Esempio n. 1
0
def run_MutationSeq(config, normal_bam, tumour_bam, output_file):
    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(obj=mgd.OutputChunks('interval',), value=list(map(str, range(1, 23) + ['X'])))

    workflow.transform(
        name='run_museq_paired',
        ctx={'mem': 8, 'ncpus': 1, 'walltime': '24:00'},
        axes=('interval',),
        func=tasks.run_museq,
        args=(
            config,
            mgd.InputFile(normal_bam),
            mgd.InputFile(tumour_bam),
            mgd.InputInstance('interval'),
            mgd.TempOutputFile('museq.vcf', 'interval'),
            mgd.TempOutputFile('museq.log', 'interval'),
            )
        )

    workflow.transform(
        name='merge_vcfs',
        func=tasks.merge_vcfs,
        args=(
            mgd.TempInputFile('museq.vcf', 'interval', axes_origin=[]),
            mgd.OutputFile(output_file),
            mgd.TempSpace('merge_vcf'),
            )
        )

    return workflow
Esempio n. 2
0
def call_copynumber(
        samples, config, tumours, normals, breakpoints,
        titan_raw_dir, remixt_results,
        remixt_raw_dir, titan_segments, titan_params, titan_markers
):
    breakpoints = dict([(sampid, breakpoints[sampid])
                        for sampid in samples])
    remixt_results = dict([(sampid, remixt_results[sampid])
                           for sampid in samples])
    titan_segments = dict([(sampid, titan_segments[sampid])
                           for sampid in samples])
    titan_params = dict([(sampid, titan_params[sampid])
                         for sampid in samples])
    titan_markers = dict([(sampid, titan_markers[sampid])
                          for sampid in samples])

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('sample_id'),
        value=samples)

    workflow.subworkflow(
        name='titan',
        func=titan.create_titan_workflow,
        axes=('sample_id',),
        args=(
            mgd.InputFile('tumour_bam', 'sample_id', fnames=tumours, extensions=['.bai']),
            mgd.InputFile('normal_bam', 'sample_id', fnames=normals, extensions=['.bai']),
            mgd.Template(titan_raw_dir, 'sample_id'),
            mgd.OutputFile('titan_segments', 'sample_id', fnames=titan_segments),
            mgd.OutputFile('titan_params', 'sample_id', fnames=titan_params),
            mgd.OutputFile('titan_markers', 'sample_id', fnames=titan_markers),
            config['globals'],
            config['cna_calling'],
            config['cna_calling']['titan_intervals'],
        ),
    )

    workflow.subworkflow(
        name='remixt',
        func=remixt.create_remixt_workflow,
        axes=('sample_id',),
        args=(
            mgd.InputFile('tumour_bam', 'sample_id', fnames=tumours, extensions=['.bai']),
            mgd.InputFile('normal_bam', 'sample_id', fnames=normals, extensions=['.bai']),
            mgd.InputFile('breakpoints', 'sample_id', fnames=breakpoints),
            mgd.InputInstance('sample_id'),
            config['cna_calling']['remixt_refdata'],
            mgd.OutputFile('remixt_results', 'sample_id', fnames=remixt_results),
            mgd.Template(remixt_raw_dir, 'sample_id'),
            config['cna_calling']['min_num_reads']
        ),
    )

    return workflow
Esempio n. 3
0
def patient_workflow(config, patient_id, patient_input, output_file):
    workflow = pypeliner.workflow.Workflow()

    patient_bam_dir = config["bam_directory"] + patient_id
    patient_result_dir = config["results_dir"] + patient_id

    helpers.makedirs(patient_bam_dir)
    helpers.makedirs(patient_result_dir)

    input_args = helpers.create_input_args(patient_input, patient_bam_dir)

    workflow.setobj(obj=mgd.OutputChunks('sample_id', ),
                    value=input_args['all_samples'])

    workflow.subworkflow(name='align_samples',
                         func=alignment.align_sample,
                         axes=('sample_id', ),
                         args=(
                             config,
                             mgd.InputFile('fastq_1',
                                           'sample_id',
                                           fnames=input_args['fastqs_r1']),
                             mgd.InputFile('fastq_2',
                                           'sample_id',
                                           fnames=input_args['fastqs_r2']),
                             mgd.InputInstance('sample_id'),
                             mgd.OutputFile('sample.bam',
                                            'sample_id',
                                            fnames=input_args['all_bams']),
                             mgd.OutputFile('sample.bam.bai',
                                            'sample_id',
                                            fnames=input_args['all_bais']),
                         ))

    workflow.subworkflow(name='run_analyses',
                         func=analysis.partition_tumour,
                         args=(
                             config,
                             input_args,
                             patient_id,
                             patient_result_dir,
                             mgd.InputFile('sample.bam',
                                           'sample_id',
                                           fnames=input_args['all_bams'],
                                           axes_origin=[]),
                             mgd.InputFile('sample.bam.bai',
                                           'sample_id',
                                           fnames=input_args['all_bais'],
                                           axes_origin=[]),
                             mgd.OutputFile(output_file),
                         ))

    return workflow
Esempio n. 4
0
def ctDNA_workflow(args):
    pyp = pypeliner.app.Pypeline(config=args)
    workflow = pypeliner.workflow.Workflow()

    config = helpers.load_yaml(args['config'])
    for arg, value in args.iteritems():
        config[arg] = value

    helpers.makedirs(config["bam_directory"])

    helpers.makedirs(config["results_dir"])

    inputs = helpers.load_yaml(args['input_yaml'])
    patients = inputs.keys()

    workflow.setobj(obj=mgd.OutputChunks('patient_id', ), value=patients)

    workflow.transform(name='get_input_by_patient',
                       func=helpers.get_input_by_patient,
                       ret=mgd.TempOutputObj('patient_input', 'patient_id'),
                       axes=('patient_id', ),
                       args=(
                           inputs,
                           mgd.InputInstance('patient_id'),
                       ))

    workflow.subworkflow(name='patient_workflow',
                         func=patient_workflow,
                         axes=('patient_id', ),
                         args=(
                             config,
                             mgd.InputInstance('patient_id'),
                             mgd.TempInputObj('patient_input', 'patient_id'),
                             mgd.OutputFile(
                                 os.path.join(config['results_dir'],
                                              '{patient_id}.log'),
                                 'patient_id'),
                         ))

    pyp.run(workflow)
def create_museq_workflow(
        normal_bam, tumour_bam, ref_genome, snv_vcf,
        config):

    ctx = {'mem_retry_increment': 2, 'ncpus': 1}
    docker_ctx = helpers.get_container_ctx(config['containers'], 'single_cell_pipeline')
    ctx.update(docker_ctx)

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('region'),
        value=normal_bam.keys(),
    )

    workflow.transform(
        name='run_museq',
        ctx=dict(mem=config["memory"]['med'],
                 pool_id=config['pools']['highmem'],
                 **ctx),
        axes=('region',),
        func='single_cell.workflows.mutationseq.tasks.run_museq',
        args=(
            mgd.InputFile('merged_bam', 'region', fnames=tumour_bam),
            mgd.InputFile('normal.split.bam', 'region', fnames=normal_bam),
            mgd.TempOutputFile('museq.vcf', 'region'),
            mgd.TempOutputFile('museq.log', 'region'),
            mgd.InputInstance('region'),
            config,
        ),
        kwargs={'docker_kwargs': helpers.get_container_ctx(config['containers'], 'mutationseq')}
    )

    workflow.transform(
        name='merge_snvs',
        ctx=dict(mem=config["memory"]['med'],
                 pool_id=config['pools']['standard'],
                 **ctx),
        func='biowrappers.components.io.vcf.tasks.concatenate_vcf',
        args=(
            mgd.TempInputFile('museq.vcf', 'region'),
            mgd.OutputFile(snv_vcf),
        ),
    )

    return workflow
Esempio n. 6
0
def create_extract_seqdata_workflow(
     bam_filename,
     seqdata_filename,
     config,
     ref_data_dir,
):
    chromosomes = remixt.config.get_chromosomes(config, ref_data_dir)
    snp_positions_filename = remixt.config.get_filename(config, ref_data_dir, 'snp_positions')

    bam_max_fragment_length = remixt.config.get_param(config, 'bam_max_fragment_length')
    bam_max_soft_clipped = remixt.config.get_param(config, 'bam_max_soft_clipped')
    bam_check_proper_pair = remixt.config.get_param(config, 'bam_check_proper_pair')

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(obj=mgd.OutputChunks('chromosome'), value=chromosomes)

    workflow.transform(
        name='create_chromosome_seqdata',
        axes=('chromosome',),
        ctx={'mem': 16},
        func=remixt.seqdataio.create_chromosome_seqdata,
        args=(
            mgd.TempOutputFile('seqdata', 'chromosome'),
            mgd.InputFile(bam_filename),
            mgd.InputFile(snp_positions_filename),
            mgd.InputInstance('chromosome'),
            bam_max_fragment_length,
            bam_max_soft_clipped,
            bam_check_proper_pair,
        ),
    )

    workflow.transform(
        name='merge_seqdata',
        ctx={'mem': 16},
        func=remixt.seqdataio.merge_seqdata,
        args=(
            mgd.OutputFile(seqdata_filename),
            mgd.TempInputFile('seqdata', 'chromosome'),
        ),
    )

    return workflow
Esempio n. 7
0
def align_samples(config, fastq1_inputs, fastq2_inputs, bam_outputs, outdir):
    samples = bam_outputs.keys()

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(obj=mgd.OutputChunks('sample_id'), value=samples)

    workflow.subworkflow(
        name='align_samples',
        func=align_sample,
        axes=('sample_id', ),
        args=(config,
              mgd.InputFile('input.r1.fastq.gz',
                            'sample_id',
                            fnames=fastq1_inputs),
              mgd.InputFile('input.r2.fastq.gz',
                            'sample_id',
                            fnames=fastq2_inputs),
              mgd.OutputFile('output.bam', 'sample_id', fnames=bam_outputs),
              mgd.InputInstance("sample_id"), outdir),
    )

    return workflow
Esempio n. 8
0
def cohort_qc_pipeline(args):
    """Process maf, run classify copynumber, make plots.
    Args:
        args ([dict]): [pipeline arguments]
    """
    config = inpututils.load_config(args)
    config = config["cohort_qc"]

    pyp = pypeliner.app.Pypeline(config=args)

    workflow = pypeliner.workflow.Workflow()

    out_dir = args["out_dir"]
    api_key = args["API_key"]

    meta_yaml = os.path.join(args['out_dir'], 'metadata.yaml')
    input_yaml_blob = os.path.join(args['out_dir'], 'input.yaml')

    # inputs
    cohort, germline_mafs, vcfs, hmmcopy = inpututils.load_cohort_qc_inputs(
        args["input_yaml"]
    )

    museq = {
        label: data["museq"] for label, data in vcfs.items()
    }
    strelka_snv = {
        label: data["strelka_snv"] for label, data in vcfs.items()
    }
    strelka_indel = {
        label: data["strelka_indel"] for label, data in vcfs.items()
    }
    hmmcopy_files = {
        label: data["hmmcopy"] for label, data in hmmcopy.items()
    }
    hmmcopy_metrics_files = {
        label: data["hmmcopy_metrics"] for label, data in hmmcopy.items()
    }
    # outputs
    cbiofile_paths = get_cbioportal_paths(os.path.join(out_dir, cohort))
    maftools_filepaths = get_maftools_paths(os.path.join(out_dir, cohort))

    workflow.setobj(
        obj=mgd.OutputChunks('sample_label', 'library_label'),
        value=list(museq.keys()),
    )
    workflow.subworkflow(
        name="merge_somatic_mafs",
        func="single_cell.workflows.cohort_qc.merge_somatic_mafs",
        axes=('sample_label',),
        args=(
            mgd.InputInstance('sample_label'),
            config,
            mgd.InputFile(
                'museq', 'sample_label', 'library_label',
                fnames=museq, axes_origin=[]
            ),
            mgd.InputFile(
                'strelka_snv', 'sample_label', 'library_label',
                fnames=strelka_snv, axes_origin=[]
            ),
            mgd.InputFile(
                'strelka_indel', 'sample_label', 'library_label',
                fnames=strelka_indel, axes_origin=[]
            ),
            mgd.TempOutputFile('somatic_maf', 'sample_label')
        ),
    )
    
    workflow.subworkflow(
        name="classifycopynumber",
        func="single_cell.workflows.cohort_qc.cna_annotation_workflow",
        args=(
            config,
            mgd.InputFile(
                'hmmcopy_dict', 'sample_label', 'library_label',
                fnames=hmmcopy_files, axes_origin=[]
            ),
            mgd.InputFile(
                'hmmcopy_metrics_dict', 'sample_label', 'library_label',
                fnames=hmmcopy_metrics_files, axes_origin=[]
            ),
            mgd.OutputFile(cbiofile_paths["cna_table"]),
            mgd.OutputFile(maftools_filepaths["maftools_cna"]),
            mgd.OutputFile(cbiofile_paths["segments"]),
            config["gtf"],

        ),
    )

    workflow.subworkflow(
        name="maf_annotation_workflow",
        func="single_cell.workflows.cohort_qc.preprocess_mafs_workflow",
        args=(
            config,
            mgd.InputFile(
                'germline_mafs_dict',  'sample_label',
                fnames=germline_mafs, axes_origin=[]
            ),
            mgd.TempInputFile(
                'somatic_maf',  'sample_label',
                axes_origin=[]
            ),
            mgd.OutputFile(cbiofile_paths["filtered_germline_maf"]),
            mgd.OutputFile(cbiofile_paths["annotated_somatic_maf"]),
            api_key
        ),
    )
    workflow.subworkflow(
        name="make_plots_and_report",
        func="single_cell.workflows.cohort_qc.create_cohort_oncoplot",
        args=(
            config,
            mgd.InputFile(cbiofile_paths["filtered_germline_maf"]),
            mgd.InputFile(cbiofile_paths["annotated_somatic_maf"]),
            mgd.InputFile(maftools_filepaths["maftools_cna"]),
            mgd.OutputFile(maftools_filepaths["maftools_maf"]),
            mgd.OutputFile(maftools_filepaths["cohort_oncoplot"]),
            mgd.OutputFile(maftools_filepaths["report"]),
            cohort
        ),
    )

    workflow.transform(
        name='generate_meta_files_results',
        func='single_cell.utils.helpers.generate_and_upload_metadata',
        args=(
            sys.argv[0:],
            args['out_dir'],
            list(cbiofile_paths.values()) + list(maftools_filepaths.values()),
            mgd.OutputFile(meta_yaml)
        ),
        kwargs={
            'input_yaml_data': inpututils.load_yaml(args['input_yaml']),
            'input_yaml': mgd.OutputFile(input_yaml_blob),
            'metadata': {'type': 'cohort_qc'}
        }
    )
    pyp.run(workflow)
Esempio n. 9
0
def process_cells_destruct(destruct_config,
                           cell_bam_files,
                           reads_1,
                           reads_2,
                           sample_1,
                           sample_2,
                           stats,
                           tag=False):
    ctx = {
        'mem_retry_increment': 2,
        'disk_retry_increment': 50,
        'ncpus': 1,
    }

    cells = list(cell_bam_files.keys())

    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=cells,
    )

    workflow.transform(
        name='bamdisc_and_numreads_cell',
        func=
        "single_cell.workflows.destruct_singlecell.tasks.destruct_bamdisc_and_numreads",
        axes=('cell_id', ),
        ctx={
            'io': 1,
            'mem': 8
        },
        ret=mgd.TempOutputObj("numreads", "cell_id"),
        args=(
            destruct_config,
            mgd.InputFile('bam', 'cell_id', fnames=cell_bam_files),
            mgd.TempOutputFile('cell_stats', 'cell_id'),
            mgd.TempOutputFile('cell_reads_1.fastq.gz', 'cell_id'),
            mgd.TempOutputFile('cell_reads_2.fastq.gz', 'cell_id'),
            mgd.TempOutputFile('cell_sample_1.fastq.gz', 'cell_id'),
            mgd.TempOutputFile('cell_sample_2.fastq.gz', 'cell_id'),
            mgd.TempSpace('bamdisc_cell_tempspace', 'cell_id'),
        ),
    )

    workflow.transform(
        name='merge_read_counts',
        ret=mgd.TempOutputObj("readcounts"),
        func=
        "single_cell.workflows.destruct_singlecell.tasks.merge_read_counts",
        ctx={
            'io': 1,
            'mem': 8
        },
        args=(mgd.TempInputObj('numreads', 'cell_id'), ))

    workflow.transform(
        name='reindex_reads',
        func=
        "single_cell.workflows.destruct_singlecell.tasks.re_index_reads_both",
        ctx={
            'io': 1,
            'mem': 8
        },
        axes=('cell_id', ),
        args=(
            mgd.TempInputFile('cell_reads_1.fastq.gz', 'cell_id'),
            mgd.TempOutputFile('cell_reads_1_reindex.fastq.gz', 'cell_id'),
            mgd.TempInputFile('cell_reads_2.fastq.gz', 'cell_id'),
            mgd.TempOutputFile('cell_reads_2_reindex.fastq.gz', 'cell_id'),
            mgd.InputInstance('cell_id'),
            cells,
            mgd.TempInputObj('readcounts'),
        ),
        kwargs={'tag': tag})

    workflow.transform(
        name='merge_reads_r1',
        ctx={
            'io': 1,
            'mem': 8,
            'disk': 100
        },
        func=
        "single_cell.workflows.destruct_singlecell.tasks.merge_cell_fastqs",
        args=(
            mgd.TempInputFile('cell_reads_1_reindex.fastq.gz', 'cell_id'),
            mgd.OutputFile(reads_1),
        ),
    )

    workflow.transform(
        name='merge_reads_r2',
        ctx={
            'io': 1,
            'mem': 8,
            'disk': 100
        },
        func=
        "single_cell.workflows.destruct_singlecell.tasks.merge_cell_fastqs",
        args=(
            mgd.TempInputFile('cell_reads_2_reindex.fastq.gz', 'cell_id'),
            mgd.OutputFile(reads_2),
        ),
    )

    workflow.transform(
        name='merge_sample',
        ctx={
            'io': 1,
            'mem': 8,
            'disk': 100
        },
        func="single_cell.workflows.destruct_singlecell.tasks.resample_fastqs",
        args=(
            mgd.TempInputFile('cell_sample_1.fastq.gz', 'cell_id'),
            mgd.TempInputFile('cell_sample_2.fastq.gz', 'cell_id'),
            mgd.OutputFile(sample_1),
            mgd.OutputFile(sample_2),
            destruct_config['num_read_samples'],
        ),
    )

    workflow.transform(
        name='merge_stats',
        ctx={
            'io': 1,
            'mem': 8
        },
        func="single_cell.workflows.destruct_singlecell.tasks.merge_stats",
        args=(
            mgd.TempInputFile('cell_stats', 'cell_id'),
            mgd.OutputFile(stats),
        ),
    )

    return workflow
Esempio n. 10
0
def create_split_workflow(normal_bam,
                          normal_split_bam,
                          regions,
                          config,
                          by_reads=False):

    normal_split_bam = dict([(ival, normal_split_bam[ival])
                             for ival in regions])

    one_split_job = config["one_split_job"]

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('region'),
        value=regions,
    )

    # split by reads always runs no a single node
    if by_reads:
        workflow.transform(
            name='split_normal_bam',
            ctx={
                'mem': config['memory']['low'],
                'ncpus': config['max_cores']
            },
            func=
            "single_cell.workflows.split_bams.tasks.split_bam_file_by_reads",
            args=(
                mgd.InputFile(normal_bam, extensions=['.bai']),
                mgd.OutputFile("normal.split.bam",
                               "region",
                               fnames=normal_split_bam,
                               axes_origin=[],
                               extensions=['.bai']),
                mgd.TempSpace("bam_split_by_reads"),
                regions,
            ),
        )

    elif one_split_job:
        workflow.transform(
            name='split_normal_bam',
            ctx={
                'mem': config['memory']['low'],
                'ncpus': config['max_cores']
            },
            func=
            "single_cell.workflows.split_bams.tasks.split_bam_file_one_job",
            args=(mgd.InputFile(normal_bam, extensions=['.bai']),
                  mgd.OutputFile(
                      "normal.split.bam",
                      "region",
                      fnames=normal_split_bam,
                      axes_origin=[],
                      extensions=['.bai'],
                  ), regions, mgd.TempSpace("one_job_split_tempdir")),
            kwargs={"ncores": config["max_cores"]})

    else:
        workflow.transform(
            name='split_normal_bam',
            ctx={
                'mem': config['memory']['low'],
                'ncpus': config['max_cores']
            },
            axes=('region', ),
            func="single_cell.workflows.split_bams.tasks.split_bam_file",
            args=(mgd.InputFile(normal_bam, extensions=['.bai']),
                  mgd.OutputFile("normal.split.bam",
                                 "region",
                                 fnames=normal_split_bam,
                                 extensions=['.bai']),
                  mgd.InputInstance('region')))

    return workflow
Esempio n. 11
0
def create_split_workflow(normal_bam,
                          normal_bai,
                          normal_split_bam,
                          normal_split_bai,
                          regions,
                          config,
                          by_reads=False):

    ctx = {'mem_retry_increment': 2}
    docker_ctx = helpers.get_container_ctx(config['containers'],
                                           'single_cell_pipeline')
    ctx.update(docker_ctx)

    normal_split_bam = dict([(ival, normal_split_bam[ival])
                             for ival in regions])
    normal_split_bai = dict([(ival, normal_split_bai[ival])
                             for ival in regions])

    one_split_job = config["one_split_job"]

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('region'),
        value=regions,
    )

    # split by reads always runs no a single node
    if by_reads:
        workflow.transform(
            name='split_normal_bam',
            ctx=dict(mem=config['memory']['low'],
                     pool_id=config['pools']['multicore'],
                     ncpus=config['max_cores'],
                     **ctx),
            func=
            "single_cell.workflows.split_bams.tasks.split_bam_file_by_reads",
            args=(mgd.InputFile(normal_bam), mgd.InputFile(normal_bai),
                  mgd.OutputFile("normal.split.bam",
                                 "region",
                                 fnames=normal_split_bam,
                                 axes_origin=[]),
                  mgd.OutputFile("normal.split.bam.bai",
                                 "region",
                                 fnames=normal_split_bai,
                                 axes_origin=[]),
                  mgd.TempSpace("bam_split_by_reads"), regions,
                  helpers.get_container_ctx(config['containers'], 'samtools')),
        )

    elif one_split_job:
        workflow.transform(
            name='split_normal_bam',
            ctx=dict(mem=config['memory']['low'],
                     pool_id=config['pools']['multicore'],
                     ncpus=config['max_cores'],
                     **ctx),
            func=
            "single_cell.workflows.split_bams.tasks.split_bam_file_one_job",
            args=(mgd.InputFile(normal_bam, extensions=['.bai']),
                  mgd.OutputFile(
                      "normal.split.bam",
                      "region",
                      fnames=normal_split_bam,
                      axes_origin=[],
                      extensions=['.bai'],
                  ), regions,
                  helpers.get_container_ctx(config['containers'], 'samtools')),
            kwargs={"ncores": config["max_cores"]})

    else:
        workflow.transform(
            name='split_normal_bam',
            ctx=dict(mem=config['memory']['low'],
                     pool_id=config['pools']['standard'],
                     ncpus=1,
                     **ctx),
            axes=('region', ),
            func="single_cell.workflows.split_bams.tasks.split_bam_file",
            args=(mgd.InputFile(normal_bam), mgd.InputFile(normal_bai),
                  mgd.OutputFile("normal.split.bam",
                                 "region",
                                 fnames=normal_split_bam),
                  mgd.OutputFile("normal.split.bam.bai",
                                 "region",
                                 fnames=normal_split_bai),
                  mgd.InputInstance('region'),
                  helpers.get_container_ctx(config['containers'], 'samtools')))

    return workflow
Esempio n. 12
0
def create_titan_workflow(tumour_bam, normal_bam, targets, titan_raw_dir,
                          segments, params, markers, global_config, config,
                          intervals, sample_id):
    titan_outdir = os.path.join(titan_raw_dir, 'clusters_{numclusters}',
                                'ploidy_{ploidy}')
    igv_template = os.path.join(titan_outdir, 'igv_segs.txt')
    outfile_template = os.path.join(titan_outdir, 'titan_markers.txt')
    params_template = os.path.join(titan_outdir, 'titan_params.txt')
    segs_template = os.path.join(titan_outdir, 'titan_segs.txt')
    plots_template = os.path.join(titan_outdir, 'titan_plots.tar.gz')
    parsed_template = os.path.join(titan_outdir, 'titan_parsed.csv')
    museq_vcf = os.path.join(titan_raw_dir, 'museq.vcf')

    chunks = [(v['num_clusters'], v['ploidy']) for v in intervals]

    targets = mgd.InputFile(targets) if targets else None

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('numclusters', 'ploidy'),
        value=chunks,
    )

    workflow.transform(name='generate_intervals',
                       func=tasks.generate_intervals,
                       ctx={
                           'mem': global_config['memory']['low'],
                           'ncpus': 1,
                           'walltime': '02:00'
                       },
                       ret=mgd.OutputChunks('interval'),
                       args=(config['reference_genome'],
                             config['chromosomes']))

    workflow.transform(
        name='run_museq',
        ctx={
            'mem': global_config['memory']['high'],
            'ncpus': global_config['threads'],
            'walltime': '02:00'
        },
        func=tasks.run_museq,
        axes=('interval', ),
        args=(mgd.InputFile(tumour_bam, extensions=['.bai']),
              mgd.InputFile(normal_bam, extensions=['.bai']),
              mgd.TempOutputFile('museq.vcf', 'interval'),
              mgd.TempOutputFile('museq.log',
                                 'interval'), config['reference_genome'],
              mgd.InputInstance('interval'), config['museq_params']),
    )

    workflow.transform(name='merge_vcfs',
                       ctx={
                           'num_retry': 3,
                           'mem_retry_increment': 2,
                           'mem': global_config['memory']['high'],
                           'ncpus': 1
                       },
                       func=tasks.merge_vcfs,
                       args=(
                           mgd.TempInputFile('museq.vcf', 'interval'),
                           mgd.OutputFile(museq_vcf),
                       ))

    workflow.transform(
        name='convert_museq_vcf2counts',
        ctx={
            'mem': global_config['memory']['high'],
            'ncpus': 1,
            'walltime': '02:00'
        },
        func=tasks.convert_museq_vcf2counts,
        args=(
            mgd.InputFile(museq_vcf),
            mgd.TempOutputFile('museq_postprocess.txt'),
            config,
        ),
    )

    workflow.transform(
        name='run_readcounter_tumour',
        ctx={
            'mem': global_config['memory']['high'],
            'ncpus': 1,
            'walltime': '02:00'
        },
        func=tasks.run_readcounter,
        args=(
            mgd.InputFile(tumour_bam, extensions=['.bai']),
            mgd.TempOutputFile('tumour.wig'),
            config,
        ),
    )

    workflow.transform(
        name='run_readcounter_normal',
        ctx={
            'mem': global_config['memory']['high'],
            'ncpus': 1,
            'walltime': '02:00'
        },
        func=tasks.run_readcounter,
        args=(
            mgd.InputFile(normal_bam, extensions=['.bai']),
            mgd.TempOutputFile('normal.wig'),
            config,
        ),
    )

    workflow.transform(
        name='calc_correctreads_wig',
        ctx={
            'mem': global_config['memory']['low'],
            'ncpus': 1,
            'walltime': '02:00'
        },
        func=tasks.calc_correctreads_wig,
        args=(
            mgd.TempInputFile('tumour.wig'),
            mgd.TempInputFile('normal.wig'),
            targets,
            mgd.TempOutputFile('correct_reads.txt'),
            config,
        ),
    )

    workflow.transform(name='run_titan',
                       axes=('numclusters', 'ploidy'),
                       ctx={
                           'mem': global_config['memory']['high'],
                           'ncpus': 1,
                           'walltime': '06:00'
                       },
                       func=tasks.run_titan,
                       args=(mgd.TempInputFile('museq_postprocess.txt'),
                             mgd.TempInputFile('correct_reads.txt'),
                             mgd.OutputFile('titan_outfile',
                                            'numclusters',
                                            'ploidy',
                                            template=outfile_template),
                             mgd.TempOutputFile('titan.Rdata', 'numclusters',
                                                'ploidy'),
                             mgd.OutputFile('titan_params',
                                            'numclusters',
                                            'ploidy',
                                            template=params_template),
                             config['titan_params'],
                             mgd.InputInstance('numclusters'),
                             mgd.InputInstance('ploidy')))

    workflow.transform(
        name='plot_titan',
        axes=('numclusters', 'ploidy'),
        ctx={
            'mem': global_config['memory']['low'],
            'ncpus': 1,
            'walltime': '02:00'
        },
        func=tasks.plot_titan,
        args=(mgd.TempInputFile('titan.Rdata', 'numclusters', 'ploidy'),
              mgd.InputFile('titan_params',
                            'numclusters',
                            'ploidy',
                            template=params_template),
              mgd.OutputFile('titan_plots',
                             'numclusters',
                             'ploidy',
                             template=plots_template),
              mgd.TempSpace("titan_plots_tempdir",
                            'numclusters', 'ploidy'), config,
              mgd.InputInstance('numclusters'), mgd.InputInstance('ploidy')),
    )

    workflow.transform(
        name='calc_cnsegments_titan',
        axes=('numclusters', 'ploidy'),
        ctx={
            'mem': global_config['memory']['low'],
            'ncpus': 1,
            'walltime': '02:00'
        },
        func=tasks.calc_cnsegments_titan,
        args=(
            mgd.InputFile('titan_outfile',
                          'numclusters',
                          'ploidy',
                          template=outfile_template),
            mgd.OutputFile('titan_igv',
                           'numclusters',
                           'ploidy',
                           template=igv_template),
            mgd.TempOutputFile('segs.csv', 'numclusters', 'ploidy'),
        ),
    )

    workflow.transform(
        name='annot_pygenes',
        axes=('numclusters', 'ploidy'),
        ctx={
            'mem': global_config['memory']['low'],
            'ncpus': 1,
            'walltime': '02:00'
        },
        func=tasks.annot_pygenes,
        args=(
            mgd.TempInputFile('segs.csv', 'numclusters', 'ploidy'),
            mgd.OutputFile('titan_segs.csv',
                           'numclusters',
                           'ploidy',
                           template=segs_template),
            config,
        ),
    )

    workflow.transform(
        name='parse_titan',
        axes=('numclusters', 'ploidy'),
        ctx={
            'mem': global_config['memory']['low'],
            'ncpus': 1,
            'walltime': '02:00'
        },
        func=tasks.parse_titan,
        args=(
            mgd.InputFile('titan_segs.csv',
                          'numclusters',
                          'ploidy',
                          template=segs_template),
            mgd.InputFile('titan_params',
                          'numclusters',
                          'ploidy',
                          template=params_template),
            mgd.InputFile('titan_outfile',
                          'numclusters',
                          'ploidy',
                          template=outfile_template),
            mgd.OutputFile('titan_parsed.csv',
                           'numclusters',
                           'ploidy',
                           template=parsed_template),
            config['parse_titan'],
            sample_id,
        ),
    )

    workflow.transform(
        name='segments_h5',
        ctx={
            'mem': global_config['memory']['low'],
            'ncpus': 1,
            'walltime': '02:00'
        },
        func=tasks.merge_to_h5,
        args=(mgd.InputFile('titan_segs.csv',
                            'numclusters',
                            'ploidy',
                            template=segs_template), mgd.OutputFile(segments),
              intervals),
    )

    workflow.transform(
        name='params_h5',
        ctx={
            'mem': global_config['memory']['low'],
            'ncpus': 1,
            'walltime': '02:00'
        },
        func=tasks.merge_to_h5,
        args=(mgd.InputFile('titan_params',
                            'numclusters',
                            'ploidy',
                            template=params_template), mgd.OutputFile(params),
              intervals),
    )

    workflow.transform(name='markers_h5',
                       ctx={
                           'mem': global_config['memory']['low'],
                           'ncpus': 1,
                           'walltime': '02:00'
                       },
                       func=tasks.merge_to_h5,
                       args=(mgd.InputFile('titan_outfile',
                                           'numclusters',
                                           'ploidy',
                                           template=outfile_template),
                             mgd.OutputFile(markers), intervals),
                       kwargs={'dtype': {
                           'Chr': str
                       }})

    return workflow
Esempio n. 13
0
def infer_haps(
        bam_file,
        haplotypes_filename,
        config,
        from_tumour=False,
):
    baseimage = {'docker_image': config['docker']['single_cell_pipeline']}

    remixt_image = config['docker']['remixt']

    remixt_config = config.get('extract_seqdata', {})
    remixt_ref_data_dir = config['ref_data_dir']

    chromosomes = config['chromosomes']
    remixt_config['chromosomes'] = chromosomes

    ctx = dict(mem_retry_increment=2, disk_retry_increment=50, ncpus=1, **baseimage)
    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    if isinstance(bam_file, dict):
        workflow.setobj(
            obj=mgd.OutputChunks('cell_id'),
            value=list(bam_file.keys()),
        )

        # dont parallelize over chromosomes for per cell bams
        workflow.subworkflow(
            name="extract_seqdata",
            axes=('cell_id',),
            func='remixt.workflow.create_extract_seqdata_workflow',
            ctx={'docker_image': remixt_image},
            args=(
                mgd.InputFile(
                    'bam_markdups', 'cell_id', fnames=bam_file, extensions=['.bai']
                ),
                mgd.TempOutputFile('seqdata_cell.h5', 'cell_id'),
                remixt_config,
                remixt_ref_data_dir,
            ),
            kwargs={'no_parallelism': True}
        )
        workflow.transform(
            name='merge_all_seqdata',
            func="remixt.seqdataio.merge_overlapping_seqdata",
            ctx={'docker_image': remixt_image},
            args=(
                mgd.TempOutputFile('seqdata_file.h5'),
                mgd.TempInputFile("seqdata_cell.h5", "cell_id"),
                config["chromosomes"]
            ),
        )
    else:
        workflow.subworkflow(
            name='extract_seqdata',
            func='remixt.workflow.create_extract_seqdata_workflow',
            ctx={'disk': 150, 'docker_image': remixt_image},
            args=(
                mgd.InputFile(bam_file, extensions=['.bai']),
                mgd.TempOutputFile('seqdata_file.h5'),
                remixt_config,
                remixt_ref_data_dir,
            ),
        )

    workflow.setobj(
        obj=mgd.OutputChunks('chromosome'),
        value=chromosomes,
    )

    if from_tumour:
        func = 'remixt.analysis.haplotype.infer_snp_genotype_from_tumour'
    else:
        func = 'remixt.analysis.haplotype.infer_snp_genotype_from_normal'

    workflow.transform(
        name='infer_snp_genotype',
        axes=('chromosome',),
        ctx={'mem': 16, 'docker_image': remixt_image},
        func=func,
        args=(
            mgd.TempOutputFile('snp_genotype.tsv', 'chromosome'),
            mgd.TempInputFile('seqdata_file.h5'),
            mgd.InputInstance('chromosome'),
            config,
        ),
    )

    workflow.transform(
        name='infer_haps',
        axes=('chromosome',),
        ctx={'mem': 16, 'docker_image': remixt_image},
        func='remixt.analysis.haplotype.infer_haps',
        args=(
            mgd.TempOutputFile('haplotypes.tsv', 'chromosome'),
            mgd.TempInputFile('snp_genotype.tsv', 'chromosome'),
            mgd.InputInstance('chromosome'),
            mgd.TempSpace('haplotyping', 'chromosome'),
            remixt_config,
            remixt_ref_data_dir,
        ),
    )

    workflow.transform(
        name='merge_haps',
        ctx={'mem': 16, 'docker_image': remixt_image},
        func='remixt.utils.merge_tables',
        args=(
            mgd.TempOutputFile('haplotypes_merged.tsv'),
            mgd.TempInputFile('haplotypes.tsv', 'chromosome'),
        )
    )

    workflow.transform(
        name='finalize_csv',
        ctx={'mem': 16},
        func='single_cell.utils.csvutils.rewrite_csv_file',
        args=(
            mgd.TempInputFile('haplotypes_merged.tsv'),
            mgd.OutputFile(haplotypes_filename, extensions=['.yaml']),
        ),
        kwargs={
            'write_header': True,
            'dtypes': dtypes()['haplotypes']
        },
    )

    return workflow
Esempio n. 14
0
def create_titan_workflow(
        tumour_bam, normal_bam, targets, outfile, params, segs, igv_segs,
        parsed, plots, tar_outputs, museq_vcf,
        sample_id, reference, chromosomes, het_positions, map_wig, gc_wig, pygenes_gtf,
        single_node=None
):
    cn_params = config.default_params('copynumber_calling')

    chunks = [(v['num_clusters'], v['ploidy']) for v in cn_params['titan_intervals']]

    targets = mgd.InputFile(targets) if targets else None

    ctx = {'docker_image': config.containers('wgs')}

    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    workflow.setobj(
        obj=mgd.OutputChunks('numclusters', 'ploidy'),
        value=chunks,
    )

    workflow.transform(
        name='generate_intervals',
        func='wgs.workflows.titan.tasks.generate_intervals',
        ctx=helpers.get_default_ctx(
            memory=5,
            walltime='2:00', ),
        ret=mgd.OutputChunks('interval'),
        args=(
            reference,
            chromosomes,
        ),
        kwargs={'size': cn_params['split_size']}
    )

    if single_node:
        workflow.transform(
            name='run_museq',
            ctx=helpers.get_default_ctx(
                memory=15,
                walltime='96:00',
                ncpus=8),
            func='wgs.utils.museq_utils.run_museq_one_job',
            args=(
                mgd.TempSpace("run_museq_temp"),
                mgd.OutputFile(museq_vcf),
                reference,
                mgd.InputChunks('interval'),
                cn_params['museq_params'],
            ),
            kwargs={
                'tumour_bam': mgd.InputFile(tumour_bam, extensions=['.bai']),
                'normal_bam': mgd.InputFile(normal_bam, extensions=['.bai']),
                'titan_mode': True,
                'museq_docker_image': config.containers('mutationseq'),
                'vcftools_docker_image': config.containers('vcftools')
            }
        )
    else:
        workflow.transform(
            name='run_museq',
            ctx=helpers.get_default_ctx(
                memory=15,
                walltime='24:00'),
            axes=('interval',),
            func='wgs.utils.museq_utils.run_museq',
            args=(
                mgd.TempOutputFile('museq.vcf', 'interval'),
                mgd.TempOutputFile('museq.log', 'interval'),
                reference,
                mgd.InputInstance('interval'),
                cn_params['museq_params']
            ),
            kwargs={
                'tumour_bam': mgd.InputFile(tumour_bam, extensions=['.bai']),
                'normal_bam': mgd.InputFile(normal_bam, extensions=['.bai']),
                'titan_mode': True,
                'docker_image': config.containers('mutationseq')
            }
        )

        workflow.transform(
            name='merge_vcfs',
            ctx=helpers.get_default_ctx(
                memory=15,
                walltime='4:00', ),
            func='wgs.utils.museq_utils.merge_vcfs',
            args=(
                mgd.TempInputFile('museq.vcf', 'interval'),
                mgd.OutputFile(museq_vcf),
                mgd.TempSpace('merge_vcf'),
            ),
            kwargs={'docker_image': config.containers('vcftools')}
        )

    workflow.transform(
        name='convert_museq_vcf2counts',
        ctx=helpers.get_default_ctx(
            memory=10,
            walltime='4:00', ),
        func='wgs.workflows.titan.tasks.convert_museq_vcf2counts',
        args=(
            mgd.InputFile(museq_vcf),
            mgd.TempOutputFile('museq_postprocess.txt'),
            het_positions,
        ),
    )

    workflow.transform(
        name='run_readcounter_tumour',
        ctx=helpers.get_default_ctx(
            memory=10,
            walltime='16:00',
            disk=200
        ),
        func='wgs.workflows.titan.tasks.run_readcounter',
        args=(
            mgd.InputFile(tumour_bam, extensions=['.bai']),
            mgd.TempOutputFile('tumour.wig'),
            chromosomes,
            cn_params['readcounter']
        ),
    )

    workflow.transform(
        name='run_readcounter_normal',
        ctx=helpers.get_default_ctx(
            memory=10,
            walltime='16:00',
            disk=200
        ),
        func='wgs.workflows.titan.tasks.run_readcounter',
        args=(
            mgd.InputFile(normal_bam, extensions=['.bai']),
            mgd.TempOutputFile('normal.wig'),
            chromosomes,
            cn_params['readcounter']
        ),
    )

    workflow.transform(
        name='calc_correctreads_wig',
        ctx=helpers.get_default_ctx(
            memory=10,
            walltime='4:00', ),
        func='wgs.workflows.titan.tasks.calc_correctreads_wig',
        args=(
            mgd.TempInputFile('tumour.wig'),
            mgd.TempInputFile('normal.wig'),
            targets,
            mgd.TempOutputFile('correct_reads.txt'),
            gc_wig,
            map_wig,
            cn_params['genome_type']
        ),
        kwargs={'docker_image': config.containers('titan')}
    )

    workflow.transform(
        name='run_titan',
        axes=('numclusters', 'ploidy'),
        ctx=helpers.get_default_ctx(
            memory=15,
            walltime='72:00',
            ncpus='8'),
        func='wgs.workflows.titan.tasks.run_titan',
        args=(
            mgd.TempInputFile('museq_postprocess.txt'),
            mgd.TempInputFile('correct_reads.txt'),
            mgd.TempOutputFile('titan_outfile', 'numclusters', 'ploidy'),
            mgd.TempOutputFile('titan.Rdata', 'numclusters', 'ploidy'),
            mgd.TempOutputFile('titan_params', 'numclusters', 'ploidy'),
            mgd.InputInstance('numclusters'),
            mgd.InputInstance('ploidy'),
            sample_id,
            map_wig,
            cn_params['titan_params'],
            cn_params['genome_type']
        ),
        kwargs={'docker_image': config.containers('titan'), 'threads': '8'}
    )

    workflow.transform(
        name='plot_titan',
        axes=('numclusters', 'ploidy'),
        ctx=helpers.get_default_ctx(
            memory=10,
            walltime='16:00', ),
        func='wgs.workflows.titan.tasks.plot_titan',
        args=(
            mgd.TempInputFile('titan.Rdata', 'numclusters', 'ploidy'),
            mgd.TempOutputFile('titan_plots', 'numclusters', 'ploidy'),
            mgd.TempSpace("titan_plots_tempdir", 'numclusters', 'ploidy'),
            mgd.InputInstance('numclusters'),
            mgd.InputInstance('ploidy')
        ),
        kwargs={
            'chromosomes': chromosomes,
            'docker_image': config.containers('titan'),
        },
    )

    workflow.transform(
        name='calc_cnsegments_titan',
        axes=('numclusters', 'ploidy'),
        ctx=helpers.get_default_ctx(
            memory=5,
            walltime='4:00', ),
        func='wgs.workflows.titan.tasks.calc_cnsegments_titan',
        args=(
            mgd.TempInputFile('titan_outfile', 'numclusters', 'ploidy'),
            mgd.TempOutputFile('titan_igv', 'numclusters', 'ploidy'),
            mgd.TempOutputFile('segs.csv', 'numclusters', 'ploidy'),
            sample_id,
        ),
        kwargs={'docker_image': config.containers('titan')}
    )

    workflow.transform(
        name='annot_pygenes',
        axes=('numclusters', 'ploidy'),
        ctx=helpers.get_default_ctx(
            memory=10,
            walltime='4:00', ),
        func='wgs.workflows.titan.tasks.annot_pygenes',
        args=(
            mgd.TempInputFile('segs.csv', 'numclusters', 'ploidy'),
            mgd.TempOutputFile('titan_segs.csv', 'numclusters', 'ploidy'),
            pygenes_gtf,
        ),
    )

    workflow.transform(
        name='parse_titan',
        axes=('numclusters', 'ploidy'),
        ctx=helpers.get_default_ctx(
            memory=5,
            walltime='4:00', ),
        func='wgs.workflows.titan.tasks.parse_titan_data',
        args=(
            mgd.TempInputFile('titan_segs.csv', 'numclusters', 'ploidy'),
            mgd.TempInputFile('titan_outfile', 'numclusters', 'ploidy'),
            mgd.TempOutputFile('titan_parsed.csv', 'numclusters', 'ploidy'),
        ),
    )

    # select optimal solution
    workflow.transform(
        name="select_optimal_solution",
        ctx=helpers.get_default_ctx(
            memory=5,
            walltime='4:00', ),
        func="wgs.workflows.titan.tasks.select_optimal_solution",
        args=(
            chunks,
            mgd.TempInputFile('titan_params', 'numclusters', 'ploidy', axes_origin=[]),
            mgd.TempInputFile("titan_segs.csv", 'numclusters', 'ploidy', axes_origin=[]),
            mgd.TempInputFile('titan_igv', 'numclusters', 'ploidy'),
            mgd.TempInputFile("titan_outfile", 'numclusters', 'ploidy', axes_origin=[]),
            mgd.TempInputFile("titan_parsed.csv", 'numclusters', 'ploidy', axes_origin=[]),
            mgd.TempInputFile("titan_plots", 'numclusters', 'ploidy', axes_origin=[]),
            mgd.OutputFile(segs, extensions=['.yaml']),
            mgd.OutputFile(igv_segs, extensions=['.yaml']),
            mgd.OutputFile(params, extensions=['.yaml']),
            mgd.OutputFile(outfile, extensions=['.yaml']),
            mgd.OutputFile(parsed, extensions=['.yaml']),
            mgd.OutputFile(plots),
        )
    )

    workflow.transform(
        name='tar_all_data',
        ctx=helpers.get_default_ctx(
            memory=5,
            walltime='4:00', ),
        func="wgs.workflows.titan.tasks.tar_all_data",
        args=(
            mgd.TempInputFile('titan_params', 'numclusters', 'ploidy', axes_origin=[]),
            mgd.TempInputFile("titan_segs.csv", 'numclusters', 'ploidy', axes_origin=[]),
            mgd.TempInputFile('titan_igv', 'numclusters', 'ploidy'),
            mgd.TempInputFile("titan_outfile", 'numclusters', 'ploidy', axes_origin=[]),
            mgd.TempInputFile("titan_parsed.csv", 'numclusters', 'ploidy', axes_origin=[]),
            mgd.TempInputFile("titan_plots", 'numclusters', 'ploidy', axes_origin=[]),
            mgd.OutputFile(tar_outputs),
            mgd.TempSpace("titan_all_parameters_data"),
            chunks
        )
    )

    return workflow
Esempio n. 15
0
def cna_calling_workflow(args):
    pyp = pypeliner.app.Pypeline(config=args)
    workflow = pypeliner.workflow.Workflow()

    config = helpers.load_yaml(args['config_file'])
    inputs = helpers.load_yaml(args['input_yaml'])

    tumours = helpers.get_values_from_input(inputs, 'tumour')
    normals = helpers.get_values_from_input(inputs, 'normal')
    targets = helpers.get_values_from_input(inputs, 'target_list')
    breakpoints = helpers.get_values_from_input(inputs, 'breakpoints')
    samples = tumours.keys()

    cna_outdir = os.path.join(args['out_dir'], 'copynumber', '{sample_id}')
    remixt_results_filename = os.path.join(cna_outdir, 'remixt', 'results.h5')
    remixt_raw_dir = os.path.join(cna_outdir, 'remixt', 'raw_data')

    titan_raw_dir = os.path.join(cna_outdir, 'titan')
    titan_segments_filename = os.path.join(titan_raw_dir, 'segments.h5')
    titan_markers_filename = os.path.join(titan_raw_dir, 'markers.h5')
    titan_params_filename = os.path.join(titan_raw_dir, 'params.h5')

    workflow.setobj(
        obj=mgd.OutputChunks('sample_id'),
        value=samples)

    workflow.subworkflow(
        name='titan',
        func=titan.create_titan_workflow,
        axes=('sample_id',),
        args=(
            mgd.InputFile("tumour.bam", 'sample_id', fnames=tumours,
                          extensions=['.bai'], axes_origin=[]),
            mgd.InputFile("normal.bam", 'sample_id', fnames=normals,
                          extensions=['.bai'], axes_origin=[]),
            mgd.InputFile("target_list", 'sample_id', fnames=targets,
                          axes_origin=[]),
            mgd.Template(titan_raw_dir, 'sample_id'),
            mgd.OutputFile('titan_segments_filename', 'sample_id',
                           axes_origin=[], template=titan_segments_filename),
            mgd.OutputFile('titan_params_filename', 'sample_id',
                           axes_origin=[], template=titan_params_filename),
            mgd.OutputFile('titan_markers_filename', 'sample_id',
                           axes_origin=[], template=titan_markers_filename),
            config['globals'],
            config['cna_calling'],
            config['cna_calling']['titan_intervals'],
            mgd.InputInstance('sample_id'),
        ),
    )

    workflow.subworkflow(
        name='remixt',
        func=remixt.create_remixt_workflow,
        axes=('sample_id',),
        args=(
            mgd.InputFile('tumour_bam', 'sample_id',
                          fnames=tumours, extensions=['.bai']),
            mgd.InputFile('normal_bam', 'sample_id',
                          fnames=normals, extensions=['.bai']),
            mgd.InputFile('destruct_breakpoints', 'sample_id',
                          axes_origin=[], fnames=breakpoints),
            mgd.InputInstance('sample_id'),
            config['cna_calling']['remixt_refdata'],
            mgd.OutputFile('remixt_results_filename', 'sample_id',
                           axes_origin=[], template=remixt_results_filename),
            mgd.Template(remixt_raw_dir, 'sample_id'),
            config['cna_calling']['min_num_reads']
        ),
    )

    pyp.run(workflow)
Esempio n. 16
0
def create_museq_workflow(
        normal_bam, tumour_bam, ref_genome, snv_vcf,
        config):
    museq_docker = {'docker_image': config['docker']['mutationseq']}
    vcftools_docker = {'docker_image': config['docker']['vcftools']}

    ctx = {'mem_retry_increment': 2, 'disk_retry_increment': 50, 'ncpus': 1, 'num_retry': 3,
           'docker_image': config['docker']['single_cell_pipeline']}

    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    workflow.setobj(
        obj=mgd.OutputChunks('region'),
        value=list(normal_bam.keys()),
    )

    workflow.transform(
        name='run_museq',
        ctx=dict(mem=config["memory"]['med']),
        axes=('region',),
        func='single_cell.workflows.mutationseq.tasks.run_museq',
        args=(
            mgd.InputFile('merged_bam', 'region', fnames=tumour_bam, extensions=['.bai']),
            mgd.InputFile('normal.split.bam', 'region', fnames=normal_bam, extensions=['.bai']),
            mgd.TempOutputFile('museq.vcf', 'region'),
            mgd.TempOutputFile('museq.log', 'region'),
            mgd.InputInstance('region'),
            config,
        ),
        kwargs={'docker_kwargs': museq_docker}
    )

    workflow.transform(
        name='finalise_region_vcfs',
        axes=('region',),
        ctx=dict(mem=config["memory"]['med']),
        func='biowrappers.components.io.vcf.tasks.finalise_vcf',
        args=(
            mgd.TempInputFile('museq.vcf', 'region'),
            mgd.TempOutputFile('museq.vcf.gz', 'region', extensions=['.tbi', '.csi']),
        ),
        kwargs={'docker_config': vcftools_docker}
    )

    workflow.transform(
        name='merge_snvs',
        ctx=dict(mem=config["memory"]['med']),
        func='biowrappers.components.io.vcf.tasks.concatenate_vcf',
        args=(
            mgd.TempInputFile('museq.vcf.gz', 'region', extensions=['.tbi', '.csi']),
            mgd.TempOutputFile('museq.vcf.gz', extensions=['.tbi', '.csi']),
        ),
        kwargs={
            'allow_overlap': True,
            'docker_config': vcftools_docker
        },
    )

    workflow.transform(
        name='finalise_vcf',
        func='biowrappers.components.io.vcf.tasks.finalise_vcf',
        ctx=dict(mem=config["memory"]['med']),
        args=(
            mgd.TempInputFile('museq.vcf.gz', extensions=['.tbi', '.csi']),
            mgd.OutputFile(snv_vcf, extensions=['.tbi', '.csi']),
        ),
        kwargs={'docker_config': vcftools_docker}
    )

    return workflow
Esempio n. 17
0
def extract_allele_readcounts(
    haplotypes_filename,
    cell_bams,
    allele_counts_filename,
    config,
):
    baseimage = {'docker_image': config['docker']['single_cell_pipeline']}

    remixt_image = config['docker']['remixt']

    remixt_config = config.get('extract_seqdata', {})
    remixt_ref_data_dir = config['ref_data_dir']

    chromosomes = config['chromosomes']
    remixt_config['chromosomes'] = chromosomes

    workflow = pypeliner.workflow.Workflow(ctx=baseimage)

    workflow.set_filenames('cell.bam', 'cell_id', fnames=cell_bams)

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=list(cell_bams.keys()),
    )

    workflow.subworkflow(
        name='create_seqdata_readcounts',
        axes=('cell_id', ),
        func='remixt.workflow.create_extract_seqdata_workflow',
        ctx={'docker_image': remixt_image},
        args=(
            mgd.InputFile('cell.bam', 'cell_id', extensions=['.bai']),
            mgd.TempOutputFile('seqdata.h5', 'cell_id', axes_origin=[]),
            remixt_config,
            remixt_ref_data_dir,
        ),
        kwargs={'no_parallelism': True})

    # TODO Segments with bin width from single cell
    workflow.transform(
        name='create_segments',
        func='remixt.analysis.segment.create_segments',
        ctx={
            'mem': 16,
            'docker_image': remixt_image
        },
        args=(
            mgd.TempOutputFile('segments.tsv'),
            remixt_config,
            remixt_ref_data_dir,
        ),
    )

    workflow.transform(
        name='generate_haplotypes_tsv',
        func=
        'single_cell.workflows.extract_allele_readcounts.tasks.convert_csv_to_tsv',
        args=(mgd.InputFile(haplotypes_filename, extensions=['.yaml']),
              mgd.TempOutputFile('haplotypes.tsv')))

    workflow.transform(
        name='haplotype_allele_readcount',
        axes=('cell_id', ),
        ctx={
            'mem': 16,
            'docker_image': remixt_image
        },
        func='remixt.analysis.readcount.haplotype_allele_readcount',
        args=(
            mgd.TempOutputFile('allele_counts.tsv', 'cell_id', axes_origin=[]),
            mgd.TempInputFile('segments.tsv'),
            mgd.TempInputFile('seqdata.h5', 'cell_id'),
            mgd.TempInputFile('haplotypes.tsv'),
            remixt_config,
        ),
    )

    workflow.transform(
        name='prep_readcount_csv',
        axes=('cell_id', ),
        func='single_cell.utils.csvutils.rewrite_csv_file',
        args=(
            mgd.TempInputFile('allele_counts.tsv', 'cell_id'),
            mgd.TempOutputFile('allele_counts.csv.gz',
                               'cell_id',
                               extensions=['.yaml']),
        ),
        kwargs={
            'write_header': True,
            'dtypes': dtypes()['readcount']
        },
    )

    workflow.transform(
        name='readcounts_cell_id_annotate',
        axes=('cell_id', ),
        func='single_cell.utils.csvutils.add_col_from_dict',
        args=(
            mgd.TempInputFile('allele_counts.csv.gz',
                              'cell_id',
                              extensions=['.yaml']),
            {
                'cell_id': mgd.InputInstance('cell_id')
            },
            mgd.TempOutputFile('allele_counts_annotate.csv.gz',
                               'cell_id',
                               extensions=['.yaml']),
            dtypes()['readcount'],
        ),
    )

    workflow.transform(
        name='merge_allele_readcount',
        ctx={'mem': 16},
        func='single_cell.utils.csvutils.concatenate_csv',
        args=(
            mgd.TempInputFile('allele_counts_annotate.csv.gz',
                              'cell_id',
                              extensions=['.yaml']),
            mgd.OutputFile(allele_counts_filename, extensions=['.yaml']),
        ),
        kwargs={'write_header': True},
    )

    return workflow
Esempio n. 18
0
def create_infer_haps_workflow(
    seqdata_filenames,
    haps_filename,
    config,
    ref_data_dir,
    normal_id=None,
):
    chromosomes = remixt.config.get_chromosomes(config, ref_data_dir)

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(obj=mgd.OutputChunks('chromosome'), value=chromosomes)

    if normal_id is not None:
        normal_seqdata_filename = seqdata_filenames[normal_id]
        
        workflow.transform(
            name='infer_snp_genotype_from_normal',
            axes=('chromosome',),
            ctx={'mem': 16},
            func=remixt.analysis.haplotype.infer_snp_genotype_from_normal,
            args=(
                mgd.TempOutputFile('snp_genotype.tsv', 'chromosome'),
                mgd.InputFile(normal_seqdata_filename),
                mgd.InputInstance('chromosome'),
                config,
            ),
        )
    
    else:
        workflow.setobj(
            obj=mgd.OutputChunks('tumour_id'),
            value=seqdata_filenames.keys(),
        )

        workflow.transform(
            name='infer_snp_genotype_from_tumour',
            axes=('chromosome',),
            ctx={'mem': 16},
            func=remixt.analysis.haplotype.infer_snp_genotype_from_tumour,
            args=(
                mgd.TempOutputFile('snp_genotype.tsv', 'chromosome'),
                mgd.InputFile('tumour_seqdata', 'tumour_id', fnames=seqdata_filenames),
                mgd.InputInstance('chromosome'),
                config,
            ),
        )

    workflow.transform(
        name='infer_haps',
        axes=('chromosome',),
        ctx={'mem': 16},
        func=remixt.analysis.haplotype.infer_haps,
        args=(
            mgd.TempOutputFile('haps.tsv', 'chromosome'),
            mgd.TempInputFile('snp_genotype.tsv', 'chromosome'),
            mgd.InputInstance('chromosome'),
            mgd.TempSpace('haplotyping', 'chromosome'),
            config,
            ref_data_dir,
        )
    )

    workflow.transform(
        name='merge_haps',
        ctx={'mem': 16},
        func=remixt.utils.merge_tables,
        args=(
            mgd.OutputFile(haps_filename),
            mgd.TempInputFile('haps.tsv', 'chromosome'),
        )
    )

    return workflow
Esempio n. 19
0
def create_remixt_seqdata_workflow(
    breakpoint_filename,
    seqdata_filenames,
    results_filenames,
    raw_data_directory,
    config,
    ref_data_dir,
    normal_id=None,
):
    sample_ids = seqdata_filenames.keys()
    
    tumour_ids = seqdata_filenames.keys()
    if normal_id is not None:
        tumour_ids.remove(normal_id)

    results_filenames = dict([(tumour_id, results_filenames[tumour_id]) for tumour_id in tumour_ids])

    segment_filename = os.path.join(raw_data_directory, 'segments.tsv')
    haplotypes_filename = os.path.join(raw_data_directory, 'haplotypes.tsv')
    counts_table_template = os.path.join(raw_data_directory, 'counts', 'sample_{tumour_id}.tsv')
    experiment_template = os.path.join(raw_data_directory, 'experiment', 'sample_{tumour_id}.pickle')
    ploidy_plots_template = os.path.join(raw_data_directory, 'ploidy_plots', 'sample_{tumour_id}.pdf')

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('sample_id'),
        value=sample_ids,
    )

    workflow.setobj(
        obj=mgd.OutputChunks('tumour_id'),
        value=tumour_ids,
    )

    workflow.transform(
        name='create_segments',
        func=remixt.analysis.segment.create_segments,
        args=(
            mgd.OutputFile(segment_filename),
            config,
            ref_data_dir,
        ),
        kwargs={
            'breakpoint_filename': mgd.InputFile(breakpoint_filename),
        },
    )

    workflow.subworkflow(
        name='infer_haps_workflow',
        func=remixt.workflow.create_infer_haps_workflow,
        args=(
            mgd.InputFile('seqdata', 'sample_id', fnames=seqdata_filenames),
            mgd.OutputFile(haplotypes_filename),
            config,
            ref_data_dir,
        ),
        kwargs={
            'normal_id': normal_id,
        }
    )

    workflow.subworkflow(
        name='prepare_counts_workflow',
        func=remixt.workflow.create_prepare_counts_workflow,
        args=(
            mgd.InputFile(segment_filename),
            mgd.InputFile(haplotypes_filename),
            mgd.InputFile('seqdata', 'tumour_id', fnames=seqdata_filenames),
            mgd.TempOutputFile('rawcounts', 'tumour_id', axes_origin=[]),
            config,
        ),
    )

    workflow.subworkflow(
        name='calc_bias_workflow',
        axes=('tumour_id',),
        func=remixt.workflow.create_calc_bias_workflow,
        args=(
            mgd.InputFile('seqdata', 'tumour_id', fnames=seqdata_filenames),
            mgd.TempInputFile('rawcounts', 'tumour_id'),
            mgd.OutputFile('counts', 'tumour_id', template=counts_table_template),
            config,
            ref_data_dir,
        ),
    )

    workflow.transform(
        name='create_experiment',
        axes=('tumour_id',),
        ctx={'mem': 8},
        func=remixt.analysis.experiment.create_experiment,
        args=(
            mgd.InputFile('counts', 'tumour_id', template=counts_table_template),
            mgd.InputFile(breakpoint_filename),
            mgd.OutputFile('experiment', 'tumour_id', template=experiment_template),
        ),
    )

    workflow.transform(
        name='ploidy_analysis_plots',
        axes=('tumour_id',),
        ctx={'mem': 8},
        func=remixt.cn_plot.ploidy_analysis_plots,
        args=(
            mgd.InputFile('experiment', 'tumour_id', template=experiment_template),
            mgd.OutputFile('plots', 'tumour_id', template=ploidy_plots_template),
        ),
    )

    workflow.subworkflow(
        name='fit_model',
        axes=('tumour_id',),
        func=remixt.workflow.create_fit_model_workflow,
        args=(
            mgd.InputFile('experiment', 'tumour_id', template=experiment_template),
            mgd.OutputFile('results', 'tumour_id', fnames=results_filenames),
            config,
            ref_data_dir,
        ),
        kwargs={
            'tumour_id': mgd.InputInstance('tumour_id'),
        },
    )

    return workflow
Esempio n. 20
0
def create_museq_workflow(snv_vcf,
                          museqportrait_pdf,
                          reference,
                          chromosomes,
                          thousand_genomes=None,
                          dbsnp=None,
                          germline_refdata=None,
                          tumour_bam=None,
                          normal_bam=None,
                          single_node=None):
    name = 'run_museq'
    if tumour_bam:
        tumour_bam = mgd.InputFile(tumour_bam, extensions=['.bai'])
        name += '_tumour'
    if normal_bam:
        normal_bam = mgd.InputFile(normal_bam, extensions=['.bai'])
        name += '_normal'
    single = False if name == 'run_museq_tumour_normal' else True

    params = config.default_params('variant_calling')

    workflow = pypeliner.workflow.Workflow(
        ctx={'docker_image': config.containers('wgs')})

    workflow.transform(
        name='generate_intervals',
        func='wgs.workflows.mutationseq.tasks.generate_intervals',
        ctx=helpers.get_default_ctx(
            memory=5,
            walltime='1:00',
        ),
        ret=mgd.OutputChunks('interval'),
        args=(reference, chromosomes),
        kwargs={'size': params['split_size']})

    if single_node:
        workflow.transform(name=name,
                           ctx=helpers.get_default_ctx(memory=15,
                                                       walltime='48:00',
                                                       ncpus='8',
                                                       disk=600),
                           func='wgs.utils.museq_utils.run_museq_one_job',
                           args=(
                               mgd.TempSpace("run_museq_temp"),
                               mgd.TempOutputFile('merged.vcf'),
                               reference,
                               mgd.InputChunks('interval'),
                               params['museq_params'],
                           ),
                           kwargs={
                               'tumour_bam':
                               tumour_bam,
                               'normal_bam':
                               normal_bam,
                               'museq_docker_image':
                               config.containers('mutationseq'),
                               'vcftools_docker_image':
                               config.containers('vcftools')
                           })
    else:
        workflow.transform(name=name,
                           ctx=helpers.get_default_ctx(
                               memory=15,
                               walltime='24:00',
                           ),
                           axes=('interval', ),
                           func='wgs.utils.museq_utils.run_museq',
                           args=(
                               mgd.TempOutputFile('museq.vcf', 'interval'),
                               mgd.TempOutputFile('museq.log', 'interval'),
                               reference,
                               mgd.InputInstance('interval'),
                               params['museq_params'],
                           ),
                           kwargs={
                               'tumour_bam': tumour_bam,
                               'normal_bam': normal_bam,
                               'docker_image':
                               config.containers('mutationseq'),
                           })

        workflow.transform(
            name='merge_vcfs',
            ctx=helpers.get_default_ctx(
                memory=15,
                walltime='8:00',
            ),
            func='wgs.utils.museq_utils.merge_vcfs',
            args=(
                mgd.TempInputFile('museq.vcf', 'interval'),
                mgd.TempOutputFile('merged.vcf'),
                mgd.TempSpace('merge_vcf'),
            ),
            kwargs={'docker_image': config.containers('vcftools')})

    workflow.transform(name='finalise_snvs',
                       ctx=helpers.get_default_ctx(walltime='8:00', ),
                       func='wgs.utils.vcf_tasks.finalise_vcf',
                       args=(
                           mgd.TempInputFile('merged.vcf'),
                           mgd.OutputFile(snv_vcf, extensions=['.tbi',
                                                               '.csi']),
                       ),
                       kwargs={'docker_image': config.containers('vcftools')})

    workflow.transform(
        name='run_museqportrait',
        ctx=helpers.get_default_ctx(
            memory=5,
            walltime='8:00',
        ),
        func='wgs.workflows.mutationseq.tasks.run_museqportrait',
        args=(
            mgd.InputFile(snv_vcf, extensions=['.tbi', '.csi']),
            mgd.OutputFile(museqportrait_pdf),
            mgd.TempOutputFile('museqportrait.txt'),
            mgd.TempOutputFile('museqportrait.log'),
            single,
        ),
        kwargs={
            'docker_image': config.containers('mutationseq'),
            'thousand_genomes': thousand_genomes,
            'dbsnp': dbsnp,
            'germline_refdata': germline_refdata,
            'germline_plot_threshold': params['germline_portrait_threshold']
        })

    return workflow
Esempio n. 21
0
def create_mutect_workflow(normal_bam,
                           tumour_bam,
                           snv_vcf,
                           snv_maf,
                           reference,
                           reference_vep,
                           chromosomes,
                           normal_id,
                           tumour_id,
                           single_node=None):
    params = config.default_params('variant_calling')

    workflow = pypeliner.workflow.Workflow()

    workflow.transform(name='generate_intervals',
                       func='wgs.workflows.mutect.tasks.generate_intervals',
                       ctx=helpers.get_default_ctx(
                           memory=5,
                           walltime='1:00',
                       ),
                       ret=mgd.OutputChunks('interval'),
                       args=(reference, chromosomes),
                       kwargs={'size': params['split_size']})

    if single_node:
        workflow.transform(
            name='mutect_one_node',
            ctx=helpers.get_default_ctx(memory=15,
                                        walltime='48:00',
                                        ncpus=8,
                                        disk=600),
            func='wgs.workflows.mutect.tasks.run_mutect_one_job',
            args=(mgd.TempSpace("run_mutect_temp"),
                  mgd.TempOutputFile('merged.vcf'), reference,
                  mgd.InputChunks('interval'), mgd.InputFile(normal_bam),
                  mgd.InputFile(tumour_bam)),
        )
    else:
        workflow.transform(
            name='mutect_caller',
            ctx=helpers.get_default_ctx(
                memory=15,
                walltime='24:00',
            ),
            axes=('interval', ),
            func='wgs.workflows.mutect.tasks.run_mutect',
            args=(mgd.TempOutputFile('mutect.vcf', 'interval'), reference,
                  mgd.InputInstance('interval'), mgd.InputFile(normal_bam),
                  mgd.InputFile(tumour_bam),
                  mgd.TempSpace('mutect_temp', 'interval')),
        )

        workflow.transform(
            name='merge_vcfs',
            ctx=helpers.get_default_ctx(
                memory=15,
                walltime='8:00',
            ),
            func='wgs.workflows.mutect.tasks.merge_vcfs',
            args=(
                mgd.TempInputFile('mutect.vcf', 'interval'),
                mgd.TempOutputFile('merged.vcf'),
                mgd.TempSpace('merge_vcf'),
            ),
        )

    workflow.transform(name='bcftools_normalize',
                       ctx=helpers.get_default_ctx(walltime='8:00', ),
                       func='wgs.utils.vcfutils.bcftools_normalize',
                       args=(
                           mgd.TempInputFile('merged.vcf'),
                           mgd.TempOutputFile('normalized.vcf'),
                           reference,
                       ))

    workflow.transform(
        name='finalise_snvs',
        ctx=helpers.get_default_ctx(walltime='8:00', ),
        func='wgs.utils.vcf_tasks.finalise_vcf',
        args=(
            mgd.TempInputFile('normalized.vcf'),
            mgd.OutputFile(snv_vcf, extensions=['.tbi', '.csi']),
        ),
    )

    workflow.subworkflow(name="strelka_indel_maf",
                         func='wgs.workflows.vcf2maf.create_vcf2maf_workflow',
                         args=(
                             mgd.InputFile(snv_vcf,
                                           extensions=['.tbi', '.csi']),
                             mgd.OutputFile(snv_maf),
                             reference_vep,
                         ),
                         kwargs={
                             'tumour_id': tumour_id,
                             'normal_id': normal_id
                         })

    return workflow
Esempio n. 22
0
def infer_haps(
    bam_file,
    haplotypes_filename,
    allele_counts_filename,
    config,
    normal=False,
):
    baseimage = {'docker_image': config['docker']['single_cell_pipeline']}

    remixt_config = config.get('extract_seqdata', {})
    remixt_ref_data_dir = config['ref_data_dir']

    chromosomes = config['chromosomes']

    ctx = dict(mem_retry_increment=2,
               disk_retry_increment=50,
               ncpus=1,
               **baseimage)
    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    if isinstance(bam_file, dict):
        workflow.setobj(
            obj=mgd.OutputChunks('cell_id'),
            value=list(bam_file.keys()),
        )

        # dont parallelize over chromosomes for per cell bams
        workflow.subworkflow(
            name="extract_seqdata",
            axes=('cell_id', ),
            func=
            'single_cell.workflows.extract_seqdata.create_extract_seqdata_workflow',
            args=(
                mgd.InputFile('bam_markdups',
                              'cell_id',
                              fnames=bam_file,
                              extensions=['.bai']),
                mgd.TempOutputFile('seqdata_cell.h5', 'cell_id'),
                config.get('extract_seqdata', {}),
                config['ref_data_dir'],
                config,
            ))
        workflow.transform(
            name='merge_all_seqdata',
            func="single_cell.workflows.titan.tasks.merge_overlapping_seqdata",
            args=(mgd.TempOutputFile('seqdata_file.h5'),
                  mgd.TempInputFile("seqdata_cell.h5",
                                    "cell_id"), config["chromosomes"]),
        )

    else:
        # if its a single bam, then its probably whole genome
        # so parallelize over chromosomes
        workflow.subworkflow(
            name='extract_seqdata',
            func='remixt.workflow.create_extract_seqdata_workflow',
            ctx={'disk': 150},
            args=(
                mgd.InputFile(bam_file, extensions=['.bai']),
                mgd.TempOutputFile('seqdata_file.h5'),
                remixt_config,
                remixt_ref_data_dir,
            ),
        )

    workflow.setobj(
        obj=mgd.OutputChunks('chromosome'),
        value=chromosomes,
    )

    if normal:
        func = 'remixt.analysis.haplotype.infer_snp_genotype_from_normal'
    else:
        func = 'remixt.analysis.haplotype.infer_snp_genotype_from_tumour'

    workflow.transform(
        name='infer_snp_genotype',
        axes=('chromosome', ),
        ctx=dict(mem=16, **ctx),
        func=func,
        args=(
            mgd.TempOutputFile('snp_genotype.tsv', 'chromosome'),
            mgd.TempInputFile('seqdata_file.h5'),
            mgd.InputInstance('chromosome'),
            config,
        ),
    )

    workflow.transform(
        name='infer_haps',
        axes=('chromosome', ),
        ctx=dict(mem=16, **ctx),
        func='remixt.analysis.haplotype.infer_haps',
        args=(
            mgd.TempOutputFile('haplotypes.tsv', 'chromosome'),
            mgd.TempInputFile('snp_genotype.tsv', 'chromosome'),
            mgd.InputInstance('chromosome'),
            mgd.TempSpace('haplotyping', 'chromosome'),
            remixt_config,
            remixt_ref_data_dir,
        ),
    )

    workflow.transform(name='merge_haps',
                       ctx=dict(mem=16, **ctx),
                       func='remixt.utils.merge_tables',
                       args=(
                           mgd.OutputFile(haplotypes_filename),
                           mgd.TempInputFile('haplotypes.tsv', 'chromosome'),
                       ))

    workflow.transform(
        name='create_segments',
        ctx=dict(mem=16, **ctx),
        func='remixt.analysis.segment.create_segments',
        args=(
            mgd.TempOutputFile('segments.tsv'),
            remixt_config,
            config['ref_data_dir'],
        ),
    )

    workflow.transform(
        name='haplotype_allele_readcount',
        ctx=dict(mem=16, **ctx),
        func='remixt.analysis.readcount.haplotype_allele_readcount',
        args=(mgd.OutputFile(allele_counts_filename),
              mgd.TempInputFile('segments.tsv'),
              mgd.TempInputFile('seqdata_file.h5'),
              mgd.InputFile(haplotypes_filename), remixt_config),
    )

    return workflow
Esempio n. 23
0
def partition_tumour(config, input_args, patient_id, results_dir, input_bams,
                     input_bais, output_file):
    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(obj=mgd.OutputChunks('tumour_id', ),
                    value=input_args['tumour_samples'])
    workflow.setobj(obj=mgd.OutputChunks('normal_id', ),
                    value=input_args['normal_samples'])

    workflow.transform(name='merge_normal',
                       func=tasks.merge_normal,
                       args=(config,
                             mgd.InputFile('normal.bam',
                                           'normal_id',
                                           fnames=input_args['normal_bams'],
                                           axes_origin=[]),
                             mgd.OutputFile(
                                 os.path.join(input_args['patient_bam_dir'],
                                              'merged_normal.bam')),
                             mgd.OutputFile(
                                 os.path.join(input_args['patient_bam_dir'],
                                              'merged_normal.bam.bai'))))

    workflow.subworkflow(
        name='analyze_tumour',
        func=analyze_tumour_normal,
        axes=('tumour_id', ),
        args=(
            config,
            input_args,
            results_dir,
            mgd.InputFile(
                os.path.join(input_args['patient_bam_dir'],
                             'merged_normal.bam')),
            mgd.InputInstance('tumour_id'),
            mgd.InputFile('tumour.bam', 'tumour_id', fnames=input_bams),
            mgd.OutputFile(
                os.path.join(results_dir, patient_id + '_{tumour_id}.snv.tsv'),
                'tumour_id'),
            mgd.OutputFile(
                os.path.join(results_dir,
                             patient_id + '_{tumour_id}.indel.tsv'),
                'tumour_id'),
            mgd.TempOutputFile('snv.vcf', 'tumour_id'),
            mgd.TempOutputFile('indel.vcf', 'tumour_id'),
        ))

    workflow.transform(name='annotate_snvs',
                       func=tasks.annotate_outputs,
                       axes=('tumour_id', ),
                       args=(
                           config,
                           mgd.TempSpace('snv_annotation_space', 'tumour_id'),
                           mgd.TempInputFile('snv.vcf', 'tumour_id'),
                           mgd.OutputFile(
                               os.path.join(
                                   results_dir,
                                   patient_id + '_{tumour_id}.snv.txt'),
                               'tumour_id'),
                       ))

    workflow.transform(name='annotate_indels',
                       func=tasks.annotate_outputs,
                       axes=('tumour_id', ),
                       args=(
                           config,
                           mgd.TempSpace('indel_annotation_space',
                                         'tumour_id'),
                           mgd.TempInputFile('indel.vcf', 'tumour_id'),
                           mgd.OutputFile(
                               os.path.join(
                                   results_dir,
                                   patient_id + '_{tumour_id}.indel.txt'),
                               'tumour_id'),
                       ))

    workflow.transform(name='vcf_annotate_indels',
                       func=tasks.vcf_annotate_outputs,
                       axes=('tumour_id', ),
                       args=(
                           config,
                           mgd.TempSpace('indel_vcf_annotation_space',
                                         'tumour_id'),
                           mgd.TempInputFile('indel.vcf', 'tumour_id'),
                           mgd.OutputFile(
                               os.path.join(
                                   results_dir,
                                   patient_id + '_{tumour_id}.indel.vcf'),
                               'tumour_id'),
                       ))

    workflow.transform(
        name='vcf_annotate_snvs',
        func=tasks.vcf_annotate_outputs,
        axes=('tumour_id', ),
        args=(
            config,
            mgd.TempSpace('snv_vcf_annotation_space', 'tumour_id'),
            mgd.TempInputFile('snv.vcf', 'tumour_id'),
            mgd.OutputFile(
                os.path.join(results_dir, patient_id + '_{tumour_id}.snv.vcf'),
                'tumour_id'),
        ))

    workflow.transform(
        name='log_patient_analysis',
        func=tasks.log_patient_analysis,
        args=(
            mgd.InputFile(os.path.join(results_dir,
                                       patient_id + '_{tumour_id}.snv.tsv'),
                          'tumour_id',
                          axes_origin=[]),
            mgd.InputFile(os.path.join(results_dir,
                                       patient_id + '_{tumour_id}.indel.tsv'),
                          'tumour_id',
                          axes_origin=[]),
            mgd.InputFile(os.path.join(results_dir,
                                       patient_id + '_{tumour_id}.snv.txt'),
                          'tumour_id',
                          axes_origin=[]),
            mgd.InputFile(os.path.join(results_dir,
                                       patient_id + '_{tumour_id}.indel.txt'),
                          'tumour_id',
                          axes_origin=[]),
            mgd.InputFile(os.path.join(results_dir,
                                       patient_id + '_{tumour_id}.snv.vcf'),
                          'tumour_id',
                          axes_origin=[]),
            mgd.InputFile(os.path.join(results_dir,
                                       patient_id + '_{tumour_id}.indel.vcf'),
                          'tumour_id',
                          axes_origin=[]),
            mgd.OutputFile(output_file),
        ))

    return workflow
Esempio n. 24
0
def run_LoLoPicker(config, args, normal_bam, tumour_bam, output_file):
    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(obj=mgd.OutputChunks('region', ),
                    value=list(map(str,
                                   range(1, 23) + ['X'])))

    workflow.transform(name='create_axes_beds',
                       axes=('region', ),
                       func=tasks.create_axes_beds,
                       args=(mgd.InputFile(config["bed_file"]),
                             mgd.InputInstance('region'),
                             mgd.TempOutputFile('region.bed', 'region')))

    workflow.transform(name='LoLoPicker_somatic',
                       axes=('region', ),
                       func=tasks.LoLoPicker_somatic,
                       args=(config, mgd.InputFile(tumour_bam),
                             mgd.InputFile(normal_bam),
                             mgd.TempInputFile('region.bed', 'region'),
                             mgd.TempSpace('LoLoPicker_somatic_temp',
                                           'region'),
                             mgd.TempOutputFile("raw_somatic_varants.txt",
                                                'region')))

    workflow.transform(name='make_sample_list',
                       func=tasks.make_sample_list,
                       args=(
                           args,
                           mgd.TempOutputFile('samplelist.txt'),
                       ))

    workflow.transform(name='LoLoPicker_control',
                       axes=('region', ),
                       func=tasks.LoLoPicker_control,
                       args=(config, mgd.TempInputFile('samplelist.txt'),
                             mgd.TempSpace('LoLoPicker_control_temp',
                                           'region'),
                             mgd.TempInputFile("raw_somatic_varants.txt",
                                               'region'),
                             mgd.TempOutputFile("control_stats.txt",
                                                'region')))

    workflow.transform(name='LoLoPicker_stats',
                       axes=('region', ),
                       func=tasks.LoLoPicker_stats,
                       args=(
                           mgd.TempSpace('LoLoPicker_stats_temp', 'region'),
                           mgd.TempInputFile("raw_somatic_varants.txt",
                                             'region'),
                           mgd.TempInputFile("control_stats.txt", 'region'),
                           mgd.TempOutputFile("stats_calls.txt", 'region'),
                       ))

    workflow.transform(name='merge_LoLoPicker',
                       func=tasks.merge_LoLoPicker,
                       args=(mgd.TempSpace("merge_LoLo"),
                             mgd.TempInputFile("stats_calls.txt",
                                               'region',
                                               axes_origin=[]),
                             mgd.OutputFile(output_file)))

    return workflow
Esempio n. 25
0
def breakpoint_calling_workflow(args):
    pyp = pypeliner.app.Pypeline(config=args)

    inputs = helpers.load_yaml(args['input_yaml'])

    meta_yaml = os.path.join(args["out_dir"], 'metadata.yaml')
    input_yaml_blob = os.path.join(args["out_dir"], 'input.yaml')

    tumours = helpers.get_values_from_input(inputs, 'tumour')
    normals = helpers.get_values_from_input(inputs, 'normal')
    samples = list(tumours.keys())

    sv_outdir = os.path.join(args['out_dir'], 'breakpoints', '{sample_id}')
    destruct_breakpoints = os.path.join(
        sv_outdir, '{sample_id}_destruct_breakpoints.csv.gz')
    destruct_library = os.path.join(sv_outdir,
                                    '{sample_id}_destruct_library.csv.gz')
    destruct_raw_breakpoints = os.path.join(
        sv_outdir, '{sample_id}_destruct_raw_breakpoints.csv.gz')
    destruct_raw_library = os.path.join(
        sv_outdir, '{sample_id}_destruct_raw_library.csv.gz')
    destruct_reads = os.path.join(sv_outdir,
                                  '{sample_id}_destruct_reads.csv.gz')
    lumpy_vcf = os.path.join(sv_outdir, '{sample_id}_lumpy.vcf')
    parsed_csv = os.path.join(sv_outdir,
                              '{sample_id}_filtered_consensus_calls.csv.gz')

    svaba_vcf = os.path.join(sv_outdir, '{sample_id}_svaba.vcf')

    single_node = args['single_node']

    refdir_paths = config.refdir_data(args['refdir'])['paths']
    chromosomes = config.refdir_data(args['refdir'])['params']['chromosomes']

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(obj=mgd.OutputChunks('sample_id'), value=samples)

    workflow.subworkflow(
        name='destruct',
        func=destruct_wgs.create_destruct_wgs_workflow,
        axes=('sample_id', ),
        args=(mgd.InputFile("tumour.bam",
                            'sample_id',
                            fnames=tumours,
                            extensions=['.bai'],
                            axes_origin=[]),
              mgd.InputFile("normal.bam",
                            'sample_id',
                            fnames=normals,
                            extensions=['.bai'],
                            axes_origin=[]),
              mgd.OutputFile('destruct_raw_breakpoints',
                             'sample_id',
                             template=destruct_raw_breakpoints),
              mgd.OutputFile('destruct_raw_library',
                             'sample_id',
                             template=destruct_raw_library),
              mgd.OutputFile('destruct_breakpoints',
                             'sample_id',
                             template=destruct_breakpoints),
              mgd.OutputFile('destruct_library',
                             'sample_id',
                             template=destruct_library),
              mgd.OutputFile('destruct_reads',
                             'sample_id',
                             template=destruct_reads),
              mgd.InputInstance('sample_id'), refdir_paths['reference'],
              refdir_paths['refdata_destruct'], refdir_paths['gtf'],
              refdir_paths['blacklist_destruct']),
        kwargs={'single_node': single_node})

    workflow.subworkflow(
        name='lumpy',
        func=lumpy.create_lumpy_workflow,
        axes=('sample_id', ),
        args=(mgd.OutputFile('lumpy_vcf', 'sample_id', template=lumpy_vcf), ),
        kwargs={
            'tumour_bam':
            mgd.InputFile("tumour.bam",
                          'sample_id',
                          fnames=tumours,
                          extensions=['.bai'],
                          axes_origin=[]),
            'normal_bam':
            mgd.InputFile("normal.bam",
                          'sample_id',
                          fnames=normals,
                          extensions=['.bai'],
                          axes_origin=[]),
            'single_node':
            single_node
        },
    )

    if args['svaba']:
        workflow.subworkflow(
            name='svaba',
            func=svaba.create_svaba_workflow,
            axes=('sample_id', ),
            args=(
                mgd.InputFile("tumour.bam",
                              'sample_id',
                              fnames=tumours,
                              extensions=['.bai'],
                              axes_origin=[]),
                mgd.InputFile("normal.bam",
                              'sample_id',
                              fnames=normals,
                              extensions=['.bai'],
                              axes_origin=[]),
                mgd.OutputFile('svaba_vcf', 'sample_id', template=svaba_vcf),
                refdir_paths['reference'],
            ),
        )

    workflow.subworkflow(
        name="consensus_calling",
        func=breakpoint_calling_consensus.create_consensus_workflow,
        axes=('sample_id', ),
        args=(mgd.InputFile('destruct_breakpoints',
                            'sample_id',
                            template=destruct_breakpoints),
              mgd.InputFile('lumpy_vcf', 'sample_id', template=lumpy_vcf),
              mgd.OutputFile('consensus_calls',
                             'sample_id',
                             template=parsed_csv,
                             extensions=['.yaml']), chromosomes),
    )

    filenames = [
        destruct_breakpoints, destruct_library, destruct_raw_breakpoints,
        destruct_raw_library, destruct_reads, lumpy_vcf, parsed_csv
    ]

    if args['svaba']:
        filenames.append(svaba_vcf)

    outputted_filenames = helpers.expand_list(filenames, samples, "sample_id")

    workflow.transform(name='generate_meta_files_results',
                       func=helpers.generate_and_upload_metadata,
                       args=(sys.argv[0:], args["out_dir"],
                             outputted_filenames, mgd.OutputFile(meta_yaml)),
                       kwargs={
                           'input_yaml_data':
                           helpers.load_yaml(args['input_yaml']),
                           'input_yaml': mgd.OutputFile(input_yaml_blob),
                           'metadata': {
                               'type': 'breakpoint_calling'
                           }
                       })

    pyp.run(workflow)
Esempio n. 26
0
def lumpy_preprocess_cells(config, bam_files, merged_discordants,
                           merged_splitters, hist_csv, mean_stdev_obj):
    ctx = {
        'mem_retry_increment': 2,
        'disk_retry_increment': 50,
        'ncpus': 1,
    }

    histogram_settings = dict(N=10000,
                              skip=0,
                              min_elements=100,
                              mads=10,
                              X=4,
                              read_length=101)

    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=list(bam_files.keys()),
    )

    workflow.transform(
        name='process_tumour_cells',
        axes=('cell_id', ),
        ctx={
            'mem': 8,
            'ncpus': 1
        },
        func='single_cell.workflows.lumpy.tasks.process_bam',
        args=(
            mgd.InputFile('tumour_bam',
                          'cell_id',
                          fnames=bam_files,
                          extensions=['.bai']),
            mgd.TempOutputFile('tumour.discordants.sorted.bam', 'cell_id'),
            mgd.TempOutputFile('tumour.splitters.sorted.bam', 'cell_id'),
            mgd.TempOutputFile('hist.csv', 'cell_id'),
            mgd.TempSpace("lumpy_tumour_processing", "cell_id"),
        ),
        kwargs=dict(tag=mgd.InputInstance('cell_id'), **histogram_settings),
    )

    workflow.transform(
        name='merge_disc',
        ctx={
            'mem': 8,
            'ncpus': 1
        },
        func='single_cell.workflows.lumpy.tasks.merge_bams',
        args=(mgd.TempInputFile('tumour.discordants.sorted.bam',
                                'cell_id'), mgd.OutputFile(merged_discordants),
              mgd.TempSpace("merge_disc_temp")),
    )

    workflow.transform(
        name='merge_split',
        ctx={
            'mem': 8,
            'ncpus': 1
        },
        func='single_cell.workflows.lumpy.tasks.merge_bams',
        args=(mgd.TempInputFile('tumour.splitters.sorted.bam',
                                'cell_id'), mgd.OutputFile(merged_splitters),
              mgd.TempSpace("merge_split_temp")),
    )

    workflow.transform(
        name='merge_histo',
        ctx={
            'mem': 8,
            'ncpus': 1
        },
        func='single_cell.workflows.lumpy.merge_histograms.merge_histograms',
        args=(mgd.TempInputFile('hist.csv',
                                'cell_id'), mgd.OutputFile(hist_csv),
              mgd.OutputFile(mean_stdev_obj)),
    )

    return workflow
Esempio n. 27
0
def alignment_workflow(args):
    inputs = helpers.load_yaml(args['input_yaml'])
    outdir = args['out_dir']

    outputs = os.path.join(outdir, '{sample_id}', '{sample_id}.bam')
    metrics_output = os.path.join(outdir, '{sample_id}',
                                  '{sample_id}_metrics.csv.gz')
    prealignment_tar = os.path.join(outdir, '{sample_id}',
                                    '{sample_id}_fastqc.tar.gz')
    postalignment_tar = os.path.join(outdir, '{sample_id}',
                                     '{sample_id}_metrics.tar.gz')

    samples = list(inputs.keys())
    fastqs_r1, fastqs_r2 = helpers.get_fastqs(inputs, samples, None)

    sample_info = helpers.get_sample_info(inputs)

    pyp = pypeliner.app.Pypeline(config=args)
    workflow = pypeliner.workflow.Workflow(ctx=helpers.get_default_ctx(
        docker_image=config.containers('alignment')))

    workflow.setobj(
        obj=mgd.OutputChunks('sample_id', 'lane_id'),
        value=list(fastqs_r1.keys()),
    )

    workflow.subworkflow(name="prealign",
                         func=pre_alignment.pre_alignment,
                         axes=('sample_id', 'lane_id'),
                         args=(
                             mgd.InputFile('input.r1.fastq.gz',
                                           'sample_id',
                                           'lane_id',
                                           fnames=fastqs_r1),
                             mgd.InputFile('input.r2.fastq.gz',
                                           'sample_id',
                                           'lane_id',
                                           fnames=fastqs_r2),
                             mgd.Template('prealignment.tar',
                                          'sample_id',
                                          template=prealignment_tar),
                         ))

    workflow.subworkflow(
        name="align",
        func=alignment.alignment,
        args=(
            mgd.InputFile('input.r1.fastq.gz',
                          'sample_id',
                          'lane_id',
                          fnames=fastqs_r1,
                          axes_origin=[]),
            mgd.InputFile('input.r2.fastq.gz',
                          'sample_id',
                          'lane_id',
                          fnames=fastqs_r2,
                          axes_origin=[]),
            mgd.OutputFile('output.bam',
                           'sample_id',
                           template=outputs,
                           axes_origin=[]),
            args['refdir'],
            sample_info,
        ),
    )

    workflow.subworkflow(
        name="postalign",
        func=post_alignment.post_alignment,
        axes=('sample_id', ),
        args=(
            mgd.InputFile('output.bam', 'sample_id', template=outputs),
            mgd.OutputFile('metrics.csv.gz',
                           'sample_id',
                           template=metrics_output,
                           extensions=['.yaml']),
            mgd.OutputFile('metrics.tar.gz',
                           'sample_id',
                           template=postalignment_tar),
            mgd.InputInstance('sample_id'),
            args['refdir'],
        ),
    )

    pyp.run(workflow)
Esempio n. 28
0
def get_coverage_data(
        input_bam, output, refdir, chromosomes,
        mapping_qual, bins, single_node=False
):
    reference = config.refdir_data(refdir)['paths']['reference']

    workflow = pypeliner.workflow.Workflow()

    if single_node:
        workflow.transform(
            name='generate_coverage_bed',
            func='wgs.workflows.sample_qc.tasks.generate_coverage_bed',
            ctx=helpers.get_default_ctx(
                memory=5
            ),
            args=(
                reference,
                mgd.TempOutputFile('coverage_bed.bed'),
                chromosomes,
                bins,
            )
        )
        workflow.transform(
            name='samtools_coverage',
            func='wgs.workflows.sample_qc.tasks.samtools_coverage',
            ctx=helpers.get_default_ctx(
                memory=5
            ),
            args=(
                mgd.InputFile(input_bam),
                mgd.TempInputFile('coverage_bed.bed'),
                mgd.TempOutputFile('per_interval.txt', 'chromosome'),
                mapping_qual,

            ),
        )

    else:
        workflow.setobj(
            obj=mgd.OutputChunks('chromosome'),
            value=chromosomes
        )
        workflow.transform(
            name='generate_coverage_bed',
            func='wgs.workflows.sample_qc.tasks.generate_coverage_bed',
            ctx=helpers.get_default_ctx(
                memory=5
            ),
            axes=('chromosome',),
            args=(
                reference,
                mgd.TempOutputFile('coverage_bed.bed', 'chromosome'),
                mgd.InputInstance('chromosome'),
                bins,
            )
        )
        workflow.transform(
            name='samtools_coverage',
            func='wgs.workflows.sample_qc.tasks.samtools_coverage',
            ctx=helpers.get_default_ctx(
                memory=5
            ),
            axes=('chromosome',),
            args=(
                mgd.InputFile(input_bam),
                mgd.TempInputFile('coverage_bed.bed', 'chromosome'),
                mgd.TempOutputFile('per_interval.txt', 'chromosome'),
                mapping_qual,
            ),
        )

        workflow.transform(
            name='merge_data',
            func='wgs.utils.csvutils.concatenate_csv',
            ctx=helpers.get_default_ctx(
                memory=5
            ),
            args=(
                mgd.TempInputFile('per_interval.txt', 'chromosome', axes_origin=[]),
                mgd.OutputFile(output),
            )
        )

    return workflow
Esempio n. 29
0
def create_hmmcopy_workflow(
        bam_file, reads, segs, metrics, params, igv_seg_filename,
        segs_pdf, bias_pdf, plot_heatmap_ec_output,
        plot_metrics_output,
        plot_kernel_density_output, hmmcopy_data_tar,
        cell_ids, hmmparams, sample_info
):
    chromosomes = hmmparams["chromosomes"]

    baseimage = hmmparams['docker']['single_cell_pipeline']
    hmmcopy_docker = hmmparams['docker']['hmmcopy']

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=cell_ids,
    )

    workflow.setobj(
        obj=mgd.TempOutputObj('sampleinfo', 'cell_id', axes_origin=[]),
        value=sample_info)

    workflow.transform(
        name='run_hmmcopy',
        ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage},
        func="single_cell.workflows.hmmcopy.tasks.run_hmmcopy",
        axes=('cell_id',),
        args=(
            mgd.InputFile('bam_markdups', 'cell_id', fnames=bam_file, extensions=['.bai']),
            mgd.TempOutputFile('reads.csv.gz', 'cell_id', extensions=['.yaml']),
            mgd.TempOutputFile('segs.csv.gz', 'cell_id', extensions=['.yaml']),
            mgd.TempOutputFile('params.csv.gz', 'cell_id', extensions=['.yaml']),
            mgd.TempOutputFile('hmm_metrics.csv.gz', 'cell_id', extensions=['.yaml']),
            mgd.TempOutputFile('hmm_data.tar.gz', 'cell_id'),
            mgd.InputInstance('cell_id'),
            hmmparams,
            mgd.TempSpace('hmmcopy_temp', 'cell_id'),
            hmmcopy_docker
        ),
    )

    workflow.transform(
        name='merge_reads',
        ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage},
        func="single_cell.workflows.hmmcopy.tasks.concatenate_csv",
        args=(
            mgd.TempInputFile('reads.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']),
            mgd.TempOutputFile('reads_merged.csv.gz', extensions=['.yaml']),
        ),
        kwargs={'low_memory': True}
    )

    workflow.transform(
        name='add_mappability_bool',
        ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage},
        func="single_cell.workflows.hmmcopy.tasks.get_mappability_col",
        args=(
            mgd.TempInputFile('reads_merged.csv.gz', extensions=['.yaml']),
            mgd.OutputFile(reads, extensions=['.yaml']),
        ),
    )

    workflow.transform(
        name='merge_segs',
        ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage},
        func="single_cell.workflows.hmmcopy.tasks.concatenate_csv",
        args=(
            mgd.TempInputFile('segs.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']),
            mgd.OutputFile(segs, extensions=['.yaml']),
        ),
        kwargs={'low_memory': True}
    )

    workflow.transform(
        name='merge_metrics',
        ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage},
        func="single_cell.workflows.hmmcopy.tasks.concatenate_csv",
        args=(
            mgd.TempInputFile('hmm_metrics.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']),
            mgd.TempOutputFile("hmm_metrics.csv.gz", extensions=['.yaml']),
        ),
    )

    workflow.transform(
        name='merge_params',
        ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage},
        func="single_cell.workflows.hmmcopy.tasks.concatenate_csv",
        args=(
            mgd.TempInputFile('params.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']),
            mgd.OutputFile(params, extensions=['.yaml']),
        ),
    )

    workflow.transform(
        name='get_max_cn',
        ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage},
        func="single_cell.workflows.hmmcopy.tasks.get_max_cn",
        ret=mgd.TempOutputObj('max_cn'),
        args=(
            mgd.InputFile(reads, extensions=['.yaml']),
        )
    )

    workflow.transform(
        name='hmmcopy_plots',
        ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage},
        func="single_cell.workflows.hmmcopy.tasks.plot_hmmcopy",
        axes=('cell_id',),
        args=(
            mgd.TempInputFile('reads.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']),
            mgd.TempInputFile('segs.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']),
            mgd.TempInputFile('params.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']),
            mgd.TempInputFile('hmm_metrics.csv.gz', 'cell_id', axes_origin=[], extensions=['.yaml']),
            hmmparams['ref_genome'],
            mgd.TempOutputFile('segments.png', 'cell_id', axes_origin=[]),
            mgd.TempOutputFile('bias.png', 'cell_id', axes_origin=[]),
            mgd.InputInstance('cell_id'),
        ),
        kwargs={
            'num_states': hmmparams['num_states'],
            'sample_info': mgd.TempInputObj('sampleinfo', 'cell_id'),
            'max_cn': mgd.TempInputObj("max_cn")
        }
    )

    workflow.transform(
        name='annotate_metrics_with_info_and_clustering',
        ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage},
        func="single_cell.workflows.hmmcopy.tasks.add_clustering_order",
        args=(
            mgd.InputFile(reads, extensions=['.yaml']),
            mgd.TempInputFile("hmm_metrics.csv.gz", extensions=['.yaml']),
            mgd.OutputFile(metrics, extensions=['.yaml']),
        ),
        kwargs={
            'chromosomes': hmmparams["chromosomes"],
            'sample_info': sample_info
        }
    )

    workflow.transform(
        name='merge_hmm_copy_plots',
        ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage},
        func="single_cell.workflows.hmmcopy.tasks.merge_pdf",
        args=(
            [
                mgd.TempInputFile('segments.png', 'cell_id'),
                mgd.TempInputFile('bias.png', 'cell_id'),
            ],
            [
                mgd.OutputFile(segs_pdf),
                mgd.OutputFile(bias_pdf),
            ],
            mgd.InputFile(metrics, extensions=['.yaml']),
            None,
            mgd.TempSpace("hmmcopy_plot_merge_temp"),
            ['segments', 'bias']
        )
    )

    workflow.transform(
        name='create_igv_seg',
        ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage},
        func="single_cell.workflows.hmmcopy.tasks.create_igv_seg",
        args=(
            mgd.InputFile(segs, extensions=['.yaml']),
            mgd.InputFile(metrics, extensions=['.yaml']),
            mgd.OutputFile(igv_seg_filename),
            hmmparams,
        )
    )

    workflow.transform(
        name='plot_metrics',
        ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage},
        func="single_cell.workflows.hmmcopy.tasks.plot_metrics",
        args=(
            mgd.InputFile(metrics, extensions=['.yaml']),
            mgd.OutputFile(plot_metrics_output),
            'QC pipeline metrics',
        )
    )

    workflow.transform(
        name='plot_kernel_density',
        ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage},
        func="single_cell.workflows.hmmcopy.tasks.plot_kernel_density",
        args=(
            mgd.InputFile(metrics, extensions=['.yaml']),
            mgd.OutputFile(plot_kernel_density_output),
            ',',
            'mad_neutral_state',
            'QC pipeline metrics',
        )
    )

    workflow.transform(
        name='plot_heatmap_ec',
        ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage},
        func="single_cell.workflows.hmmcopy.tasks.plot_pcolor",
        args=(
            mgd.InputFile(reads, extensions=['.yaml']),
            mgd.InputFile(metrics, extensions=['.yaml']),
            mgd.OutputFile(plot_heatmap_ec_output),
        ),
        kwargs={
            'plot_title': 'QC pipeline metrics',
            'column_name': 'state',
            'plot_by_col': 'experimental_condition',
            'color_by_col': 'cell_call',
            'chromosomes': chromosomes,
            'max_cn': hmmparams['num_states'],
            'scale_by_cells': False,
            'mappability_threshold': hmmparams["map_cutoff"]
        }
    )

    workflow.transform(
        name='merge_hmmcopy_data_tars',
        ctx={'mem': hmmparams['memory']['med'], 'ncpus': 1, 'docker_image': baseimage},
        func="single_cell.utils.helpers.tar_files",
        args=(
            mgd.TempInputFile('hmm_data.tar.gz', 'cell_id', axes_origin=[]),
            mgd.OutputFile(hmmcopy_data_tar),
            mgd.TempSpace("merge_tarballs")
        ),

    )

    return workflow
Esempio n. 30
0
def create_aneufinder_workflow(
    bam_file,
    cell_ids,
    config,
    aneufinder_results_filename,
    aneufinder_pdf_filename,
):
    baseimage = config['docker']['single_cell_pipeline']

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=cell_ids,
    )

    workflow.transform(
        name='run_aneufinder_on_individual_cells',
        ctx={
            'mem': config['memory']['med'],
            'ncpus': 1,
            'docker_image': baseimage
        },
        func="single_cell.workflows.aneufinder.tasks.run_aneufinder",
        axes=('cell_id', ),
        args=(
            mgd.InputFile('bam_file', 'cell_id', fnames=bam_file),
            mgd.TempSpace('working_dir', 'cell_id', fnames=bam_file),
            mgd.InputInstance('cell_id'),
            mgd.TempOutputFile('segments.csv', 'cell_id'),
            mgd.TempOutputFile('reads.csv', 'cell_id'),
            mgd.TempOutputFile('dnacopy.pdf', 'cell_id'),
        ),
        kwargs={'docker_image': config['docker']['aneufinder']})

    workflow.transform(
        name='merge_outputs',
        ctx={
            'mem': config['memory']['med'],
            'ncpus': 1,
            'docker_image': baseimage
        },
        func="single_cell.workflows.aneufinder.tasks.merge_outputs_to_hdf",
        args=(
            mgd.TempInputFile('reads.csv', 'cell_id'),
            mgd.TempInputFile('segments.csv', 'cell_id'),
            mgd.OutputFile(aneufinder_results_filename),
            mgd.TempSpace("aneufinder_merge"),
        ))

    workflow.transform(name='merge_aneufinder_pdfs',
                       ctx={
                           'mem': config['memory']['med'],
                           'ncpus': 1,
                           'docker_image': baseimage
                       },
                       func="single_cell.workflows.aneufinder.tasks.merge_pdf",
                       args=(
                           [mgd.TempInputFile('dnacopy.pdf', 'cell_id')],
                           [mgd.OutputFile(aneufinder_pdf_filename)],
                       ))

    return workflow