Example #1
0
def _create_download_cosmic_file_subworkflow(host,
                                             host_path,
                                             user,
                                             password,
                                             out_file,
                                             local_download=False):
    sandbox = soil.utils.workflow.get_sandbox(['bcftools', 'samtools'])

    workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox)

    workflow.transform(name='download',
                       ctx={'local': local_download},
                       func=tasks.download_from_sftp,
                       args=(host, host_path,
                             mgd.TempOutputFile('file.vcf.gz'), user,
                             password))

    workflow.transform(name='decompress',
                       func=tasks.decompress,
                       args=(mgd.TempInputFile('file.vcf.gz'),
                             mgd.TempOutputFile('file.vcf')))

    workflow.transform(name='bgzip',
                       func=soil.wrappers.samtools.tasks.compress_vcf,
                       args=(mgd.TempInputFile('file.vcf'),
                             mgd.OutputFile(out_file)))

    return workflow
Example #2
0
def create_db_workflow(in_file,
                       ref_proteome_fasta_file,
                       out_file,
                       genome_version='GRCh37',
                       pyensembl_cache_dir=None):

    sandbox = pypeliner.sandbox.CondaSandbox(pip_packages=['varcode'])

    workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox)

    workflow.transform(name='clean_ref_fasta',
                       func=tasks.clean_ref_proteome_ids,
                       args=(mgd.InputFile(ref_proteome_fasta_file),
                             mgd.TempOutputFile('ref.fasta')))

    workflow.transform(name='build_variant_table',
                       func=tasks.build_variant_table,
                       args=(mgd.InputFile(in_file),
                             mgd.TempOutputFile('variant_table.tsv.gz')),
                       kwargs={
                           'genome_version': genome_version,
                           'pyensembl_cache_dir': pyensembl_cache_dir
                       })

    workflow.transform(name='build_variant_fasta',
                       func=tasks.build_variant_fasta,
                       args=(mgd.TempInputFile('variant_table.tsv.gz'),
                             mgd.TempOutputFile('var.fasta')))

    workflow.commandline(name='build_db',
                         args=('cat', mgd.TempInputFile('ref.fasta'),
                               mgd.TempInputFile('var.fasta'), '>',
                               mgd.OutputFile(out_file)))

    return workflow
Example #3
0
def run_MutationSeq(config, normal_bam, tumour_bam, output_file):
    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(obj=mgd.OutputChunks('interval',), value=list(map(str, range(1, 23) + ['X'])))

    workflow.transform(
        name='run_museq_paired',
        ctx={'mem': 8, 'ncpus': 1, 'walltime': '24:00'},
        axes=('interval',),
        func=tasks.run_museq,
        args=(
            config,
            mgd.InputFile(normal_bam),
            mgd.InputFile(tumour_bam),
            mgd.InputInstance('interval'),
            mgd.TempOutputFile('museq.vcf', 'interval'),
            mgd.TempOutputFile('museq.log', 'interval'),
            )
        )

    workflow.transform(
        name='merge_vcfs',
        func=tasks.merge_vcfs,
        args=(
            mgd.TempInputFile('museq.vcf', 'interval', axes_origin=[]),
            mgd.OutputFile(output_file),
            mgd.TempSpace('merge_vcf'),
            )
        )

    return workflow
Example #4
0
def run_Strelka(config, normal_bam, tumour_bam, snv_output_file,
                indel_output_file):
    workflow = pypeliner.workflow.Workflow()

    workflow.transform(name='configure_bed',
                       func=tasks.configure_bed,
                       args=(mgd.TempSpace('bed_space'),
                             mgd.InputFile(config['bed_file']),
                             mgd.TempOutputFile('bed.gz'),
                             mgd.TempOutputFile('bed.gz.tbi')))

    workflow.transform(name='run_strelka',
                       ctx={
                           'mem': 10,
                           'ncpus': 1,
                           'walltime': '08:00'
                       },
                       func=tasks.run_strelka,
                       args=(
                           config,
                           mgd.InputFile(normal_bam),
                           mgd.InputFile(tumour_bam),
                           mgd.TempInputFile('bed.gz'),
                           mgd.TempInputFile('bed.gz.tbi'),
                           mgd.TempSpace('strelka_workspace'),
                           mgd.OutputFile(snv_output_file),
                           mgd.OutputFile(indel_output_file),
                       ))

    return workflow
Example #5
0
def create_cohort_qc_report(cohort_label, out_dir, filtered_cohort_maf,
                            cna_table, report_path):

    oncoplot = os.path.join(out_dir, cohort_label, "cohort_oncoplot.png")
    somatic_interactions_plot = os.path.join(out_dir, cohort_label,
                                             "somatic_interactions.png")
    summary_plot = os.path.join(out_dir, cohort_label, "summary.png")
    burden_plot = os.path.join(out_dir, cohort_label, "mutation_burden.png")

    workflow = pypeliner.workflow.Workflow()

    non_synonymous_labels = [
        "Frame_Shift_Del", "Frame_Shift_Ins", "Splice_Site",
        "Translation_Start_Site", "Nonsense_Mutation", "Nonstop_Mutation",
        "In_Frame_Del", "In_Frame_Ins", "Missense_Mutation"
    ]

    workflow.transform(
        name='postprocess_maf',
        func='wgs.workflows.cohort_qc.tasks.prepare_maf_for_maftools',
        args=(cohort_label, mgd.InputFile(filtered_cohort_maf),
              mgd.TempOutputFile("prepared_maf"), non_synonymous_labels,
              mgd.TempOutputFile("vcNames")),
    )

    workflow.transform(
        name='burden_plot',
        func='wgs.workflows.cohort_qc.tasks.plot_mutation_burden',
        args=(
            mgd.InputFile(filtered_cohort_maf),
            mgd.OutputFile(burden_plot),
        ),
    )

    workflow.transform(
        name='build_gene_list',
        func='wgs.workflows.cohort_qc.tasks.build_gene_list',
        args=(mgd.InputFile(cna_table), mgd.TempOutputFile("genelist")),
    )
    workflow.transform(
        name='make_cohort_plots',
        func='wgs.workflows.cohort_qc.tasks.make_R_cohort_plots',
        args=(mgd.TempInputFile("prepared_maf"), mgd.InputFile(cna_table),
              mgd.OutputFile(oncoplot),
              mgd.OutputFile(somatic_interactions_plot),
              mgd.OutputFile(summary_plot), mgd.TempInputFile("vcNames"),
              mgd.TempInputFile("genelist")))

    workflow.transform(name='make_report',
                       func='wgs.workflows.cohort_qc.tasks.make_report',
                       args=(
                           cohort_label,
                           mgd.InputFile(oncoplot),
                           mgd.InputFile(somatic_interactions_plot),
                           mgd.InputFile(summary_plot),
                           mgd.InputFile(burden_plot),
                           mgd.OutputFile(report_path),
                       ))

    return workflow
Example #6
0
def create_vcf_db_annotation_workflow(db_vcf_file,
                                      target_vcf_file,
                                      out_file,
                                      docker_config={},
                                      split_size=int(1e4)):

    ctx = dict(mem=2, num_retry=3, mem_retry_increment=2, **docker_config)

    workflow = pypeliner.workflow.Workflow()

    workflow.transform(name='split_vcf',
                       ctx=ctx,
                       func='biowrappers.components.io.vcf.tasks.split_vcf',
                       args=(mgd.InputFile(target_vcf_file),
                             mgd.TempOutputFile('split.vcf', 'split')),
                       kwargs={'lines_per_file': split_size})

    workflow.transform(
        name='annotate_db_status',
        axes=('split', ),
        ctx=ctx,
        func=
        'biowrappers.components.variant_calling.annotated_db_status.tasks.annotate_db_status',
        args=(db_vcf_file, mgd.TempInputFile('split.vcf', 'split'),
              mgd.TempOutputFile('annotated.csv.gz',
                                 'split',
                                 extensions=['.yaml'])))

    workflow.transform(name='merge_tables',
                       ctx=ctx,
                       func='single_cell.utils.csvutils.concatenate_csv',
                       args=(mgd.TempInputFile('annotated.csv.gz', 'split'),
                             mgd.OutputFile(out_file, extensions=['.yaml'])))

    return workflow
Example #7
0
def create_db_annotation_workflow(in_vcf_file,
                                  out_csv_file,
                                  db_vcf_file,
                                  split_size=1e4):
    workflow = pypeliner.workflow.Workflow(
        ctx=dict(mem=2, num_retry=3, mem_retry_increment=2))

    workflow.transform(name='split_vcf',
                       func='single_cell.utils.vcfutils.split_vcf',
                       args=(mgd.InputFile(in_vcf_file),
                             mgd.TempOutputFile('split.vcf', 'split')),
                       kwargs={'lines_per_file': split_size})

    workflow.transform(
        name='annotate_db_status',
        axes=('split', ),
        func='single_cell.workflows.db_annotation.tasks.annotate_db_status',
        args=(db_vcf_file, mgd.TempInputFile('split.vcf', 'split'),
              mgd.TempOutputFile('annotated.csv.gz',
                                 'split',
                                 extensions=['.yaml'])))

    workflow.transform(name='merge_tables',
                       func='single_cell.utils.csvutils.concatenate_csv',
                       args=(mgd.TempInputFile('annotated.csv.gz',
                                               'split',
                                               extensions=['.yaml']),
                             mgd.OutputFile(out_csv_file,
                                            extensions=['.yaml'])))

    return workflow
Example #8
0
def create_lumpy_workflow(lumpy_vcf,
                          tumour_bam=None,
                          normal_bam=None,
                          single_node=False):
    workflow = pypeliner.workflow.Workflow()

    lumpy_job_name = 'run_lumpy'
    if normal_bam:
        normal_bam = mgd.InputFile(normal_bam)
        normal_disc = mgd.TempInputFile('normal.discordants.sorted.bam')
        normal_split = mgd.TempInputFile('normal.splitters.sorted.bam')
        lumpy_job_name += '_normal'
    else:
        normal_disc = None
        normal_split = None

    if tumour_bam:
        tumour_bam = mgd.InputFile(tumour_bam)
        tumour_disc = mgd.TempInputFile('tumour.discordants.sorted.bam')
        tumour_split = mgd.TempInputFile('tumour.splitters.sorted.bam')
        lumpy_job_name += '_tumour'
    else:
        tumour_disc = None
        tumour_split = None

    if normal_bam:
        workflow.subworkflow(
            name='preprocess_lumpy_normal',
            func=lumpy_preprocess_workflow,
            args=(normal_bam,
                  mgd.TempOutputFile('normal.discordants.sorted.bam'),
                  mgd.TempOutputFile('normal.splitters.sorted.bam')),
            kwargs={'single_node': single_node})

    if tumour_bam:
        workflow.subworkflow(
            name='preprocess_lumpy_tumour',
            func=lumpy_preprocess_workflow,
            args=(tumour_bam,
                  mgd.TempOutputFile('tumour.discordants.sorted.bam'),
                  mgd.TempOutputFile('tumour.splitters.sorted.bam')),
            kwargs={'single_node': single_node})

    workflow.transform(
        name=lumpy_job_name,
        ctx=helpers.get_default_ctx(memory=10, disk=500, walltime='72:00'),
        func='wgs.workflows.lumpy.tasks.run_lumpyexpress',
        args=(mgd.OutputFile(lumpy_vcf),
              config.default_params('breakpoint_calling')['lumpy_paths']),
        kwargs={
            'tumour_bam': tumour_bam,
            'tumour_discordants': tumour_disc,
            'tumour_splitters': tumour_split,
            'normal_bam': normal_bam,
            'normal_discordants': normal_disc,
            'normal_splitters': normal_split,
            'docker_image': config.containers('lumpy')
        })

    return workflow
Example #9
0
def create_optitype_workflow(bam_file, hla_type_file, is_rna=False, threads=1):
    if check_chr_prefix(bam_file):
        chrom_str = 'chr6'
    else:
        chrom_str = '6'

    sandbox = soil.utils.workflow.get_sandbox(
        ['optitype', 'razers3', 'samtools'])

    workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox)

    workflow.commandline(
        name='extract_chr6',
        args=(
            'samtools',
            'view',
            '-bh',
            '-f',
            '2',
            '-F',
            '4',
            mgd.InputFile(bam_file),
            chrom_str,
            '|',
            'samtools',
            'collate',
            '-O',
            '-',
            mgd.TempSpace('chr6_collate_temp'),
            '|',
            'samtools',
            'bam2fq',
            '-1',
            mgd.TempOutputFile('chr6_reads_1.fq'),
            '-2',
            mgd.TempOutputFile('chr6_reads_2.fq'),
            '-',
        ),
    )

    workflow.transform(name='optitype',
                       ctx={
                           'mem': 24,
                           'mem_retry_increment': 8,
                           'num_retry': 3,
                           'threads': threads
                       },
                       func=tasks.run_optitype,
                       args=(
                           mgd.TempInputFile('chr6_reads_1.fq'),
                           mgd.TempInputFile('chr6_reads_2.fq'),
                           mgd.OutputFile(hla_type_file),
                           mgd.TempSpace('optitype_temp'),
                       ),
                       kwargs={
                           'is_rna': is_rna,
                           'threads': threads,
                       })

    return workflow
Example #10
0
def _create_download_cosmic_workflow(ref_data_version,
                                     out_file,
                                     user,
                                     password,
                                     host='sftp-cancer.sanger.ac.uk',
                                     local_download=False):

    host_base_path = '/files/{}/cosmic/v83/VCF'.format(
        ref_data_version.lower())

    coding_host_path = '/'.join([host_base_path, 'CosmicCodingMuts.vcf.gz'])

    non_coding_host_path = '/'.join(
        [host_base_path, 'CosmicNonCodingVariants.vcf.gz'])

    sandbox = soil.utils.workflow.get_sandbox(['bcftools', 'samtools'])

    workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox)

    workflow.setobj(obj=mgd.TempOutputObj('coding_host_path'),
                    value=coding_host_path)

    workflow.setobj(obj=mgd.TempOutputObj('non_coding_host_path'),
                    value=non_coding_host_path)

    workflow.subworkflow(name='download_coding',
                         func=_create_download_cosmic_file_subworkflow,
                         args=(
                             host,
                             mgd.TempInputObj('coding_host_path'),
                             user,
                             password,
                             mgd.TempOutputFile('coding.vcf.gz'),
                         ),
                         kwargs={'local_download': local_download})

    workflow.subworkflow(name='download_non_coding',
                         func=_create_download_cosmic_file_subworkflow,
                         args=(
                             host,
                             mgd.TempInputObj('non_coding_host_path'),
                             user,
                             password,
                             mgd.TempOutputFile('non_coding.vcf.gz'),
                         ),
                         kwargs={'local_download': local_download})

    workflow.transform(name='merge_files',
                       func=soil.wrappers.samtools.tasks.concatenate_vcf,
                       args=([
                           mgd.TempInputFile('coding.vcf.gz'),
                           mgd.TempInputFile('non_coding.vcf.gz')
                       ], mgd.OutputFile(out_file)),
                       kwargs={
                           'allow_overlap': True,
                           'index_file': mgd.OutputFile(out_file + '.tbi')
                       })

    return workflow
Example #11
0
def create_somatic_consensus_workflow(
    mutect_snv_vcf,
    strelka_snv_vcf,
    strelka_indel_vcf,
    museq_snv_vcf,
    consensus_maf,
    chromosomes,
    reference_vep,
    normal_id,
    tumour_id,
):
    workflow = pypeliner.workflow.Workflow()

    workflow.transform(
        name='snv_consensus',
        ctx=helpers.get_default_ctx(
            memory=15,
            walltime='8:00',
        ),
        func='wgs.workflows.somatic_calling_consensus.consensus.main',
        args=(
            mgd.InputFile(museq_snv_vcf),
            mgd.InputFile(strelka_snv_vcf),
            mgd.InputFile(mutect_snv_vcf),
            mgd.InputFile(strelka_indel_vcf),
            mgd.TempOutputFile('consensus.vcf'),
            mgd.TempOutputFile('counts.csv'),
            chromosomes,
        ),
    )

    workflow.subworkflow(name="consensus_maf",
                         func='wgs.workflows.vcf2maf.create_vcf2maf_workflow',
                         args=(
                             mgd.TempInputFile('consensus.vcf'),
                             mgd.TempOutputFile('consensus.maf'),
                             reference_vep,
                         ),
                         kwargs={
                             'normal_id': normal_id,
                             'tumour_id': tumour_id
                         })

    workflow.transform(
        name='maf_counts',
        ctx=helpers.get_default_ctx(
            memory=15,
            walltime='8:00',
        ),
        func='wgs.workflows.somatic_calling_consensus.tasks.update_maf_counts',
        args=(
            mgd.TempInputFile('consensus.maf'),
            mgd.TempInputFile('counts.csv'),
            mgd.OutputFile(consensus_maf),
        ))

    return workflow
Example #12
0
def create_consensus_workflow(
        destruct_breakpoints,
        lumpy_vcf,
        output,
        chromosomes
):

    params = config.default_params('breakpoint_calling')
    workflow = pypeliner.workflow.Workflow()

    workflow.transform(
        name='parse_lumpy',
        ctx=helpers.get_default_ctx(
            memory=15,
            walltime='8:00',
        ),
        func='wgs.workflows.breakpoint_calling_consensus.tasks.parse_lumpy_task',
        args=(
            mgd.InputFile(lumpy_vcf),
            mgd.TempOutputFile('lumpy.csv'),
            params["parse_lumpy"],
        ),
        kwargs={'chromosomes': chromosomes}
    )

    workflow.transform(
        name='parse_destruct',
        ctx=helpers.get_default_ctx(
            memory=15,
            walltime='8:00',
        ),
        func='wgs.workflows.breakpoint_calling_consensus.tasks.parse_destruct_task',
        args=(
            mgd.InputFile(destruct_breakpoints),
            mgd.TempOutputFile('destruct.csv'),
            params["parse_destruct"],
        ),
        kwargs={'chromosomes': chromosomes}
    )

    workflow.transform(
        name='consensus_breakpoint_calling',
        ctx=helpers.get_default_ctx(
            memory=15,
            walltime='8:00',
        ),
        func='wgs.workflows.breakpoint_calling_consensus.tasks.consensus_calls',
        args=(
            mgd.TempInputFile('destruct.csv'),
            mgd.TempInputFile('lumpy.csv'),
            mgd.OutputFile(output, extensions=['.yaml']),
            params['consensus']
        ),
    )

    return workflow
Example #13
0
def create_snpeff_annotation_workflow(
        in_vcf_file,
        out_csv_file,
        db,
        data_dir,
        split_size=int(1e3)
):
    workflow = pypeliner.workflow.Workflow(
        ctx={'num_retry': 3, 'mem_retry_increment': 2}
    )

    workflow.transform(
        name='split_vcf',
        func='single_cell.utils.vcfutils.split_vcf',
        args=(
            mgd.InputFile(in_vcf_file),
            mgd.TempOutputFile('split.vcf', 'split')
        ),
        kwargs={'lines_per_file': split_size}
    )

    workflow.transform(
        name='run_snpeff',
        axes=('split',),
        func='single_cell.workflows.snpeff_annotation.tasks.run_snpeff',
        args=(
            db,
            data_dir,
            mgd.TempInputFile('split.vcf', 'split'),
            mgd.TempOutputFile('snpeff.vcf', 'split')
        ),
        kwargs={
            'classic_mode': True
        }
    )

    workflow.transform(
        name='convert_vcf_to_csv',
        axes=('split',),
        func='single_cell.workflows.snpeff_annotation.tasks.convert_vcf_to_table',
        args=(
            mgd.TempInputFile('snpeff.vcf', 'split'),
            mgd.TempOutputFile('snpeff.csv.gz', 'split', extensions=['.yaml']),
        )
    )

    workflow.transform(
        name='concatenate_tables',
        func='single_cell.utils.csvutils.concatenate_csv',
        args=(
            mgd.TempInputFile('snpeff.csv.gz', 'split', extensions=['.yaml']),
            mgd.OutputFile(out_csv_file, extensions=['.yaml'])
        )
    )

    return workflow
Example #14
0
def create_pileup2snp_workflow(bam_file, ref_genome_fasta_file, out_file, chromosomes=None, split_size=int(1e7)):

    sandbox = soil.utils.workflow.get_sandbox(['bcftools', 'samtools', 'varscan'])

    workflow = pypeliner.workflow.Workflow(default_ctx=low_mem_ctx, default_sandbox=sandbox)

    workflow.setobj(
        obj=pypeliner.managed.TempOutputObj('config', 'regions'),
        value=soil.utils.genome.get_bam_regions(bam_file, split_size, chromosomes=chromosomes)
    )

    workflow.commandline(
        name='run_mpileup',
        axes=('regions',),
        args=(
            'samtools',
            'mpileup',
            '-f', mgd.InputFile(ref_genome_fasta_file),
            '-o', mgd.TempOutputFile('region.mpileup', 'regions'),
            '-r', mgd.TempInputObj('config', 'regions'),
            mgd.InputFile(bam_file),
        )
    )

    workflow.transform(
        name='run_mpileup2snp',
        axes=('regions',),
        ctx=med_mem_ctx,
        func=tasks.mpileup2snp,
        args=(
            mgd.TempInputFile('region.mpileup', 'regions'),
            mgd.TempOutputFile('region.vcf', 'regions'),
        )
    )

    workflow.transform(
        name='compress',
        axes=('regions',),
        func=soil.wrappers.samtools.tasks.compress_vcf,
        args=(
            mgd.TempInputFile('region.vcf', 'regions'),
            mgd.TempOutputFile('region.vcf.gz', 'regions'),
        ),
    )

    workflow.transform(
        name='concatenate_vcfs',
        func=soil.wrappers.samtools.tasks.concatenate_vcf,
        args=(
            mgd.TempInputFile('region.vcf.gz', 'regions'),
            mgd.OutputFile(out_file),
        ),
    )

    return workflow
Example #15
0
def create_snpeff_annotation_workflow(db,
                                      data_dir,
                                      target_vcf_file,
                                      out_file,
                                      base_docker={},
                                      snpeff_docker={},
                                      classic_mode=True,
                                      split_size=int(1e3),
                                      table_name='snpeff'):

    ctx = {'num_retry': 3, 'mem_retry_increment': 2}

    if base_docker:
        ctx.update(base_docker)

    workflow = Workflow()

    workflow.transform(name='split_vcf',
                       ctx=dict(mem=2, **ctx),
                       func='biowrappers.components.io.vcf.tasks.split_vcf',
                       args=(mgd.InputFile(target_vcf_file),
                             mgd.TempOutputFile('split.vcf', 'split')),
                       kwargs={'lines_per_file': split_size})

    workflow.transform(
        name='run_snpeff',
        axes=('split', ),
        ctx=dict(mem=8, **ctx),
        func='biowrappers.components.variant_calling.snpeff.tasks.run_snpeff',
        args=(db, data_dir, mgd.TempInputFile('split.vcf', 'split'),
              mgd.TempOutputFile('snpeff.vcf', 'split')),
        kwargs={
            'classic_mode': classic_mode,
            'docker_config': snpeff_docker
        })

    workflow.transform(
        name='convert_vcf_to_csv',
        axes=('split', ),
        ctx=dict(mem=4, **ctx),
        func=
        'biowrappers.components.variant_calling.snpeff.tasks.convert_vcf_to_table',
        args=(mgd.TempInputFile('snpeff.vcf', 'split'),
              mgd.TempOutputFile('snpeff.csv.gz',
                                 'split',
                                 extensions=['.yaml']), table_name))

    workflow.transform(name='concatenate_tables',
                       ctx=dict(mem=4, **ctx),
                       func='single_cell.utils.csvutils.concatenate_csv',
                       args=(mgd.TempInputFile('snpeff.csv.gz', 'split'),
                             mgd.OutputFile(out_file, extensions=['.yaml'])))

    return workflow
Example #16
0
def create_vcf_tric_nucleotide_annotation_workflow(
        ref_genome_fasta_file,
        vcf_file,
        out_file,
        docker_config=None,
        split_size=int(1e4),
        table_name='tri_nucleotide_context'):

    ctx = {'num_retry': 3, 'mem_retry_increment': 2}
    if docker_config:
        ctx.update(docker_config)


    merged_file = mgd.TempFile('merged.csv.gz')

    workflow = pypeliner.workflow.Workflow()

    workflow.transform(
        name='split_vcf',
        ctx=dict(mem=2, **ctx),
        func='biowrappers.components.io.vcf.tasks.split_vcf',
        args=(
            mgd.InputFile(vcf_file),
            mgd.TempOutputFile('split.vcf', 'split')
        ),
        kwargs={'lines_per_file': split_size}
    )

    workflow.transform(
        name='annotate_db_status',
        axes=('split',),
        ctx=dict(mem=4, **ctx),
        func='biowrappers.components.variant_calling.tri_nucleotide_context.tasks.get_tri_nucelotide_context',
        args=(
            ref_genome_fasta_file,
            mgd.TempInputFile('split.vcf', 'split'),
            mgd.TempOutputFile('tri_nucleotide_context.csv.gz', 'split',
                               extensions=['.yaml']),
            table_name
        )
    )

    workflow.transform(
        name='merge_tables',
        ctx=dict(mem=2, **ctx),
        func='single_cell.utils.csvutils.concatenate_csv',
        args=(
            mgd.TempInputFile('tri_nucleotide_context.csv.gz', 'split'),
            mgd.OutputFile(out_file, extensions=['.yaml']))
    )


    return workflow
Example #17
0
def create_eagle_ref_data_workflow(vcf_url_template,
                                   out_file,
                                   local_download=False):

    chrom_map_file = soil.utils.package_data.load_data_file(
        'ref_data/data/GRCh37/chrom_map.tsv')

    chrom_map = pd.read_csv(chrom_map_file, sep='\t')

    chrom_map = chrom_map[chrom_map['ncbi'].isin(
        [str(x) for x in range(1, 23)])]

    chrom_map['url'] = chrom_map['ncbi'].apply(
        lambda x: vcf_url_template.format(chrom=x))

    vcf_urls = chrom_map['url'].to_dict()

    sandbox = soil.utils.workflow.get_sandbox(['bcftools'])

    workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox)

    workflow.setobj(obj=mgd.TempOutputObj('vcf_url', 'chrom'), value=vcf_urls)

    workflow.transform(name='download_vcf_files',
                       axes=('chrom', ),
                       ctx={'local': local_download},
                       func=soil.ref_data.tasks.download,
                       args=(mgd.TempInputObj('vcf_url', 'chrom'),
                             mgd.TempOutputFile('raw.vcf.gz', 'chrom')))

    workflow.transform(name='write_chrom_map',
                       func=tasks.write_chrom_map_file,
                       args=(mgd.InputFile(chrom_map_file),
                             mgd.TempOutputFile('chrom_map.tsv')))

    workflow.transform(name='rename_chroms',
                       axes=('chrom', ),
                       func=soil.wrappers.bcftools.tasks.rename_chroms,
                       args=(mgd.TempInputFile('chrom_map.tsv'),
                             mgd.TempInputFile('raw.vcf.gz', 'chrom'),
                             mgd.TempOutputFile('renamed.bcf', 'chrom')))

    workflow.transform(name='concat_vcfs',
                       func=soil.wrappers.bcftools.tasks.concatenate_vcf,
                       args=(mgd.TempInputFile('renamed.bcf', 'chrom'),
                             mgd.OutputFile(out_file)),
                       kwargs={'bcf_output': True})

    workflow.commandline(name='index',
                         args=('bcftools', 'index', mgd.InputFile(out_file),
                               '-o', mgd.OutputFile(out_file + '.csi')))

    return workflow
Example #18
0
def create_variant_counting_workflow(
    vcfs,
    tumour_cell_bams,
    results_h5,
    config,
):
    """ Count variant reads for multiple sets of variants across cells.
    """

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=tumour_cell_bams.keys(),
    )

    workflow.transform(name='merge_snvs',
                       func='biowrappers.components.io.vcf.tasks.merge_vcfs',
                       args=([mgd.InputFile(vcf) for vcf in vcfs],
                             mgd.TempOutputFile('all.snv.vcf')))

    workflow.transform(name='finalise_snvs',
                       func="biowrappers.components.io.vcf.tasks.finalise_vcf",
                       args=(mgd.TempInputFile('all.snv.vcf'),
                             mgd.TempOutputFile('all.snv.vcf.gz',
                                                extensions=['.tbi'])),
                       kwargs={
                           'docker_config':
                           helpers.get_container_ctx(config['containers'],
                                                     'vcftools')
                       })

    workflow.subworkflow(
        name='count_alleles',
        func=create_snv_allele_counts_for_vcf_targets_workflow,
        args=(
            config,
            mgd.InputFile('tumour_cells.bam',
                          'cell_id',
                          extensions=['.bai'],
                          fnames=tumour_cell_bams),
            mgd.TempInputFile('all.snv.vcf.gz'),
            mgd.OutputFile(results_h5),
        ),
        kwargs={
            'docker_config':
            helpers.get_container_ctx(config['containers'],
                                      'single_cell_pipeline')
        },
    )

    return workflow
Example #19
0
def create_vcf2maf_workflow(vcf_file,
                            maf_file,
                            reference,
                            tumour_id=None,
                            normal_id=None):
    workflow = pypeliner.workflow.Workflow()

    workflow.transform(name='vcf2maf',
                       func='wgs.workflows.vcf2maf.tasks.run_vcf2maf',
                       args=(mgd.InputFile(vcf_file),
                             mgd.TempOutputFile('maf_file.maf'),
                             mgd.TempSpace('vcf2maf_temp'), reference),
                       kwargs={
                           'tumour_id': tumour_id,
                           'normal_id': normal_id
                       })

    workflow.transform(name='update_ids',
                       func='wgs.workflows.vcf2maf.tasks.update_ids',
                       args=(
                           mgd.TempInputFile('maf_file.maf'),
                           tumour_id,
                           normal_id,
                           mgd.OutputFile(maf_file),
                       ))

    return workflow
Example #20
0
def create_workflow_1(input_filename, output_filename):
    workflow = pypeliner.workflow.Workflow(default_ctx={'mem': 1})

    # Read data into a managed object
    workflow.transform(name='read',
                       func=read_stuff,
                       ret=mgd.TempOutputObj('input_data'),
                       args=(mgd.InputFile(input_filename), ))

    # Extract a property of the managed object, modify it
    # and store the result in another managed object
    workflow.transform(
        name='do',
        func=do_stuff,
        ret=mgd.TempOutputObj('output_data'),
        args=(mgd.TempInputObj('input_data').prop('some_string'), ))

    # Write the object to an output file
    workflow.transform(name='write',
                       func=write_stuff,
                       args=(mgd.TempInputObj('output_data'),
                             mgd.TempOutputFile('output_file')))

    # Recursive workflow
    workflow.subworkflow(name='sub_workflow_2',
                         func=create_workflow_2,
                         args=(mgd.TempInputFile('output_file'),
                               mgd.OutputFile(output_filename)))

    return workflow
Example #21
0
def _create_download_decompress_workflow(url,
                                         local_path,
                                         local_download=False):
    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(mgd.TempOutputObj('url'), value=url)

    workflow.transform(
        name='download',
        ctx={'local': local_download},
        func=tasks.download,
        args=(
            mgd.TempInputObj('url'),
            mgd.TempOutputFile('download'),
        ),
    )

    workflow.transform(name='decompress',
                       func=tasks.decompress,
                       args=(
                           mgd.TempInputFile('download'),
                           mgd.OutputFile(local_path),
                       ))

    return workflow
def create_lumpy_workflow(config, normal_bam, tumour_cell_bams,
                          lumpy_breakpoints_csv, lumpy_breakpoints_evidence,
                          lumpy_breakpoints_bed):
    ctx = {'docker_image': config['docker']['single_cell_pipeline']}
    workflow = pypeliner.workflow.Workflow(ctx=ctx)

    workflow.setobj(
        obj=mgd.OutputChunks('cell_id'),
        value=list(tumour_cell_bams.keys()),
    )

    workflow.subworkflow(
        name='normal_preprocess_lumpy',
        func='single_cell.workflows.lumpy.lumpy_preprocess_workflow',
        ctx={'docker_image': config['docker']['single_cell_pipeline']},
        args=(normal_bam, config,
              mgd.TempOutputFile('normal.discordants.sorted.bam'),
              mgd.TempOutputFile('normal.splitters.sorted.bam'),
              mgd.TempOutputFile('hist_normal_formatted.csv'),
              mgd.TempOutputFile('normal_mean_stdev.yaml')),
    )

    workflow.subworkflow(
        name='tumour_preprocess_lumpy',
        func='single_cell.workflows.lumpy.lumpy_preprocess_workflow',
        ctx={'docker_image': config['docker']['single_cell_pipeline']},
        args=(mgd.InputFile('tumour_cells.bam',
                            'cell_id',
                            extensions=['.bai'],
                            fnames=tumour_cell_bams), config,
              mgd.TempOutputFile('tumour.discordants.sorted.bam'),
              mgd.TempOutputFile('tumour.splitters.sorted.bam'),
              mgd.TempOutputFile('hist_tumour_formatted.csv'),
              mgd.TempOutputFile('tumour_mean_stdev.yaml')),
    )

    workflow.subworkflow(
        name='lumpy',
        ctx={'docker_image': config['docker']['single_cell_pipeline']},
        func="single_cell.workflows.lumpy.lumpy_calling_workflow",
        args=(
            config,
            mgd.TempInputFile('normal.discordants.sorted.bam'),
            mgd.TempInputFile('normal.splitters.sorted.bam'),
            mgd.TempInputFile('hist_normal_formatted.csv'),
            mgd.TempInputFile('normal_mean_stdev.yaml'),
            mgd.TempInputFile('tumour.discordants.sorted.bam'),
            mgd.TempInputFile('tumour.splitters.sorted.bam'),
            mgd.TempInputFile('hist_tumour_formatted.csv'),
            mgd.TempInputFile('tumour_mean_stdev.yaml'),
            mgd.OutputFile(lumpy_breakpoints_bed),
            mgd.OutputFile(lumpy_breakpoints_csv, extensions=['.yaml']),
            mgd.OutputFile(lumpy_breakpoints_evidence, extensions=['.yaml']),
        ),
    )

    return workflow
Example #23
0
def create_fit_model_workflow(
    experiment_filename,
    results_filename,
    config,
    ref_data_dir,
    tumour_id=None,
):
    config = remixt.config.get_sample_config(config, tumour_id)
    
    workflow = pypeliner.workflow.Workflow(default_ctx={'mem': 16})

    workflow.transform(
        name='init',
        func=remixt.analysis.pipeline.init,
        ret=mgd.TempOutputObj('init_params', 'init_id'),
        args=(
            mgd.TempOutputFile('init_results'),
            mgd.InputFile(experiment_filename),
            config,
        ),
    )

    workflow.transform(
        name='fit',
        axes=('init_id',),
        func=remixt.analysis.pipeline.fit_task,
        args=(
            mgd.TempOutputFile('fit_results', 'init_id'),
            mgd.InputFile(experiment_filename),
            mgd.TempInputObj('init_params', 'init_id'),
            config,
        ),
    )

    workflow.transform(
        name='collate',
        func=remixt.analysis.pipeline.collate,
        args=(
            mgd.OutputFile(results_filename),
            mgd.InputFile(experiment_filename),
            mgd.TempInputFile('init_results'),
            mgd.TempInputFile('fit_results', 'init_id'),
            config,
        ),
    )

    return workflow
Example #24
0
def circos_plot(titan_calls, remixt_calls, sample_id, breakpoints,
           circos_plot_remixt, circos_plot_titan):
    workflow = pypeliner.workflow.Workflow()

    workflow.transform(
        name='prep_titan',
        func='wgs_qc_utils.reader.read_titan.make_for_circos',
        ctx=helpers.get_default_ctx(
            memory=5
        ),
        args=(
            mgd.InputFile(titan_calls),
            mgd.TempOutputFile("titan_prepped"),
        )
    )

    workflow.transform(
        name='prep_remixt',
        func='wgs_qc_utils.reader.read_remixt.make_for_circos',
        ctx=helpers.get_default_ctx(
            memory=5
        ),
        args=(
            mgd.InputFile(remixt_calls),
            sample_id,
            mgd.TempOutputFile("remixt_prepped"),
        )
    )
    workflow.transform(
        name='circos_plot',
        func='wgs.workflows.sample_qc.tasks.circos',
        ctx=helpers.get_default_ctx(
            memory=5
        ),
        args=(
            mgd.TempInputFile("titan_prepped"),
            mgd.TempInputFile("remixt_prepped"),
            sample_id,
            breakpoints,
            mgd.OutputFile(circos_plot_remixt),
            mgd.OutputFile(circos_plot_titan),
            mgd.TempSpace("circos")
        )
    )

    return workflow
def create_museq_workflow(
        normal_bam, tumour_bam, ref_genome, snv_vcf,
        config):

    ctx = {'mem_retry_increment': 2, 'ncpus': 1}
    docker_ctx = helpers.get_container_ctx(config['containers'], 'single_cell_pipeline')
    ctx.update(docker_ctx)

    workflow = pypeliner.workflow.Workflow()

    workflow.setobj(
        obj=mgd.OutputChunks('region'),
        value=normal_bam.keys(),
    )

    workflow.transform(
        name='run_museq',
        ctx=dict(mem=config["memory"]['med'],
                 pool_id=config['pools']['highmem'],
                 **ctx),
        axes=('region',),
        func='single_cell.workflows.mutationseq.tasks.run_museq',
        args=(
            mgd.InputFile('merged_bam', 'region', fnames=tumour_bam),
            mgd.InputFile('normal.split.bam', 'region', fnames=normal_bam),
            mgd.TempOutputFile('museq.vcf', 'region'),
            mgd.TempOutputFile('museq.log', 'region'),
            mgd.InputInstance('region'),
            config,
        ),
        kwargs={'docker_kwargs': helpers.get_container_ctx(config['containers'], 'mutationseq')}
    )

    workflow.transform(
        name='merge_snvs',
        ctx=dict(mem=config["memory"]['med'],
                 pool_id=config['pools']['standard'],
                 **ctx),
        func='biowrappers.components.io.vcf.tasks.concatenate_vcf',
        args=(
            mgd.TempInputFile('museq.vcf', 'region'),
            mgd.OutputFile(snv_vcf),
        ),
    )

    return workflow
Example #26
0
def create_basic_workflow(fastq_file_1, fastq_file_2, out_file, threads=1):

    sandbox = soil.utils.workflow.get_sandbox([
        'mixcr',
    ])

    workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox)

    workflow.commandline(name='align',
                         ctx={
                             'mem': 32,
                             'mem_retry_increment': 8,
                             'num_retry': 3,
                             'threads': threads
                         },
                         args=('mixcr', 'align', '-f', '-t', threads,
                               mgd.InputFile(fastq_file_1),
                               mgd.InputFile(fastq_file_2),
                               mgd.TempOutputFile('alignments.vdjca')))

    workflow.commandline(name='assemble',
                         ctx={
                             'mem': 16,
                             'mem_retry_increment': 8,
                             'num_retry': 3,
                             'threads': threads
                         },
                         args=('mixcr', 'assemble', '-f', '-t', 1,
                               mgd.TempInputFile('alignments.vdjca'),
                               mgd.TempOutputFile('clones.clns')))

    workflow.commandline(name='export',
                         ctx={
                             'mem': 16,
                             'mem_retry_increment': 8,
                             'num_retry': 3
                         },
                         args=('mixcr', 'exportClones', '-f',
                               mgd.TempInputFile('clones.clns'),
                               mgd.TempOutputFile('results.tsv')))

    workflow.commandline(name='compress',
                         args=('gzip', '-c', mgd.TempInputFile('results.tsv'),
                               '>', mgd.OutputFile(out_file)))

    return workflow
Example #27
0
def create_align_workflow(fastq_file_1,
                          fastq_file_2,
                          ref_genome_dir,
                          out_bam_file,
                          add_xs_tag=False,
                          align_threads=1,
                          read_group_info=None,
                          sort_threads=1):

    sandbox = soil.utils.workflow.get_sandbox(['star', 'samtools', 'sambamba'])

    workflow = pypeliner.workflow.Workflow(default_sandbox=sandbox)

    workflow.transform(name='star_align',
                       ctx={
                           'mem': 32,
                           'mem_retry_increment': 16,
                           'num_retry': 3,
                           'threads': align_threads
                       },
                       func=tasks.align,
                       args=(
                           mgd.InputFile(fastq_file_1),
                           mgd.InputFile(fastq_file_2),
                           ref_genome_dir,
                           mgd.TempOutputFile('aligned.bam'),
                           mgd.TempSpace('align_tmp'),
                       ),
                       kwargs={
                           'add_xs_tag': add_xs_tag,
                           'read_group_info': read_group_info,
                           'threads': align_threads,
                       })

    workflow.transform(name='sort',
                       ctx={
                           'mem': 32,
                           'mem_retry_increment': 16,
                           'num_retry': 3,
                           'threads': sort_threads
                       },
                       func=soil.wrappers.sambamba.tasks.sort,
                       args=(
                           mgd.TempInputFile('aligned.bam'),
                           mgd.OutputFile(out_bam_file),
                           mgd.TempSpace('sort_tmp'),
                       ),
                       kwargs={'threads': sort_threads})

    workflow.commandline(name='index',
                         args=(
                             'samtools',
                             'index',
                             mgd.InputFile(out_bam_file),
                             mgd.OutputFile(out_bam_file + '.bai'),
                         ))

    return workflow
Example #28
0
def create_svaba_workflow(
    tumour_bam,
    normal_bam,
    svaba_vcf,
    reference,
):
    workflow = pypeliner.workflow.Workflow()

    workflow.transform(
        name='run_svaba',
        ctx=helpers.get_default_ctx(memory=10,
                                    walltime='72:00',
                                    ncpus='8',
                                    disk=300),
        func='wgs.workflows.svaba.tasks.run_svaba',
        args=(mgd.InputFile(tumour_bam), mgd.InputFile(normal_bam),
              mgd.TempOutputFile('germline.indel.vcf.gz'),
              mgd.TempOutputFile('germline.sv.vcf.gz'),
              mgd.TempOutputFile('somatic.indel.vcf.gz'),
              mgd.OutputFile(svaba_vcf),
              mgd.TempOutputFile('unfiltered.germline.indel.vcf.gz'),
              mgd.TempOutputFile('unfiltered.germline.sv.vcf.gz'),
              mgd.TempOutputFile('unfiltered.somatic.indel.vcf.gz'),
              mgd.TempOutputFile('unfiltered.somatic.sv.vcf.gz'), reference,
              mgd.TempSpace('svaba_tempdir_full')),
        kwargs={
            'ncores': 8,
        })

    return workflow
Example #29
0
def pre_alignment(fastq_r1, fastq_r2, metrics_tar):
    workflow = pypeliner.workflow.Workflow()

    workflow.transform(
        name="fastqc_r1",
        ctx=helpers.get_default_ctx(memory=10, walltime='48:00', disk=400),
        func='alignment.workflows.pre_alignment.tasks.run_fastqc',
        args=(
            mgd.InputFile(fastq_r1),
            mgd.TempOutputFile('R1.html'),
            mgd.TempOutputFile('R1.pdf'),
            mgd.TempSpace('fastqc_R1'),
        ),
        kwargs={
            'docker_image': config.containers("fastqc"),
        })

    workflow.transform(
        name="fastqc_r2",
        func='alignment.workflows.pre_alignment.tasks.run_fastqc',
        ctx=helpers.get_default_ctx(memory=10, walltime='48:00', disk=400),
        args=(
            mgd.InputFile(fastq_r2),
            mgd.TempOutputFile('R2.html'),
            mgd.TempOutputFile('R2.pdf'),
            mgd.TempSpace('fastqc_R2'),
        ),
        kwargs={
            'docker_image': config.containers('fastqc'),
        })

    workflow.transform(name='tar',
                       func='alignment.utils.helpers.make_tar_from_files',
                       axes=('sample_id', ),
                       args=(mgd.OutputFile(metrics_tar), [
                           mgd.TempInputFile('R2.html'),
                           mgd.TempInputFile('R2.pdf'),
                           mgd.TempInputFile('R2.html'),
                           mgd.TempInputFile('R2.pdf'),
                       ], mgd.TempSpace('wgs_metrics')))

    return workflow
Example #30
0
def create_destruct_wrapper_workflow(bam_filenames,
                                     output_filename,
                                     raw_data_dir,
                                     control_id=None,
                                     config=None,
                                     ref_data_dir=None):
    workflow = pypeliner.workflow.Workflow(default_ctx={'mem': 4})

    workflow.setobj(
        obj=mgd.OutputChunks('sample_id'),
        value=list(bam_filenames.keys()),
    )

    workflow.subworkflow(
        name='run_destruct',
        func=destruct.workflow.create_destruct_workflow,
        args=(
            mgd.InputFile('bam', 'sample_id', fnames=bam_filenames),
            mgd.TempOutputFile('breakpoint_table'),
            mgd.TempOutputFile('breakpoint_library_table'),
            mgd.TempOutputFile('breakpoint_read_table'),
            config,
            ref_data_dir,
        ),
        kwargs={
            'raw_data_dir': raw_data_dir,
        },
    )

    workflow.transform(
        name='post_process',
        func=destruct.benchmark.wrappers.destruct.tasks.destruct_postprocess,
        args=(
            mgd.TempInputFile('breakpoint_table'),
            mgd.TempInputFile('breakpoint_library_table'),
            mgd.OutputFile(output_filename),
        ),
        kwargs={
            'control_id': control_id,
        })

    return workflow