Python make_parent_directory 예제들, biowrappers.components.utils.make_parent_directory Python 예제들

예제 #1

0

파일 보기

def build_index(index_sentinel_file,
                transcriptome_fasta_file,
                kmer_length=31,
                gencode=False,
                num_threads=1):

    make_parent_directory(index_sentinel_file)

    cmd = [
        'salmon',
        'index',
        '-i',
        os.path.dirname(index_sentinel_file),
        '-k',
        kmer_length,
        '-p',
        num_threads,
        '-t',
        transcriptome_fasta_file,
    ]

    if gencode is not None:
        cmd.append('--gencode')

    pypeliner.commandline.execute(*cmd)

    open(index_sentinel_file, 'w').close()

예제 #2

0

파일 보기

def create_battenberg_workflow(
    seqdata_files,
    config,
    out_file,
    raw_data_dir,
    somatic_breakpoint_file=None,
    normal_id=None,
    **kwargs
):
    if normal_id is None:
        raise ValueError('cloneHD requires normal sample')

    normal_seqdata_file = seqdata_files[normal_id]
    tumour_seqdata_files = seqdata_files.copy()
    del tumour_seqdata_files[normal_id]

    results_files = os.path.join(raw_data_dir, 'results', 'sample_{sample_id}.h5')
    utils.make_parent_directory(results_files)

    workflow = Workflow()

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('sample_id'),
        value=tumour_seqdata_files.keys(),
    )

    if somatic_breakpoint_file is not None:
        somatic_breakpoint_file = pypeliner.managed.InputFile(somatic_breakpoint_file)

    workflow.subworkflow(
        name='run_battenberg',
        axes=('sample_id',),
        func=create_battenberg_single_workflow,
        args=(
            pypeliner.managed.InputFile(normal_seqdata_file),
            pypeliner.managed.InputFile('tumour_seqdata', 'sample_id', fnames=tumour_seqdata_files),
            normal_id,
            pypeliner.managed.InputInstance('sample_id'),
            pypeliner.managed.OutputFile('results', 'sample_id', template=results_files),
            config,
        ),
        kwargs={
            'somatic_breakpoint_file': somatic_breakpoint_file,
        },
    )

    workflow.transform(
        name='merge_results',
        ctx={'mem': 8},
        func=hdf5_tasks.merge_hdf5,
        args=(
            pypeliner.managed.InputFile('results', 'sample_id', template=results_files),
            pypeliner.managed.OutputFile(out_file),
        ),
        kwargs={
            'table_names': '/sample_{}',
        },
    )

    return workflow

예제 #3

0

파일 보기

def get_sample_out_file(cmd, ext, out_dir, variant_type):

    out_file = os.path.join(out_dir, variant_type, cmd,
                            '{{tumour_sample_id}}.{0}'.format(ext))

    make_parent_directory(out_file)

    return out_file

예제 #4

0

파일 보기

def create_ascat_workflow(seqdata_files,
                          config,
                          out_file,
                          raw_data_dir,
                          somatic_breakpoint_file=None,
                          normal_id=None,
                          **kwargs):
    if normal_id is None:
        raise ValueError('ASCAT requires normal sample')

    normal_seqdata_file = seqdata_files[normal_id]
    tumour_seqdata_files = seqdata_files.copy()
    del tumour_seqdata_files[normal_id]

    results_files = os.path.join(raw_data_dir, 'results',
                                 'sample_{sample_id}.h5')
    utils.make_parent_directory(results_files)

    workflow = Workflow()

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('sample_id'),
        value=tumour_seqdata_files.keys(),
    )

    workflow.transform(
        name='prepare_normal_data',
        ctx={
            'mem': 16,
            'num_retry': 3,
            'mem_retry_increment': 4
        },
        func=tasks.prepare_normal_data,
        args=(
            pypeliner.managed.InputFile(normal_seqdata_file),
            pypeliner.managed.TempOutputFile('Germline_LogR.txt'),
            pypeliner.managed.TempOutputFile('Germline_BAF.txt'),
            config,
        ),
    )

    workflow.transform(
        name='prepare_tumour_data',
        axes=('sample_id', ),
        ctx={'mem': 20},
        func=tasks.prepare_tumour_data,
        args=(
            pypeliner.managed.InputFile('tumour_seqdata',
                                        'sample_id',
                                        fnames=tumour_seqdata_files),
            pypeliner.managed.TempOutputFile('Germline_LogR.txt', 'sample_id'),
            pypeliner.managed.TempOutputFile('Germline_BAF.txt', 'sample_id'),
            config,
        ),
    )

    return workflow

예제 #5

0

파일 보기

def build_index(
        index_sentinel_file,
        ref_genome_fasta_file,
        transcript_gtf_file,
        overhang=100,
        num_threads=1):

    make_parent_directory(index_sentinel_file)

    cmd = [
        'STAR',
        '--runMode', 'genomeGenerate',
        '--runThreadN', num_threads,
        '--genomeDir', os.path.dirname(index_sentinel_file),
        '--genomeFastaFiles', ref_genome_fasta_file,
        '--sjdbGTFfile', transcript_gtf_file,
        '--sjdbOverhang', overhang,
    ]

    pypeliner.commandline.execute(*cmd)

    open(index_sentinel_file, 'w').close()

예제 #6

0

파일 보기

def call_and_annotate_pipeline(
    config,
    bam_files,
    raw_data_dir,
    results_file,
    normal_id=None,
    somatic_breakpoint_file=None,
    patient_config=None,
):
    sample_ids = bam_files.keys()

    tumour_ids = bam_files.keys()
    if normal_id is not None:
        tumour_ids.remove(normal_id)

    workflow = Workflow()

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('sample_id'),
        value=sample_ids,
    )

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('tumour_id'),
        value=tumour_ids,
    )

    seq_data_template = os.path.join(raw_data_dir, 'seqdata',
                                     'sample_{sample_id}.h5')

    if somatic_breakpoint_file is not None:
        somatic_breakpoint_file = pypeliner.managed.InputFile(
            somatic_breakpoint_file)

    workflow.subworkflow(
        name='extract_seqdata_workflow',
        axes=('sample_id', ),
        func=remixt.workflow.create_extract_seqdata_workflow,
        args=(
            pypeliner.managed.InputFile('bam', 'sample_id', fnames=bam_files),
            pypeliner.managed.OutputFile('seqdata',
                                         'sample_id',
                                         template=seq_data_template),
            config['remixt'].get('extract_seqdata', {}),
            config['remixt']['ref_data_dir'],
        ),
    )

    merge_inputs = {}

    if 'remixt' in config:
        remixt_raw_data = os.path.join(raw_data_dir, 'remixt')
        remixt_results_filename = os.path.join(remixt_raw_data, 'results.h5')
        make_parent_directory(remixt_results_filename)

        remixt_config = config['remixt']['config']
        assert 'sample_specific' not in remixt_config
        remixt_config.update(patient_config)

        workflow.subworkflow(
            name='remixt',
            func=biowrappers.components.copy_number_calling.remixt.
            create_remixt_workflow,
            args=(
                pypeliner.managed.InputFile('seqdata',
                                            'sample_id',
                                            template=seq_data_template),
                remixt_config,
                pypeliner.managed.OutputFile(remixt_results_filename),
                remixt_raw_data,
            ),
            kwargs={
                'somatic_breakpoint_file': somatic_breakpoint_file,
                'ref_data_dir': config['remixt']['ref_data_dir'],
                'normal_id': normal_id,
            },
        )

        merge_inputs['/copy_number/remixt'] = pypeliner.managed.InputFile(
            remixt_results_filename)

    if 'titan' in config:
        titan_raw_data = os.path.join(raw_data_dir, 'titan')
        titan_results_filename = os.path.join(titan_raw_data, 'results.h5')
        make_parent_directory(titan_results_filename)

        workflow.subworkflow(
            name='titan',
            func=biowrappers.components.copy_number_calling.titan.
            create_titan_workflow,
            args=(
                pypeliner.managed.InputFile('seqdata',
                                            'sample_id',
                                            template=seq_data_template),
                config['titan']['config'],
                pypeliner.managed.OutputFile(titan_results_filename),
                titan_raw_data,
            ),
            kwargs={
                'somatic_breakpoint_file': somatic_breakpoint_file,
                'normal_id': normal_id,
            },
        )

        merge_inputs['/copy_number/titan'] = pypeliner.managed.InputFile(
            titan_results_filename)

    if 'clonehd' in config:
        clonehd_raw_data = os.path.join(raw_data_dir, 'clonehd')
        clonehd_results_filename = os.path.join(clonehd_raw_data, 'results.h5')
        make_parent_directory(clonehd_results_filename)

        workflow.subworkflow(
            name='clonehd',
            func=biowrappers.components.copy_number_calling.clonehd.
            create_clonehd_workflow,
            args=(
                pypeliner.managed.InputFile('seqdata',
                                            'sample_id',
                                            template=seq_data_template),
                config['clonehd']['config'],
                pypeliner.managed.OutputFile(clonehd_results_filename),
                clonehd_raw_data,
            ),
            kwargs={
                'somatic_breakpoint_file': somatic_breakpoint_file,
                'normal_id': normal_id,
            },
        )

        merge_inputs['/copy_number/clonehd'] = pypeliner.managed.InputFile(
            clonehd_results_filename)

    if 'theta' in config:
        theta_raw_data = os.path.join(raw_data_dir, 'theta')
        theta_results_filename = os.path.join(theta_raw_data, 'results.h5')
        make_parent_directory(theta_results_filename)

        workflow.subworkflow(
            name='theta',
            func=biowrappers.components.copy_number_calling.theta.
            create_theta_workflow,
            args=(
                pypeliner.managed.InputFile('seqdata',
                                            'sample_id',
                                            template=seq_data_template),
                config['theta']['config'],
                pypeliner.managed.OutputFile(theta_results_filename),
                theta_raw_data,
            ),
            kwargs={
                'somatic_breakpoint_file': somatic_breakpoint_file,
                'normal_id': normal_id,
                'num_clones': config['theta']['kwargs']['num_clones'],
            },
        )

        merge_inputs['/copy_number/theta'] = pypeliner.managed.InputFile(
            theta_results_filename)

    workflow.transform(
        name='merge_results',
        ctx={'mem': 8},
        func=hdf5_tasks.merge_hdf5,
        args=(
            merge_inputs,
            pypeliner.managed.OutputFile(results_file),
        ),
    )

    return workflow

예제 #7

0

파일 보기

def create_theta_workflow(seqdata_files,
                          config,
                          out_file,
                          raw_data_dir,
                          somatic_breakpoint_file=None,
                          normal_id=None,
                          **kwargs):
    if normal_id is None:
        raise ValueError('Theta requires normal sample')

    normal_seqdata_file = seqdata_files[normal_id]
    tumour_seqdata_files = seqdata_files.copy()
    del tumour_seqdata_files[normal_id]

    results_template = os.path.join(raw_data_dir, 'results',
                                    'sample_{sample_id}.h5')
    bicseq2_seg_template = os.path.join(raw_data_dir, 'bicseq2',
                                        'bicseq2_{sample_id}.seg')
    utils.make_parent_directory(results_template)
    utils.make_parent_directory(bicseq2_seg_template)

    workflow = Workflow()

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('sample_id'),
        value=tumour_seqdata_files.keys(),
    )

    if somatic_breakpoint_file is not None:
        somatic_breakpoint_file = pypeliner.managed.InputFile(
            somatic_breakpoint_file)

    workflow.transform(
        name='run_bicseq2',
        axes=('sample_id', ),
        ctx={'mem': 30},
        func=tasks.run_bicseq2_seg,
        args=(
            pypeliner.managed.OutputFile('bicseq2_seg',
                                         'sample_id',
                                         template=bicseq2_seg_template),
            pypeliner.managed.InputFile('normal_seqdata',
                                        template=normal_seqdata_file),
            pypeliner.managed.InputFile('tumour_seqdata',
                                        'sample_id',
                                        fnames=tumour_seqdata_files),
            config,
            pypeliner.managed.TempSpace('bicseq2_work',
                                        'sample_id',
                                        cleanup=None),
        ),
    )

    workflow.transform(
        name='run_theta',
        axes=('sample_id', ),
        ctx={'mem': 32},
        func=tasks.run_theta,
        args=(
            pypeliner.managed.OutputFile('results',
                                         'sample_id',
                                         template=results_template),
            pypeliner.managed.InputFile('normal_seqdata',
                                        template=normal_seqdata_file),
            pypeliner.managed.InputFile('tumour_seqdata',
                                        'sample_id',
                                        fnames=tumour_seqdata_files),
            pypeliner.managed.InputFile('bicseq2_seg',
                                        'sample_id',
                                        template=bicseq2_seg_template),
            config,
            pypeliner.managed.TempSpace('theta_work',
                                        'sample_id',
                                        cleanup=None),
        ),
        kwargs={
            'breakpoints_filename': somatic_breakpoint_file,
            'num_clones': kwargs.get('num_clones', None),
        },
    )

    workflow.transform(
        name='merge_results',
        ctx={'mem': 8},
        func=hdf5_tasks.merge_hdf5,
        args=(
            pypeliner.managed.InputFile('results',
                                        'sample_id',
                                        template=results_template),
            pypeliner.managed.OutputFile(out_file),
        ),
        kwargs={
            'table_names': '/sample_{}',
        },
    )

    return workflow

예제 #8

0

파일 보기

def call_and_annotate_pipeline(
    config,
    normal_bam_path,
    tumour_bam_paths,
    raw_data_dir,
    results_file,
):
    workflow = Workflow()

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('tumour_sample_id'),
        value=tumour_bam_paths.keys(),
    )

    merge_inputs = {}

    if 'destruct' in config:
        destruct_raw_data = os.path.join(raw_data_dir, 'destruct')
        destruct_results_filename = os.path.join(destruct_raw_data,
                                                 'results.h5')
        make_parent_directory(destruct_results_filename)

        workflow.subworkflow(
            name='destruct',
            func=destruct.destruct_pipeline,
            args=(
                pypeliner.managed.InputFile(normal_bam_path),
                pypeliner.managed.InputFile('tumour_bams',
                                            'tumour_sample_id',
                                            fnames=tumour_bam_paths),
                config['destruct']['config'],
                config['destruct']['ref_data_dir'],
                pypeliner.managed.OutputFile(destruct_results_filename),
                destruct_raw_data,
            ),
        )

        merge_inputs['/breakpoints/destruct'] = pypeliner.managed.InputFile(
            destruct_results_filename)

    if 'delly' in config:
        delly_raw_data = os.path.join(raw_data_dir, 'delly')
        delly_results_filename = os.path.join(delly_raw_data, 'results.h5')
        make_parent_directory(delly_results_filename)

        workflow.subworkflow(
            name='delly',
            func=delly.delly_pipeline,
            args=(
                pypeliner.managed.InputFile(normal_bam_path),
                pypeliner.managed.InputFile('tumour_bams',
                                            'tumour_sample_id',
                                            fnames=tumour_bam_paths),
                config['delly']['ref_genome_fasta_file'],
                config['delly']['exclude_file'],
                pypeliner.managed.OutputFile(delly_results_filename),
                delly_raw_data,
            ),
        )

        merge_inputs['/breakpoints/delly'] = pypeliner.managed.InputFile(
            delly_results_filename)

    if 'lumpysv' in config:
        lumpysv_raw_data = os.path.join(raw_data_dir, 'lumpysv')
        lumpysv_results_filename = os.path.join(lumpysv_raw_data, 'results.h5')
        make_parent_directory(lumpysv_results_filename)

        workflow.subworkflow(
            name='lumpysv',
            func=lumpysv.lumpysv_pipeline,
            args=(
                pypeliner.managed.InputFile(normal_bam_path),
                pypeliner.managed.InputFile('tumour_bams',
                                            'tumour_sample_id',
                                            fnames=tumour_bam_paths),
                pypeliner.managed.OutputFile(lumpysv_results_filename),
                lumpysv_raw_data,
            ),
        )

        merge_inputs['/breakpoints/lumpysv'] = pypeliner.managed.InputFile(
            lumpysv_results_filename)

    workflow.transform(name='merge_results',
                       ctx={'mem': 8},
                       func=hdf5_tasks.merge_hdf5,
                       args=(
                           merge_inputs,
                           pypeliner.managed.OutputFile(results_file),
                       ))

    return workflow

예제 #9

0

파일 보기

def create_remixt_workflow(
    seqdata_files,
    config,
    out_file,
    raw_data_dir,
    ref_data_dir=None,
    somatic_breakpoint_file=None,
    normal_id=None,
):
    if somatic_breakpoint_file is None:
        raise ValueError('somatic breakpoints required')

    if ref_data_dir is None:
        raise ValueError('ref data directory required')

    sample_ids = seqdata_files.keys()

    tumour_ids = seqdata_files.keys()
    if normal_id is not None:
        tumour_ids.remove(normal_id)

    results_files = os.path.join(raw_data_dir, 'results',
                                 'sample_{tumour_id}.h5')
    selected_files = os.path.join(raw_data_dir, 'selected',
                                  'sample_{tumour_id}.h5')
    utils.make_parent_directory(results_files)
    utils.make_parent_directory(selected_files)

    workflow = Workflow()

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('sample_id'),
        value=sample_ids,
    )

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('tumour_id'),
        value=tumour_ids,
    )

    workflow.subworkflow(
        name='remixt',
        func=remixt.workflow.create_remixt_seqdata_workflow,
        args=(
            pypeliner.managed.InputFile(somatic_breakpoint_file),
            pypeliner.managed.InputFile('seqdata',
                                        'sample_id',
                                        fnames=seqdata_files),
            pypeliner.managed.OutputFile('results',
                                         'tumour_id',
                                         template=results_files,
                                         axes_origin=[]),
            raw_data_dir,
            config,
            ref_data_dir,
        ),
        kwargs={
            'normal_id': normal_id,
        })

    workflow.transform(name='select_solution',
                       ctx={
                           'mem': 2,
                           'num_retry': 3,
                           'mem_retry_increment': 2
                       },
                       func=tasks.select_solution,
                       axes=('tumour_id', ),
                       args=(
                           pypeliner.managed.OutputFile(
                               'selected',
                               'tumour_id',
                               template=selected_files),
                           pypeliner.managed.InputFile('results',
                                                       'tumour_id',
                                                       template=results_files),
                           config,
                       ))

    workflow.transform(
        name='merge_results',
        ctx={
            'mem': 8,
            'num_retry': 3,
            'mem_retry_increment': 2
        },
        func=hdf5_tasks.merge_hdf5,
        args=(
            pypeliner.managed.InputFile('selected',
                                        'tumour_id',
                                        template=selected_files),
            pypeliner.managed.OutputFile(out_file),
        ),
        kwargs={
            'table_names': '/sample_{}',
        },
    )

    return workflow

예제 #10

0

파일 보기

def create_titan_workflow(seqdata_files,
                          config,
                          out_file,
                          raw_data_dir,
                          somatic_breakpoint_file=None,
                          normal_id=None,
                          **kwargs):
    if normal_id is None:
        raise ValueError('Titan requires normal sample')

    normal_seqdata_file = seqdata_files[normal_id]
    tumour_seqdata_files = seqdata_files.copy()
    del tumour_seqdata_files[normal_id]

    results_files = os.path.join(raw_data_dir, 'results',
                                 'sample_{sample_id}.h5')
    utils.make_parent_directory(results_files)

    workflow = Workflow()

    workflow.setobj(
        obj=pypeliner.managed.OutputChunks('sample_id'),
        value=tumour_seqdata_files.keys(),
    )

    workflow.transform(
        name='prepare_normal_data',
        ctx={
            'mem': 16,
            'num_retry': 3,
            'mem_retry_increment': 4
        },
        func=tasks.prepare_normal_data,
        args=(
            pypeliner.managed.InputFile(normal_seqdata_file),
            pypeliner.managed.TempOutputFile('normal.wig'),
            pypeliner.managed.TempOutputFile('het_positions.tsv'),
            config,
        ),
    )

    workflow.transform(
        name='prepare_tumour_data',
        axes=('sample_id', ),
        ctx={'mem': 20},
        func=tasks.prepare_tumour_data,
        args=(
            pypeliner.managed.InputFile('tumour_seqdata',
                                        'sample_id',
                                        fnames=tumour_seqdata_files),
            pypeliner.managed.TempInputFile('het_positions.tsv'),
            pypeliner.managed.TempOutputFile('tumour.wig', 'sample_id'),
            pypeliner.managed.TempOutputFile('tumour_alleles.tsv',
                                             'sample_id'),
            config,
        ),
    )

    workflow.transform(
        name='create_intialization_parameters',
        axes=('sample_id', ),
        ctx={
            'mem': 4,
            'num_retry': 3,
            'mem_retry_increment': 2
        },
        func=tasks.create_intialization_parameters,
        ret=pypeliner.managed.TempOutputObj('init_params', 'sample_id',
                                            'init_param_id'),
        args=(config, ),
    )

    workflow.transform(
        name='run_titan',
        axes=('sample_id', 'init_param_id'),
        ctx={
            'mem': 16,
            'num_retry': 3,
            'mem_retry_increment': 4
        },
        func=tasks.run_titan,
        args=(
            pypeliner.managed.TempInputObj('init_params', 'sample_id',
                                           'init_param_id'),
            pypeliner.managed.TempInputFile('normal.wig'),
            pypeliner.managed.TempInputFile('tumour.wig', 'sample_id'),
            pypeliner.managed.TempInputFile('tumour_alleles.tsv', 'sample_id'),
            pypeliner.managed.TempOutputFile('cn.tsv', 'sample_id',
                                             'init_param_id'),
            pypeliner.managed.TempOutputFile('params.tsv', 'sample_id',
                                             'init_param_id'),
            config,
        ),
    )

    if somatic_breakpoint_file is not None:
        somatic_breakpoint_file = pypeliner.managed.InputFile(
            somatic_breakpoint_file)

    workflow.transform(
        name='select_solution',
        axes=('sample_id', ),
        ctx={
            'mem': 4,
            'num_retry': 3,
            'mem_retry_increment': 2
        },
        func=tasks.select_solution,
        args=(
            pypeliner.managed.TempInputObj('init_params', 'sample_id',
                                           'init_param_id'),
            pypeliner.managed.TempInputFile('cn.tsv', 'sample_id',
                                            'init_param_id'),
            pypeliner.managed.TempInputFile('params.tsv', 'sample_id',
                                            'init_param_id'),
            pypeliner.managed.OutputFile('results',
                                         'sample_id',
                                         template=results_files),
            pypeliner.managed.OutputFile(
                os.path.join(raw_data_dir, 'output',
                             '{sample_id}_cn_loci.tsv'), 'sample_id'),
            pypeliner.managed.OutputFile(
                os.path.join(raw_data_dir, 'output',
                             '{sample_id}_cn_segments.tsv'), 'sample_id'),
            pypeliner.managed.OutputFile(
                os.path.join(raw_data_dir, 'output', '{sample_id}_cn_igv.tsv'),
                'sample_id'),
            pypeliner.managed.OutputFile(
                os.path.join(raw_data_dir, 'output', '{sample_id}_params.tsv'),
                'sample_id'),
            config,
            pypeliner.managed.Template('{sample_id}', 'sample_id'),
        ),
        kwargs={
            'breakpoints_filename': somatic_breakpoint_file,
        },
    )

    workflow.setobj(obj=pypeliner.managed.OutputChunks('sample_id',
                                                       'chromosome'),
                    value=config.get('chromosomes', default_chromosomes),
                    axes=('sample_id', ))

    workflow.commandline(
        name='plot_chromosome',
        axes=('sample_id', 'chromosome'),
        ctx={
            'mem': 4,
            'num_retry': 3,
            'mem_retry_increment': 2
        },
        args=(
            'plot_titan_chromosome.R',
            pypeliner.managed.Instance('chromosome'),
            pypeliner.managed.InputFile(
                os.path.join(raw_data_dir, 'output',
                             '{sample_id}_cn_loci.tsv'), 'sample_id'),
            pypeliner.managed.InputFile(
                os.path.join(raw_data_dir, 'output', '{sample_id}_params.tsv'),
                'sample_id'),
            pypeliner.managed.OutputFile(
                os.path.join(raw_data_dir, 'output',
                             '{sample_id}_chr_{chromosome}.png'), 'sample_id',
                'chromosome'),
        ),
    )

    workflow.transform(
        name='merge_results',
        ctx={
            'mem': 8,
            'num_retry': 3,
            'mem_retry_increment': 2
        },
        func=hdf5_tasks.merge_hdf5,
        args=(
            pypeliner.managed.InputFile('results',
                                        'sample_id',
                                        template=results_files),
            pypeliner.managed.OutputFile(out_file),
        ),
        kwargs={
            'table_names': '/sample_{}',
        },
    )

    return workflow