コード例 #1
0
def _create_download_decompress_concat_workflow(urls,
                                                out_file,
                                                local_download=False):
    workflow = pypeliner.workflow.Workflow()

    local_files = []

    for i, url in enumerate(urls):
        local_files.append(mgd.TempFile('file_{}'.format(i)))

        workflow.setobj(mgd.TempOutputObj('url_{}'.format(i)), value=url)

        workflow.subworkflow(name='download_file_{}'.format(i),
                             func=_create_download_decompress_workflow,
                             args=(
                                 mgd.TempInputObj('url_{}'.format(i)),
                                 local_files[i].as_output(),
                             ),
                             kwargs={'local_download': local_download})

    concat_args = [
        'cat',
    ] + [x.as_input()
         for x in local_files] + ['>', mgd.OutputFile(out_file)]

    workflow.commandline(name='concat', args=concat_args)

    return workflow
コード例 #2
0
ファイル: __init__.py プロジェクト: DouglasAbrams/biowrappers
def create_vcf_tric_nucleotide_annotation_workflow(
        ref_genome_fasta_file,
        vcf_file,
        out_file,
        docker_config=None,
        split_size=int(1e4),
        table_name='tri_nucleotide_context'):

    ctx = {'num_retry': 3, 'mem_retry_increment': 2}
    if docker_config:
        ctx.update(docker_config)


    merged_file = mgd.TempFile('merged.csv.gz')

    workflow = pypeliner.workflow.Workflow()

    workflow.transform(
        name='split_vcf',
        ctx=dict(mem=2, **ctx),
        func='biowrappers.components.io.vcf.tasks.split_vcf',
        args=(
            mgd.InputFile(vcf_file),
            mgd.TempOutputFile('split.vcf', 'split')
        ),
        kwargs={'lines_per_file': split_size}
    )

    workflow.transform(
        name='annotate_db_status',
        axes=('split',),
        ctx=dict(mem=4, **ctx),
        func='biowrappers.components.variant_calling.tri_nucleotide_context.tasks.get_tri_nucelotide_context',
        args=(
            ref_genome_fasta_file,
            mgd.TempInputFile('split.vcf', 'split'),
            mgd.TempOutputFile('tri_nucleotide_context.csv.gz', 'split',
                               extensions=['.yaml']),
            table_name
        )
    )

    workflow.transform(
        name='merge_tables',
        ctx=dict(mem=2, **ctx),
        func='single_cell.utils.csvutils.concatenate_csv',
        args=(
            mgd.TempInputFile('tri_nucleotide_context.csv.gz', 'split'),
            mgd.OutputFile(out_file, extensions=['.yaml']))
    )


    return workflow
コード例 #3
0
def create_destruct_workflow(
    bam_filenames,
    breakpoint_table,
    breakpoint_library_table,
    breakpoint_read_table,
    config,
    ref_data_dir,
    raw_data_dir=None,
):
    # Optionally cache raw reads for quicker rerun
    if raw_data_dir is not None:
        mgd_stats = mgd.File(
            os.path.join(raw_data_dir, '{bylibrary}_stats.txt'), 'bylibrary')
        mgd_reads_1 = mgd.File(
            os.path.join(raw_data_dir, '{bylibrary}_reads1.fq.gz'),
            'bylibrary')
        mgd_reads_2 = mgd.File(
            os.path.join(raw_data_dir, '{bylibrary}_reads2.fq.gz'),
            'bylibrary')
        mgd_sample_1 = mgd.File(
            os.path.join(raw_data_dir, '{bylibrary}_sample1.fq.gz'),
            'bylibrary')
        mgd_sample_2 = mgd.File(
            os.path.join(raw_data_dir, '{bylibrary}_sample2.fq.gz'),
            'bylibrary')

    else:
        mgd_stats = mgd.TempFile('stats.txt', 'bylibrary')
        mgd_reads_1 = mgd.TempFile('reads1.fq.gz', 'bylibrary')
        mgd_reads_2 = mgd.TempFile('reads2.fq.gz', 'bylibrary')
        mgd_sample_1 = mgd.TempFile('sample1.fq.gz', 'bylibrary')
        mgd_sample_2 = mgd.TempFile('sample2.fq.gz', 'bylibrary')

    config = destruct.defaultconfig.get_config(ref_data_dir, config)

    workflow = pypeliner.workflow.Workflow()

    # Set the library ids

    workflow.setobj(
        obj=mgd.TempOutputObj('library_id', 'bylibrary'),
        value=destruct.tasks.create_library_ids(bam_filenames.keys()),
    )

    # Retrieve discordant reads and stats from bam files

    workflow.commandline(
        name='bamdisc',
        axes=('bylibrary', ),
        ctx={
            'io': 1,
            'mem': 8
        },
        args=(
            'destruct_bamdiscordantfastq',
            '-r',
            '-c',
            config['bam_max_soft_clipped'],
            '-f',
            config['bam_max_fragment_length'],
            '-b',
            mgd.InputFile('bam', 'bylibrary', fnames=bam_filenames),
            '-s',
            mgd_stats.as_output(),
            '--fastq1',
            mgd_reads_1.as_output(),
            '--fastq2',
            mgd_reads_2.as_output(),
            '-t',
            mgd.TempSpace('bamdisc.tempspace', 'bylibrary'),
            '-n',
            config['num_read_samples'],
            '--sample1',
            mgd_sample_1.as_output(),
            '--sample2',
            mgd_sample_2.as_output(),
        ),
    )

    workflow.subworkflow(
        name='destruct_fastq',
        func=create_destruct_fastq_workflow,
        args=(
            mgd_reads_1.as_input(),
            mgd_reads_2.as_input(),
            mgd_sample_1.as_input(),
            mgd_sample_2.as_input(),
            mgd_stats.as_input(),
            mgd.OutputFile(breakpoint_table),
            mgd.OutputFile(breakpoint_library_table),
            mgd.OutputFile(breakpoint_read_table),
            config,
            ref_data_dir,
        ),
        kwargs={
            'raw_data_dir': raw_data_dir,
        },
    )

    return workflow
コード例 #4
0
def create_snv_allele_counts_for_vcf_targets_workflow(
        bam_file,
        vcf_file,
        out_file,
        chromosomes=default_chromosomes,
        count_duplicates=False,
        hdf5_output=True,
        min_bqual=0,
        min_mqual=0,
        split_size=int(1e7),
        table_name='snv_allele_counts',
        vcf_to_bam_chrom_map=None):

    if hdf5_output:
        merged_file = mgd.File(out_file)

    else:
        merged_file = mgd.TempFile('merged.h5')

    workflow = pypeliner.workflow.Workflow()

    workflow.transform(
        name='get_regions',
        ret=mgd.TempOutputObj('regions_obj', 'regions'),
        func='biowrappers.components.variant_calling.utils.get_vcf_regions',
        args=(
            mgd.InputFile(vcf_file),
            split_size,
        ),
        kwargs={
            'chromosomes': chromosomes,
        },
    )

    workflow.transform(
        name='get_snv_allele_counts_for_vcf_targets',
        axes=('regions',),
        ctx=med_ctx,
        func='biowrappers.components.snv_allele_counts.tasks.get_snv_allele_counts_for_vcf_targets',
        args=(
            mgd.InputFile(bam_file),
            mgd.InputFile(vcf_file),
            mgd.TempOutputFile('counts.h5', 'regions'),
            table_name
        ),
        kwargs={
            'count_duplicates': count_duplicates,
            'min_bqual': min_bqual,
            'min_mqual': min_mqual,
            'region': mgd.TempInputObj('regions_obj', 'regions'),
            'vcf_to_bam_chrom_map': vcf_to_bam_chrom_map,
        }
    )

    workflow.transform(
        name='merge_snv_allele_counts',
        ctx=med_ctx,
        func='biowrappers.components.io.hdf5.tasks.concatenate_tables',
        args=(
            mgd.TempInputFile('counts.h5', 'regions'),
            merged_file.as_output(),
        ),
        kwargs={
            'in_memory': False,
        }
    )

    if not hdf5_output:
        workflow.transform(
            name='convert_to_tsv',
            ctx={'mem': 2, 'num_retry': 3, 'mem_retry_increment': 2},
            func='biowrappers.components.io.hdf5.tasks.convert_hdf5_to_tsv',
            args=(
                merged_file.as_input(),
                table_name,
                mgd.OutputFile(out_file),
            ),
            kwargs={
                'compress': True,
            }
        )

    return workflow