def _create_download_decompress_concat_workflow(urls, out_file, local_download=False): workflow = pypeliner.workflow.Workflow() local_files = [] for i, url in enumerate(urls): local_files.append(mgd.TempFile('file_{}'.format(i))) workflow.setobj(mgd.TempOutputObj('url_{}'.format(i)), value=url) workflow.subworkflow(name='download_file_{}'.format(i), func=_create_download_decompress_workflow, args=( mgd.TempInputObj('url_{}'.format(i)), local_files[i].as_output(), ), kwargs={'local_download': local_download}) concat_args = [ 'cat', ] + [x.as_input() for x in local_files] + ['>', mgd.OutputFile(out_file)] workflow.commandline(name='concat', args=concat_args) return workflow
def create_vcf_tric_nucleotide_annotation_workflow( ref_genome_fasta_file, vcf_file, out_file, docker_config=None, split_size=int(1e4), table_name='tri_nucleotide_context'): ctx = {'num_retry': 3, 'mem_retry_increment': 2} if docker_config: ctx.update(docker_config) merged_file = mgd.TempFile('merged.csv.gz') workflow = pypeliner.workflow.Workflow() workflow.transform( name='split_vcf', ctx=dict(mem=2, **ctx), func='biowrappers.components.io.vcf.tasks.split_vcf', args=( mgd.InputFile(vcf_file), mgd.TempOutputFile('split.vcf', 'split') ), kwargs={'lines_per_file': split_size} ) workflow.transform( name='annotate_db_status', axes=('split',), ctx=dict(mem=4, **ctx), func='biowrappers.components.variant_calling.tri_nucleotide_context.tasks.get_tri_nucelotide_context', args=( ref_genome_fasta_file, mgd.TempInputFile('split.vcf', 'split'), mgd.TempOutputFile('tri_nucleotide_context.csv.gz', 'split', extensions=['.yaml']), table_name ) ) workflow.transform( name='merge_tables', ctx=dict(mem=2, **ctx), func='single_cell.utils.csvutils.concatenate_csv', args=( mgd.TempInputFile('tri_nucleotide_context.csv.gz', 'split'), mgd.OutputFile(out_file, extensions=['.yaml'])) ) return workflow
def create_destruct_workflow( bam_filenames, breakpoint_table, breakpoint_library_table, breakpoint_read_table, config, ref_data_dir, raw_data_dir=None, ): # Optionally cache raw reads for quicker rerun if raw_data_dir is not None: mgd_stats = mgd.File( os.path.join(raw_data_dir, '{bylibrary}_stats.txt'), 'bylibrary') mgd_reads_1 = mgd.File( os.path.join(raw_data_dir, '{bylibrary}_reads1.fq.gz'), 'bylibrary') mgd_reads_2 = mgd.File( os.path.join(raw_data_dir, '{bylibrary}_reads2.fq.gz'), 'bylibrary') mgd_sample_1 = mgd.File( os.path.join(raw_data_dir, '{bylibrary}_sample1.fq.gz'), 'bylibrary') mgd_sample_2 = mgd.File( os.path.join(raw_data_dir, '{bylibrary}_sample2.fq.gz'), 'bylibrary') else: mgd_stats = mgd.TempFile('stats.txt', 'bylibrary') mgd_reads_1 = mgd.TempFile('reads1.fq.gz', 'bylibrary') mgd_reads_2 = mgd.TempFile('reads2.fq.gz', 'bylibrary') mgd_sample_1 = mgd.TempFile('sample1.fq.gz', 'bylibrary') mgd_sample_2 = mgd.TempFile('sample2.fq.gz', 'bylibrary') config = destruct.defaultconfig.get_config(ref_data_dir, config) workflow = pypeliner.workflow.Workflow() # Set the library ids workflow.setobj( obj=mgd.TempOutputObj('library_id', 'bylibrary'), value=destruct.tasks.create_library_ids(bam_filenames.keys()), ) # Retrieve discordant reads and stats from bam files workflow.commandline( name='bamdisc', axes=('bylibrary', ), ctx={ 'io': 1, 'mem': 8 }, args=( 'destruct_bamdiscordantfastq', '-r', '-c', config['bam_max_soft_clipped'], '-f', config['bam_max_fragment_length'], '-b', mgd.InputFile('bam', 'bylibrary', fnames=bam_filenames), '-s', mgd_stats.as_output(), '--fastq1', mgd_reads_1.as_output(), '--fastq2', mgd_reads_2.as_output(), '-t', mgd.TempSpace('bamdisc.tempspace', 'bylibrary'), '-n', config['num_read_samples'], '--sample1', mgd_sample_1.as_output(), '--sample2', mgd_sample_2.as_output(), ), ) workflow.subworkflow( name='destruct_fastq', func=create_destruct_fastq_workflow, args=( mgd_reads_1.as_input(), mgd_reads_2.as_input(), mgd_sample_1.as_input(), mgd_sample_2.as_input(), mgd_stats.as_input(), mgd.OutputFile(breakpoint_table), mgd.OutputFile(breakpoint_library_table), mgd.OutputFile(breakpoint_read_table), config, ref_data_dir, ), kwargs={ 'raw_data_dir': raw_data_dir, }, ) return workflow
def create_snv_allele_counts_for_vcf_targets_workflow( bam_file, vcf_file, out_file, chromosomes=default_chromosomes, count_duplicates=False, hdf5_output=True, min_bqual=0, min_mqual=0, split_size=int(1e7), table_name='snv_allele_counts', vcf_to_bam_chrom_map=None): if hdf5_output: merged_file = mgd.File(out_file) else: merged_file = mgd.TempFile('merged.h5') workflow = pypeliner.workflow.Workflow() workflow.transform( name='get_regions', ret=mgd.TempOutputObj('regions_obj', 'regions'), func='biowrappers.components.variant_calling.utils.get_vcf_regions', args=( mgd.InputFile(vcf_file), split_size, ), kwargs={ 'chromosomes': chromosomes, }, ) workflow.transform( name='get_snv_allele_counts_for_vcf_targets', axes=('regions',), ctx=med_ctx, func='biowrappers.components.snv_allele_counts.tasks.get_snv_allele_counts_for_vcf_targets', args=( mgd.InputFile(bam_file), mgd.InputFile(vcf_file), mgd.TempOutputFile('counts.h5', 'regions'), table_name ), kwargs={ 'count_duplicates': count_duplicates, 'min_bqual': min_bqual, 'min_mqual': min_mqual, 'region': mgd.TempInputObj('regions_obj', 'regions'), 'vcf_to_bam_chrom_map': vcf_to_bam_chrom_map, } ) workflow.transform( name='merge_snv_allele_counts', ctx=med_ctx, func='biowrappers.components.io.hdf5.tasks.concatenate_tables', args=( mgd.TempInputFile('counts.h5', 'regions'), merged_file.as_output(), ), kwargs={ 'in_memory': False, } ) if not hdf5_output: workflow.transform( name='convert_to_tsv', ctx={'mem': 2, 'num_retry': 3, 'mem_retry_increment': 2}, func='biowrappers.components.io.hdf5.tasks.convert_hdf5_to_tsv', args=( merged_file.as_input(), table_name, mgd.OutputFile(out_file), ), kwargs={ 'compress': True, } ) return workflow