def align_pe(fastq1, fastq2, output, reports, metrics, tempdir, reference, instrument, centre, sample_info, cell_id, lane_id, library_id, config): readgroup = get_readgroup(lane_id, cell_id, library_id, centre, sample_info) run_fastqc(fastq1, fastq2, reports, tempdir, config) aln_temp = os.path.join(tempdir, "temp_alignments.bam") if config["aligner"] == "bwa-mem": bwa_mem_paired_end(fastq1, fastq2, aln_temp, reference, readgroup, tempdir, config['containers']) elif config["aligner"] == "bwa-aln": if not instrument == "N550": fastq1, fastq2 = trim_fastqs(fastq1, fastq2, cell_id, tempdir, config) bwa_aln_paired_end(fastq1, fastq2, aln_temp, tempdir, reference, readgroup, config['containers']) else: raise Exception( "Aligner %s not supported, pipeline supports bwa-aln and bwa-mem" % config["aligner"]) container_ctx = helpers.get_container_ctx(config['containers'], 'picard', docker_only=True) picardutils.bam_sort(aln_temp, output, tempdir, **container_ctx) container_ctx = helpers.get_container_ctx(config['containers'], 'samtools', docker_only=True) bamutils.bam_flagstat(output, metrics, **container_ctx)
def merge_bams(inputs, output, output_index, config): container_ctx = helpers.get_container_ctx(config['containers'], 'picard', docker_only=True) picardutils.merge_bams(inputs, output, **container_ctx) container_ctx = helpers.get_container_ctx(config['containers'], 'samtools', docker_only=True) bamutils.bam_index(output, output_index, **container_ctx)
def bwa_aln_paired_end(fastq1, fastq2, output, tempdir, reference, readgroup, config): container_ctx = helpers.get_container_ctx(config, 'bwa', docker_only=True) samfile = os.path.join(tempdir, "bwamem.sam") bamutils.bwa_aln_paired_end(fastq1, fastq2, samfile, tempdir, reference, readgroup, **container_ctx) container_ctx = helpers.get_container_ctx(config, 'samtools', docker_only=True) bamutils.samtools_sam_to_bam(samfile, output, **container_ctx)
def create_variant_counting_workflow( vcfs, tumour_cell_bams, results_h5, config, ): """ Count variant reads for multiple sets of variants across cells. """ workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=tumour_cell_bams.keys(), ) workflow.transform(name='merge_snvs', func='biowrappers.components.io.vcf.tasks.merge_vcfs', args=([mgd.InputFile(vcf) for vcf in vcfs], mgd.TempOutputFile('all.snv.vcf'))) workflow.transform(name='finalise_snvs', func="biowrappers.components.io.vcf.tasks.finalise_vcf", args=(mgd.TempInputFile('all.snv.vcf'), mgd.TempOutputFile('all.snv.vcf.gz', extensions=['.tbi'])), kwargs={ 'docker_config': helpers.get_container_ctx(config['containers'], 'vcftools') }) workflow.subworkflow( name='count_alleles', func=create_snv_allele_counts_for_vcf_targets_workflow, args=( config, mgd.InputFile('tumour_cells.bam', 'cell_id', extensions=['.bai'], fnames=tumour_cell_bams), mgd.TempInputFile('all.snv.vcf.gz'), mgd.OutputFile(results_h5), ), kwargs={ 'docker_config': helpers.get_container_ctx(config['containers'], 'single_cell_pipeline') }, ) return workflow
def run_hmmcopy_script(corrected_reads, tempdir, cell_id, hmmparams, config): container_ctx = helpers.get_container_ctx(config['containers'], 'hmmcopy', docker_only=True) if container_ctx.get("container_type") == 'docker': cmd = ["hmmcopy"] else: cmd = ['Rscript', run_hmmcopy_rscript] # run hmmcopy cmd += [ '--corrected_data=' + corrected_reads, '--outdir=' + tempdir, '--sample_id=' + cell_id ] multipliers = ','.join(map(str, hmmparams['multipliers'])) cmd.append('--param_str=' + str(hmmparams['strength'])) cmd.append('--param_e=' + str(hmmparams['e'])) cmd.append('--param_mu=' + str(hmmparams['mu'])) cmd.append('--param_l=' + str(hmmparams['lambda'])) cmd.append('--param_nu=' + str(hmmparams['nu'])) cmd.append('--param_k=' + str(hmmparams['kappa'])) cmd.append('--param_m=' + str(hmmparams['m'])) cmd.append('--param_eta=' + str(hmmparams['eta'])) cmd.append('--param_g=' + str(hmmparams['g'])) cmd.append('--param_s=' + str(hmmparams['s'])) cmd.append('--param_multiplier=' + multipliers) pypeliner.commandline.execute(*cmd, **container_ctx)
def realign(input_bams, input_bais, output_bams, tempdir, config, interval): container_ctx = helpers.get_container_ctx(config['containers'], 'samtools', docker_only=True) # make the dir if not os.path.exists(tempdir): os.makedirs(tempdir) # symlink inputs to tempdir, inputs have same filename but they should be # different for mapping file nwayout to work # realign new_inputs = {} for key, bamfile in input_bams.items(): new_bam = os.path.join(tempdir, key + '.bam') new_bai = os.path.join(tempdir, key + '.bam.bai') shutil.copy(bamfile, new_bam) shutil.copy(bamfile + '.bai', new_bai) new_inputs[key] = new_bam # save intervals file in tempdir targets = os.path.join(tempdir, 'realn_positions.intervals') gatkutils.generate_targets(input_bams, config, targets, interval, **container_ctx) # run gatk realigner gatkutils.gatk_realigner(new_inputs, config, targets, interval, tempdir, **container_ctx) # copy generated files in temp dir to the specified output paths for key in input_bams.keys(): realigned_bam = os.path.join(tempdir, key + '_indel_realigned.bam') realigned_bai = os.path.join(tempdir, key + '_indel_realigned.bai') output_bam_filename = output_bams[key] output_bai_filename = output_bam_filename + '.bai' shutil.move(realigned_bam, output_bam_filename) shutil.move(realigned_bai, output_bai_filename)
def run_fastqc(fastq1, fastq2, reports, tempdir, config): """ run fastqc on both fastq files run trimgalore if needed, copy if not. """ container_ctx = helpers.get_container_ctx(config['containers'], 'fastqc', docker_only=True) reports_dir = os.path.join(tempdir, 'fastqc_reports') if not os.path.exists(reports_dir): helpers.makedirs(reports_dir) out_html = os.path.join(reports_dir, 'fastqc_R1.html') out_plot = os.path.join(reports_dir, 'fastqc_R1.zip') if not os.path.getsize(fastq1) == 0: bamutils.produce_fastqc_report(fastq1, out_html, out_plot, tempdir, **container_ctx) else: warnings.warn("fastq file %s is empty, skipping fastqc" % fastq1) out_html = os.path.join(reports_dir, 'fastqc_R2.html') out_plot = os.path.join(reports_dir, 'fastqc_R2.zip') if not os.path.getsize(fastq2) == 0: bamutils.produce_fastqc_report(fastq2, out_html, out_plot, tempdir, **container_ctx) else: warnings.warn("fastq file %s is empty, skipping fastqc" % fastq1) helpers.make_tarfile(reports, reports_dir)
def create_museq_workflow( normal_bam, tumour_bam, ref_genome, snv_vcf, config): ctx = {'mem_retry_increment': 2, 'ncpus': 1} docker_ctx = helpers.get_container_ctx(config['containers'], 'single_cell_pipeline') ctx.update(docker_ctx) workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('region'), value=normal_bam.keys(), ) workflow.transform( name='run_museq', ctx=dict(mem=config["memory"]['med'], pool_id=config['pools']['highmem'], **ctx), axes=('region',), func='single_cell.workflows.mutationseq.tasks.run_museq', args=( mgd.InputFile('merged_bam', 'region', fnames=tumour_bam), mgd.InputFile('normal.split.bam', 'region', fnames=normal_bam), mgd.TempOutputFile('museq.vcf', 'region'), mgd.TempOutputFile('museq.log', 'region'), mgd.InputInstance('region'), config, ), kwargs={'docker_kwargs': helpers.get_container_ctx(config['containers'], 'mutationseq')} ) workflow.transform( name='merge_snvs', ctx=dict(mem=config["memory"]['med'], pool_id=config['pools']['standard'], **ctx), func='biowrappers.components.io.vcf.tasks.concatenate_vcf', args=( mgd.TempInputFile('museq.vcf', 'region'), mgd.OutputFile(snv_vcf), ), ) return workflow
def bam_collect_gc_metrics(bam_filename, ref_genome, metrics_filename, summary_filename, chart_filename, tempdir, config): container_ctx = helpers.get_container_ctx(config['containers'], 'picard', docker_only=True) picardutils.bam_collect_gc_metrics(bam_filename, ref_genome, metrics_filename, summary_filename, chart_filename, tempdir, **container_ctx)
def create_merge_bams_workflow( input_bams, merged_bams, cell_ids, config, regions): merged_bams = dict([(region, merged_bams[region]) for region in regions]) ctx = {'mem_retry_increment': 2} docker_ctx = helpers.get_container_ctx(config['containers'], 'single_cell_pipeline') ctx.update(docker_ctx) workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=cell_ids, ) workflow.setobj( obj=mgd.OutputChunks('region'), value=regions, ) workflow.transform( name='merge_bams', ctx=dict(mem=config['memory']['high'], pool_id=config['pools']['multicore'], ncpus=config['max_cores'], **ctx), func="single_cell.workflows.merge_bams.tasks.merge_bams", args=( mgd.InputFile('bam', 'cell_id', fnames=input_bams), mgd.OutputFile('merged.bam', "region", fnames=merged_bams, axes_origin=[]), regions, helpers.get_container_ctx(config['containers'], 'samtools') ), kwargs = {"ncores": config["max_cores"]} ) return workflow
def postprocess_bam(infile, outfile, outfile_index, tempdir, config, markdups_metrics, flagstat_metrics): if not os.path.exists(tempdir): helpers.makedirs(tempdir) container_ctx = helpers.get_container_ctx(config['containers'], 'picard', docker_only=True) sorted_bam = os.path.join(tempdir, 'sorted.bam') picardutils.bam_sort(infile, sorted_bam, tempdir, **container_ctx) picardutils.bam_markdups(sorted_bam, outfile, markdups_metrics, tempdir, **container_ctx) container_ctx = helpers.get_container_ctx(config['containers'], 'samtools', docker_only=True) bamutils.bam_index(outfile, outfile_index, **container_ctx) bamutils.bam_flagstat(outfile, flagstat_metrics, **container_ctx)
def get_postprocess_metrics(infile, infile_bai, tempdir, config, markdups_metrics, flagstat_metrics): if not os.path.exists(tempdir): helpers.makedirs(tempdir) outfile = os.path.join(tempdir, 'markdps.bam') outfile_index = outfile + '.bai' container_ctx = helpers.get_container_ctx(config['containers'], 'picard', docker_only=True) picardutils.bam_markdups(infile, outfile, markdups_metrics, tempdir, **container_ctx) container_ctx = helpers.get_container_ctx(config['containers'], 'samtools', docker_only=True) bamutils.bam_index(outfile, outfile_index, **container_ctx) bamutils.bam_flagstat(outfile, flagstat_metrics, **container_ctx)
def bam_collect_insert_metrics(bam_filename, flagstat_metrics_filename, metrics_filename, histogram_filename, tempdir, config): container_ctx = helpers.get_container_ctx(config['containers'], 'picard', docker_only=True) picardutils.bam_collect_insert_metrics(bam_filename, flagstat_metrics_filename, metrics_filename, histogram_filename, tempdir, **container_ctx)
def variant_calling_workflow(args): config = helpers.load_config(args) ctx = {'num_retry': 3, 'mem_retry_increment': 2, 'ncpus': 1} docker_ctx = helpers.get_container_ctx(config['containers'], 'single_cell_pipeline') ctx.update(docker_ctx) meta_yaml = os.path.join(args['out_dir'], 'info.yaml') bam_files, bai_files = helpers.get_bams(args['input_yaml']) cellids = helpers.get_samples(args['input_yaml']) varcalls_dir = os.path.join(args['out_dir'], 'results', 'variant_calling') museq_vcf = os.path.join(varcalls_dir, 'museq_snv.vcf.gz') strelka_snv_vcf = os.path.join(varcalls_dir, 'strelka_snv.vcf.gz') strelka_indel_vcf = os.path.join(varcalls_dir, 'strelka_indel.vcf.gz') snv_h5 = os.path.join(varcalls_dir, 'snv_annotations.h5') raw_data_dir = os.path.join(varcalls_dir, 'raw') wgs_bam_template = args["tumour_template"] normal_bam_template = args["normal_template"] regions = refgenome.get_split_regions(config["split_size"]) tumour_region_bams = { r: wgs_bam_template.format(region=r) for r in regions } normal_region_bams = { r: normal_bam_template.format(region=r) for r in regions } return create_variant_calling_workflow( bam_files, tumour_region_bams, normal_region_bams, museq_vcf, strelka_snv_vcf, strelka_indel_vcf, snv_h5, config, raw_data_dir, )
def create_extract_seqdata_workflow( bam_filename, seqdata_filename, remixt_config, remixt_ref_data_dir, config, multiprocess=False, ): ctx = {'mem_retry_increment': 2, 'ncpus': 1} docker_ctx = helpers.get_container_ctx(config['containers'], 'single_cell_pipeline') ctx.update(docker_ctx) workflow = pypeliner.workflow.Workflow() workflow.transform( name='create_chromosome_seqdata', ctx=dict(mem=config["memory"]['high'], pool_id=config['pools']['highmem'], **ctx), func= "single_cell.workflows.extract_seqdata.tasks.create_chromosome_seqdata", args=( mgd.TempOutputFile('seqdata', 'chromosome'), mgd.InputFile(bam_filename), remixt_config, remixt_ref_data_dir, ), kwargs={ 'multiprocess': multiprocess, 'ncores': config['max_cores'] }) workflow.transform( name='merge_seqdata', ctx=dict(mem=config["memory"]['high'], pool_id=config['pools']['highmem'], **ctx), func="remixt.seqdataio.merge_seqdata", args=( mgd.OutputFile(seqdata_filename), mgd.TempInputFile('seqdata', 'chromosome'), ), ) return workflow
def run_correction_hmmcopy(bam_file, correct_reads_out, readcount_wig, config, hmmparams): container_ctx = helpers.get_container_ctx(config['containers'], 'hmmcopy', docker_only=True) run_readcount_rscript = os.path.join(scripts_directory, 'correct_read_count.R') rc = ReadCounter(bam_file, readcount_wig, hmmparams['bin_size'], config['chromosomes'], hmmparams['min_mqual'], excluded=hmmparams['exclude_list']) rc.main() if hmmparams["smoothing_function"] == 'loess': cmd = [ 'Rscript', run_readcount_rscript, readcount_wig, hmmparams['gc_wig_file'], hmmparams['map_wig_file'], correct_reads_out ] pypeliner.commandline.execute(*cmd, **container_ctx) elif hmmparams["smoothing_function"] == 'modal': CorrectReadCount(hmmparams["gc_wig_file"], hmmparams['map_wig_file'], readcount_wig, correct_reads_out, mappability=hmmparams['map_cutoff']).main() else: raise Exception( "smoothing function %s not supported. pipeline supports loess and modal" % hmmparams["smoothing_function"]) return correct_reads_out
def merge_bams_workflow(workflow, args): input_yaml = args["input_yaml"] output_template = args["merged_bam_template"] info_file = os.path.join(args["out_dir"], 'results', 'merge_bams', "info.yaml") config = helpers.load_config(args) bam_files, bai_files = helpers.get_bams(input_yaml) cellids = helpers.get_samples(input_yaml) wgs_bam_template = output_template wgs_bai_template = wgs_bam_template + ".bai" ctx = {'mem_retry_increment': 2, 'ncpus': 1} ctx.update( helpers.get_container_ctx(config['containers'], 'single_cell_pipeline')) workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=cellids, ) workflow.transform( name="get_regions", ctx=dict(mem=2, pool_id=config['pools']['standard'], **ctx), func="single_cell.utils.pysamutils.get_regions_from_reference", ret=pypeliner.managed.TempOutputObj('region'), args=( config["ref_genome"], config["split_size"], config["chromosomes"], )) workflow.subworkflow(name="wgs_merge_workflow", func=merge_bams.create_merge_bams_workflow, args=( mgd.InputFile('bam_markdups', 'cell_id', fnames=bam_files, extensions=['.bai']), mgd.OutputFile("merged_bam", "region", axes_origin=[], template=wgs_bam_template, extensions=['.bai']), cellids, config, mgd.TempInputObj("region"), )) workflow.transform(name="get_files", ctx=dict(mem=2, pool_id=config['pools']['standard'], **ctx), func='single_cell.utils.helpers.resolve_template', ret=pypeliner.managed.TempOutputObj('outputs'), args=(pypeliner.managed.TempInputObj('region'), wgs_bam_template, 'region')) inputs = {k: helpers.format_file_yaml(v) for k, v in bam_files.iteritems()} metadata = { 'merge_bams': { 'name': 'merge_bams', 'ref_genome': config["ref_genome"], 'version': single_cell.__version__, 'containers': config['containers'], 'output_datasets': pypeliner.managed.TempInputObj('outputs'), 'input_datasets': inputs, 'results': None } } workflow.transform(name='generate_meta_yaml', ctx=dict(mem=2, pool_id=config['pools']['standard'], **ctx), func="single_cell.utils.helpers.write_to_yaml", args=(mgd.OutputFile(info_file), metadata)) return workflow
def germline_calling_workflow(workflow, args): config = helpers.load_config(args) ctx = { 'mem_retry_increment': 2, 'ncpus': 1, 'mem': config["memory"]['low'], 'pool_id': config['pools']['standard'], } docker_ctx = helpers.get_container_ctx(config['containers'], 'single_cell_pipeline') ctx.update(docker_ctx) bam_files, bai_files = helpers.get_bams(args['input_yaml']) sampleids = helpers.get_samples(args['input_yaml']) normal_bam_template = args["input_template"] normal_bai_template = args["input_template"] + ".bai" if "{reads}" in normal_bam_template: raise ValueError( "input template for germline calling only support region based splits" ) varcalls_dir = os.path.join(args['out_dir'], 'results', 'germline_calling') samtools_germline_vcf = os.path.join(varcalls_dir, 'raw', 'samtools_germline.vcf.gz') snpeff_vcf_filename = os.path.join(varcalls_dir, 'snpeff.vcf') normal_genotype_filename = os.path.join(varcalls_dir, 'raw', 'normal_genotype.h5') mappability_filename = os.path.join(varcalls_dir, 'raw', 'mappability.h5') counts_template = os.path.join(varcalls_dir, 'counts', 'raw', 'counts.h5') germline_h5_filename = os.path.join(varcalls_dir, 'germline.h5') workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=bam_files.keys(), ) workflow.transform( name="get_regions", ctx=ctx, func="single_cell.utils.pysamutils.get_regions_from_reference", ret=pypeliner.managed.OutputChunks('region'), args=( config["ref_genome"], config["split_size"], config["chromosomes"], )) workflow.subworkflow(name='samtools_germline', func=germline.create_samtools_germline_workflow, args=( mgd.InputFile("normal.split.bam", "region", template=normal_bam_template), mgd.InputFile("normal.split.bam.bai", "region", template=normal_bai_template), config['ref_genome'], mgd.OutputFile(samtools_germline_vcf, extensions=['.tbi']), config, ), kwargs={ 'chromosomes': config["chromosomes"], 'base_docker': helpers.get_container_ctx(config['containers'], 'single_cell_pipeline'), 'vcftools_docker': helpers.get_container_ctx(config['containers'], 'vcftools'), 'samtools_docker': helpers.get_container_ctx(config['containers'], 'samtools'), }) workflow.subworkflow( name='annotate_mappability', func= "biowrappers.components.variant_calling.mappability.create_vcf_mappability_annotation_workflow", args=( config['databases']['mappability']['local_path'], mgd.InputFile(samtools_germline_vcf, extensions=['.tbi']), mgd.OutputFile(mappability_filename), ), kwargs={ 'base_docker': helpers.get_container_ctx(config['containers'], 'single_cell_pipeline') }) workflow.transform( name='annotate_genotype', func="single_cell.workflows.germline.tasks.annotate_normal_genotype", ctx=ctx, args=( mgd.InputFile(samtools_germline_vcf, extensions=['.tbi']), mgd.OutputFile(normal_genotype_filename), config["chromosomes"], ), ) workflow.subworkflow( name='snpeff', func= "biowrappers.components.variant_calling.snpeff.create_snpeff_annotation_workflow", args=( config['databases']['snpeff']['db'], mgd.InputFile(samtools_germline_vcf, extensions=['.tbi']), mgd.OutputFile(snpeff_vcf_filename), ), kwargs={ 'hdf5_output': False, 'base_docker': helpers.get_container_ctx(config['containers'], 'single_cell_pipeline'), 'vcftools_docker': helpers.get_container_ctx(config['containers'], 'vcftools'), 'snpeff_docker': helpers.get_container_ctx(config['containers'], 'snpeff'), }) workflow.subworkflow( name='read_counts', func= "single_cell.variant_calling.create_snv_allele_counts_for_vcf_targets_workflow", args=( config, mgd.InputFile('tumour.bam', 'cell_id', fnames=bam_files), mgd.InputFile('tumour.bam.bai', 'cell_id', fnames=bai_files), mgd.InputFile(samtools_germline_vcf, extensions=['.tbi']), mgd.OutputFile(counts_template), ), kwargs={ 'table_name': '/germline_allele_counts', 'docker_config': helpers.get_container_ctx(config['containers'], 'single_cell_pipeline') }, ) workflow.transform( name='build_results_file', func="biowrappers.components.io.hdf5.tasks.concatenate_tables", ctx=ctx, args=( [ mgd.InputFile(counts_template), mgd.InputFile(mappability_filename), mgd.InputFile(normal_genotype_filename), ], pypeliner.managed.OutputFile(germline_h5_filename), ), kwargs={ 'drop_duplicates': True, }) info_file = os.path.join(args["out_dir"], 'results', 'germline_calling', "info.yaml") results = { 'germline_data': helpers.format_file_yaml(germline_h5_filename), } input_datasets = { k: helpers.format_file_yaml(v) for k, v in bam_files.iteritems() } metadata = { 'germline_calling': { 'version': single_cell.__version__, 'results': results, 'containers': config['containers'], 'input_datasets': input_datasets, 'output_datasets': None } } workflow.transform(name='generate_meta_yaml', ctx=dict(mem=config['memory']['med'], pool_id=config['pools']['standard'], mem_retry_increment=2, ncpus=1), func="single_cell.utils.helpers.write_to_yaml", args=(mgd.OutputFile(info_file), metadata)) return workflow
def create_variant_calling_workflow( tumour_cell_bams, tumour_region_bams, normal_region_bams, museq_vcf, strelka_snv_vcf, strelka_indel_vcf, snv_h5, config, raw_data_dir, ): workflow = pypeliner.workflow.Workflow() workflow.set_filenames('normal_regions.bam', 'region', fnames=normal_region_bams) workflow.set_filenames('tumour_cells.bam', 'cell_id', fnames=tumour_cell_bams) workflow.set_filenames('tumour_regions.bam', 'region', fnames=tumour_region_bams) workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=tumour_cell_bams.keys(), ) workflow.setobj( obj=mgd.OutputChunks('region'), value=tumour_region_bams.keys(), ) workflow.subworkflow( name='museq', func=mutationseq.create_museq_workflow, args=( mgd.InputFile('normal_regions.bam', 'region', extensions=['.bai']), mgd.InputFile('tumour_regions.bam', 'region', extensions=['.bai']), config['ref_genome'], mgd.OutputFile(museq_vcf), config, ), ) workflow.subworkflow(name='strelka', func=strelka.create_strelka_workflow, args=( mgd.InputFile('normal_regions.bam', 'region', extensions=['.bai']), mgd.InputFile('tumour_regions.bam', 'region', extensions=['.bai']), config['ref_genome'], mgd.OutputFile(strelka_indel_vcf), mgd.OutputFile(strelka_snv_vcf), config, ), kwargs={"chromosomes": config["chromosomes"]}) workflow.transform( name='convert_museq_to_hdf5', func="biowrappers.components.io.vcf.tasks.convert_vcf_to_hdf5", ctx=dict(mem=2, pool_id=config['pools']['standard'], **ctx), args=( mgd.InputFile(museq_vcf), mgd.TempOutputFile('museq.h5'), '/museq/vcf/', ), kwargs={ 'score_callback': museq_callback, }) workflow.transform( name='convert_strelka_to_hdf5', func="biowrappers.components.io.vcf.tasks.convert_vcf_to_hdf5", ctx=dict(mem=2, pool_id=config['pools']['standard'], **ctx), args=( mgd.InputFile(strelka_snv_vcf), mgd.TempOutputFile('strelka_snv.h5'), '/strelka/vcf/', ), kwargs={ 'score_callback': strelka_snv_callback, }) workflow.transform(name='merge_snvs', func='biowrappers.components.io.vcf.tasks.merge_vcfs', ctx=dict(mem=2, pool_id=config['pools']['standard'], **ctx), args=([ mgd.InputFile(museq_vcf), mgd.InputFile(strelka_snv_vcf), ], mgd.TempOutputFile('all.snv.vcf')), kwargs={ 'docker_config': helpers.get_container_ctx(config['containers'], 'vcftools') }) workflow.transform(name='finalise_snvs', func="biowrappers.components.io.vcf.tasks.finalise_vcf", ctx=dict(mem=2, pool_id=config['pools']['standard'], **ctx), args=(mgd.TempInputFile('all.snv.vcf'), mgd.TempOutputFile('all.snv.vcf.gz', extensions=['.tbi'])), kwargs={ 'docker_config': helpers.get_container_ctx(config['containers'], 'vcftools') }) workflow.subworkflow( name='annotate_snvs', axes=(), func= "biowrappers.pipelines.snv_call_and_annotate.create_annotation_workflow", args=( config, mgd.TempInputFile('all.snv.vcf.gz'), mgd.TempOutputFile('snv_annotations.h5'), os.path.join(raw_data_dir, 'snv'), ), kwargs={ 'variant_type': 'snv', 'docker_config': helpers.get_container_ctx(config['containers'], 'single_cell_pipeline') }) workflow.subworkflow( name='count_alleles', func=create_snv_allele_counts_for_vcf_targets_workflow, args=( config, mgd.InputFile('tumour_cells.bam', 'cell_id', extensions=['.bai']), mgd.TempInputFile('all.snv.vcf.gz'), mgd.TempOutputFile('snv_counts.h5'), ), kwargs={ 'chromosomes': config['chromosomes'], 'docker_config': helpers.get_container_ctx(config['containers'], 'single_cell_pipeline') }) workflow.transform( name='build_results_file', ctx=dict(mem=config['memory']['high'], pool_id=config['pools']['highmem'], **ctx), func="biowrappers.components.io.hdf5.tasks.concatenate_tables", args=( [ mgd.TempInputFile('snv_counts.h5'), mgd.TempInputFile('snv_annotations.h5'), mgd.TempInputFile('museq.h5'), mgd.TempInputFile('strelka_snv.h5'), ], pypeliner.managed.OutputFile(snv_h5), ), kwargs={ 'drop_duplicates': True, 'in_memory': False, }) info_file = os.path.join(args["out_dir"], 'results', 'variant_calling', "info.yaml") normals = { k: helpers.format_file_yaml(v) for k, v in normal_region_bams.iteritems() } tumours = { k: helpers.format_file_yaml(v) for k, v in tumour_region_bams.iteritems() } cells = { k: helpers.format_file_yaml(v) for k, v in tumour_cell_bams.iteritems() } inputs = {'normal': normals, 'tumour': tumours, 'cells': cells} metadata = { 'variant_calling': { 'name': 'variant_calling', 'version': single_cell.__version__, 'containers': config['containers'], 'output_datasets': None, 'input_datasets': inputs, 'results': { 'variant_calling_data': helpers.format_file_yaml(snv_h5) } } } workflow.transform(name='generate_meta_yaml', ctx=dict(mem=config['memory']['med'], pool_id=config['pools']['standard']), func="single_cell.utils.helpers.write_to_yaml", args=(mgd.OutputFile(info_file), metadata)) return workflow
def align_workflow(workflow, args): config = helpers.load_config(args) sampleinfo = helpers.get_sample_info(args['input_yaml']) cellids = helpers.get_samples(args['input_yaml']) bam_files, bai_files = helpers.get_bams(args['input_yaml']) lib = args["library_id"] outdir = os.path.join(args["out_dir"], "results", "alignment") info_file = os.path.join(outdir, "info.yaml") alignment_metrics_h5 = os.path.join(outdir, '{}_alignment_metrics.h5'.format(lib)) plots_dir = os.path.join(outdir, 'plots') plot_metrics_output = os.path.join(plots_dir, '{}_plot_metrics.pdf'.format(lib)) ctx = {'mem_retry_increment': 2, 'ncpus': 1} ctx.update( helpers.get_container_ctx(config['containers'], 'single_cell_pipeline')) if not args["metrics_only"]: fastq1_files, fastq2_files = helpers.get_fastqs(args['input_yaml']) instrumentinfo = helpers.get_instrument_info(args['input_yaml']) centerinfo = helpers.get_center_info(args['input_yaml']) workflow.setobj( obj=mgd.OutputChunks('cell_id', 'lane'), value=fastq1_files.keys(), ) workflow.subworkflow( name='alignment_workflow', func=align.create_alignment_workflow, args=( mgd.InputFile('fastq_1', 'cell_id', 'lane', fnames=fastq1_files, axes_origin=[]), mgd.InputFile('fastq_2', 'cell_id', 'lane', fnames=fastq2_files, axes_origin=[]), mgd.OutputFile('bam_markdups', 'cell_id', fnames=bam_files, axes_origin=[]), mgd.OutputFile('bai_markdups', 'cell_id', fnames=bai_files, axes_origin=[]), config['ref_genome'], config, args, instrumentinfo, centerinfo, sampleinfo, cellids, ), ) else: workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=cellids, ) workflow.subworkflow( name='metrics_workflow', func=alignment_metrics.create_alignment_metrics_workflow, args=( mgd.InputFile('bam_markdups', 'cell_id', fnames=bam_files, axes_origin=[]), mgd.InputFile('bai_markdups', 'cell_id', fnames=bai_files, axes_origin=[]), mgd.OutputFile(alignment_metrics_h5), mgd.OutputFile(plot_metrics_output), config['ref_genome'], config, args, sampleinfo, cellids, ), ) inputs = helpers.get_fastq_files(args["input_yaml"]) outputs = { k: helpers.format_file_yaml(v) for k, v in bam_files.iteritems() } metadata = { 'alignment': { 'name': 'alignment', 'cell_batch_realign': args["realign"], 'metrics_table': '/alignment/metrics', 'gc_metrics_table': '/alignment/gc_metrics', 'aligner': config["aligner"], 'adapter': config["adapter"], 'adapter2': config["adapter2"], 'picardtools_wgsmetrics_params': config['picard_wgs_params'], 'ref_genome': config["ref_genome"], 'version': single_cell.__version__, 'containers': config['containers'], 'output_datasets': outputs, 'input_datasets': inputs, 'results': { 'alignment_metrics': helpers.format_file_yaml(alignment_metrics_h5), 'alignment_plots': helpers.format_file_yaml(plot_metrics_output), }, } } workflow.transform(name='generate_meta_yaml', ctx=dict(mem=config['memory']['med'], pool_id=config['pools']['standard'], **ctx), func="single_cell.utils.helpers.write_to_yaml", args=(mgd.OutputFile(info_file), metadata)) return workflow
def create_titan_workflow(normal_seqdata, tumour_seqdata, ref_genome, raw_data_dir, out_file, config, args, tumour_cells, normal_cells, cloneid): ctx = {'mem_retry_increment': 2, 'ncpus': 1} docker_ctx = helpers.get_container_ctx(config['containers'], 'single_cell_pipeline') ctx.update(docker_ctx) results_files = os.path.join(raw_data_dir, 'results', 'sample.h5') tumour_alleles_file = os.path.join(raw_data_dir, 'results', 'het_counts.h5') workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('tumour_cell_id'), value=tumour_cells, ) workflow.setobj( obj=mgd.OutputChunks('normal_cell_id'), value=normal_cells, ) workflow.transform( name='merge_all_normal_seqdata', ctx=dict(mem=config["memory"]['high'], pool_id=config['pools']['highmem'], **ctx), func="single_cell.workflows.titan.tasks.merge_overlapping_seqdata", args=(mgd.TempOutputFile("seqdata_normal_all_cells_merged.h5"), pypeliner.managed.InputFile('normal_sample.h5', 'normal_cell_id', fnames=normal_seqdata), config["titan_params"]["chromosomes"]), ) workflow.transform( name='prepare_normal_data', ctx=dict(mem=config["memory"]['high'], pool_id=config['pools']['highmem'], **ctx), func= "biowrappers.components.copy_number_calling.titan.tasks.prepare_normal_data", args=( mgd.TempInputFile("seqdata_normal_all_cells_merged.h5"), pypeliner.managed.TempOutputFile('normal.wig'), pypeliner.managed.TempOutputFile('het_positions.tsv'), config["titan_params"], ), ) workflow.transform( name='prepare_tumour_data', axes=('tumour_cell_id', ), ctx=dict(mem=config["memory"]['high'], pool_id=config['pools']['highmem'], **ctx), func= "biowrappers.components.copy_number_calling.titan.tasks.prepare_tumour_data", args=( pypeliner.managed.InputFile('tumour_sample.h5', 'tumour_cell_id', fnames=tumour_seqdata), pypeliner.managed.TempInputFile('het_positions.tsv'), pypeliner.managed.TempOutputFile('tumour.wig', 'tumour_cell_id'), pypeliner.managed.TempOutputFile('tumour_alleles.tsv', 'tumour_cell_id'), config["titan_params"], ), ) workflow.transform( name='merge_tumour_alleles', ctx=dict(mem=config["memory"]['high'], pool_id=config['pools']['highmem'], **ctx), func="single_cell.workflows.titan.tasks.merge_tumour_alleles", args=( pypeliner.managed.TempInputFile('tumour_alleles.tsv', 'tumour_cell_id'), pypeliner.managed.TempOutputFile('tumour_alleles.tsv'), ), ) workflow.transform( name='concat_tumour_alleles', ctx=dict(mem=config["memory"]['high'], pool_id=config['pools']['highmem'], **ctx), func="single_cell.workflows.titan.tasks.concat_tumour_alleles", args=(pypeliner.managed.TempInputFile('tumour_alleles.tsv', 'tumour_cell_id'), pypeliner.managed.OutputFile(tumour_alleles_file), config["titan_params"]['chromosomes']), ) workflow.transform( name='merge_wigs_tumour', ctx=dict(mem=config["memory"]['high'], pool_id=config['pools']['highmem'], **ctx), func="single_cell.workflows.titan.tasks.merge_wig_files", args=( pypeliner.managed.TempInputFile('tumour.wig', 'tumour_cell_id'), pypeliner.managed.TempOutputFile('tumour.wig'), ), ) workflow.transform( name='create_intialization_parameters', ctx=dict(mem=config["memory"]['low'], pool_id=config['pools']['standard'], **ctx), func= "biowrappers.components.copy_number_calling.titan.tasks.create_intialization_parameters", ret=pypeliner.managed.TempOutputObj('init_params', 'init_param_id'), args=(config["titan_params"], ), ) workflow.transform( name='run_titan', axes=('init_param_id', ), func="biowrappers.components.copy_number_calling.titan.tasks.run_titan", ctx=dict(mem=config["memory"]['high'], pool_id=config['pools']['highmem'], **ctx), args=( pypeliner.managed.TempInputObj('init_params', 'init_param_id'), pypeliner.managed.TempInputFile('normal.wig'), pypeliner.managed.TempInputFile('tumour.wig'), pypeliner.managed.TempInputFile('tumour_alleles.tsv'), pypeliner.managed.TempOutputFile('cn.tsv', 'init_param_id'), pypeliner.managed.TempOutputFile('params.tsv', 'init_param_id'), config["titan_params"], ), kwargs={ 'docker_config': helpers.get_container_ctx(config['containers'], 'titan') }) workflow.transform( name='select_solution', ctx=dict(mem=config["memory"]['low'], pool_id=config['pools']['standard'], **ctx), func= "biowrappers.components.copy_number_calling.titan.tasks.select_solution", args=(pypeliner.managed.TempInputObj('init_params', 'init_param_id'), pypeliner.managed.TempInputFile('cn.tsv', 'init_param_id'), pypeliner.managed.TempInputFile('params.tsv', 'init_param_id'), pypeliner.managed.OutputFile('results', template=results_files), pypeliner.managed.OutputFile( os.path.join(raw_data_dir, 'output', 'cn_loci.tsv')), pypeliner.managed.OutputFile( os.path.join(raw_data_dir, 'output', 'cn_segments.tsv')), pypeliner.managed.OutputFile( os.path.join(raw_data_dir, 'output', 'cn_igv.tsv')), pypeliner.managed.OutputFile( os.path.join(raw_data_dir, 'output', 'params.tsv')), config, cloneid), kwargs={ 'docker_config': helpers.get_container_ctx(config['containers'], 'titan'), 'breakpoints_filename': None, }, ) workflow.setobj( obj=mgd.OutputChunks('chromosome'), value=config['titan_params']["chromosomes"], ) workflow.commandline( name='plot_chromosome', axes=('chromosome', ), ctx=dict(mem=config["memory"]['low'], pool_id=config['pools']['standard'], ncpus=1, num_retry=3, mem_retry_increment=2, **helpers.get_container_ctx(config['containers'], 'titan')), args=( 'plot_titan_chromosome.R', pypeliner.managed.Instance('chromosome'), pypeliner.managed.InputFile( os.path.join(raw_data_dir, 'output', 'cn_loci.tsv')), pypeliner.managed.InputFile( os.path.join(raw_data_dir, 'output', 'params.tsv')), pypeliner.managed.OutputFile( os.path.join(raw_data_dir, 'output', 'chr_{chromosome}.png'), 'chromosome'), ), ) # just leaving it here in case we parallelize by samples later. workflow.transform( name='merge_results', ctx=dict(mem=config["memory"]['low'], pool_id=config['pools']['standard'], **ctx), func="biowrappers.components.io.hdf5.tasks.merge_hdf5", args=( { cloneid: pypeliner.managed.InputFile('results', template=results_files) }, pypeliner.managed.OutputFile(out_file), ), kwargs={ 'table_names': '/sample_{}'.format(cloneid), }, ) return workflow
def create_alignment_workflow(fastq_1_filename, fastq_2_filename, bam_filename, bai_filename, ref_genome, config, args, instrumentinfo, centerinfo, sample_info, cell_ids): out_dir = args['out_dir'] merge_metrics = os.path.join(out_dir, 'metrics') lane_metrics = os.path.join(args['out_dir'], 'metrics_per_lane', '{lane}') bam_filename = dict([(cellid, bam_filename[cellid]) for cellid in cell_ids]) bai_filename = dict([(cellid, bai_filename[cellid]) for cellid in cell_ids]) chromosomes = config["chromosomes"] ctx = {'mem_retry_increment': 2, 'ncpus': 1} docker_ctx = helpers.get_container_ctx(config['containers'], 'single_cell_pipeline') ctx.update(docker_ctx) workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('chrom'), value=chromosomes, ) workflow.setobj( obj=mgd.OutputChunks('cell_id', 'lane'), value=fastq_1_filename.keys(), ) workflow.setobj(obj=mgd.TempOutputObj('instrument', 'cell_id', 'lane', axes_origin=[]), value=instrumentinfo) workflow.setobj(obj=mgd.TempOutputObj('center', 'cell_id', 'lane', axes_origin=[]), value=centerinfo) workflow.setobj(obj=mgd.TempOutputObj('sampleinfo', 'cell_id', axes_origin=[]), value=sample_info) fastqc_reports = os.path.join(lane_metrics, "fastqc", "{cell_id}_reports.tar.gz") flagstat_metrics = os.path.join(lane_metrics, 'flagstat', '{cell_id}.txt') workflow.transform( name='align_reads', ctx=dict(mem=config['memory']['med'], pool_id=config['pools']['standard'], **ctx), axes=( 'cell_id', 'lane', ), func="single_cell.workflows.align.tasks.align_pe", args=(mgd.InputFile('fastq_1', 'cell_id', 'lane', fnames=fastq_1_filename), mgd.InputFile('fastq_2', 'cell_id', 'lane', fnames=fastq_2_filename), mgd.TempOutputFile('aligned_per_cell_per_lane.sorted.bam', 'cell_id', 'lane'), mgd.OutputFile(fastqc_reports, 'cell_id', 'lane'), mgd.OutputFile(flagstat_metrics, 'cell_id', 'lane'), mgd.TempSpace('alignment_temp', 'cell_id', 'lane'), ref_genome, mgd.TempInputObj('instrument', 'cell_id', 'lane'), mgd.TempInputObj('center', 'cell_id', 'lane'), mgd.TempInputObj('sampleinfo', 'cell_id'), mgd.InputInstance('cell_id'), mgd.InputInstance('lane'), args['library_id'], config)) workflow.transform(name='merge_bams', ctx=dict(mem=config['memory']['med'], pool_id=config['pools']['standard'], **ctx), func="single_cell.workflows.align.tasks.merge_bams", axes=('cell_id', ), args=(mgd.TempInputFile( 'aligned_per_cell_per_lane.sorted.bam', 'cell_id', 'lane'), mgd.TempOutputFile('merged_lanes.bam', 'cell_id'), mgd.TempOutputFile('merged_lanes.bam.bai', 'cell_id'), config)) if args['realign']: workflow.transform(name='realignment', axes=('chrom', ), ctx=dict(mem=config['memory']['high'], pool_id=config['pools']['highmem'], **ctx), func="single_cell.workflows.align.tasks.realign", args=(mgd.TempInputFile('merged_lanes.bam', 'cell_id'), mgd.TempInputFile('merged_lanes.bam.bai', 'cell_id'), mgd.TempOutputFile('realigned.bam', 'chrom', 'cell_id'), mgd.TempSpace('realignment_temp', 'chrom', cleanup='before'), config, mgd.InputInstance('chrom'))) workflow.transform( name='merge_realignment', ctx=dict(mem=config['memory']['high'], pool_id=config['pools']['highmem'], **ctx), axes=('cell_id', ), func="single_cell.workflows.align.tasks.merge_realignment", args=(mgd.TempInputFile('realigned.bam', 'chrom', 'cell_id'), mgd.TempOutputFile('merged_realign.bam', 'cell_id'), config, mgd.InputInstance('cell_id'))) final_bam = mgd.TempInputFile('merged_lanes.bam', 'cell_id') if args["realign"]: final_bam = mgd.TempInputFile('merged_realign.bam', 'cell_id') markdups_metrics = os.path.join(merge_metrics, 'markdups_metrics', '{cell_id}.markdups_metrics.txt') flagstat_metrics = os.path.join(merge_metrics, 'flagstat_metrics', '{cell_id}.flagstat_metrics.txt') workflow.transform( name='postprocess_bam', ctx=dict(mem=config['memory']['med'], pool_id=config['pools']['standard'], **ctx), axes=('cell_id', ), func="single_cell.workflows.align.tasks.postprocess_bam", args=( final_bam, mgd.OutputFile('sorted_markdups', 'cell_id', fnames=bam_filename), mgd.OutputFile('sorted_markdups_index', 'cell_id', fnames=bai_filename), mgd.TempSpace('tempdir', 'cell_id'), config, mgd.OutputFile(markdups_metrics, 'cell_id'), mgd.OutputFile(flagstat_metrics, 'cell_id'), ), ) return workflow
def create_hmmcopy_workflow(bam_file, bai_file, hmmcopy_data, igv_seg_filename, segs_pdf, bias_pdf, plot_heatmap_ec_output, plot_heatmap_ec_filt_output, plot_metrics_output, plot_kernel_density_output, cell_ids, config, args, hmmparams, params_tag, results_dir, alignment_metrics=None): sample_info = helpers.get_sample_info(args["input_yaml"]) chromosomes = config["chromosomes"] multipliers = copy.deepcopy(hmmparams["multipliers"]) multipliers.append(0) rows = [int(cellinfo["row"]) for cellinfo in sample_info.values()] rows = sorted(set(rows)) workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=cell_ids, ) workflow.setobj(obj=mgd.TempOutputObj('sampleinfo', 'cell_id', axes_origin=[]), value=sample_info) workflow.setobj( obj=mgd.OutputChunks('row'), value=rows, ) ctx = {'mem_retry_increment': 2, 'ncpus': 1} docker_ctx = helpers.get_container_ctx(config['containers'], 'single_cell_pipeline') ctx.update(docker_ctx) workflow.transform( name='run_hmmcopy', ctx=dict(mem=config['memory']['med'], pool_id=config['pools']['standard'], **ctx), func="single_cell.workflows.hmmcopy.tasks.run_hmmcopy", axes=('cell_id', ), args=( mgd.InputFile('bam_markdups', 'cell_id', fnames=bam_file), mgd.InputFile('bai_markdups', 'cell_id', fnames=bai_file), mgd.TempOutputFile('reads.h5', 'cell_id'), mgd.TempOutputFile('segs.h5', 'cell_id'), mgd.TempOutputFile('params.h5', 'cell_id'), mgd.TempOutputFile('hmm_metrics.h5', 'cell_id'), mgd.TempOutputFile('segments.png', 'cell_id'), mgd.TempOutputFile('bias.png', 'cell_id'), mgd.InputInstance('cell_id'), config['ref_genome'], config, hmmparams, multipliers, mgd.TempSpace('hmmcopy_temp', 'cell_id'), mgd.TempInputObj('sampleinfo', 'cell_id'), ), ) workflow.transform( name='merge_reads', ctx=dict(mem=config['memory']['low'], pool_id=config['pools']['standard'], **ctx), func="single_cell.workflows.hmmcopy.tasks.merge_hdf_files_on_disk", args=(mgd.TempInputFile('reads.h5', 'cell_id'), mgd.TempOutputFile("reads.h5"), multipliers, 'hmmcopy/reads'), kwargs={ 'dtypes': { 'valid': bool, 'ideal': bool, 'state': float, 'multiplier': float } }) workflow.transform( name='merge_segs', ctx=dict(mem=config['memory']['low'], pool_id=config['pools']['standard'], **ctx), func="single_cell.workflows.hmmcopy.tasks.merge_hdf_files_on_disk", args=(mgd.TempInputFile('segs.h5', 'cell_id'), mgd.TempOutputFile("segments.h5"), multipliers, 'hmmcopy/segments'), kwargs={'dtypes': { 'end': float, 'median': float }}) workflow.transform( name='merge_metrics', ctx=dict(mem=config['memory']['low'], pool_id=config['pools']['standard'], **ctx), func="single_cell.workflows.hmmcopy.tasks.merge_hdf_files_in_memory", args=(mgd.TempInputFile('hmm_metrics.h5', 'cell_id'), mgd.TempOutputFile("hmmcopy_metrics.h5"), multipliers, 'hmmcopy/metrics'), kwargs={'dtypes': { 'mad_neutral_state': float }}) workflow.transform( name='merge_params', ctx=dict(mem=config['memory']['low'], pool_id=config['pools']['standard'], **ctx), func="single_cell.workflows.hmmcopy.tasks.merge_hdf_files_in_memory", args=(mgd.TempInputFile('params.h5', 'cell_id'), mgd.TempOutputFile("params.h5"), multipliers, 'hmmcopy/params'), ) annotation_input = 'hmmcopy_metrics.h5' if alignment_metrics: annotation_input = 'hmmcopy_quality_metrics.h5' workflow.transform( name="add_quality", ctx=dict(mem=config['memory']['low'], pool_id=config['pools']['standard'], **ctx), func="single_cell.workflows.hmmcopy.tasks.add_quality", args=( mgd.TempInputFile('hmmcopy_metrics.h5'), mgd.InputFile(alignment_metrics), multipliers, mgd.TempOutputFile("hmmcopy_quality_metrics.h5"), hmmparams['classifier_training_data'], ), ) workflow.transform( name='annotate_metrics_with_info_and_clustering', ctx=dict(mem=config['memory']['low'], pool_id=config['pools']['standard'], **ctx), func="single_cell.workflows.hmmcopy.tasks.annotate_metrics", args=( mgd.TempInputFile('reads.h5'), mgd.TempInputFile(annotation_input), mgd.TempOutputFile("annotated_metrics.h5"), sample_info, cell_ids, multipliers, ), kwargs={'chromosomes': config["chromosomes"]}) workflow.transform(name='merge_hmm_copy_plots', ctx=dict(mem=config['memory']['med'], pool_id=config['pools']['standard'], **ctx), func="single_cell.workflows.hmmcopy.tasks.merge_pdf", args=([ mgd.TempInputFile('segments.png', 'cell_id'), mgd.TempInputFile('bias.png', 'cell_id'), ], [ mgd.OutputFile(segs_pdf), mgd.OutputFile(bias_pdf), ], mgd.TempInputFile("annotated_metrics.h5"), None, mgd.TempSpace("hmmcopy_plot_merge_temp"), ['segments', 'bias'])) workflow.transform( name='create_igv_seg', ctx=dict(mem=config['memory']['med'], pool_id=config['pools']['standard'], **ctx), func="single_cell.workflows.hmmcopy.tasks.create_igv_seg", args=(mgd.TempInputFile("segments.h5"), mgd.TempInputFile("annotated_metrics.h5"), mgd.OutputFile(igv_seg_filename), hmmparams)) workflow.transform(name='plot_metrics', ctx=dict(mem=config['memory']['med'], pool_id=config['pools']['standard'], **ctx), func="single_cell.workflows.hmmcopy.tasks.plot_metrics", args=(mgd.TempInputFile("annotated_metrics.h5"), mgd.OutputFile(plot_metrics_output), mgd.TempSpace("plot_metrics_temp"), 'QC pipeline metrics', multipliers)) workflow.transform( name='plot_kernel_density', ctx=dict(mem=config['memory']['med'], pool_id=config['pools']['standard'], **ctx), func="single_cell.workflows.hmmcopy.tasks.plot_kernel_density", args=(mgd.TempInputFile('annotated_metrics.h5'), mgd.OutputFile(plot_kernel_density_output), mgd.TempSpace("hmmcopy_kde_plot_temp"), ',', 'mad_neutral_state', 'QC pipeline metrics', multipliers)) workflow.transform(name='plot_heatmap_ec', ctx=dict(mem=config['memory']['med'], pool_id=config['pools']['standard'], **ctx), func="single_cell.workflows.hmmcopy.tasks.plot_pcolor", args=(mgd.TempInputFile('reads.h5'), mgd.TempInputFile('annotated_metrics.h5'), mgd.OutputFile(plot_heatmap_ec_output), mgd.TempSpace("heatmap_ec_temp"), multipliers), kwargs={ 'plot_title': 'QC pipeline metrics', 'column_name': 'state', 'plot_by_col': 'experimental_condition', 'color_by_col': 'cell_call', 'chromosomes': chromosomes, 'max_cn': hmmparams['num_states'], 'scale_by_cells': False, 'mappability_threshold': hmmparams["map_cutoff"] }) workflow.transform(name='plot_heatmap_ec_filtered', ctx=dict(mem=config['memory']['med'], pool_id=config['pools']['standard'], **ctx), func="single_cell.workflows.hmmcopy.tasks.plot_pcolor", args=(mgd.TempInputFile('reads.h5'), mgd.TempInputFile('annotated_metrics.h5'), mgd.OutputFile(plot_heatmap_ec_filt_output), mgd.TempSpace("heatmap_ec_filt_temp"), multipliers), kwargs={ 'plot_title': 'QC pipeline metrics', 'column_name': 'state', 'plot_by_col': 'experimental_condition', 'color_by_col': 'cell_call', 'chromosomes': chromosomes, 'max_cn': hmmparams['num_states'], 'scale_by_cells': False, 'cell_filters': config["good_cells"], 'mappability_threshold': hmmparams["map_cutoff"] }) workflow.transform(name='merge_all_hdf5_stores', ctx=dict(mem=config['memory']['med'], pool_id=config['pools']['standard'], **ctx), func="single_cell.workflows.hmmcopy.tasks.merge_tables", args=(mgd.TempInputFile("reads.h5"), mgd.TempInputFile("segments.h5"), mgd.TempInputFile("annotated_metrics.h5"), mgd.TempInputFile("params.h5"), mgd.TempOutputFile("hmmcopy_precast.h5"), cell_ids)) workflow.transform(name='cast_h5', ctx=dict(mem=config['memory']['high'], pool_id=config['pools']['highmem'], **ctx), func="single_cell.utils.hdfutils.cast_h5_file", args=( mgd.TempInputFile("hmmcopy_precast.h5"), mgd.OutputFile(hmmcopy_data), )) return workflow
def create_strelka_workflow( normal_bam_file, tumour_bam_file, ref_genome_fasta_file, indel_vcf_file, snv_vcf_file, config, chromosomes=default_chromosomes, split_size=int(1e7), use_depth_thresholds=True): ctx = {'mem_retry_increment': 2, 'ncpus': 1, 'num_retry': 3} docker_ctx = helpers.get_container_ctx(config['containers'], 'single_cell_pipeline') ctx.update(docker_ctx) regions = normal_bam_file.keys() assert set(tumour_bam_file.keys()) == set(regions) workflow = Workflow() workflow.setobj( obj=pypeliner.managed.OutputChunks('chrom'), value=chromosomes, ) workflow.setobj( obj=pypeliner.managed.OutputChunks('region'), value=regions, ) workflow.transform( name='count_fasta_bases', ctx=dict(mem=2, pool_id=config['pools']['standard'], **ctx), func="single_cell.workflows.strelka.tasks.count_fasta_bases", args=( ref_genome_fasta_file, pypeliner.managed.TempOutputFile('ref_base_counts.tsv'), helpers.get_container_ctx(config['containers'], 'strelka') ) ) workflow.transform( name="get_chrom_sizes", ctx=dict(mem=2, pool_id=config['pools']['standard'], **ctx), func="single_cell.workflows.strelka.tasks.get_known_chromosome_sizes", ret=pypeliner.managed.TempOutputObj('known_sizes'), args=( pypeliner.managed.TempInputFile('ref_base_counts.tsv'), chromosomes ) ) workflow.transform( name='call_somatic_variants', ctx=dict(mem=4, pool_id=config['pools']['standard'], **ctx), func="single_cell.workflows.strelka.tasks.call_somatic_variants", axes=('region',), args=( pypeliner.managed.InputFile("normal.split.bam", "region", fnames=normal_bam_file), pypeliner.managed.InputFile("merged_bam", "region", fnames=tumour_bam_file), pypeliner.managed.TempInputObj('known_sizes'), ref_genome_fasta_file, pypeliner.managed.TempOutputFile('somatic.indels.unfiltered.vcf', 'region'), pypeliner.managed.TempOutputFile('somatic.indels.unfiltered.vcf.window', 'region'), pypeliner.managed.TempOutputFile('somatic.snvs.unfiltered.vcf', 'region'), pypeliner.managed.TempOutputFile('strelka.stats', 'region'), pypeliner.managed.InputInstance("region"), helpers.get_container_ctx(config['containers'], 'strelka') ), ) workflow.transform( name='add_indel_filters', axes=('chrom',), ctx=dict(mem=4, pool_id=config['pools']['standard'], **ctx), func="single_cell.workflows.strelka.tasks.filter_indel_file_list", args=( pypeliner.managed.TempInputFile('somatic.indels.unfiltered.vcf', 'region'), pypeliner.managed.TempInputFile('strelka.stats', 'region'), pypeliner.managed.TempInputFile('somatic.indels.unfiltered.vcf.window', 'region'), pypeliner.managed.TempOutputFile('somatic.indels.filtered.vcf', 'chrom'), pypeliner.managed.InputInstance("chrom"), pypeliner.managed.TempInputObj('known_sizes'), regions ), kwargs={'use_depth_filter': use_depth_thresholds} ) workflow.transform( name='add_snv_filters', axes=('chrom',), ctx=dict(mem=4, pool_id=config['pools']['standard'], **ctx), func="single_cell.workflows.strelka.tasks.filter_snv_file_list", args=( pypeliner.managed.TempInputFile('somatic.snvs.unfiltered.vcf', 'region'), pypeliner.managed.TempInputFile('strelka.stats', 'region'), pypeliner.managed.TempOutputFile('somatic.snvs.filtered.vcf', 'chrom'), pypeliner.managed.InputInstance("chrom"), pypeliner.managed.TempInputObj('known_sizes'), regions, ), kwargs={'use_depth_filter': use_depth_thresholds} ) workflow.transform( name='merge_indels', ctx=dict(mem=4, pool_id=config['pools']['standard'], **ctx), func="single_cell.workflows.strelka.vcf_tasks.concatenate_vcf", args=( pypeliner.managed.TempInputFile('somatic.indels.filtered.vcf', 'chrom'), pypeliner.managed.TempOutputFile('somatic.indels.filtered.vcf.gz'), pypeliner.managed.TempSpace("merge_indels_temp"), helpers.get_container_ctx(config['containers'], 'vcftools') ) ) workflow.transform( name='merge_snvs', ctx=dict(mem=4, pool_id=config['pools']['standard'], **ctx), func="single_cell.workflows.strelka.vcf_tasks.concatenate_vcf", args=( pypeliner.managed.TempInputFile('somatic.snvs.filtered.vcf', 'chrom'), pypeliner.managed.TempOutputFile('somatic.snvs.filtered.vcf.gz'), pypeliner.managed.TempSpace("merge_snvs_temp"), helpers.get_container_ctx(config['containers'], 'vcftools') ) ) workflow.transform( name='filter_indels', ctx=dict(mem=4, pool_id=config['pools']['standard'], **ctx), func="single_cell.workflows.strelka.vcf_tasks.filter_vcf", args=( pypeliner.managed.TempInputFile('somatic.indels.filtered.vcf.gz'), pypeliner.managed.TempOutputFile('somatic.indels.passed.vcf') ) ) workflow.transform( name='filter_snvs', ctx=dict(mem=4, pool_id=config['pools']['standard'], **ctx), func="single_cell.workflows.strelka.vcf_tasks.filter_vcf", args=( pypeliner.managed.TempInputFile('somatic.snvs.filtered.vcf.gz'), pypeliner.managed.TempOutputFile('somatic.snvs.passed.vcf') ) ) workflow.transform( name='finalise_indels', ctx=dict(mem=4, pool_id=config['pools']['standard'], **ctx), func="single_cell.workflows.strelka.vcf_tasks.finalise_vcf", args=( pypeliner.managed.TempInputFile('somatic.indels.passed.vcf'), pypeliner.managed.OutputFile(indel_vcf_file), helpers.get_container_ctx(config['containers'], 'vcftools') ) ) workflow.transform( name='finalise_snvs', ctx=dict(mem=2, pool_id=config['pools']['standard'], **ctx), func="single_cell.workflows.strelka.vcf_tasks.finalise_vcf", args=( pypeliner.managed.TempInputFile('somatic.snvs.passed.vcf'), pypeliner.managed.OutputFile(snv_vcf_file), helpers.get_container_ctx(config['containers'], 'vcftools') ) ) return workflow
def create_split_workflow(normal_bam, normal_bai, normal_split_bam, normal_split_bai, regions, config, by_reads=False): ctx = {'mem_retry_increment': 2} docker_ctx = helpers.get_container_ctx(config['containers'], 'single_cell_pipeline') ctx.update(docker_ctx) normal_split_bam = dict([(ival, normal_split_bam[ival]) for ival in regions]) normal_split_bai = dict([(ival, normal_split_bai[ival]) for ival in regions]) one_split_job = config["one_split_job"] workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('region'), value=regions, ) # split by reads always runs no a single node if by_reads: workflow.transform( name='split_normal_bam', ctx=dict(mem=config['memory']['low'], pool_id=config['pools']['multicore'], ncpus=config['max_cores'], **ctx), func= "single_cell.workflows.split_bams.tasks.split_bam_file_by_reads", args=(mgd.InputFile(normal_bam), mgd.InputFile(normal_bai), mgd.OutputFile("normal.split.bam", "region", fnames=normal_split_bam, axes_origin=[]), mgd.OutputFile("normal.split.bam.bai", "region", fnames=normal_split_bai, axes_origin=[]), mgd.TempSpace("bam_split_by_reads"), regions, helpers.get_container_ctx(config['containers'], 'samtools')), ) elif one_split_job: workflow.transform( name='split_normal_bam', ctx=dict(mem=config['memory']['low'], pool_id=config['pools']['multicore'], ncpus=config['max_cores'], **ctx), func= "single_cell.workflows.split_bams.tasks.split_bam_file_one_job", args=(mgd.InputFile(normal_bam, extensions=['.bai']), mgd.OutputFile( "normal.split.bam", "region", fnames=normal_split_bam, axes_origin=[], extensions=['.bai'], ), regions, helpers.get_container_ctx(config['containers'], 'samtools')), kwargs={"ncores": config["max_cores"]}) else: workflow.transform( name='split_normal_bam', ctx=dict(mem=config['memory']['low'], pool_id=config['pools']['standard'], ncpus=1, **ctx), axes=('region', ), func="single_cell.workflows.split_bams.tasks.split_bam_file", args=(mgd.InputFile(normal_bam), mgd.InputFile(normal_bai), mgd.OutputFile("normal.split.bam", "region", fnames=normal_split_bam), mgd.OutputFile("normal.split.bam.bai", "region", fnames=normal_split_bai), mgd.InputInstance('region'), helpers.get_container_ctx(config['containers'], 'samtools'))) return workflow
def copy_number_calling_workflow(workflow, args): config = helpers.load_config(args) ctx = {'mem_retry_increment': 2, 'ncpus': 1, 'mem': config["memory"]['low'], 'pool_id': config['pools']['standard']} docker_ctx = helpers.get_container_ctx(config['containers'], 'single_cell_pipeline') ctx.update(docker_ctx) tumour_bam_files, tumour_bai_files = helpers.get_bams(args['tumour_yaml']) normal_bam_files, normal_bai_files = helpers.get_bams(args['normal_yaml']) tumour_cellids = helpers.get_samples(args['tumour_yaml']) normal_cellids = helpers.get_samples(args['normal_yaml']) if set(tumour_bam_files.keys()) != set(tumour_cellids): raise ValueError() if set(normal_bam_files.keys()) != set(normal_cellids): raise ValueError() copynumber_dir = os.path.join(args["out_dir"], "copynumber") out_file = os.path.join(copynumber_dir, "results", "results.h5") cloneid = args["clone_id"] remixt_config = config['titan_params'].get('extract_seqdata', {}) workflow.setobj( obj=mgd.OutputChunks('tumour_cell_id'), value=tumour_cellids, ) workflow.setobj( obj=mgd.OutputChunks('normal_cell_id'), value=normal_cellids, ) workflow.transform( name="get_snp_positions_filename", ctx=ctx, func="remixt.config.get_filename", ret=mgd.TempOutputObj('snp_positions_filename'), args=( remixt_config, config['titan_params']['ref_data_dir'], 'snp_positions' ) ) workflow.transform( name="get_bam_max_fragment_length", ctx=ctx, func="remixt.config.get_param", ret=mgd.TempOutputObj('bam_max_fragment_length'), args=( remixt_config, 'bam_max_fragment_length' ) ) workflow.transform( name="get_bam_max_soft_clipped", ctx=ctx, func="remixt.config.get_param", ret=mgd.TempOutputObj('bam_max_soft_clipped'), args=( remixt_config, 'bam_max_soft_clipped' ) ) workflow.transform( name="get_bam_check_proper_pair", ctx=ctx, func="remixt.config.get_param", ret=mgd.TempOutputObj('bam_check_proper_pair'), args=( remixt_config, 'bam_check_proper_pair' ) ) workflow.subworkflow( name="extract_seqdata_tumour", axes=('tumour_cell_id',), func=extract_seqdata.create_extract_seqdata_workflow, args=( mgd.InputFile( 'bam_markdups', 'tumour_cell_id', fnames=tumour_bam_files), mgd.InputFile( 'bam_markdups_index', 'tumour_cell_id', fnames=tumour_bai_files), mgd.TempOutputFile("tumour.h5", "tumour_cell_id"), config, config['titan_params'].get('extract_seqdata', {}), config['titan_params']['ref_data_dir'], mgd.TempInputObj('snp_positions_filename'), mgd.TempInputObj('bam_max_fragment_length'), mgd.TempInputObj('bam_max_soft_clipped'), mgd.TempInputObj('bam_check_proper_pair'), ) ) workflow.subworkflow( name="extract_seqdata_normal", axes=('normal_cell_id',), func=extract_seqdata.create_extract_seqdata_workflow, args=( mgd.InputFile( 'bam_markdups', 'normal_cell_id', fnames=normal_bam_files), mgd.InputFile( 'bam_markdups_index', 'normal_cell_id', fnames=normal_bai_files), mgd.TempOutputFile("normal.h5", "normal_cell_id"), config, config['titan_params'].get('extract_seqdata', {}), config['titan_params']['ref_data_dir'], mgd.TempInputObj('snp_positions_filename'), mgd.TempInputObj('bam_max_fragment_length'), mgd.TempInputObj('bam_max_soft_clipped'), mgd.TempInputObj('bam_check_proper_pair'), ) ) workflow.subworkflow( name='titan_workflow', func=titan.create_titan_workflow, args=( mgd.TempInputFile("normal.h5", "normal_cell_id"), mgd.TempInputFile("tumour.h5", "tumour_cell_id"), config['ref_genome'], copynumber_dir, out_file, config, args, tumour_cellids, normal_cellids, cloneid ), ) info_file = os.path.join(args["out_dir"],'results','copynumber_calling', "info.yaml") results = { 'copynumber_data': helpers.format_file_yaml(out_file), } tumours = {k: helpers.format_file_yaml(v) for k,v in tumour_bam_files.iteritems()} normals = {k: helpers.format_file_yaml(v) for k,v in normal_bam_files.iteritems()} input_datasets = {'tumour': tumours, 'normal': normals} metadata = { 'copynumber_calling': { 'chromosomes': config['chromosomes'], 'ref_genome': config['ref_genome'], 'version': single_cell.__version__, 'results': results, 'containers': config['containers'], 'input_datasets': input_datasets, 'output_datasets': None } } workflow.transform( name='generate_meta_yaml', ctx=dict(mem=config['memory']['med'], pool_id=config['pools']['standard'], mem_retry_increment=2, ncpus=1), func="single_cell.utils.helpers.write_to_yaml", args=( mgd.OutputFile(info_file), metadata ) ) return workflow
def create_aneufinder_workflow(bam_file, cell_ids, config, aneufinder_output, aneufinder_results_filename, aneufinder_pdf_filename, library_id, ): ctx = {'mem_retry_increment': 2, 'ncpus': 1} docker_ctx = helpers.get_container_ctx(config['containers'], 'single_cell_pipeline') ctx.update(docker_ctx) aneufinder_docker = helpers.get_container_ctx(config['containers'], 'aneufinder', docker_only=True) workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=cell_ids, ) workflow.transform( name='run_aneufinder_on_individual_cells', ctx=dict(mem=config['memory']['med'], pool_id=config['pools']['standard'], **ctx), func="single_cell.workflows.aneufinder.tasks.run_aneufinder", axes=('cell_id',), args=( mgd.InputFile('bam_file', 'cell_id', fnames=bam_file), mgd.TempSpace('working_dir', 'cell_id', fnames=bam_file), mgd.InputInstance('cell_id'), aneufinder_output, mgd.TempOutputFile('segments.csv', 'cell_id'), mgd.TempOutputFile('reads.csv', 'cell_id'), mgd.TempOutputFile('dnacopy.pdf', 'cell_id'), ), kwargs={'docker_config': aneufinder_docker} ) workflow.transform( name='merge_outputs', ctx=dict(mem=config['memory']['med'], pool_id=config['pools']['standard'], **ctx), func="single_cell.workflows.aneufinder.tasks.merge_outputs_to_hdf", args=( mgd.TempInputFile('reads.csv', 'cell_id'), mgd.TempInputFile('segments.csv', 'cell_id'), mgd.OutputFile(aneufinder_results_filename), mgd.TempSpace("aneufinder_merge"), ) ) workflow.transform( name='merge_aneufinder_pdfs', ctx=dict(mem=config['memory']['med'], pool_id=config['pools']['standard'], **ctx), func="single_cell.workflows.aneufinder.tasks.merge_pdf", args=( [mgd.TempInputFile('dnacopy.pdf', 'cell_id')], [mgd.OutputFile(aneufinder_pdf_filename)], ) ) return workflow
def create_alignment_metrics_workflow(bam_filename, bai_filename, alignment_metrics, plot_metrics, ref_genome, config, args, sample_info, cell_ids): out_dir = args['out_dir'] merge_metrics = os.path.join(out_dir, 'metrics') bam_filename = dict([(cellid, bam_filename[cellid]) for cellid in cell_ids]) bai_filename = dict([(cellid, bai_filename[cellid]) for cellid in cell_ids]) chromosomes = config["chromosomes"] ctx = {'mem_retry_increment': 2, 'ncpus': 1} docker_ctx = helpers.get_container_ctx(config['containers'], 'single_cell_pipeline') ctx.update(docker_ctx) workflow = pypeliner.workflow.Workflow() workflow.setobj( obj=mgd.OutputChunks('chrom'), value=chromosomes, ) workflow.setobj( obj=mgd.OutputChunks('cell_id'), value=cell_ids, ) workflow.setobj(obj=mgd.TempOutputObj('sampleinfo', 'cell_id', axes_origin=[]), value=sample_info) markdups_metrics = os.path.join(merge_metrics, 'markdups_metrics', '{cell_id}.markdups_metrics.txt') flagstat_metrics = os.path.join(merge_metrics, 'flagstat_metrics', '{cell_id}.flagstat_metrics.txt') workflow.transform( name='postprocess_bam', ctx=dict(mem=config['memory']['med'], pool_id=config['pools']['standard'], **ctx), axes=('cell_id', ), func= "single_cell.workflows.alignment_metrics.tasks.get_postprocess_metrics", args=( mgd.InputFile('sorted_markdups', 'cell_id', fnames=bam_filename), mgd.InputFile('sorted_markdups', 'cell_id', fnames=bai_filename), mgd.TempSpace('tempdir', 'cell_id'), config, mgd.OutputFile(markdups_metrics, 'cell_id'), mgd.OutputFile(flagstat_metrics, 'cell_id'), ), ) wgs_metrics_filename = os.path.join(merge_metrics, 'wgs_metrics', '{cell_id}.wgs_metrics.txt') workflow.transform( name='bam_collect_wgs_metrics', ctx=dict(mem=config['memory']['med'], pool_id=config['pools']['standard'], **ctx), func= "single_cell.workflows.alignment_metrics.tasks.bam_collect_wgs_metrics", axes=('cell_id', ), args=( mgd.InputFile('sorted_markdups', 'cell_id', fnames=bam_filename), ref_genome, mgd.OutputFile(wgs_metrics_filename, 'cell_id'), config, mgd.TempSpace('wgs_tempdir', 'cell_id'), ), ) gc_metrics_filename = os.path.join(merge_metrics, 'gc_metrics', '{cell_id}.gc_metrics.txt') gc_summary_filename = os.path.join(merge_metrics, 'gc_metrics', '{cell_id}.gc_metrics.summ.txt') gc_chart_filename = os.path.join(merge_metrics, 'gc_metrics', '{cell_id}.gc_metrics.pdf') workflow.transform( name='bam_collect_gc_metrics', ctx=dict(mem=config['memory']['med'], pool_id=config['pools']['standard'], **ctx), func= "single_cell.workflows.alignment_metrics.tasks.bam_collect_gc_metrics", axes=('cell_id', ), args=( mgd.InputFile('sorted_markdups', 'cell_id', fnames=bam_filename), ref_genome, mgd.OutputFile(gc_metrics_filename, 'cell_id'), mgd.OutputFile(gc_summary_filename, 'cell_id'), mgd.OutputFile(gc_chart_filename, 'cell_id'), mgd.TempSpace('gc_tempdir', 'cell_id'), config, ), ) insert_metrics_filename = os.path.join(merge_metrics, 'insert_metrics', '{cell_id}.insert_metrics.txt') insert_histogram_filename = os.path.join(merge_metrics, 'insert_metrics', '{cell_id}.insert_metrics.pdf') workflow.transform( name='bam_collect_insert_metrics', ctx=dict(mem=config['memory']['med'], pool_id=config['pools']['standard'], **ctx), func= "single_cell.workflows.alignment_metrics.tasks.bam_collect_insert_metrics", axes=('cell_id', ), args=( mgd.InputFile('sorted_markdups', 'cell_id', fnames=bam_filename), mgd.InputFile(flagstat_metrics, 'cell_id'), mgd.OutputFile(insert_metrics_filename, 'cell_id'), mgd.OutputFile(insert_histogram_filename, 'cell_id'), mgd.TempSpace('insert_tempdir', 'cell_id'), config, ), ) workflow.transform( name="collect_gc_metrics", func="single_cell.workflows.alignment_metrics.tasks.collect_gc", ctx=dict(mem=config['memory']['med'], pool_id=config['pools']['standard'], **ctx), args=(mgd.InputFile(gc_metrics_filename, 'cell_id', axes_origin=[]), mgd.TempOutputFile("gc_metrics.h5"), mgd.TempSpace("temp_gc"))) workflow.transform( name='collect_metrics', ctx=dict(mem=config['memory']['med'], pool_id=config['pools']['standard'], **ctx), func="single_cell.workflows.alignment_metrics.tasks.collect_metrics", args=( mgd.InputFile(flagstat_metrics, 'cell_id', axes_origin=[]), mgd.InputFile(markdups_metrics, 'cell_id', axes_origin=[]), mgd.InputFile(insert_metrics_filename, 'cell_id', axes_origin=[]), mgd.InputFile(wgs_metrics_filename, 'cell_id', axes_origin=[]), mgd.TempSpace("tempdir_collect_metrics"), mgd.TempOutputFile("alignment_metrics.h5"), ), ) workflow.transform( name='annotate_metrics', ctx=dict(mem=config['memory']['med'], pool_id=config['pools']['standard'], **ctx), func="single_cell.workflows.alignment_metrics.tasks.annotate_metrics", args=( mgd.TempInputFile("alignment_metrics.h5"), sample_info, mgd.TempOutputFile("alignment_metrics_annotated.h5"), )) workflow.transform( name='plot_metrics', ctx=dict(mem=config['memory']['med'], pool_id=config['pools']['standard'], **ctx), func="single_cell.workflows.alignment_metrics.tasks.plot_metrics", args=( mgd.TempInputFile("alignment_metrics_annotated.h5"), mgd.OutputFile(plot_metrics), 'QC pipeline metrics', mgd.TempInputFile("gc_metrics.h5"), config['gc_windows'], )) workflow.transform( name='concatenate_all_hdf_tables', ctx=dict(mem=config['memory']['low'], pool_id=config['pools']['standard'], **ctx), func="single_cell.utils.hdfutils.concat_hdf_tables", args=( [ mgd.TempInputFile("alignment_metrics_annotated.h5"), mgd.TempInputFile("gc_metrics.h5"), ], mgd.TempOutputFile("alignment_precast.h5"), ), ) workflow.transform(name='cast_h5', ctx=dict(mem=config['memory']['med'], pool_id=config['pools']['standard'], **ctx), func="single_cell.utils.hdfutils.cast_h5_file", args=( mgd.TempInputFile("alignment_precast.h5"), mgd.OutputFile(alignment_metrics), )) return workflow