def graphmap_join_workflow(job, options, config, vg_ids, hal_ids): root_job = Job() job.addChild(root_job) # run clip-vg on each input clipped_vg_ids = [] for vg_path, vg_id in zip(options.vg, vg_ids): clip_job = root_job.addChildJobFn(clip_vg, options, config, vg_path, vg_id, disk=vg_id.size * 2, memory=vg_id.size * 4) clipped_vg_ids.append(clip_job.rv()) # join the ids join_job = root_job.addFollowOnJobFn(join_vg, options, config, clipped_vg_ids, disk=sum([f.size for f in vg_ids])) clipped_vg_ids = join_job.rv() # make a gfa for each gfa_root_job = Job() join_job.addFollowOn(gfa_root_job) clipped_gfa_ids = [] for i in range(len(options.vg)): vg_path = options.vg[i] clipped_id = join_job.rv(i) vg_id = vg_ids[i] gfa_job = gfa_root_job.addChildJobFn(vg_to_gfa, options, config, vg_path, clipped_id, disk=vg_id.size * 5) clipped_gfa_ids.append(gfa_job.rv()) # merge up the gfas and make the various vg indexes gfa_merge_job = gfa_root_job.addFollowOnJobFn( vg_indexes, options, config, clipped_gfa_ids, cores=options.indexCores, disk=sum(f.size for f in vg_ids) * 5) if hal_ids: merge_hal_id = job.addChildJobFn(merge_hal, options, hal_ids, disk=sum(f.size for f in hal_ids) * 2).rv() else: merge_hal_id = None return clipped_vg_ids, gfa_merge_job.rv(), merge_hal_id
def minigraph_map_all(job, config, gfa_id, fa_id_map, graph_event, keep_gaf): """ top-level job to run the minigraph mapping in parallel, returns paf """ # hang everything on this job, to self-contain workflow top_job = Job() job.addChild(top_job) mg_cores = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "cpu", typeFn=int, default=1) mg_cores = min(mg_cores, cpu_count()) # doing the paf conversion is more efficient when done separately for each genome. we can get away # with doing this if the universal filter (which needs to process everything at once) is disabled xml_node = findRequiredNode(config.xmlRoot, "graphmap") paf_per_genome = not getOptionalAttrib(xml_node, "universalMZFilter", float) # do the mapping gaf_id_map = {} paf_id_map = {} for event, fa_path_fa_id in fa_id_map.items(): fa_path = fa_path_fa_id[0] fa_id = fa_path_fa_id[1] minigraph_map_job = top_job.addChildJobFn( minigraph_map_one, config, event, fa_path, fa_id, gfa_id, keep_gaf or not paf_per_genome, paf_per_genome, # todo: estimate RAM cores=mg_cores, disk=5 * (fa_id.size + gfa_id.size)) gaf_id_map[event] = minigraph_map_job.rv(0) paf_id_map[event] = minigraph_map_job.rv(1) # convert to paf if paf_per_genome: paf_job = top_job.addFollowOnJobFn(merge_pafs, paf_id_map) else: paf_job = top_job.addFollowOnJobFn(merge_gafs_into_paf, config, gaf_id_map) if not keep_gaf: gaf_id_map = None else: gaf_id_map = paf_job.addFollowOnJobFn(compress_gafs, gaf_id_map).rv() return paf_job.rv(), gaf_id_map
def run_all_bam_caller(job, context, fasta_file_id, bam_file_id, bam_idx_id, sample_name, chroms, offsets, out_name, bam_caller, bam_caller_opts = []): """ run freebayes or platypus on a set of chromosomal regions. this is done by sending each region to a child job and farming off the entire input to each (ie not splitting the input) """ # to encapsulate everything under this job child_job = Job() job.addChild(child_job) fb_vcf_ids = [] fb_tbi_ids = [] fb_timers = [] assert chroms if not offsets: offsets = [None] * len(chroms) for chrom, offset in zip(chroms, offsets): fb_job = child_job.addChildJobFn(run_bam_caller, context, fasta_file_id, bam_file_id, bam_idx_id, sample_name, chrom, offset, out_name, bam_caller, bam_caller_opts, memory=context.config.calling_mem, disk=context.config.calling_disk) fb_vcf_ids.append(fb_job.rv(0)) fb_tbi_ids.append(fb_job.rv(1)) fb_timers.append([fb_job.rv(2)]) merge_vcf_job = child_job.addFollowOnJobFn(run_concat_vcfs, context, out_name, fb_vcf_ids, fb_tbi_ids, write_to_outstore = True, call_timers_lists = fb_timers) return merge_vcf_job.rv()
def run_surjecting(job, context, gam_input_reads_id, output_name, interleaved, xg_file_id, paths): """ split the fastq, then surject each chunk. returns outputgams, paired with total surject time (excluding toil-vg overhead such as transferring and splitting files )""" # to encapsulate everything under this job child_job = Job() job.addChild(child_job) if not context.config.single_reads_chunk: reads_chunk_ids = child_job.addChildJobFn( run_split_reads, context, None, 'aln.gam', None, [gam_input_reads_id], cores=context.config.misc_cores, memory=context.config.misc_mem, disk=context.config.misc_disk).rv() else: RealtimeLogger.info( "Bypassing reads splitting because --single_reads_chunk enabled") reads_chunk_ids = [[r] for r in [gam_input_reads_id]] return child_job.addFollowOnJobFn(run_whole_surject, context, reads_chunk_ids, output_name, interleaved, xg_file_id, paths, cores=context.config.misc_cores, memory=context.config.misc_mem, disk=context.config.misc_disk).rv()
def minigraph_map_all(job, config, gfa_id, fa_id_map): """ top-level job to run the minigraph mapping in parallel, returns paf """ # hang everything on this job, to self-contain workflow top_job = Job() job.addChild(top_job) # do the mapping gaf_ids = [] for event, fa_id in fa_id_map.items(): RealtimeLogger.info("adding child event={} faid={} gfaid={}".format( event, fa_id, gfa_id)) minigraph_map_job = top_job.addChildJobFn(minigraph_map_one, config, event, fa_id, gfa_id, cores=1, disk=5 * (fa_id.size + gfa_id.size)) gaf_ids.append(minigraph_map_job.rv()) # convert to paf paf_job = top_job.addFollowOnJobFn(merge_gafs_into_paf, config, gaf_ids) return paf_job.rv()
def graphmap_split_workflow(job, options, config, seqIDMap, gfa_id, gfa_path, paf_id, paf_path, ref_contigs, other_contig): root_job = Job() job.addChild(root_job) # get the sizes before we overwrite below gfa_size = gfa_id.size paf_size = paf_id.size # use file extension to sniff out compressed input if gfa_path.endswith(".gz"): gfa_id = root_job.addChildJobFn(unzip_gz, gfa_path, gfa_id, disk=gfa_id.size * 10).rv() gfa_size *= 10 if paf_path.endswith(".gz"): paf_id = root_job.addChildJobFn(unzip_gz, paf_path, paf_id, disk=paf_id.size * 10).rv() paf_size *= 10 mask_bed_id = None if options.maskFilter: mask_bed_id = root_job.addChildJobFn(get_mask_bed, seqIDMap, options.maskFilter).rv() # use rgfa-split to split the gfa and paf up by contig split_gfa_job = root_job.addFollowOnJobFn(split_gfa, config, gfa_id, [paf_id], ref_contigs, other_contig, options.reference, mask_bed_id, disk=(gfa_size + paf_size) * 5) # use the output of the above splitting to do the fasta splitting split_fas_job = split_gfa_job.addFollowOnJobFn(split_fas, seqIDMap, split_gfa_job.rv(0)) # gather everythign up into a table gather_fas_job = split_fas_job.addFollowOnJobFn(gather_fas, seqIDMap, split_gfa_job.rv(0), split_fas_job.rv()) # try splitting the ambiguous sequences using minimap2, which is more sensitive in some cases remap_job = gather_fas_job.addFollowOnJobFn(split_minimap_fallback, options, config, seqIDMap, gather_fas_job.rv()) # partition these into fasta files split_fallback_gfa_job = remap_job.addFollowOnJobFn(split_gfa, config, None, remap_job.rv(0), ref_contigs, other_contig, options.reference, None, disk=(gfa_size + paf_size) * 5) # use the output of the above to split the ambiguous fastas split_fallback_fas_job = split_fallback_gfa_job.addFollowOnJobFn(split_fas, remap_job.rv(1), split_fallback_gfa_job.rv(0)) # gather the fallback contigs into a table gather_fallback_fas_job = split_fallback_fas_job.addFollowOnJobFn(gather_fas, remap_job.rv(1), split_fallback_gfa_job.rv(0), split_fallback_fas_job.rv()) # combine the split sequences with the split ambigious sequences combine_split_job = gather_fallback_fas_job.addFollowOnJobFn(combine_splits, options, config, seqIDMap, gather_fas_job.rv(), gather_fallback_fas_job.rv()) # return all the files, as well as the 2 split logs return (seqIDMap, combine_split_job.rv(), split_gfa_job.rv(1), split_fallback_gfa_job.rv(1))
def graphmap_split_workflow(job, options, config, seqIDMap, gfa_id, gfa_path, paf_id, paf_path, ref_contigs, other_contig): root_job = Job() job.addChild(root_job) # get the sizes before we overwrite below gfa_size = gfa_id.size paf_size = paf_id.size # use file extension to sniff out compressed input if gfa_path.endswith(".gz"): gfa_id = root_job.addChildJobFn(unzip_gz, gfa_path, gfa_id, disk=gfa_id.size * 10).rv() gfa_size *= 10 if paf_path.endswith(".gz"): paf_id = root_job.addChildJobFn(unzip_gz, paf_path, paf_id, disk=paf_id.size * 10).rv() paf_size *= 10 mask_bed_id = None if options.maskFilter: mask_bed_id = root_job.addChildJobFn(get_mask_bed, seqIDMap, options.maskFilter).rv() # use rgfa-split to split the gfa and paf up by contig split_gfa_job = root_job.addFollowOnJobFn(split_gfa, config, gfa_id, paf_id, ref_contigs, other_contig, options.reference, mask_bed_id, disk=(gfa_size + paf_size) * 5) # use the output of the above splitting to do the fasta splitting split_fas_job = split_gfa_job.addFollowOnJobFn(split_fas, seqIDMap, split_gfa_job.rv()) # gather everythign up into a table gather_fas_job = split_fas_job.addFollowOnJobFn(gather_fas, seqIDMap, split_gfa_job.rv(), split_fas_job.rv()) # return all the files return gather_fas_job.rv()
def run_cactus_align(job, configWrapper, cactusWorkflowArguments, project, checkpointInfo, doRenaming, pafInput, pafSecondaries, doVG, doGFA, delay=0, eventNameAsID=False): # this option (--stagger) can be used in batch mode to avoid starting all the alignment jobs at the same time time.sleep(delay) head_job = Job() job.addChild(head_job) # allow for input in paf format: if pafInput: # convert the paf input to lastz format, splitting out into primary and secondary files paf_to_lastz_job = head_job.addChildJobFn(paf_to_lastz.paf_to_lastz, cactusWorkflowArguments.alignmentsID, True) cactusWorkflowArguments.alignmentsID = paf_to_lastz_job.rv(0) cactusWorkflowArguments.secondaryAlignmentsID = paf_to_lastz_job.rv(1) if pafSecondaries else None # do the name mangling cactus expects, where every fasta sequence starts with id=0|, id=1| etc # and the cigar files match up. If reading cactus-blast output, the cigars are fine, just need # the fastas (todo: make this less hacky somehow) cur_job = head_job.addFollowOnJobFn(run_prepend_unique_ids, cactusWorkflowArguments, project, doRenaming, eventNameAsID, #todo disk= ) no_ingroup_coverage = not cactusWorkflowArguments.ingroupCoverageIDs cactusWorkflowArguments = cur_job.rv() if no_ingroup_coverage: # if we're not taking cactus_blast input, then we need to recompute the ingroup coverage cur_job = cur_job.addFollowOnJobFn(run_ingroup_coverage, cactusWorkflowArguments, project) cactusWorkflowArguments = cur_job.rv() # run cactus setup all the way through cactus2hal generation setup_job = cur_job.addFollowOnJobFn(run_setup_phase, cactusWorkflowArguments) # set up the project prepare_hal_export_job = setup_job.addFollowOnJobFn(run_prepare_hal_export, project, setup_job.rv()) # create the hal hal_export_job = prepare_hal_export_job.addFollowOnJobFn(exportHal, prepare_hal_export_job.rv(0), event=prepare_hal_export_job.rv(1), checkpointInfo=checkpointInfo, memory=configWrapper.getDefaultMemory(), disk=configWrapper.getExportHalDisk(), preemptable=False) # optionally create the VG if doVG or doGFA: vg_export_job = hal_export_job.addFollowOnJobFn(export_vg, hal_export_job.rv(), configWrapper, doVG, doGFA, checkpointInfo=checkpointInfo) vg_file_id, gfa_file_id = vg_export_job.rv(0), vg_export_job.rv(1) else: vg_file_id, gfa_file_id = None, None return hal_export_job.rv(), vg_file_id, gfa_file_id
def run_whole_surject(job, context, reads_chunk_ids, output_name, interleaved, xg_file_id, paths): """ Surject all gam chunks in parallel. surject all the GAM file IDs in read_chunk_ids, saving the merged BAM as output_name. If interleaved is true, expects paired-interleaved GAM input and writes paired BAM output. Surjects against the given collection of paths in the given XG file. """ RealtimeLogger.info( "Surjecting read chunks {} to BAM".format(reads_chunk_ids)) # this will be a list of lists. # bam_chunk_file_ids[i][j], will correspond to the jth path (from id_ranges) # for the ith gam chunk (generated from fastq shard i) bam_chunk_file_ids = [] bam_chunk_running_times = [] # to encapsulate everything under this job child_job = Job() job.addChild(child_job) for chunk_id, chunk_filename_ids in enumerate(zip(*reads_chunk_ids)): #Run graph surject on each gam chunk chunk_surject_job = child_job.addChildJobFn( run_chunk_surject, context, interleaved, xg_file_id, paths, chunk_filename_ids, '{}_chunk{}'.format(output_name, chunk_id), cores=context.config.alignment_cores, memory=context.config.alignment_mem, disk=context.config.alignment_disk) bam_chunk_file_ids.append(chunk_surject_job.rv(0)) bam_chunk_running_times.append(chunk_surject_job.rv(1)) return child_job.addFollowOnJobFn(run_merge_bams, context, output_name, bam_chunk_file_ids, cores=context.config.misc_cores, memory=context.config.misc_mem, disk=context.config.misc_disk).rv()
def run_cactus_align(job, configWrapper, cactusWorkflowArguments, project, doRenaming, pafInput): head_job = Job() job.addChild(head_job) # allow for input in paf format: if pafInput: # convert the paf input to lastz format, splitting out into primary and secondary files paf_to_lastz_job = head_job.addChildJobFn( paf_to_lastz.paf_to_lastz, cactusWorkflowArguments.alignmentsID, True) cactusWorkflowArguments.alignmentsID = paf_to_lastz_job.rv(0) cactusWorkflowArguments.secondaryAlignmentsID = paf_to_lastz_job.rv(1) # do the name mangling cactus expects, where every fasta sequence starts with id=0|, id=1| etc # and the cigar files match up. If reading cactus-blast output, the cigars are fine, just need # the fastas (todo: make this less hacky somehow) cur_job = head_job.addFollowOnJobFn(run_prepend_unique_ids, cactusWorkflowArguments, project, doRenaming #todo disk= ) no_ingroup_coverage = not cactusWorkflowArguments.ingroupCoverageIDs cactusWorkflowArguments = cur_job.rv() if no_ingroup_coverage: # if we're not taking cactus_blast input, then we need to recompute the ingroup coverage cur_job = cur_job.addFollowOnJobFn(run_ingroup_coverage, cactusWorkflowArguments, project) cactusWorkflowArguments = cur_job.rv() # run cactus setup all the way through cactus2hal generation setup_job = cur_job.addFollowOnJobFn(run_setup_phase, cactusWorkflowArguments) # set up the project prepare_hal_export_job = setup_job.addFollowOnJobFn( run_prepare_hal_export, project, setup_job.rv()) # create the hal hal_export_job = prepare_hal_export_job.addFollowOnJobFn( exportHal, prepare_hal_export_job.rv(0), event=prepare_hal_export_job.rv(1), memory=configWrapper.getDefaultMemory(), disk=configWrapper.getExportHalDisk(), preemptable=False) return hal_export_job.rv()
def combine_splits(job, config, seq_id_map, original_id_map, remap_id_map): """ combine the output of two runs of gather_fas. the first is the contigs determined by minigraph, the second from remapping the ambigious contigs with minimap2 """ root_job = Job() job.addChild(root_job) # no ambiguous remappings, nothing to do if not remap_id_map or len(remap_id_map) == 0: return original_id_map amb_name = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "ambiguousName", default="_AMBIGUOUS_") graph_event = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "assemblyName", default="_MINIGRAPH_") # note: we're not handling case where 100% of a given reference contigs are ambiguous for ref_contig in original_id_map: if ref_contig == amb_name: # for ambiguous sequence, we overwrite and don't combine if ref_contig in remap_id_map: original_id_map[ref_contig] = remap_id_map[ref_contig] else: original_id_map[ref_contig] = None elif ref_contig in remap_id_map: total_size = 0 for event in original_id_map[ref_contig]['fa']: total_size += original_id_map[ref_contig]['fa'][event].size if event in remap_id_map[ref_contig]['fa']: total_size += remap_id_map[ref_contig]['fa'][event].size original_id_map[ref_contig] = root_job.addChildJobFn( combine_ref_contig_splits, original_id_map[ref_contig], remap_id_map[ref_contig], disk=total_size * 4).rv() return root_job.addFollowOnJobFn(combine_paf_splits, seq_id_map, original_id_map, remap_id_map, amb_name, graph_event).rv()
def preprocess_input_sequences(job, configWrapper, project, cactusWorkflowArguments, pafMaskFilter=None, referenceEvent=None): """ update the workflow arguments in place with unzipped version of any input fastas whose paths end in .gz, if there's a pafMaskFilter, softmasked regions are extracted from each sequence into a bed. Note that the beds will need unique ids prepended just like the fastas... """ head_job = Job() job.addChild(head_job) graph_event = getOptionalAttrib(findRequiredNode(configWrapper.xmlRoot, "graphmap"), "assemblyName", default="_MINIGRAPH_") exp = cactusWorkflowArguments.experimentWrapper ingroupsAndOriginalIDs = [(g, exp.getSequenceID(g)) for g in exp.getGenomesWithSequence() if g not in exp.getOutgroupGenomes()] mask_bed_ids = {} events = [] updated_seq_ids = [] for g, seqID in ingroupsAndOriginalIDs: zipped = project.inputSequenceMap[g].endswith('.gz') do_filter = pafMaskFilter and g not in [graph_event, referenceEvent] if zipped or do_filter: prepend_id_job = head_job.addChildJobFn( preprocess_input_sequence, g, seqID, project.inputSequenceMap[g], pafMaskFilter) updated_seq_id, mask_bed_id = prepend_id_job.rv( 0), prepend_id_job.rv(1) if zipped: events.append(g) updated_seq_ids.append(updated_seq_id) if do_filter: mask_bed_ids[g] = mask_bed_id return head_job.addFollowOnJobFn( resolve_id_promises, events, updated_seq_ids, cactusWorkflowArguments).rv(), mask_bed_ids
def run_calleval(job, context, xg_ids, gam_ids, gam_idx_ids, bam_ids, bam_idx_ids, gam_names, bam_names, vcfeval_baseline_id, vcfeval_baseline_tbi_id, caller_fasta_id, vcfeval_fasta_id, bed_id, clip_only, call, sample_name, chroms, vcf_offsets, vcfeval_score_field, plot_sets, surject, interleaved, freebayes, platypus, happy, sveval, recall, min_sv_len, max_sv_len, sv_overlap, sv_region_overlap, normalize, ins_ref_len, del_min_rol, ins_seq_comp, min_mapq=0, min_baseq=0, min_augment_coverage=0): """ top-level call-eval function. Runs the caller on every gam, and freebayes on every bam. The resulting vcfs are put through vcfeval and the accuracies are tabulated in the output. Returns the output of run_calleval results, a list of condition names, a list of corresponding called VCF.gz and index ID pairs, and dicts of vcfeval and happy result dicts, by condition name and clipped/unclipped status. plot_sets is a data structure of collections of conditions to plot against each other, as produced by parse_plot_sets. """ # We store the name of each condition we run names = [] # And we build up these result lists in sync with the name list vcf_tbi_id_pairs = [] timing_results = [] # Here we accumulate vcf_eval comparison results in a dict by condition name, then clipping status ("clipped", "unclipped"). # Each contained dict is the output dict from run_vcfeval eval_results = collections.defaultdict(dict) # And here we store similarly the output dicts from run_happy happy_results = collections.defaultdict(dict) # And here we store similarly the output dicts from run_sveval sveval_results = collections.defaultdict(dict) # Some prep work (surjection and truth extraction) will happen under this head job head_job = Job() job.addChild(head_job) # Most of our work will run under this child job child_job = Job() head_job.addFollowOn(child_job) # We always extract a single-sample VCF from the truth, to save time # picking through all its samples multiple times over later. This should # also save memory. TODO: should we define a separate disk/memory requirement set? sample_extract_job = head_job.addChildJobFn(run_make_control_vcfs, context, vcfeval_baseline_id, 'baseline.vcf.gz', vcfeval_baseline_tbi_id, sample_name, pos_only = True, no_filter_if_sample_not_found = True, cores=context.config.vcfeval_cores, memory=context.config.vcfeval_mem, disk=context.config.vcfeval_disk) truth_vcf_id = sample_extract_job.rv(0) truth_vcf_tbi_id = sample_extract_job.rv(1) if not gam_idx_ids: gam_idx_ids = [None] * len(gam_ids) assert len(gam_idx_ids) == len(gam_ids) if surject: # optionally surject all the gams into bams for xg_id, gam_name, gam_id in zip(xg_ids, gam_names, gam_ids): surject_job = head_job.addChildJobFn(run_surjecting, context, gam_id, gam_name + '-surject', interleaved, xg_id, chroms, cores=context.config.misc_cores, memory=context.config.misc_mem, disk=context.config.misc_disk) bam_ids.append(surject_job.rv()) bam_idx_ids.append(None) bam_names.append(gam_name + '-surject') if bam_ids: for bam_id, bam_idx_id, bam_name in zip(bam_ids, bam_idx_ids, bam_names): if not bam_idx_id: bam_index_job = child_job.addChildJobFn(run_bam_index, context, bam_id, bam_name, cores=context.config.calling_cores, memory=context.config.calling_mem, disk=context.config.calling_disk) sorted_bam_id = bam_index_job.rv(0) sorted_bam_idx_id = bam_index_job.rv(1) else: bam_index_job = Job() child_job.addChild(bam_index_job) sorted_bam_id = bam_id sorted_bam_idx_id = bam_idx_id bam_caller_infos = [] if freebayes: bam_caller_infos.append(('freebayes', ['--genotype-qualities'], '-fb')) if platypus: bam_caller_infos.append(('platypus', ['--mergeClusteredVariants=1'], '-plat')) for bam_caller, bam_caller_opts, bam_caller_tag in bam_caller_infos: bam_caller_out_name = '{}{}'.format(bam_name, bam_caller_tag) bam_caller_job = bam_index_job.addFollowOnJobFn(run_all_bam_caller, context, caller_fasta_id, sorted_bam_id, sorted_bam_idx_id, sample_name, chroms, vcf_offsets, out_name = bam_caller_out_name, bam_caller = bam_caller, bam_caller_opts = bam_caller_opts, cores=context.config.misc_cores, memory=context.config.misc_mem, disk=context.config.misc_disk) bam_caller_vcf_tbi_id_pair = (bam_caller_job.rv(0), bam_caller_job.rv(1)) timing_result = bam_caller_job.rv(2) if bed_id: eval_results[bam_caller_out_name]["clipped"] = \ bam_caller_job.addFollowOnJobFn(run_vcfeval, context, sample_name, bam_caller_vcf_tbi_id_pair, truth_vcf_id, truth_vcf_tbi_id, 'ref.fasta', vcfeval_fasta_id, bed_id, out_name=bam_caller_out_name, score_field='GQ', cores=context.config.vcfeval_cores, memory=context.config.vcfeval_mem, disk=context.config.vcfeval_disk).rv() if happy: happy_results[bam_caller_out_name]["clipped"] = \ bam_caller_job.addFollowOnJobFn(run_happy, context, sample_name, bam_caller_vcf_tbi_id_pair, truth_vcf_id, truth_vcf_tbi_id, 'ref.fasta', vcfeval_fasta_id, bed_id, out_name=bam_caller_out_name, cores=context.config.vcfeval_cores, memory=context.config.vcfeval_mem, disk=context.config.vcfeval_disk).rv() if sveval: sveval_results[bam_caller_out_name]["clipped"] = \ bam_caller_job.addFollowOnJobFn(run_sv_eval, context, sample_name, bam_caller_vcf_tbi_id_pair, truth_vcf_id, truth_vcf_tbi_id, min_sv_len=min_sv_len, max_sv_len=max_sv_len, sv_overlap=sv_overlap, sv_region_overlap=sv_region_overlap, bed_id=bed_id, ins_ref_len=ins_ref_len, del_min_rol=del_min_rol, ins_seq_comp=ins_seq_comp, out_name=bam_caller_out_name, fasta_path = 'ref.fasta', fasta_id = vcfeval_fasta_id, normalize = normalize, cores=context.config.vcfeval_cores, memory=context.config.vcfeval_mem, disk=context.config.vcfeval_disk).rv() if not clip_only: # Also do unclipped eval_results[bam_caller_out_name]["unclipped"] = \ bam_caller_job.addFollowOnJobFn(run_vcfeval, context, sample_name, bam_caller_vcf_tbi_id_pair, truth_vcf_id, truth_vcf_tbi_id, 'ref.fasta', vcfeval_fasta_id, None, out_name=bam_caller_out_name if not bed_id else bam_caller_out_name + '-unclipped', score_field='GQ', cores=context.config.vcfeval_cores, memory=context.config.vcfeval_mem, disk=context.config.vcfeval_disk).rv() if happy: happy_results[bam_caller_out_name]["unclipped"] = \ bam_caller_job.addFollowOnJobFn(run_happy, context, sample_name, bam_caller_vcf_tbi_id_pair, truth_vcf_id, truth_vcf_tbi_id, 'ref.fasta', vcfeval_fasta_id, None, out_name=bam_caller_out_name if not bed_id else bam_caller_out_name + '-unclipped', cores=context.config.vcfeval_cores, memory=context.config.vcfeval_mem, disk=context.config.vcfeval_disk).rv() if sveval: sveval_results[bam_caller_out_name]["unclipped"] = \ bam_caller_job.addFollowOnJobFn(run_sv_eval, context, sample_name, bam_caller_vcf_tbi_id_pair, truth_vcf_id, truth_vcf_tbi_id, min_sv_len=min_sv_len, max_sv_len=max_sv_len, sv_overlap=sv_overlap, sv_region_overlap=sv_region_overlap, bed_id=None, ins_ref_len=ins_ref_len, del_min_rol=del_min_rol, ins_seq_comp=ins_seq_comp, out_name=bam_caller_out_name if not bed_id else bam_caller_out_name + '-unclipped', fasta_path = 'ref.fasta', fasta_id = vcfeval_fasta_id, normalize = normalize, cores=context.config.vcfeval_cores, memory=context.config.vcfeval_mem, disk=context.config.vcfeval_disk).rv() vcf_tbi_id_pairs.append(bam_caller_vcf_tbi_id_pair) timing_results.append(timing_result) names.append(bam_caller_out_name) if gam_ids: for gam_id, gam_idx_id, gam_name, xg_id in zip(gam_ids, gam_idx_ids, gam_names, xg_ids): if call: out_name = '{}{}'.format(gam_name, '-call') if context.config.filter_opts: filter_job = Job.wrapJobFn(run_filtering, context, graph_id=xg_id, graph_basename = 'graph.xg', gam_id=gam_id, gam_basename = 'aln.gam', filter_opts = context.config.filter_opts, cores=context.config.calling_cores, memory=context.config.calling_mem, disk=context.config.calling_disk) gam_id = filter_job.rv() call_job = Job.wrapJobFn(run_chunked_calling, context, graph_id=xg_id, graph_basename='graph.xg', gam_id=gam_id, gam_basename='aln.gam', batch_input=None, snarls_id=None, genotype_vcf_id=None, genotype_tbi_id=None, sample=sample_name, augment=not recall, connected_component_chunking=False, output_format='pg', min_augment_coverage=min_augment_coverage, expected_coverage=None, min_mapq=min_mapq, min_baseq=min_baseq, ref_paths=chroms, ref_path_chunking=False, min_call_support=None, vcf_offsets=vcf_offsets, cores=context.config.misc_cores, memory=context.config.misc_mem, disk=context.config.misc_disk) if context.config.filter_opts: child_job.addChild(filter_job) filter_job.addFollowOn(call_job) else: child_job.addChild(call_job) vcf_tbi_id_pair = (call_job.rv(0), call_job.rv(1)) #timing_result = call_job.rv(2) timing_result = TimeTracker() if not vcfeval_score_field: score_field = 'QUAL' else: score_field = vcfeval_score_field if bed_id: eval_results[out_name]["clipped"] = \ call_job.addFollowOnJobFn(run_vcfeval, context, sample_name, vcf_tbi_id_pair, truth_vcf_id, truth_vcf_tbi_id, 'ref.fasta', vcfeval_fasta_id, bed_id, out_name=out_name, score_field=score_field).rv() if happy: happy_results[out_name]["clipped"] = \ call_job.addFollowOnJobFn(run_happy, context, sample_name, vcf_tbi_id_pair, truth_vcf_id, truth_vcf_tbi_id, 'ref.fasta', vcfeval_fasta_id, bed_id, out_name=out_name).rv() if sveval: sveval_results[out_name]["clipped"] = \ call_job.addFollowOnJobFn(run_sv_eval, context, sample_name, vcf_tbi_id_pair, truth_vcf_id, truth_vcf_tbi_id, min_sv_len=min_sv_len, max_sv_len=max_sv_len, sv_overlap=sv_overlap, sv_region_overlap=sv_region_overlap, ins_ref_len=ins_ref_len, del_min_rol=del_min_rol, ins_seq_comp=ins_seq_comp, bed_id = bed_id, out_name=out_name, fasta_path = 'ref.fasta', fasta_id = vcfeval_fasta_id, normalize = normalize).rv() if not clip_only: # Also do unclipped eval_results[out_name]["unclipped"] = \ call_job.addFollowOnJobFn(run_vcfeval, context, sample_name, vcf_tbi_id_pair, truth_vcf_id, truth_vcf_tbi_id, 'ref.fasta', vcfeval_fasta_id, None, out_name=out_name if not bed_id else out_name + '-unclipped', score_field=score_field).rv() if happy: happy_results[out_name]["unclipped"] = \ call_job.addFollowOnJobFn(run_happy, context, sample_name, vcf_tbi_id_pair, truth_vcf_id, truth_vcf_tbi_id, 'ref.fasta', vcfeval_fasta_id, None, out_name=out_name if not bed_id else out_name + '-unclipped').rv() if sveval: sveval_results[out_name]["unclipped"] = \ call_job.addFollowOnJobFn(run_sv_eval, context, sample_name, vcf_tbi_id_pair, truth_vcf_id, truth_vcf_tbi_id, min_sv_len=min_sv_len, max_sv_len=max_sv_len, sv_overlap=sv_overlap, sv_region_overlap=sv_region_overlap, bed_id = None, ins_ref_len=ins_ref_len, del_min_rol=del_min_rol, ins_seq_comp=ins_seq_comp, out_name=out_name if not bed_id else out_name + '-unclipped', fasta_path = 'ref.fasta', fasta_id = vcfeval_fasta_id, normalize = normalize).rv() vcf_tbi_id_pairs.append(vcf_tbi_id_pair) timing_results.append(timing_result) names.append(out_name) calleval_results = child_job.addFollowOnJobFn(run_calleval_results, context, names, vcf_tbi_id_pairs, eval_results, happy_results, sveval_results, timing_results, plot_sets, cores=context.config.misc_cores, memory=context.config.misc_mem, disk=context.config.misc_disk).rv() return calleval_results, names, vcf_tbi_id_pairs, eval_results
def run_chunked_calling(job, context, graph_id, graph_basename, gam_id, gam_basename, batch_input=None, snarls_id=None, genotype_vcf_id=None, genotype_tbi_id=None, sample=None, augment=False, connected_component_chunking=False, output_format=None, min_augment_coverage=None, expected_coverage=None, min_mapq=None, min_baseq=None, ref_paths=[], ref_path_chunking=True, min_call_support=None, vcf_offsets={}, gam_chunking=False): # simple way to keep follow-ons down the tree child_job = Job() job.addChild(child_job) out_vcf_name = remove_ext(graph_basename) if sample: out_vcf_name += '_' + sample # base case: only one input if batch_input is None: # chunk if necessary if connected_component_chunking or ref_path_chunking: chunk_job = child_job.addChildJobFn( run_chunking, context, graph_id=graph_id, graph_basename=graph_basename, chunk_paths=ref_paths, connected_component_chunking=connected_component_chunking, output_format=output_format, gam_id=gam_id if gam_chunking else None, to_outstore=False, cores=context.config.chunk_cores, memory=context.config.chunk_mem, disk=context.config.chunk_disk) batch_input = chunk_job.rv() # recurse on chunks recurse_job = child_job.addFollowOnJobFn( run_chunked_calling, context, graph_id=None, graph_basename=graph_basename, gam_id=gam_id, gam_basename=gam_basename, batch_input=batch_input, snarls_id=snarls_id, genotype_vcf_id=genotype_vcf_id, genotype_tbi_id=genotype_tbi_id, sample=sample, augment=augment, connected_component_chunking=connected_component_chunking, output_format=output_format, min_augment_coverage=min_augment_coverage, expected_coverage=expected_coverage, min_mapq=min_mapq, min_baseq=min_baseq, ref_paths=ref_paths, ref_path_chunking=ref_path_chunking, min_call_support=min_call_support, vcf_offsets=vcf_offsets, gam_chunking=gam_chunking) return recurse_job.rv() else: # convert if we're augmenting and not chunking if augment and os.path.splitext( graph_basename)[1] != '.' + output_format: convert_job = child_job.addChildJobFn( run_convert, context, graph_id=graph_id, graph_basename=graph_basename, output_format=output_format, disk=context.config.calling_disk) graph_id = convert_job.rv() graph_basename = os.path.splitext( graph_basename)[0] + '.' + output_format # todo: clean up next_job = Job() child_job.addFollowOn(next_job) child_job = next_job #phony up chunk output for single input batch_input = {'all': [graph_id, graph_basename]} if gam_id: batch_input['all'] += [gam_id, gam_basename] # run the calling on each chunk assert batch_input call_results = [] in_gam_id = gam_id in_gam_basename = gam_basename for chunk_name, chunk_results in list(batch_input.items()): calling_root_job = Job() child_job.addChild(calling_root_job) graph_id = chunk_results[0] graph_basename = chunk_results[1] if gam_chunking: gam_id = chunk_results[2] gam_basename = chunk_results[3] else: gam_id = in_gam_id gam_basename = in_gam_basename if augment: augment_job = calling_root_job.addChildJobFn( run_augmenting, context, graph_id=graph_id, graph_basename=graph_basename, gam_id=gam_id, gam_basename=gam_basename, augment_gam=True, min_augment_coverage=min_augment_coverage, expected_coverage=expected_coverage, min_mapq=min_mapq, min_baseq=min_baseq, to_outstore=True, cores=context.config.augment_cores, memory=context.config.augment_mem, disk=context.config.augment_disk) graph_id = augment_job.rv(0) graph_basename = os.path.splitext(graph_basename)[ 0] + '-aug' + os.path.splitext(graph_basename)[1] gam_id = augment_job.rv(1) gam_basename = os.path.splitext( gam_basename)[0] + '-aug' + os.path.splitext(gam_basename)[1] # When path chunking, we subset our reference paths down to the current path if ref_path_chunking: ref_path = [chunk_name] else: ref_path = ref_paths calling_job = calling_root_job.addFollowOnJobFn( run_calling, context, graph_id=graph_id, graph_basename=graph_basename, gam_id=gam_id, gam_basename=gam_basename, snarls_id=snarls_id, genotype_vcf_id=genotype_vcf_id, genotype_tbi_id=genotype_tbi_id, sample=sample, expected_coverage=expected_coverage, min_mapq=min_mapq, ref_paths=ref_path, min_call_support=min_call_support, vcf_offsets=vcf_offsets, to_outstore=False, cores=context.config.calling_cores, memory=context.config.calling_mem, disk=context.config.calling_disk) call_results.append((chunk_name, calling_job.rv())) concat_job = child_job.addFollowOnJobFn(run_concat_vcfs, context, out_name=out_vcf_name, vcf_ids=None, tbi_ids=None, write_to_outstore=True, call_timers_lists=[], batch_data=call_results) return concat_job.rv()
def run_whole_alignment(job, context, fastq, gam_input_reads, bam_input_reads, sample_name, interleaved, mapper, indexes, reads_chunk_ids, bam_output=False, surject=False, gbwt_penalty=None, validate=False, fasta_dict_id=None): """ align all fastq chunks in parallel Takes a dict from index type to index file ID. Some indexes are extra and specifying them will change mapping behavior. Returns a list of per-contig GAMs, the total allignment runtime, and a list of per-contig BAM file IDs (which is only nonempty when surject is true). """ # this will be a list of lists. # gam_chunk_file_ids[i][j], will correspond to the jth path (from id_ranges) # for the ith gam chunk (generated from fastq shard i) gam_chunk_file_ids = [] gam_chunk_running_times = [] # depending on bam_output and surject options, we can make bam_output too bam_chunk_file_ids = [] # to encapsulate everything under this job child_job = Job() job.addChild(child_job) for chunk_id, chunk_filename_ids in enumerate(zip(*reads_chunk_ids)): #Run graph alignment on each fastq chunk chunk_alignment_job = child_job.addChildJobFn(run_chunk_alignment, context, gam_input_reads, bam_input_reads, sample_name, interleaved, mapper, chunk_filename_ids, chunk_id, indexes, bam_output=bam_output, gbwt_penalty=gbwt_penalty, validate=validate, fasta_dict_id=fasta_dict_id, cores=context.config.alignment_cores, memory=context.config.alignment_mem, disk=context.config.alignment_disk) if not bam_output: gam_chunk_file_ids.append(chunk_alignment_job.rv(0)) else: bam_chunk_file_ids.append(chunk_alignment_job.rv(0)) gam_chunk_running_times.append(chunk_alignment_job.rv(1)) if not bam_output: merge_gams_job = child_job.addFollowOnJobFn(run_merge_gams, context, sample_name, indexes.get('id_ranges'), gam_chunk_file_ids, gam_chunk_running_times, cores=context.config.misc_cores, memory=context.config.misc_mem, disk=context.config.misc_disk) gam_chrom_ids = merge_gams_job.rv(0) gam_chunk_time = merge_gams_job.rv(1) bam_chrom_ids = [] else: gam_chrom_ids = [] gam_chunk_time = None merge_bams_job = child_job.addFollowOnJobFn(run_merge_bams, context, sample_name, bam_chunk_file_ids, cores=context.config.misc_cores, memory=context.config.misc_mem, disk=context.config.misc_disk) split_bams_job = merge_bams_job.addFollowOnJobFn(split_bam_into_chroms, context, indexes.get('id_ranges'), merge_bams_job.rv(), cores=context.config.alignment_cores, memory=context.config.alignment_mem, disk=context.config.alignment_disk) bam_chrom_ids = split_bams_job.rv() if surject: interleaved_surject = interleaved or (fastq and len(fastq) == 2) zip_job = child_job.addFollowOnJobFn(run_zip_surject_input, context, gam_chunk_file_ids) xg_id = indexes['xg-surject'] if 'xg-surject' in indexes else indexes['xg'] bam_chrom_ids = [zip_job.addFollowOnJobFn(run_whole_surject, context, zip_job.rv(), sample_name + '-surject', interleaved_surject, xg_id, []).rv()] return gam_chrom_ids, gam_chunk_time, bam_chrom_ids
def run_chunked_calling(job, context, chunk_infos, genotype, recall, snarls_id, pack_support, old_call, call_timers): """ spawn a calling job for each chunk then merge them together """ # to encapsulate everything under this job child_job = Job() job.addChild(child_job) path_names = set() # If no chunking and many paths, we augment once first and not before calling # so we don't waste resources augmenting the same graph again and again # Note: should only do this when len(chunk_infos) > 1, but leaving as is so the tests hit it! if context.config.call_chunk_size == 0: chunk_info = chunk_infos[0] augment_job = child_job.addChildJobFn( run_vg_call, context, chunk_info['sample'], chunk_info['vg_id'], chunk_info['gam_id'], xg_id = chunk_info['xg_id'], path_names = [chunk_info['chrom']], seq_names = [chunk_info['chrom']], seq_offsets = [chunk_info['chunk_start'] + chunk_info['offset']], seq_lengths = [chunk_info['path_size']], chunk_name = 'chunk_{}_{}'.format(chunk_info['chrom'], chunk_info['chunk_start']), genotype = genotype, recall = recall, clip_info = chunk_info, augment_only = True, pack_support = pack_support, alt_gam_id = chunk_info['alt_gam_id'], old_call = old_call, cores=context.config.calling_cores, memory=context.config.calling_mem, disk=context.config.calling_disk) augment_results = augment_job.rv() next_job = Job() augment_job.addFollowOn(next_job) child_job = next_job else: augment_results = None clip_file_ids = [] for chunk_info in chunk_infos: path_names.add(chunk_info['chrom']) # Run vg call call_job = child_job.addChildJobFn( run_vg_call, context, chunk_info['sample'], chunk_info['vg_id'], chunk_info['gam_id'], xg_id = chunk_info['xg_id'], path_names = [chunk_info['chrom']], seq_names = [chunk_info['chrom']], seq_offsets = [chunk_info['chunk_start'] + chunk_info['offset']], seq_lengths = [chunk_info['path_size']], chunk_name = 'chunk_{}_{}'.format(chunk_info['chrom'], chunk_info['chunk_start']), genotype = genotype, recall = recall, clip_info = chunk_info, alt_gam_id = chunk_info['alt_gam_id'], genotype_vcf_id = chunk_info['genotype_vcf_id'], genotype_tbi_id = chunk_info['genotype_tbi_id'], snarls_id = snarls_id, pack_support = pack_support, old_call = old_call, augment_results = augment_results, cores=context.config.calling_cores, memory=context.config.calling_mem, disk=context.config.calling_disk) vcf_id, call_timer = call_job.rv(0), call_job.rv(1) clip_file_ids.append(vcf_id) call_timers.append(call_timer) tag = list(path_names)[0] if len(path_names) == 1 else 'chroms' merge_job = child_job.addFollowOnJobFn(run_concat_vcfs, context, tag, clip_file_ids, cores=context.config.call_chunk_cores, memory=context.config.call_chunk_mem, disk=context.config.call_chunk_disk) vcf_out_file_id = merge_job.rv(0) tbi_out_file_id = merge_job.rv(1) return vcf_out_file_id, tbi_out_file_id, call_timers
def run_all_calling2(job, context, xg_file_id, chr_gam_ids, chr_gam_idx_ids, chroms, path_sizes, vcf_offsets, sample_name, genotype=False, out_name=None, recall=False, alt_gam_id=None, alt_gai_id=None, genotype_vcf_id=None, genotype_tbi_id=None, id_ranges_id=None, snarls_id=None, pack_support=False, old_call=False): """ Call all the chromosomes and return a merged up vcf/tbi pair """ # we make a child job so that all calling is encapsulated in a top-level job child_job = Job() job.addChild(child_job) vcf_ids = [] tbi_ids = [] call_timers_lists = [] assert len(chr_gam_ids) > 0 if not chr_gam_idx_ids: chr_gam_idx_ids = [None] * len(chr_gam_ids) if not chroms: chroms = [name for name in path_sizes.keys() if path_sizes[name] > 0] assert len(chr_gam_ids) == len(chr_gam_idx_ids) # id ranges deactivates path chunking if id_ranges_id: context.config.call_chunk_size = (2 << 30) - 1 context.config.overlap = 0 for i in range(len(chr_gam_ids)): alignment_file_id = chr_gam_ids[i] alignment_index_id = chr_gam_idx_ids[i] if len(chr_gam_ids) > 1: # 1 gam per chromosome chr_label = [chroms[i]] chr_offset = [vcf_offsets[i]] if vcf_offsets else [0] else: # single gam with one or more chromosomes chr_label = chroms chr_offset = vcf_offsets if vcf_offsets else [0] * len(chroms) chunk_job = child_job.addChildJobFn(run_chunking, context, xg_file_id, alignment_file_id, alignment_index_id, chr_label, chr_offset, path_sizes, sample_name, genotype=genotype, recall=recall, alt_gam_id=alt_gam_id, alt_gai_id=alt_gai_id, genotype_vcf_id=genotype_vcf_id, genotype_tbi_id=genotype_tbi_id, id_ranges_id=id_ranges_id, cores=context.config.call_chunk_cores, memory=context.config.call_chunk_mem, disk=context.config.call_chunk_disk) call_job = chunk_job.addFollowOnJobFn(run_chunked_calling, context, chunk_job.rv(0), genotype, recall, snarls_id, pack_support, old_call, chunk_job.rv(1), cores=context.config.misc_cores, memory=context.config.misc_mem, disk=context.config.misc_disk) vcf_ids.append(call_job.rv(0)) tbi_ids.append(call_job.rv(1)) call_timers_lists.append(call_job.rv(2)) if not out_name: out_name = sample_name return child_job.addFollowOnJobFn(run_concat_vcfs, context, out_name, vcf_ids, tbi_ids, write_to_outstore = True, call_timers_lists = call_timers_lists, cores=context.config.call_chunk_cores, memory=context.config.call_chunk_mem, disk=context.config.call_chunk_disk).rv()
def run_sim(job, context, num_reads, gam, fastq_out, seed, sim_chunks, xg_file_ids, xg_annot_file_id, tag_bed_ids=[], paths=[], drop_contigs_matching=[], fastq_id=None, out_name=None, validate=False): """ run a bunch of simulation child jobs, merge up their output as a follow on """ sim_out_id_infos = [] # no seed specified, we choose one at random if seed is None: seed = random.randint(0, 2147483647) RealtimeLogger.info( 'No seed specifed, choosing random value = {}'.format(seed)) # encapsulate follow-on child_job = Job() job.addChild(child_job) # we can have more than one xg file if we've split our input graphs up # into haplotypes for xg_i, xg_file_id in enumerate(xg_file_ids): file_reads = num_reads / len(xg_file_ids) if xg_file_id == xg_file_ids[-1]: file_reads += num_reads % len(xg_file_ids) # Define a seed base for this set of chunks, leaving space for each chunk before the next seed base seed_base = seed + xg_i * sim_chunks # each element is either reads_chunk_id or (gam_chunk_id, true_pos_chunk_id) # if --gam not specified for chunk_i in range(sim_chunks): chunk_reads = file_reads / sim_chunks if chunk_i == sim_chunks - 1: chunk_reads += file_reads % sim_chunks sim_out_id_info = child_job.addChildJobFn( run_sim_chunk, context, gam, seed_base, xg_file_id, xg_annot_file_id, chunk_reads, chunk_i, xg_i, tag_bed_ids=tag_bed_ids, paths=paths, drop_contigs_matching=drop_contigs_matching, fastq_id=fastq_id, validate=validate, cores=context.config.sim_cores, memory=context.config.sim_mem, disk=context.config.sim_disk).rv() sim_out_id_infos.append(sim_out_id_info) merge_job = child_job.addFollowOnJobFn(run_merge_sim_chunks, context, gam, sim_out_id_infos, out_name, cores=context.config.sim_cores, memory=context.config.sim_mem, disk=context.config.sim_disk) merged_gam_id, true_id = merge_job.rv(0), merge_job.rv(1) if fastq_out: fastq_job = merge_job.addFollowOnJobFn( run_gam_to_fastq, context, merged_gam_id, False, out_name=out_name if out_name else 'sim', out_store=True, cores=context.config.sim_cores, memory=context.config.sim_mem, disk=context.config.sim_disk) merged_fq_id = fastq_job.rv(0) return merged_gam_id, true_id
def gatk_germline_pipeline(job, samples, config): """ Runs the GATK best practices pipeline for germline SNP and INDEL discovery. Steps in Pipeline 0: Generate and preprocess BAM - Uploads processed BAM to output directory 1: Call Variants using HaplotypeCaller - Uploads GVCF 2: Genotype VCF - Uploads VCF 3: Filter Variants using either "hard filters" or VQSR - Uploads filtered VCF :param JobFunctionWrappingJob job: passed automatically by Toil :param list[GermlineSample] samples: List of GermlineSample namedtuples :param Namespace config: Input parameters and reference FileStoreIDs Requires the following config attributes: config.genome_fasta FilesStoreID for reference genome fasta file config.genome_fai FilesStoreID for reference genome fasta index file config.genome_dict FilesStoreID for reference genome sequence dictionary file config.cores Number of cores for each job config.xmx Java heap size in bytes config.suffix Suffix added to output filename config.output_dir URL or local path to output directory config.ssec Path to key file for SSE-C encryption config.joint_genotype If True, then joint genotype and filter cohort config.hc_output URL or local path to HaplotypeCaller output for testing :return: Dictionary of filtered VCF FileStoreIDs :rtype: dict """ require(len(samples) > 0, 'No samples were provided!') # Get total size of genome reference files. This is used for configuring disk size. genome_ref_size = config.genome_fasta.size + config.genome_fai.size + config.genome_dict.size # 0: Generate processed BAM and BAI files for each sample # group preprocessing and variant calling steps in empty Job instance group_bam_jobs = Job() gvcfs = {} for sample in samples: # 0: Generate processed BAM and BAI files for each sample get_bam = group_bam_jobs.addChildJobFn(prepare_bam, sample.uuid, sample.url, config, paired_url=sample.paired_url, rg_line=sample.rg_line) # 1: Generate per sample gvcfs {uuid: gvcf_id} # The HaplotypeCaller disk requirement depends on the input bam, bai, the genome reference # files, and the output GVCF file. The output GVCF is smaller than the input BAM file. hc_disk = PromisedRequirement(lambda bam, bai, ref_size: 2 * bam.size + bai.size + ref_size, get_bam.rv(0), get_bam.rv(1), genome_ref_size) get_gvcf = get_bam.addFollowOnJobFn(gatk_haplotype_caller, get_bam.rv(0), get_bam.rv(1), config.genome_fasta, config.genome_fai, config.genome_dict, annotations=config.annotations, cores=config.cores, disk=hc_disk, memory=config.xmx, hc_output=config.hc_output) # Store cohort GVCFs in dictionary gvcfs[sample.uuid] = get_gvcf.rv() # Upload individual sample GVCF before genotyping to a sample specific output directory vqsr_name = '{}{}.g.vcf'.format(sample.uuid, config.suffix) get_gvcf.addChildJobFn(output_file_job, vqsr_name, get_gvcf.rv(), os.path.join(config.output_dir, sample.uuid), s3_key_path=config.ssec, disk=PromisedRequirement(lambda x: x.size, get_gvcf.rv())) # VQSR requires many variants in order to train a decent model. GATK recommends a minimum of # 30 exomes or one large WGS sample: # https://software.broadinstitute.org/gatk/documentation/article?id=3225 filtered_vcfs = {} if config.joint_genotype: # Need to configure joint genotype in a separate function to resolve promises filtered_vcfs = group_bam_jobs.addFollowOnJobFn(joint_genotype_and_filter, gvcfs, config).rv() # If not joint genotyping, then iterate over cohort and genotype and filter individually. else: for uuid, gvcf_id in gvcfs.iteritems(): filtered_vcfs[uuid] = group_bam_jobs.addFollowOnJobFn(genotype_and_filter, {uuid: gvcf_id}, config).rv() job.addChild(group_bam_jobs) return filtered_vcfs
def run_chunked_augmenting(job, context, graph_id, graph_basename, gam_id, gam_basename, batch_input=None, all_path_components=False, chunk_paths=[], connected_component_chunking=False, output_format=None, augment_gam=False, min_augment_coverage=None, expected_coverage=None, min_mapq=None, min_baseq=None, to_outstore=False): """ Run a chunking job (if desired), then augment the results """ # base case: only one input if batch_input is None: # chunk if necessary if all_path_components or connected_component_chunking or len( chunk_paths) > 1: child_job = Job() job.addChild(child_job) chunk_job = child_job.addChildJobFn( run_chunking, context, graph_id=graph_id, graph_basename=graph_basename, chunk_paths=chunk_paths, connected_component_chunking=connected_component_chunking, output_format=output_format, gam_id=gam_id, to_outstore=False, cores=context.config.chunk_cores, memory=context.config.chunk_mem, disk=context.config.chunk_disk) batch_input = chunk_job.rv() # recurse on chunks recurse_job = child_job.addFollowOnJobFn( run_chunked_augmenting, context, graph_id=None, graph_basename=None, gam_id=None, gam_basename=None, batch_input=batch_input, all_path_components=all_path_components, chunk_paths=chunk_paths, connected_component_chunking=connected_component_chunking, output_format=output_format, augment_gam=augment_gam, min_augment_coverage=min_augment_coverage, expected_coverage=expected_coverage, min_mapq=min_mapq, min_baseq=min_baseq, to_outstore=to_outstore) return recurse_job.rv() else: #phony up chunk output for single input batch_input = {'all': [graph_id, graph_basename]} if gam_id: batch_input['all'] += [gam_id, gam_basename] # run the augmenting on each chunk assert batch_input augment_results = [] for chunk_name, chunk_results in list(batch_input.items()): augment_job = job.addChildJobFn( run_augmenting, context, graph_id=chunk_results[0], graph_basename=chunk_results[1], gam_id=chunk_results[2], gam_basename=chunk_results[3], augment_gam=augment_gam, min_augment_coverage=min_augment_coverage, expected_coverage=expected_coverage, min_mapq=min_mapq, min_baseq=min_baseq, to_outstore=to_outstore, cores=context.config.augment_cores, memory=context.config.augment_mem, disk=context.config.augment_disk) augment_results.append((chunk_name, augment_job.rv())) return augment_results
def gatk_germline_pipeline(job, samples, config): """ Runs the GATK best practices pipeline for germline SNP and INDEL discovery. Steps in Pipeline 0: Generate and preprocess BAM - Uploads processed BAM to output directory 1: Call Variants using HaplotypeCaller - Uploads GVCF 2: Genotype VCF - Uploads VCF 3: Filter Variants using either "hard filters" or VQSR - Uploads filtered VCF :param JobFunctionWrappingJob job: passed automatically by Toil :param list[GermlineSample] samples: List of GermlineSample namedtuples :param Namespace config: Input parameters and reference FileStoreIDs Requires the following config attributes: config.genome_fasta FilesStoreID for reference genome fasta file config.genome_fai FilesStoreID for reference genome fasta index file config.genome_dict FilesStoreID for reference genome sequence dictionary file config.cores Number of cores for each job config.xmx Java heap size in bytes config.suffix Suffix added to output filename config.output_dir URL or local path to output directory config.ssec Path to key file for SSE-C encryption config.joint_genotype If True, then joint genotype and filter cohort config.hc_output URL or local path to HaplotypeCaller output for testing :return: Dictionary of filtered VCF FileStoreIDs :rtype: dict """ require(len(samples) > 0, 'No samples were provided!') # Get total size of genome reference files. This is used for configuring disk size. genome_ref_size = config.genome_fasta.size + config.genome_fai.size + config.genome_dict.size # 0: Generate processed BAM and BAI files for each sample # group preprocessing and variant calling steps in empty Job instance group_bam_jobs = Job() gvcfs = {} for sample in samples: # 0: Generate processed BAM and BAI files for each sample get_bam = group_bam_jobs.addChildJobFn(prepare_bam, sample.uuid, sample.url, config, paired_url=sample.paired_url, rg_line=sample.rg_line) # 1: Generate per sample gvcfs {uuid: gvcf_id} # The HaplotypeCaller disk requirement depends on the input bam, bai, the genome reference # files, and the output GVCF file. The output GVCF is smaller than the input BAM file. hc_disk = PromisedRequirement( lambda bam, bai, ref_size: 2 * bam.size + bai.size + ref_size, get_bam.rv(0), get_bam.rv(1), genome_ref_size) get_gvcf = get_bam.addFollowOnJobFn(gatk_haplotype_caller, get_bam.rv(0), get_bam.rv(1), config.genome_fasta, config.genome_fai, config.genome_dict, annotations=config.annotations, cores=config.cores, disk=hc_disk, memory=config.xmx, hc_output=config.hc_output) # Store cohort GVCFs in dictionary gvcfs[sample.uuid] = get_gvcf.rv() # Upload individual sample GVCF before genotyping to a sample specific output directory vqsr_name = '{}{}.g.vcf'.format(sample.uuid, config.suffix) get_gvcf.addChildJobFn(output_file_job, vqsr_name, get_gvcf.rv(), os.path.join(config.output_dir, sample.uuid), s3_key_path=config.ssec, disk=PromisedRequirement( lambda x: x.size, get_gvcf.rv())) # VQSR requires many variants in order to train a decent model. GATK recommends a minimum of # 30 exomes or one large WGS sample: # https://software.broadinstitute.org/gatk/documentation/article?id=3225 filtered_vcfs = {} if config.joint_genotype: # Need to configure joint genotype in a separate function to resolve promises filtered_vcfs = group_bam_jobs.addFollowOnJobFn( joint_genotype_and_filter, gvcfs, config).rv() # If not joint genotyping, then iterate over cohort and genotype and filter individually. else: for uuid, gvcf_id in gvcfs.iteritems(): filtered_vcfs[uuid] = group_bam_jobs.addFollowOnJobFn( genotype_and_filter, { uuid: gvcf_id }, config).rv() job.addChild(group_bam_jobs) return filtered_vcfs