def run(self, fileStore): self.configNode = ET.parse(fileStore.readGlobalFile(self.project.getConfigID())).getroot() self.configWrapper = ConfigWrapper(self.configNode) self.configWrapper.substituteAllPredefinedConstantsWithLiterals() logger.info("Progressive Up: " + self.event) # open up the experiment # note that we copy the path into the options here experimentFile = fileStore.readGlobalFile(self.project.expIDMap[self.event]) expXml = ET.parse(experimentFile).getroot() experiment = ExperimentWrapper(expXml) configPath = fileStore.readGlobalFile(experiment.getConfigID()) configXml = ET.parse(configPath).getroot() seqIDMap = dict() tree = experiment.getTree() seqNames = [] for node in tree.postOrderTraversal(): if tree.isLeaf(node): name = tree.getName(node) seqIDMap[name] = self.project.outputSequenceIDMap[name] seqNames.append(name) logger.info("Sequences in progressive, %s: %s" % (self.event, seqNames)) experimentFile = fileStore.getLocalTempFile() experiment.writeXML(experimentFile) self.options.experimentFileID = fileStore.writeGlobalFile(experimentFile) # take union of command line options and config options for hal and reference if self.options.buildReference == False: refNode = findRequiredNode(configXml, "reference") self.options.buildReference = getOptionalAttrib(refNode, "buildReference", bool, False) halNode = findRequiredNode(configXml, "hal") if self.options.buildHal == False: self.options.buildHal = getOptionalAttrib(halNode, "buildHal", bool, False) if self.options.buildFasta == False: self.options.buildFasta = getOptionalAttrib(halNode, "buildFasta", bool, False) # get parameters that cactus_workflow stuff wants configFile = fileStore.readGlobalFile(experiment.getConfigID()) configNode = ET.parse(configFile).getroot() workFlowArgs = CactusWorkflowArguments(self.options, experimentFile=experimentFile, configNode=configNode, seqIDMap = seqIDMap) # copy over the options so we don't trail them around workFlowArgs.buildReference = self.options.buildReference workFlowArgs.buildHal = self.options.buildHal workFlowArgs.buildFasta = self.options.buildFasta workFlowArgs.globalLeafEventSet = self.options.globalLeafEventSet if self.options.intermediateResultsUrl is not None: # Give the URL prefix a special name for this particular # subproblem (by suffixing it with the name of the # internal node in the guide tree) workFlowArgs.intermediateResultsUrl = self.options.intermediateResultsUrl + '-' + self.event # Use the trimming strategy to blast ingroups vs outgroups. finalExpWrapper = self.addChild(CactusTrimmingBlastPhase(cactusWorkflowArguments=workFlowArgs, phaseName="trimBlast")).rv() logger.info("Going to create alignments and define the cactus tree") return finalExpWrapper
def clip_vg(job, options, config, vg_path, vg_id): """ run clip-vg """ work_dir = job.fileStore.getLocalTempDir() is_decoy = vg_path == options.decoyGraph vg_path = os.path.join(work_dir, os.path.basename(vg_path)) job.fileStore.readGlobalFile(vg_id, vg_path) out_path = vg_path + '.clip' cmd = ['clip-vg', vg_path, '-f'] if options.clipLength is not None and not is_decoy: cmd += ['-u', str(options.clipLength)] for rs in options.rename: cmd += ['-r', rs] if options.reference: cmd += ['-e', options.reference] if getOptionalAttrib(findRequiredNode(config.xmlRoot, "hal2vg"), "includeMinigraph", typeFn=bool, default=False): # our vg file has minigraph sequences -- we'll filter them out, along with any nodes # that don't appear in a non-minigraph path graph_event = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "assemblyName", default="_MINIGRAPH_") cmd += ['-d', graph_event] # sort while we're at it cmd = [cmd, ['vg', 'ids', '-s', '-']] cactus_call(parameters=cmd, outfile=out_path) # worth it cactus_call(parameters=['vg', 'validate', out_path]) return job.fileStore.writeGlobalFile(out_path)
def minigraph_map_all(job, config, gfa_id, fa_id_map, graph_event, keep_gaf): """ top-level job to run the minigraph mapping in parallel, returns paf """ # hang everything on this job, to self-contain workflow top_job = Job() job.addChild(top_job) mg_cores = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "cpu", typeFn=int, default=1) mg_cores = min(mg_cores, cpu_count()) # doing the paf conversion is more efficient when done separately for each genome. we can get away # with doing this if the universal filter (which needs to process everything at once) is disabled xml_node = findRequiredNode(config.xmlRoot, "graphmap") paf_per_genome = not getOptionalAttrib(xml_node, "universalMZFilter", float) # do the mapping gaf_id_map = {} paf_id_map = {} for event, fa_path_fa_id in fa_id_map.items(): fa_path = fa_path_fa_id[0] fa_id = fa_path_fa_id[1] minigraph_map_job = top_job.addChildJobFn( minigraph_map_one, config, event, fa_path, fa_id, gfa_id, keep_gaf or not paf_per_genome, paf_per_genome, # todo: estimate RAM cores=mg_cores, disk=5 * (fa_id.size + gfa_id.size)) gaf_id_map[event] = minigraph_map_job.rv(0) paf_id_map[event] = minigraph_map_job.rv(1) # convert to paf if paf_per_genome: paf_job = top_job.addFollowOnJobFn(merge_pafs, paf_id_map) else: paf_job = top_job.addFollowOnJobFn(merge_gafs_into_paf, config, gaf_id_map) if not keep_gaf: gaf_id_map = None else: gaf_id_map = paf_job.addFollowOnJobFn(compress_gafs, gaf_id_map).rv() return paf_job.rv(), gaf_id_map
def merge_gafs_into_paf(job, config, gaf_file_id_map, gaf_paths=[]): """ Merge GAF alignments into a single PAF, applying some filters """ work_dir = job.fileStore.getLocalTempDir() paf_path = os.path.join(work_dir, "mz_alignments.paf") if not gaf_paths: for event, gaf_id in gaf_file_id_map.items(): gaf_paths.append("{}.gaf".format(event)) job.fileStore.readGlobalFile(gaf_id, os.path.join(work_dir, gaf_paths[-1])) xml_node = findRequiredNode(config.xmlRoot, "graphmap") mzgaf2paf_opts = [] graph_event = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "assemblyName", default="_MINIGRAPH_") # this must be consistent with prependUniqueIDs() in cactus_workflow.py mzgaf2paf_opts += ['-p', 'id={}|'.format(graph_event)] mz_filter = getOptionalAttrib(xml_node, "universalMZFilter", float) if mz_filter: mzgaf2paf_opts += ['-u', str(mz_filter)] if getOptionalAttrib(xml_node, "nodeBasedUniversal", typeFn=bool, default=False): mzgaf2paf_opts += ['-n'] if getOptionalAttrib(xml_node, "strictUniversal", typeFn=bool, default=False): mzgaf2paf_opts += ['-i'] min_mz = getOptionalAttrib(xml_node, "minMZBlockLength", int) if min_mz: mzgaf2paf_opts += ['-m', str(min_mz)] mapq = getOptionalAttrib(xml_node, "minMAPQ", int) if mapq: mzgaf2paf_opts += ['-q', str(mapq)] gaf_block = getOptionalAttrib(xml_node, "minGAFBlockLength", int) if gaf_block: mzgaf2paf_opts += ['-b', str(gaf_block)] gaf_node = getOptionalAttrib(xml_node, "minGAFNodeLength", int) if gaf_node: mzgaf2paf_opts += ['-s', str(gaf_node)] overlap_filter_len = getOptionalAttrib(xml_node, "minGAFQueryOverlapFilter", int) if overlap_filter_len: mzgaf2paf_opts += ['-o', str(overlap_filter_len)] cactus_call(outfile=paf_path, parameters=["mzgaf2paf"] + gaf_paths + mzgaf2paf_opts) return job.fileStore.writeGlobalFile(paf_path)
def export_vg(job, hal_id, configWrapper, doVG, doGFA, checkpointInfo=None, resource_spec = False): """ use hal2vg to convert the HAL to vg format """ if not resource_spec: # caller couldn't figure out the resrouces from hal_id promise. do that # now and try again return job.addChildJobFn(export_vg, hal_id, configWrapper, doVG, doGFA, checkpointInfo, resource_spec = True, disk=hal_id.size * 3, memory=hal_id.size * 10).rv() work_dir = job.fileStore.getLocalTempDir() hal_path = os.path.join(work_dir, "out.hal") job.fileStore.readGlobalFile(hal_id, hal_path) graph_event = getOptionalAttrib(findRequiredNode(configWrapper.xmlRoot, "graphmap"), "assemblyName", default="_MINIGRAPH_") hal2vg_opts = getOptionalAttrib(findRequiredNode(configWrapper.xmlRoot, "hal2vg"), "hal2vgOptions", default="") if hal2vg_opts: hal2vg_opts = hal2vg_opts.split(' ') else: hal2vg_opts = [] ignore_events = [] if not getOptionalAttrib(findRequiredNode(configWrapper.xmlRoot, "hal2vg"), "includeMinigraph", typeFn=bool, default=False): ignore_events.append(graph_event) if not getOptionalAttrib(findRequiredNode(configWrapper.xmlRoot, "hal2vg"), "includeAncestor", typeFn=bool, default=False): ignore_events.append(configWrapper.getDefaultInternalNodePrefix() + '0') if ignore_events: hal2vg_opts += ['--ignoreGenomes', ','.join(ignore_events)] if not getOptionalAttrib(findRequiredNode(configWrapper.xmlRoot, "hal2vg"), "prependGenomeNames", typeFn=bool, default=True): hal2vg_opts += ['--onlySequenceNames'] vg_path = os.path.join(work_dir, "out.vg") cmd = ['hal2vg', hal_path] + hal2vg_opts cactus_call(parameters=cmd, outfile=vg_path) if checkpointInfo: write_s3(vg_path, os.path.splitext(checkpointInfo[1])[0] + '.vg', region=checkpointInfo[0]) gfa_path = os.path.join(work_dir, "out.gfa.gz") if doGFA: gfa_cmd = [ ['vg', 'view', '-g', vg_path], ['gzip'] ] cactus_call(parameters=gfa_cmd, outfile=gfa_path) if checkpointInfo: write_s3(gfa_path, os.path.splitext(checkpointInfo[1])[0] + '.gfa.gz', region=checkpointInfo[0]) vg_id = job.fileStore.writeGlobalFile(vg_path) if doVG else None gfa_id = job.fileStore.writeGlobalFile(gfa_path) if doGFA else None return vg_id, gfa_id
def merge_gafs_into_paf(job, config, gaf_file_ids): """ Merge GAF alignments into a single PAF, applying some filters """ work_dir = job.fileStore.getLocalTempDir() paf_path = os.path.join(work_dir, "mz_alignments.paf") gaf_paths = [] for i, gaf_id in enumerate(gaf_file_ids): gaf_paths.append("mz_alignment_{}.gaf".format(i)) job.fileStore.readGlobalFile(gaf_id, os.path.join(work_dir, gaf_paths[-1])) xml_node = findRequiredNode(config.xmlRoot, "refgraph") mzgaf2paf_opts = [] mz_filter = getOptionalAttrib(xml_node, "universalMZFilter", float) if mz_filter: mzgaf2paf_opts += ['-u', str(mz_filter)] min_mz = getOptionalAttrib(xml_node, "minMZBlockLength", int) if min_mz: mzgaf2paf_opts += ['-m', str(min_mz)] mapq = getOptionalAttrib(xml_node, "minMAPQ", int) if mapq: mzgaf2paf_opts += ['-q', str(mapq)] gaf_block = getOptionalAttrib(xml_node, "minGAFBlockLength", int) if gaf_block: mzgaf2paf_opts += ['-b', str(gaf_block)] cactus_call(work_dir=work_dir, outfile=paf_path, parameters=["mzgaf2paf"] + gaf_paths + mzgaf2paf_opts) # these are big, get rid of them as soon as we can (which is now) for gaf_id in gaf_file_ids: job.fileStore.deleteGlobalFile(gaf_id) return job.fileStore.writeGlobalFile(paf_path)
def split_minimap_fallback(job, options, config, seqIDMap, output_id_map): """ take the output table from gather_fas, pull out the ambiguous sequences, remap them to the reference, and add them to the events where possible""" # can't do anything without a reference if not options.reference: logger.info("Skipping minimap2 fallback as --reference was not specified") return None, None # todo: also skip if no ambgious sequences ref_path, ref_id = seqIDMap[options.reference] mm_mem = ref_id.size * 5 if seqIDMap[options.reference][0].endswith('.gz'): mm_mem *= 4 mm_index_job = job.addChildJobFn(minimap_index, ref_path, ref_id, disk=ref_id.size * 5, memory=mm_mem) mm_map_root_job = Job() mm_index_job.addFollowOn(mm_map_root_job) amb_name = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "ambiguousName", default="_AMBIGUOUS_") if amb_name not in output_id_map: logger.info("Skipping minmap2 fallback as no ambigious sequences found") return None, None # map every ambgiuous sequence against the reference in parallel paf_ids = [] ambiguous_seq_id_map = {} for event, fa_id in output_id_map[amb_name]['fa'].items(): paf_job = mm_map_root_job.addChildJobFn(minimap_map, mm_index_job.rv(), event, fa_id, seqIDMap[event][0], disk=ref_id.size * 3, memory=mm_mem) paf_ids.append(paf_job.rv()) ambiguous_seq_id_map[event] = (seqIDMap[event][0], fa_id) return paf_ids, ambiguous_seq_id_map
def substituteAllDivergenceContolledParametersWithLiterals(self, maxDivergence): constants = findRequiredNode(self.xmlRoot, "constants") divergences = constants.find("divergences") messages = [] if divergences != None: useDefaultDivergences = getOptionalAttrib(divergences, attribName="useDefault", typeFn=bool, default=False) def replaceAllDivergenceParameters(node): for child in node: if child.tag == "divergence": attribName = child.attrib["argName"] arg = child.attrib["default"] divergence = sys.maxint if not useDefaultDivergences: for i in child.attrib.keys(): if i in divergences.attrib.keys(): j = float(divergences.attrib[i]) if j < divergence and j >= maxDivergence: arg = child.attrib[i] divergence = j messages.append("Made argument %s=%s in tag %s with divergence threshold of %s for longest path of %s (useDefaultDivergences=%s)" % (attribName, arg, node.tag, divergence, maxDivergence, useDefaultDivergences)) node.attrib[attribName] = arg else: replaceAllDivergenceParameters(child) replaceAllDivergenceParameters(self.xmlRoot) return messages
def substituteAllDivergenceContolledParametersWithLiterals( self, maxDivergence): constants = findRequiredNode(self.xmlRoot, "constants") divergences = constants.find("divergences") messages = [] if divergences != None: useDefaultDivergences = getOptionalAttrib(divergences, attribName="useDefault", typeFn=bool, default=False) def replaceAllDivergenceParameters(node): for child in node: if child.tag == "divergence": attribName = child.attrib["argName"] arg = child.attrib["default"] divergence = sys.maxsize if not useDefaultDivergences: for i in list(child.attrib.keys()): if i in list(divergences.attrib.keys()): j = float(divergences.attrib[i]) if j < divergence and j >= maxDivergence: arg = child.attrib[i] divergence = j messages.append( "Made argument %s=%s in tag %s with divergence threshold of %s for longest path of %s (useDefaultDivergences=%s)" % (attribName, arg, node.tag, divergence, maxDivergence, useDefaultDivergences)) node.attrib[attribName] = arg else: replaceAllDivergenceParameters(child) replaceAllDivergenceParameters(self.xmlRoot) return messages
def combine_splits(job, config, seq_id_map, original_id_map, remap_id_map): """ combine the output of two runs of gather_fas. the first is the contigs determined by minigraph, the second from remapping the ambigious contigs with minimap2 """ root_job = Job() job.addChild(root_job) # no ambiguous remappings, nothing to do if not remap_id_map or len(remap_id_map) == 0: return original_id_map amb_name = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "ambiguousName", default="_AMBIGUOUS_") graph_event = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "assemblyName", default="_MINIGRAPH_") # note: we're not handling case where 100% of a given reference contigs are ambiguous for ref_contig in original_id_map: if ref_contig == amb_name: # for ambiguous sequence, we overwrite and don't combine if ref_contig in remap_id_map: original_id_map[ref_contig] = remap_id_map[ref_contig] else: original_id_map[ref_contig] = None elif ref_contig in remap_id_map: total_size = 0 for event in original_id_map[ref_contig]['fa']: total_size += original_id_map[ref_contig]['fa'][event].size if event in remap_id_map[ref_contig]['fa']: total_size += remap_id_map[ref_contig]['fa'][event].size original_id_map[ref_contig] = root_job.addChildJobFn( combine_ref_contig_splits, original_id_map[ref_contig], remap_id_map[ref_contig], disk=total_size * 4).rv() return root_job.addFollowOnJobFn(combine_paf_splits, seq_id_map, original_id_map, remap_id_map, amb_name, graph_event).rv()
def substituteAllPredefinedConstantsWithLiterals(self): constants = findRequiredNode(self.xmlRoot, "constants") defines = constants.find("defines") def replaceAllConstants(node, defines): for attrib in node.attrib: if node.attrib[attrib] in defines.attrib: node.attrib[attrib] = defines.attrib[node.attrib[attrib]] for child in node: replaceAllConstants(child, defines) if defines != None: replaceAllConstants(self.xmlRoot, defines) constants.remove(defines)
def minigraph_map_one(job, config, event_name, fa_path, fa_file_id, gfa_file_id, gaf_output, paf_output): """ Run minigraph to map a Fasta file to a GFA graph, producing a GAF output """ work_dir = job.fileStore.getLocalTempDir() gfa_path = os.path.join(work_dir, "mg.gfa") fa_dir = job.fileStore.getLocalTempDir() fa_path = os.path.join(fa_dir, os.path.basename(fa_path)) gaf_path = os.path.join(work_dir, "{}.gaf".format(event_name)) job.fileStore.readGlobalFile(gfa_file_id, gfa_path) job.fileStore.readGlobalFile(fa_file_id, fa_path) if fa_path.endswith('.gz'): fa_path = fa_path[:-3] cactus_call(parameters = ['gzip', '-d', '-c', fa_path + '.gz'], outfile=fa_path) # prepend the unique id before mapping so the GAF has cactus-compatible event names fa_path = prependUniqueIDs({event_name : fa_path}, work_dir, eventNameAsID=True)[event_name] # parse options from the config xml_node = findRequiredNode(config.xmlRoot, "graphmap") minigraph_opts = getOptionalAttrib(xml_node, "minigraphMapOptions", str, default="") opts_list = minigraph_opts.split() # add required options if not present if "-S" not in opts_list: opts_list += ["-S"] if "--write-mz" not in opts_list: opts_list += ["--write-mz"] if "-t" not in opts_list: opts_list += ["-t", str(int(job.cores))] cmd = ["minigraph", os.path.basename(gfa_path), os.path.basename(fa_path), "-o", os.path.basename(gaf_path)] + opts_list mask_filter = getOptionalAttrib(xml_node, "maskFilter", int, default=-1) if mask_filter >= 0: cmd[2] = '-' cmd = [['cactus_softmask2hardmask', os.path.basename(fa_path), '-m', str(mask_filter)], cmd] cactus_call(work_dir=work_dir, parameters=cmd) paf_id, gaf_id = None, None if paf_output: # optional gaf->paf step. we are not piping directly out of minigraph because mzgaf2paf's overlap filter # (which is usually on) requires 2 passes so it won't read stdin when it's enabled paf_id = merge_gafs_into_paf(job, config, None, [gaf_path]) if gaf_output: gaf_id = job.fileStore.writeGlobalFile(gaf_path) return gaf_id, paf_id
def preprocess_input_sequences(job, configWrapper, project, cactusWorkflowArguments, pafMaskFilter=None, referenceEvent=None): """ update the workflow arguments in place with unzipped version of any input fastas whose paths end in .gz, if there's a pafMaskFilter, softmasked regions are extracted from each sequence into a bed. Note that the beds will need unique ids prepended just like the fastas... """ head_job = Job() job.addChild(head_job) graph_event = getOptionalAttrib(findRequiredNode(configWrapper.xmlRoot, "graphmap"), "assemblyName", default="_MINIGRAPH_") exp = cactusWorkflowArguments.experimentWrapper ingroupsAndOriginalIDs = [(g, exp.getSequenceID(g)) for g in exp.getGenomesWithSequence() if g not in exp.getOutgroupGenomes()] mask_bed_ids = {} events = [] updated_seq_ids = [] for g, seqID in ingroupsAndOriginalIDs: zipped = project.inputSequenceMap[g].endswith('.gz') do_filter = pafMaskFilter and g not in [graph_event, referenceEvent] if zipped or do_filter: prepend_id_job = head_job.addChildJobFn( preprocess_input_sequence, g, seqID, project.inputSequenceMap[g], pafMaskFilter) updated_seq_id, mask_bed_id = prepend_id_job.rv( 0), prepend_id_job.rv(1) if zipped: events.append(g) updated_seq_ids.append(updated_seq_id) if do_filter: mask_bed_ids[g] = mask_bed_id return head_job.addFollowOnJobFn( resolve_id_promises, events, updated_seq_ids, cactusWorkflowArguments).rv(), mask_bed_ids
def minigraph_map_one(job, config, event_name, fa_file_id, gfa_file_id, ignore_softmasked): """ Run minigraph to map a Fasta file to a GFA graph, producing a GAF output """ work_dir = job.fileStore.getLocalTempDir() gfa_path = os.path.join(work_dir, "minigraph.gfa") fa_path = os.path.join(work_dir, "{}.fa".format(event_name)) gaf_path = os.path.join(work_dir, "{}.gaf".format(event_name)) job.fileStore.readGlobalFile(gfa_file_id, gfa_path) job.fileStore.readGlobalFile(fa_file_id, fa_path) # parse options from the config xml_node = findRequiredNode(config.xmlRoot, "graphmap") minigraph_opts = getOptionalAttrib(xml_node, "minigraphMapOptions", str, default="") opts_list = minigraph_opts.split() # add required options if not present if "-S" not in opts_list: opts_list += ["-S"] if "--write-mz" not in opts_list: opts_list += ["--write-mz"] if "-t" not in opts_list: opts_list += ["-t", str(int(job.cores))] cmd = [ "minigraph", os.path.basename(gfa_path), os.path.basename(fa_path), "-o", os.path.basename(gaf_path) ] + opts_list if ignore_softmasked: cmd[2] = '-' cmd = [['cactus_softmask2hardmask', os.path.basename(fa_path)], cmd] # todo: pipe into gzip directly as these files can be huge!!! (requires gzip support be added to mzgaf2paf) cactus_call(work_dir=work_dir, parameters=cmd) return job.fileStore.writeGlobalFile(gaf_path)
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) parser.add_argument("inSeqFile", type=str, nargs='?', default=None, help="Input Seq file") parser.add_argument( "outSeqFile", type=str, nargs='?', default=None, help="Output Seq file (ex generated with cactus-prepare)") parser.add_argument("--configFile", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument( "--inputNames", nargs='*', help= 'input genome names (not paths) to preprocess (all leaves from Input Seq file if none specified)' ) parser.add_argument( "--inPaths", nargs='*', help= 'Space-separated list of input fasta paths (to be used in place of --inSeqFile' ) parser.add_argument( "--outPaths", nargs='*', help= 'Space-separated list of output fasta paths (one for each inPath, used in place of --outSeqFile)' ) parser.add_argument("--maskAlpha", action='store_true', help='Use dna-brnn instead of lastz for repeatmasking') parser.add_argument( "--clipAlpha", action='store_true', help= 'use dna-brnn instead of lastz for repeatmasking. Also, clip sequence using given minimum length instead of softmasking' ) parser.add_argument( "--ignore", nargs='*', help='Space-separate list of genomes from inSeqFile to ignore', default=[]) parser.add_argument( "--maskPAF", type=str, help= 'Incorporate coverage gaps from given PAF when masking. Only implemented for dna-brnn masking' ) parser.add_argument( "--brnnCores", type=int, help= 'Specify number of cores for each dna-brnn job (overriding default value from the config)' ) parser.add_argument( "--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument( "--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) options = parser.parse_args() setupBinaries(options) setLoggingFromOptions(options) enableDumpStack() # Mess with some toil options to create useful defaults. cactus_override_toil_options(options) # we have two modes: operate directly on paths or rely on the seqfiles. they cannot be mixed if options.inSeqFile or options.outSeqFile: if not options.inSeqFile or not options.outSeqFile or options.inPaths or options.outPaths: raise RuntimeError( '--inSeqFile must be used in conjunction with --outSeqFile and not with --inPaths nor --outPaths' ) elif options.inPaths or options.outPaths: if not options.inPaths or not options.outPaths or options.inSeqFile or options.outSeqFile: raise RuntimeError( '--inPaths must be used in conjunction with --outPaths and not with --inSeqFile nor --outSeqFile' ) if len(options.inPaths) != len(options.outPaths): raise RuntimeError( '--inPaths and --outPaths must have the same number of arguments' ) else: raise RuntimeError( '--inSeqFile/--outSeqFile/--inputNames or --inPaths/--outPaths required to specify input' ) if options.maskAlpha and options.clipAlpha: raise RuntimeError( '--maskAlpha and --clipAlpha cannot be used together') if options.clipAlpha: options.maskAlpha = True if options.maskPAF and not options.inputNames and not options.inSeqFile: raise RuntimeError( '--maskPAF requires event names specified wither with an input seqfile or with --inputNames' ) if options.ignore and options.clipAlpha is None: raise RuntimeError('--ignore can only be used with --clipAlpha') inSeqPaths = [] outSeqPaths = [] inNames = options.inputNames eventNames = [] #load cactus config configNode = ET.parse(options.configFile).getroot() #we never want to preprocess minigraph sequences graph_event = getOptionalAttrib(findRequiredNode(configNode, "graphmap"), "assemblyName", default="_MINIGRAPH_") options.ignore.append(graph_event) # mine the paths out of the seqfiles if options.inSeqFile: inSeqFile = SeqFile(options.inSeqFile) outSeqFile = SeqFile(options.outSeqFile) if not inNames: inNames = [ inSeqFile.tree.getName(node) for node in inSeqFile.tree.getLeaves() ] for inName in inNames: if inName in options.ignore: # "convenience" functionality: we let the --ignore option update the output seqfile # to reflect the fact that we're not touching the original input outSeqFile.pathMap[inName] = inSeqFile.pathMap[inName] continue if inName not in inSeqFile.pathMap or inName not in outSeqFile.pathMap: raise RuntimeError( '{} not present in input and output Seq files'.format( inName)) inPath = inSeqFile.pathMap[inName] outPath = outSeqFile.pathMap[inName] if os.path.isdir(inPath): try: os.makedirs(outPath) except: pass assert os.path.isdir(inPath) == os.path.isdir(outPath) inSeqPaths += [ os.path.join(inPath, seqPath) for seqPath in os.listdir(inPath) ] outSeqPaths += [ os.path.join(outPath, seqPath) for seqPath in os.listdir(inPath) ] else: inSeqPaths += [inPath] outSeqPaths += [outPath] eventNames.append(inName) if options.ignore: # see comment above with open(options.outSeqFile, 'w') as outSF: outSF.write(str(outSeqFile)) # we got path names directly from the command line else: inSeqPaths = options.inPaths outSeqPaths = options.outPaths with Toil(options) as toil: stageWorkflow(outputSequenceDir=None, configFile=options.configFile, inputSequences=inSeqPaths, toil=toil, restart=options.restart, outputSequences=outSeqPaths, maskAlpha=options.maskAlpha, clipAlpha=options.clipAlpha, maskPAF=options.maskPAF, inputEventNames=eventNames, brnnCores=options.brnnCores)
def turnAllModesOn(self): """Switches on check, normalisation etc. to use when debugging/testing """ findRequiredNode(self.xmlRoot, "check").attrib["runCheck"] = "1" findRequiredNode(self.xmlRoot, "normal").attrib["iterations"] = "2"
def combine_paf_splits(job, options, config, seq_id_map, original_id_map, orig_amb_entry, remap_id_map, amb_name, graph_event): """ pull out PAF entries for contigs that were ambiguous in the first round but assigned by minimap2 then add them to the chromosome PAFs """ if amb_name not in original_id_map: return original_id_map work_dir = job.fileStore.getLocalTempDir() amb_paf_path = os.path.join(work_dir, 'amb.paf') job.fileStore.readGlobalFile(orig_amb_entry['paf'], amb_paf_path, mutable=True) # use_minimap_paf = True: return the minimap2 mappings for ambiguous contigs in final output # use_minimap_paf = False: ambiguous contigs are assigned to chromosomes base on minimap2, but their minigraph # alignments are returned in the final paf""" use_minimap_paf = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "useMinimapPAF", typeFn=bool, default=False) # it's simpler not to support both codepaths right now. the main issue is that -u can cause contigs to be split # in which case they get renamed, so pulling them in from the existing PAF would require a pass to resolove all the # offsets if not use_minimap_paf and '-u' in getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "remapSplitOptions", default=""): raise RuntimeError("useMinimapPAF must be set when -u present in remapSplitOptions") for ref_contig in remap_id_map.keys(): if ref_contig != amb_name and ref_contig in original_id_map: # make a set of all minigraph nodes in this contig mg_fa_path = os.path.join(work_dir, '{}.{}.fa'.format(graph_event, ref_contig)) if seq_id_map[graph_event][0].endswith('.gz'): mg_fa_path += '.gz' mg_contigs_path = os.path.join(work_dir, '{}.contigs'.format(graph_event)) job.fileStore.readGlobalFile(original_id_map[ref_contig]['fa'][graph_event], mg_fa_path, mutable=True) cactus_call(parameters=[['zcat' if mg_fa_path.endswith('.gz') else 'cat', mg_fa_path], ['grep', '>'], ['cut', '-c', '2-']], outfile=mg_contigs_path) mg_contig_set = set() with open(mg_contigs_path, 'r') as mg_contigs_file: for line in mg_contigs_file: mg_contig_set.add('id={}|{}'.format(graph_event, line.strip())) os.remove(mg_fa_path) os.remove(mg_contigs_path) #make a set of all the query contigs that we want to remove from ambiguous and add to this contig query_contig_set = set() for event in remap_id_map[ref_contig]['fa']: if event != graph_event and remap_id_map[ref_contig]['fa'][event].size > 0: # read the contigs assigned to this sample for this chromosome by scanning fasta headers tmp_fa_path = os.path.join(work_dir, 'tmp.fa') if seq_id_map[event][0].endswith('.gz'): tmp_fa_path += '.gz' if os.path.isfile(tmp_fa_path): os.remove(tmp_fa_path) job.fileStore.readGlobalFile(remap_id_map[ref_contig]['fa'][event], tmp_fa_path, mutable=True) contigs_path = os.path.join(work_dir, '{}.contigs'.format(event)) cactus_call(parameters=[['zcat' if tmp_fa_path.endswith('.gz') else 'cat', tmp_fa_path], ['grep', '>'], ['cut', '-c', '2-']], outfile=contigs_path) # add them to the grep with open(contigs_path, 'r') as contigs_file: for line in contigs_file: query_contig_set.add('id={}|{}'.format(event, line.strip())) if query_contig_set: # pull out remapped contigs into this path new_contig_path = os.path.join(work_dir, '{}.remap.paf'.format(ref_contig)) do_append = False if ref_contig in original_id_map and 'paf' in original_id_map[ref_contig]: job.fileStore.readGlobalFile(original_id_map[ref_contig]['paf'], new_contig_path, mutable=True) do_append = True # make an updated ambiguous paf with the contigs removed in this path temp_contig_path = os.path.join(work_dir, amb_paf_path + '.temp.remove') with open(new_contig_path, 'a' if do_append else 'w') as new_contig_file, \ open(amb_paf_path, 'r') as amb_paf_file, \ open(temp_contig_path, 'w') as temp_contig_file: # scan the ambgiuous paf from minigraph for line in amb_paf_file: toks = line.split('\t') if len(toks) > 5 and toks[0] in query_contig_set: if toks[5] in mg_contig_set and not use_minimap_paf: # move the contig if both the query and target belong to reference contig new_contig_file.write(line) else: # leave the contig in ambiguous temp_contig_file.write(line) if use_minimap_paf: # if we're taking the contigs from minigraph, append them here (as they weren't added in # the loop above) minimap_paf_path = os.path.join(work_dir, '{}.minimap.paf'.format(ref_contig)) job.fileStore.readGlobalFile(remap_id_map[ref_contig]['paf'], minimap_paf_path) with open(minimap_paf_path, 'r') as minimap_paf_file: for line in minimap_paf_file: toks = line.split('\t') if len(toks) > 5: toks[5] = 'id={}|{}'.format(options.reference, toks[5]) new_contig_file.write('\t'.join(toks)) # update the map original_id_map[ref_contig]['paf'] = job.fileStore.writeGlobalFile(new_contig_path) # update the ambigious paf cactus_call(parameters=['mv', temp_contig_path, amb_paf_path]) # update the ambiguous paf if amb_name in original_id_map and original_id_map[amb_name]: original_id_map[amb_name]['paf'] = job.fileStore.writeGlobalFile(amb_paf_path) else: assert os.path.getsize(amb_paf_path) == 0 return original_id_map
def split_gfa(job, config, gfa_id, paf_id, ref_contigs, other_contig, reference_event): """ Use rgfa-split to divide a GFA and PAF into chromosomes. The GFA must be in minigraph RGFA output using the desired reference. """ work_dir = job.fileStore.getLocalTempDir() gfa_path = os.path.join(work_dir, "mg.gfa") paf_path = os.path.join(work_dir, "mg.paf") out_prefix = os.path.join(work_dir, "split_") job.fileStore.readGlobalFile(gfa_id, gfa_path) job.fileStore.readGlobalFile(paf_id, paf_path) # get the minigraph "virutal" assembly name graph_event = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "assemblyName", default="_MINIGRAPH_") # and look up its unique id prefix. this will be needed to pick its contigs out of the list mg_id = graph_event # get the specificity filters query_coverage = getOptionalAttrib(findRequiredNode( config.xmlRoot, "graphmap_split"), "minQueryCoverage", default="0") query_uniqueness = getOptionalAttrib(findRequiredNode( config.xmlRoot, "graphmap_split"), "minQueryUniqueness", default="0") amb_event = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "ambiguousName", default="_AMBIGUOUS_") cmd = [ 'rgfa-split', '-i', 'id={}|'.format(mg_id), '-G', '-g', gfa_path, '-p', paf_path, '-b', out_prefix, '-n', query_coverage, '-Q', query_uniqueness, '-a', amb_event ] if other_contig: cmd += ['-o', other_contig] if reference_event: cmd += ['-r', 'id={}|'.format(reference_event)] for contig in ref_contigs: cmd += ['-c', contig] cactus_call(parameters=cmd, work_dir=work_dir) output_id_map = {} for out_name in os.listdir(work_dir): file_name, ext = os.path.splitext(out_name) if file_name.startswith(os.path.basename(out_prefix)) and ext in [ ".gfa", ".paf", ".fa_contigs" ]: name = file_name[len(os.path.basename(out_prefix)):] if name not in output_id_map: output_id_map[name] = {} output_id_map[name][ext[1:]] = job.fileStore.writeGlobalFile( os.path.join(work_dir, out_name)) return output_id_map
def getDoTrimStrategy(self): trimBlastNode = findRequiredNode(self.xmlRoot, "trimBlast") if "doTrimStrategy" in trimBlastNode.attrib: return trimBlastNode.attrib["doTrimStrategy"] == "1" return False
def cactusPrepare(options, project): """ annotate a SeqFile with ancestral names as well as paths for output sequences.""" # read the input seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) if not options.wdl: # prepare output sequence directory # todo: support remote (ie s3) output directory try: os.makedirs(options.outDir) except: pass if not os.path.isdir(options.outDir): raise RuntimeError('Unable to create output sequence directory \'{}\''.format(options.outDir)) if not os.access(options.outDir, os.W_OK): logger.warning('Output sequence directory is not writeable: \'{}\''.format(options.outDir)) if options.preprocessOnly or options.gpu: if options.preprocessOnly: # hack the configfile to skip preprocessing and write it to the output dir config.removePreprocessors() if options.gpu: # hack the configfile to toggle on gpu lastz cafNode = findRequiredNode(config.xmlRoot, "caf") cafNode.attrib["gpuLastz"] = "true" # realigning doesn't mix well with lastz so we make sure it's off # https://github.com/ComparativeGenomicsToolkit/cactus/issues/271 cafNode.attrib["realign"] = "0" options.configFile = os.path.join(options.outDir, 'config-prepared.xml') sys.stderr.write("configuration saved in {}\n".format(options.configFile)) config.writeXML(options.configFile) # pass through the config file to the options # todo (don't like second hard-code check of .xml path) if options.configFile != os.path.join(cactusRootPath(), "cactus_progressive_config.xml") and not options.wdl: options.cactusOptions += ' --configFile {}'.format(options.configFile) # get the ancestor names tree = MultiCactusTree(seqFile.tree) tree.nameUnlabeledInternalNodes(prefix = config.getDefaultInternalNodePrefix()) # make the output outSeqFile = SeqFile() outSeqFile.tree= tree outSeqFile.pathMap = copy.deepcopy(seqFile.pathMap) outSeqFile.outgroups = copy.deepcopy(seqFile.outgroups) # update paths for preprocessed leaves or inferred ancestors for node in outSeqFile.tree.breadthFirstTraversal(): name = outSeqFile.tree.getName(node) leaf = outSeqFile.tree.isLeaf(node) if leaf or (not leaf and name not in seqFile.pathMap and not options.preprocessOnly): out_basename = seqFile.pathMap[name] if name in seqFile.pathMap else '{}.fa'.format(name) outSeqFile.pathMap[name] = os.path.join(options.outDir, os.path.basename(out_basename)) if options.wdl: # uniquify name in wdl to prevent collisions outSeqFile.pathMap[name] += '.pp' # write the output if options.outSeqFile: with open(options.outSeqFile, 'w') as out_sf: out_sf.write(str(outSeqFile)) # write the instructions print(get_plan(options, project, seqFile, outSeqFile))
def runCactusGraphMap(options): with Toil(options) as toil: importSingularityImage(options) #Run the workflow if options.restart: alignmentID = toil.restart() else: options.cactusDir = getTempDirectory() # apply path overrides. this was necessary for wdl which doesn't take kindly to # text files of local paths (ie seqfile). one way to fix would be to add support # for s3 paths and force wdl to use it. a better way would be a more fundamental # interface shift away from files of paths throughout all of cactus if options.pathOverrides: seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) tree = MultiCactusTree(seqFile.tree) tree.nameUnlabeledInternalNodes( prefix=config.getDefaultInternalNodePrefix()) for name, override in zip(options.pathOverrideNames, options.pathOverrides): seqFile.pathMap[name] = override override_seq = os.path.join(options.cactusDir, 'seqFile.override') with open(override_seq, 'w') as out_sf: out_sf.write(str(seqFile)) options.seqFile = override_seq #load cactus config configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) config.substituteAllPredefinedConstantsWithLiterals() # get the minigraph "virutal" assembly name graph_event = getOptionalAttrib(findRequiredNode( configNode, "refgraph"), "assemblyName", default="__MINIGRAPH_SEQUENCES__") # load the seqfile seqFile = SeqFile(options.seqFile) logger.info("Genomes for graphmap, {}".format(seqFile.pathMap)) if not options.outputFasta and graph_event not in seqFile.pathMap: raise RuntimeError( "{} assembly not found in seqfile so it must be specified with --outputFasta" .format(graph_event)) #import the graph gfa_id = toil.importFile(makeURL(options.minigraphGFA)) #import the sequences (that we need to align for the given event, ie leaves and outgroups) seqIDMap = {} for genome, seq in seqFile.pathMap.items(): if genome != graph_event: if os.path.isdir(seq): tmpSeq = getTempFile() catFiles([ os.path.join(seq, subSeq) for subSeq in os.listdir(seq) ], tmpSeq) seq = tmpSeq seq = makeURL(seq) seqIDMap[genome] = toil.importFile(seq) # run the workflow paf_id, gfa_fa_id = toil.start( Job.wrapJobFn(minigraph_workflow, options, config, seqIDMap, gfa_id, graph_event)) #export the paf toil.exportFile(paf_id, makeURL(options.outputPAF)) if gfa_fa_id: toil.exportFile(gfa_fa_id, makeURL(options.outputFasta)) # update the input seqfile (in place!) add_genome_to_seqfile(options.seqFile, makeURL(options.outputFasta), graph_event)
def make_align_job(options, toil): options.cactusDir = getTempDirectory() # apply path overrides. this was necessary for wdl which doesn't take kindly to # text files of local paths (ie seqfile). one way to fix would be to add support # for s3 paths and force wdl to use it. a better way would be a more fundamental # interface shift away from files of paths throughout all of cactus if options.pathOverrides: seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) tree = MultiCactusTree(seqFile.tree) tree.nameUnlabeledInternalNodes( prefix=config.getDefaultInternalNodePrefix()) for name, override in zip(options.pathOverrideNames, options.pathOverrides): seqFile.pathMap[name] = override override_seq = os.path.join(options.cactusDir, 'seqFile.override') with open(override_seq, 'w') as out_sf: out_sf.write(str(seqFile)) options.seqFile = override_seq if not options.root: seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) mcTree = MultiCactusTree(seqFile.tree) mcTree.nameUnlabeledInternalNodes( prefix=config.getDefaultInternalNodePrefix()) options.root = mcTree.getRootName() if options.acyclic: seqFile = SeqFile(options.seqFile) tree = MultiCactusTree(seqFile.tree) leaves = [tree.getName(leaf) for leaf in tree.getLeaves()] if options.acyclic not in leaves: raise RuntimeError( "Genome specified with --acyclic, {}, not found in tree leaves" .format(options.acyclic)) #to be consistent with all-in-one cactus, we make sure the project #isn't limiting itself to the subtree (todo: parameterize so root can #be passed through from prepare to blast/align) proj_options = copy.deepcopy(options) proj_options.root = None #Create the progressive cactus project (as we do in runCactusProgressive) projWrapper = ProjectWrapper(proj_options, proj_options.configFile, ignoreSeqPaths=options.root) projWrapper.writeXml() pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) # open up the experiment (as we do in ProgressiveUp.run) # note that we copy the path into the options here experimentFile = project.expMap[options.root] expXml = ET.parse(experimentFile).getroot() experiment = ExperimentWrapper(expXml) configPath = experiment.getConfigPath() configXml = ET.parse(configPath).getroot() seqIDMap = dict() tree = MultiCactusTree(experiment.getTree()).extractSubTree(options.root) leaves = [tree.getName(leaf) for leaf in tree.getLeaves()] outgroups = experiment.getOutgroupGenomes() genome_set = set(leaves + outgroups) # this is a hack to allow specifying all the input on the command line, rather than using suffix lookups def get_input_path(suffix=''): base_path = options.cigarsFile[0] for input_path in options.cigarsFile: if suffix and input_path.endswith(suffix): return input_path if os.path.basename(base_path).startswith( os.path.basename(input_path)): base_path = input_path return base_path + suffix # import the outgroups outgroupIDs = [] outgroup_fragment_found = False for i, outgroup in enumerate(outgroups): try: outgroupID = toil.importFile( makeURL(get_input_path('.og_fragment_{}'.format(i)))) outgroupIDs.append(outgroupID) experiment.setSequenceID(outgroup, outgroupID) outgroup_fragment_found = True assert not options.pangenome except: # we assume that input is not coming from cactus blast, so we'll treat output # sequences normally and not go looking for fragments outgroupIDs = [] break #import the sequences (that we need to align for the given event, ie leaves and outgroups) for genome, seq in list(project.inputSequenceMap.items()): if genome in leaves or (not outgroup_fragment_found and genome in outgroups): if os.path.isdir(seq): tmpSeq = getTempFile() catFiles( [os.path.join(seq, subSeq) for subSeq in os.listdir(seq)], tmpSeq) seq = tmpSeq seq = makeURL(seq) logger.info("Importing {}".format(seq)) experiment.setSequenceID(genome, toil.importFile(seq)) if not outgroup_fragment_found: outgroupIDs = [ experiment.getSequenceID(outgroup) for outgroup in outgroups ] # write back the experiment, as CactusWorkflowArguments wants a path experiment.writeXML(experimentFile) #import cactus config if options.configFile: cactusConfigID = toil.importFile(makeURL(options.configFile)) else: cactusConfigID = toil.importFile(makeURL(project.getConfigPath())) project.setConfigID(cactusConfigID) project.syncToFileStore(toil) configNode = ET.parse(project.getConfigPath()).getroot() configWrapper = ConfigWrapper(configNode) configWrapper.substituteAllPredefinedConstantsWithLiterals() if options.singleCopySpecies: findRequiredNode( configWrapper.xmlRoot, "caf").attrib["alignmentFilter"] = "singleCopyEvent:{}".format( options.singleCopySpecies) if options.barMaskFilter: findRequiredNode( configWrapper.xmlRoot, "bar").attrib["partialOrderAlignmentMaskFilter"] = str( options.barMaskFilter) if options.pangenome: # turn off the megablock filter as it ruins non-all-to-all alignments findRequiredNode(configWrapper.xmlRoot, "caf").attrib["minimumBlockHomologySupport"] = "0" findRequiredNode( configWrapper.xmlRoot, "caf").attrib["minimumBlockDegreeToCheckSupport"] = "9999999999" # turn off mapq filtering findRequiredNode(configWrapper.xmlRoot, "caf").attrib["runMapQFiltering"] = "0" # more iterations here helps quite a bit to reduce underalignment findRequiredNode(configWrapper.xmlRoot, "caf").attrib["maxRecoverableChainsIterations"] = "50" # turn down minimum block degree to get a fat ancestor findRequiredNode(configWrapper.xmlRoot, "bar").attrib["minimumBlockDegree"] = "1" # turn on POA findRequiredNode(configWrapper.xmlRoot, "bar").attrib["partialOrderAlignment"] = "1" # save it if not options.batch: pg_file = options.outHal + ".pg-conf.xml" if pg_file.startswith('s3://'): pg_temp_file = getTempFile() else: pg_temp_file = pg_file configWrapper.writeXML(pg_temp_file) if pg_file.startswith('s3://'): write_s3(pg_temp_file, pg_file, region=get_aws_region(options.jobStore)) logger.info("pangenome configuration overrides saved in {}".format( pg_file)) workFlowArgs = CactusWorkflowArguments(options, experimentFile=experimentFile, configNode=configNode, seqIDMap=project.inputSequenceIDMap) #import the files that cactus-blast made workFlowArgs.alignmentsID = toil.importFile(makeURL(get_input_path())) workFlowArgs.secondaryAlignmentsID = None if not options.pafInput: try: workFlowArgs.secondaryAlignmentsID = toil.importFile( makeURL(get_input_path('.secondary'))) except: pass workFlowArgs.outgroupFragmentIDs = outgroupIDs workFlowArgs.ingroupCoverageIDs = [] if outgroup_fragment_found and len(outgroups) > 0: for i in range(len(leaves)): workFlowArgs.ingroupCoverageIDs.append( toil.importFile( makeURL(get_input_path('.ig_coverage_{}'.format(i))))) align_job = Job.wrapJobFn(run_cactus_align, configWrapper, workFlowArgs, project, checkpointInfo=options.checkpointInfo, doRenaming=options.nonCactusInput, pafInput=options.pafInput, pafSecondaries=options.usePafSecondaries, doVG=options.outVG, doGFA=options.outGFA, delay=options.stagger, eventNameAsID=options.eventNameAsID, acyclicEvent=options.acyclic) return align_job
def disableRecoverableChains(self): """Make sure the filter is off in caf """ cafNode = findRequiredNode(self.xmlRoot, "caf") cafNode.attrib["removeRecoverableChains"] = "0"
def disableCafMegablockFilter(self): """Make sure the filter is off in caf """ cafNode = findRequiredNode(self.xmlRoot, "caf") cafNode.attrib["minimumBlockHomologySupport"] = "0" cafNode.attrib["minimumBlockDegreeToCheckSupport"] = "9999999999"
def split_gfa(job, config, gfa_id, paf_ids, ref_contigs, other_contig, reference_event, mask_bed_id): """ Use rgfa-split to divide a GFA and PAF into chromosomes. The GFA must be in minigraph RGFA output using the desired reference. """ if not paf_ids: # we can bypass when, ex, doing second pass on ambiguous sequences but not are present return [None, None] work_dir = job.fileStore.getLocalTempDir() gfa_path = os.path.join(work_dir, "mg.gfa") paf_path = os.path.join(work_dir, "mg.paf") out_prefix = os.path.join(work_dir, "split_") bed_path = os.path.join(work_dir, "mask.bed") log_path = os.path.join(work_dir, "split.log") if (mask_bed_id): job.fileStore.readGlobalFile(mask_bed_id, bed_path) if gfa_id: job.fileStore.readGlobalFile(gfa_id, gfa_path) paf_paths = [] for i, paf_id in enumerate(paf_ids): paf_paths.append( '{}.{}'.format(paf_path, i) if len(paf_ids) > 1 else paf_path) job.fileStore.readGlobalFile(paf_id, paf_paths[-1]) if len(paf_paths) > 1: catFiles(paf_paths, paf_path) # get the minigraph "virutal" assembly name graph_event = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "assemblyName", default="_MINIGRAPH_") # and look up its unique id prefix. this will be needed to pick its contigs out of the list mg_id = graph_event # get the specificity filters query_coverage = getOptionalAttrib(findRequiredNode( config.xmlRoot, "graphmap_split"), "minQueryCoverage", default="0") small_query_coverage = getOptionalAttrib(findRequiredNode( config.xmlRoot, "graphmap_split"), "minQuerySmallCoverage", default="0") small_coverage_threshold = getOptionalAttrib(findRequiredNode( config.xmlRoot, "graphmap_split"), "minQuerySmallThreshold", default="0") query_uniqueness = getOptionalAttrib(findRequiredNode( config.xmlRoot, "graphmap_split"), "minQueryUniqueness", default="0") max_gap = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "maxGap", default="0") amb_name = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "ambiguousName", default="_AMBIGUOUS_") cmd = [ 'rgfa-split', '-p', paf_path, '-b', out_prefix, '-n', query_coverage, '-N', small_query_coverage, '-T', small_coverage_threshold, '-Q', query_uniqueness, '-P', max_gap, '-a', amb_name, '-L', log_path ] if gfa_id: cmd += ['-g', gfa_path, '-G'] if other_contig: cmd += ['-o', other_contig] if reference_event: cmd += ['-r', 'id={}|'.format(reference_event)] if mask_bed_id: cmd += ['-B', bed_path] min_mapq = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "minMAPQ") if min_mapq: cmd += ['-A', min_mapq] for contig in ref_contigs: cmd += ['-c', contig] cactus_call(parameters=cmd, work_dir=work_dir) output_id_map = {} for out_name in os.listdir(work_dir): file_name, ext = os.path.splitext(out_name) if file_name.startswith(os.path.basename(out_prefix)) and ext in [ ".gfa", ".paf", ".fa_contigs" ]: name = file_name[len(os.path.basename(out_prefix)):] if name not in output_id_map: output_id_map[name] = {} output_id_map[name][ext[1:]] = job.fileStore.writeGlobalFile( os.path.join(work_dir, out_name)) return output_id_map, job.fileStore.writeGlobalFile(log_path)
def export_split_data(toil, input_seq_id_map, output_id_map, split_log_ids, output_dir, config): """ download all the split data locally """ amb_name = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "ambiguousName", default="_AMBIGUOUS_") chrom_file_map = {} for ref_contig in output_id_map.keys(): ref_contig_path = os.path.join(output_dir, ref_contig) if not os.path.isdir( ref_contig_path) and not ref_contig_path.startswith('s3://'): os.makedirs(ref_contig_path) # GFA: <output_dir>/<contig>/<contig>.gfa if 'gfa' in output_id_map[ref_contig]: # we do this check because no gfa made for ambiguous sequences "contig" toil.exportFile( output_id_map[ref_contig]['gfa'], makeURL( os.path.join(ref_contig_path, '{}.gfa'.format(ref_contig)))) # PAF: <output_dir>/<contig>/<contig>.paf paf_path = os.path.join(ref_contig_path, '{}.paf'.format(ref_contig)) toil.exportFile(output_id_map[ref_contig]['paf'], makeURL(paf_path)) # Fasta: <output_dir>/<contig>/fasta/<event>_<contig>.fa .. seq_file_map = {} for event, ref_contig_fa_id in output_id_map[ref_contig]['fa'].items(): fa_base = os.path.join(ref_contig_path, 'fasta') if not os.path.isdir(fa_base) and not fa_base.startswith('s3://'): os.makedirs(fa_base) fa_path = makeURL( os.path.join(fa_base, '{}_{}.fa'.format(event, ref_contig))) if input_seq_id_map[event][0].endswith('.gz'): fa_path += '.gz' seq_file_map[event] = fa_path toil.exportFile(ref_contig_fa_id, fa_path) # Seqfile: <output_dir>/seqfiles/<contig>.seqfile seq_file_path = os.path.join(output_dir, 'seqfiles', '{}.seqfile'.format(ref_contig)) if seq_file_path.startswith('s3://'): seq_file_temp_path = getTempFile() else: seq_file_temp_path = seq_file_path if not os.path.isdir(os.path.dirname(seq_file_path)): os.makedirs(os.path.dirname(seq_file_path)) with open(seq_file_temp_path, 'w') as seq_file: for event, fa_path in seq_file_map.items(): # cactus can't handle empty fastas. if there are no sequences for a sample for this # contig, just don't add it. if output_id_map[ref_contig]['fa'][event].size > 0: seq_file.write('{}\t{}\n'.format(event, fa_path)) if seq_file_path.startswith('s3://'): write_s3(seq_file_temp_path, seq_file_path) # Top-level seqfile chrom_file_map[ref_contig] = seq_file_path, paf_path # Chromfile : <coutput_dir>/chromfile.txt chrom_file_path = os.path.join(output_dir, 'chromfile.txt') if chrom_file_path.startswith('s3://'): chrom_file_temp_path = getTempFile() else: chrom_file_temp_path = chrom_file_path with open(chrom_file_temp_path, 'w') as chromfile: for ref_contig, seqfile_paf in chrom_file_map.items(): if ref_contig != amb_name: seqfile, paf = seqfile_paf[0], seqfile_paf[1] if seqfile.startswith('s3://'): # no use to have absolute s3 reference as cactus-align requires seqfiles passed locally seqfile = 'seqfiles/{}'.format(os.path.basename(seqfile)) chromfile.write('{}\t{}\t{}\n'.format(ref_contig, seqfile, paf)) if chrom_file_path.startswith('s3://'): write_s3(chrom_file_temp_path, chrom_file_path) toil.exportFile(split_log_ids[0], makeURL(os.path.join(output_dir, 'minigraph.split.log'))) if split_log_ids[1]: toil.exportFile( split_log_ids[1], makeURL(os.path.join(output_dir, 'minimap2.ambiguous.split.log')))
def runCactusGraphMapSplit(options): with Toil(options) as toil: importSingularityImage(options) #Run the workflow if options.restart: wf_output = toil.restart() else: options.cactusDir = getTempDirectory() #load cactus config configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) config.substituteAllPredefinedConstantsWithLiterals() # load up the contigs if any ref_contigs = set(options.refContigs) # todo: use import? if options.refContigsFile: with open(options.refContigsFile, 'r') as rc_file: for line in rc_file: if len(line.strip()): ref_contigs.add(line.strip().split()[0]) if options.otherContig: assert options.otherContig not in ref_contigs # get the minigraph "virutal" assembly name graph_event = getOptionalAttrib(findRequiredNode( configNode, "graphmap"), "assemblyName", default="_MINIGRAPH_") # load the seqfile seqFile = SeqFile(options.seqFile) #import the graph gfa_id = toil.importFile(makeURL(options.minigraphGFA)) #import the paf paf_id = toil.importFile(makeURL(options.graphmapPAF)) #import the sequences (that we need to align for the given event, ie leaves and outgroups) seqIDMap = {} leaves = set([ seqFile.tree.getName(node) for node in seqFile.tree.getLeaves() ]) if graph_event not in leaves: raise RuntimeError( "Minigraph name {} not found in seqfile".format( graph_event)) if options.reference and options.reference not in leaves: raise RuntimeError( "Name given with --reference {} not found in seqfile". format(options.reference)) for genome, seq in seqFile.pathMap.items(): if genome in leaves: if os.path.isdir(seq): tmpSeq = getTempFile() catFiles([ os.path.join(seq, subSeq) for subSeq in os.listdir(seq) ], tmpSeq) seq = tmpSeq seq = makeURL(seq) logger.info("Importing {}".format(seq)) seqIDMap[genome] = (seq, toil.importFile(seq)) # run the workflow wf_output = toil.start( Job.wrapJobFn(graphmap_split_workflow, options, config, seqIDMap, gfa_id, options.minigraphGFA, paf_id, options.graphmapPAF, ref_contigs, options.otherContig)) #export the split data export_split_data(toil, wf_output[0], wf_output[1], wf_output[2:], options.outDir, config)
def split_gfa(job, config, gfa_id, paf_ids, ref_contigs, other_contig, reference_event, mask_bed_id): """ Use rgfa-split to divide a GFA and PAF into chromosomes. The GFA must be in minigraph RGFA output using the desired reference. """ if not paf_ids: # we can bypass when, ex, doing second pass on ambiguous sequences but not are present return [None, None] if not gfa_id and not getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "remap", typeFn=bool, default=False): # also bypass if remapping is off in the config (we know it's the second pass because gfa_id is None) return [None, None] work_dir = job.fileStore.getLocalTempDir() gfa_path = os.path.join(work_dir, "mg.gfa") paf_path = os.path.join(work_dir, "mg.paf") out_prefix = os.path.join(work_dir, "split_") bed_path = os.path.join(work_dir, "mask.bed") log_path = os.path.join(work_dir, "split.log") if (mask_bed_id): job.fileStore.readGlobalFile(mask_bed_id, bed_path) if gfa_id: job.fileStore.readGlobalFile(gfa_id, gfa_path) paf_paths = [] for i, paf_id in enumerate(paf_ids): paf_paths.append('{}.{}'.format(paf_path, i) if len(paf_ids) > 1 else paf_path) job.fileStore.readGlobalFile(paf_id, paf_paths[-1]) if len(paf_paths) > 1: catFiles(paf_paths, paf_path) # get the minigraph "virutal" assembly name graph_event = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "assemblyName", default="_MINIGRAPH_") # and look up its unique id prefix. this will be needed to pick its contigs out of the list mg_id = graph_event # get the specificity filters query_coverage = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "minQueryCoverage", default="0") small_query_coverage = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "minQuerySmallCoverage", default="0") small_coverage_threshold = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "minQuerySmallThreshold", default="0") query_uniqueness = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "minQueryUniqueness", default="0") max_gap = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "maxGap", default="0") amb_name = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "ambiguousName", default="_AMBIGUOUS_") cmd = ['rgfa-split', '-p', paf_path, '-b', out_prefix, '-n', query_coverage, '-N', small_query_coverage, '-T', small_coverage_threshold, '-Q', query_uniqueness, '-P', max_gap, '-a', amb_name, '-L', log_path] if gfa_id: cmd += ['-g', gfa_path, '-G'] if other_contig: cmd += ['-o', other_contig] if reference_event: cmd += ['-r', 'id={}|'.format(reference_event)] if mask_bed_id: cmd += ['-B', bed_path] min_mapq = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "minMAPQ") if min_mapq: cmd += ['-A', min_mapq] # optional stuff added to second pass: if not gfa_id: remap_opts = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "remapSplitOptions", default=None) if remap_opts: cmd += remap_opts.split(' ') for contig in ref_contigs: cmd += ['-c', contig] cactus_call(parameters=cmd, work_dir=work_dir) output_id_map = {} for out_name in os.listdir(work_dir): file_name, ext = os.path.splitext(out_name) if file_name.startswith(os.path.basename(out_prefix)) and ext in [".gfa", ".paf", ".fa_contigs"] and \ os.path.isfile(os.path.join(work_dir, file_name + ".fa_contigs")): name = file_name[len(os.path.basename(out_prefix)):] if name not in output_id_map: output_id_map[name] = {} if ext == '.paf': # apply the hacky naming correction so that subpaths have no special characterse in the hal (to make hubs happy) # this gets undone by hal2vg cactus_call(parameters=['sed', '-i', '-e', 's/\([^:]*\):\([0-9]*\)-\([0-9]*\)/echo "\\1_sub_$((\\2-1))_\\3"/e', '-e', 's/ /\t/g', os.path.join(work_dir, out_name)]) output_id_map[name][ext[1:]] = job.fileStore.writeGlobalFile(os.path.join(work_dir, out_name)) return output_id_map, job.fileStore.writeGlobalFile(log_path)