def stageWorkflow(outputSequenceDir, configFile, inputSequences, toil, restart=False, outputSequences=[]): #Replace any constants configNode = ET.parse(configFile).getroot() if not outputSequences: outputSequences = CactusPreprocessor.getOutputSequenceFiles( inputSequences, outputSequenceDir) else: assert len(outputSequences) == len(inputSequences) if configNode.find("constants") != None: ConfigWrapper( configNode).substituteAllPredefinedConstantsWithLiterals() if not restart: inputSequenceIDs = [ toil.importFile(makeURL(seq)) for seq in inputSequences ] outputSequenceIDs = toil.start( CactusPreprocessor(inputSequenceIDs, configNode)) else: outputSequenceIDs = toil.restart() for seqID, path in zip(outputSequenceIDs, outputSequences): toil.exportFile(seqID, makeURL(path))
def export_join_data(toil, options, clip_ids, idx_map, merge_hal_id): """ download all the output data """ # download the clip vgs clip_base = os.path.join(options.outDir, 'clip') if not clip_base.startswith('s3://') and not os.path.isdir(clip_base): os.makedirs(clip_base) for vg_path, vg_id in zip(options.vg, clip_ids): toil.exportFile( vg_id, makeURL(os.path.join(clip_base, os.path.basename(vg_path)))) # download everything else for ext, idx_id in idx_map.items(): toil.exportFile( idx_id, makeURL( os.path.join(options.outDir, '{}.{}'.format(options.outName, ext)))) # download the merged hal if merge_hal_id: toil.exportFile( merge_hal_id, makeURL( os.path.join(options.outDir, '{}.hal'.format(options.outName))))
def runCactusBlast(sequenceFiles, alignmentsFile, toilDir, chunkSize=None, overlapSize=None, logLevel=None, compressFiles=None, lastzMemory=None, targetSequenceFiles=None): options = Job.Runner.getDefaultOptions(toilDir) options.logLevel = "CRITICAL" blastOptions = BlastOptions(chunkSize=chunkSize, overlapSize=overlapSize, compressFiles=compressFiles, memory=lastzMemory) with Toil(options) as toil: seqIDs = [ toil.importFile(makeURL(seqFile)) for seqFile in sequenceFiles ] if targetSequenceFiles: targetSeqIDs = [ toil.importFile(makeURL(seqFile)) for seqFile in targetSequenceFiles ] rootJob = BlastSequencesAgainstEachOther( sequenceFileIDs1=seqIDs, sequenceFileIDs2=targetSeqIDs, blastOptions=blastOptions) else: rootJob = BlastSequencesAllAgainstAll(seqIDs, blastOptions) alignmentsID = toil.start(rootJob) toil.exportFile(alignmentsID, makeURL(alignmentsFile))
def runCactusBlastIngroupsAndOutgroups(ingroups, outgroups, alignmentsFile, toilDir, outgroupFragmentPaths=None, ingroupCoveragePaths=None, chunkSize=None, overlapSize=None, logLevel=None, compressFiles=None, lastzMemory=None): options = Job.Runner.getDefaultOptions(toilDir) options.disableCaching = True options.logLevel = "CRITICAL" blastOptions = BlastOptions(chunkSize=chunkSize, overlapSize=overlapSize, compressFiles=compressFiles, memory=lastzMemory) with Toil(options) as toil: ingroupIDs = [toil.importFile(makeURL(ingroup)) for ingroup in ingroups] outgroupIDs = [toil.importFile(makeURL(outgroup)) for outgroup in outgroups] rootJob = BlastIngroupsAndOutgroups(blastOptions, ingroupIDs, outgroupIDs) blastResults = toil.start(rootJob) alignmentsID = blastResults[0] toil.exportFile(alignmentsID, makeURL(alignmentsFile)) outgroupFragmentIDs = blastResults[1] ingroupCoverageIDs = blastResults[2] if outgroupFragmentPaths: assert len(outgroupFragmentIDs) == len(outgroupFragmentPaths) for outgroupFragmentID, outgroupFragmentPath in zip(outgroupFragmentIDs, outgroupFragmentPaths): toil.exportFile(outgroupFragmentID, makeURL(outgroupFragmentPath)) if ingroupCoveragePaths: assert len(ingroupCoverageIDs) == len(ingroupCoveragePaths) for ingroupCoverageID, ingroupCoveragePath in zip(ingroupCoverageIDs, ingroupCoveragePaths): toil.exportFile(ingroupCoverageID, makeURL(ingroupCoveragePath))
def runToilPipeline(self, alignmentsFile, alpha=0.001): # Tests the toil pipeline options = Job.Runner.getDefaultOptions( os.path.join(self.tempDir, "toil")) options.logLevel = self.logLevelString with Toil(options) as toil: # Import the input file into the job store inputAlignmentFileID = toil.importFile(makeURL(alignmentsFile)) rootJob = Job.wrapJobFn(mappingQualityRescoring, inputAlignmentFileID, minimumMapQValue=0, maxAlignmentsPerSite=1, alpha=alpha, logLevel=self.logLevelString) primaryOutputAlignmentsFileID, secondaryOutputAlignmentsFileID = toil.start( rootJob) toil.exportFile(primaryOutputAlignmentsFileID, makeURL(self.simpleOutputCigarPath)) toil.exportFile(secondaryOutputAlignmentsFileID, makeURL(self.simpleOutputCigarPath2)) # Check output with open(self.simpleOutputCigarPath, 'r') as fh: primaryOutputCigars = [cigar[:-1] for cigar in fh.readlines() ] # Remove new lines with open(self.simpleOutputCigarPath2, 'r') as fh: secondaryOutputCigars = [cigar[:-1] for cigar in fh.readlines() ] # Remove new lines return primaryOutputCigars + secondaryOutputCigars
def runCactusBlastIngroupsAndOutgroups(ingroups, outgroups, alignmentsFile, toilDir, outgroupFragmentPaths=None, ingroupCoveragePaths=None, chunkSize=250000, overlapSize=10000, logLevel=None, compressFiles=None, lastzMemory=None): options = Job.Runner.getDefaultOptions(toilDir) options.disableCaching = True options.logLevel = "CRITICAL" blastOptions = BlastOptions(chunkSize=chunkSize, overlapSize=overlapSize, compressFiles=compressFiles, memory=lastzMemory) with Toil(options) as toil: ingroupIDs = [toil.importFile(makeURL(ingroup)) for ingroup in ingroups] outgroupIDs = [toil.importFile(makeURL(outgroup)) for outgroup in outgroups] rootJob = BlastIngroupsAndOutgroups(blastOptions, ingroups, ingroupIDs, outgroups, outgroupIDs) blastResults = toil.start(rootJob) alignmentsID = blastResults[0] toil.exportFile(alignmentsID, makeURL(alignmentsFile)) outgroupFragmentIDs = blastResults[1] ingroupCoverageIDs = blastResults[2] if outgroupFragmentPaths: assert len(outgroupFragmentIDs) == len(outgroupFragmentPaths) for outgroupFragmentID, outgroupFragmentPath in zip(outgroupFragmentIDs, outgroupFragmentPaths): toil.exportFile(outgroupFragmentID, makeURL(outgroupFragmentPath)) if ingroupCoveragePaths: assert len(ingroupCoverageIDs) == len(ingroupCoveragePaths) for ingroupCoverageID, ingroupCoveragePath in zip(ingroupCoverageIDs, ingroupCoveragePaths): toil.exportFile(ingroupCoverageID, makeURL(ingroupCoveragePath))
def runCactusProgressive(options): with Toil(options) as toil: importSingularityImage() #Run the workflow if options.restart: halID = toil.restart() else: options.cactusDir = getTempDirectory() #Create the progressive cactus project projWrapper = ProjectWrapper(options) projWrapper.writeXml() pjPath = os.path.join( options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) #import the sequences for genome, seq in project.inputSequenceMap.items(): if os.path.isdir(seq): tmpSeq = getTempFile() catFiles([ os.path.join(seq, subSeq) for subSeq in os.listdir(seq) ], tmpSeq) seq = tmpSeq seq = makeURL(seq) project.inputSequenceIDMap[genome] = toil.importFile(seq) #import cactus config if options.configFile: cactusConfigID = toil.importFile(makeURL(options.configFile)) else: cactusConfigID = toil.importFile( makeURL(project.getConfigPath())) project.setConfigID(cactusConfigID) project.syncToFileStore(toil) configNode = ET.parse(project.getConfigPath()).getroot() configWrapper = ConfigWrapper(configNode) configWrapper.substituteAllPredefinedConstantsWithLiterals() project.writeXML(pjPath) halID = toil.start( RunCactusPreprocessorThenProgressiveDown( options, project, memory=configWrapper.getDefaultMemory())) toil.exportFile(halID, makeURL(options.outputHal))
def stageWorkflow(outputSequenceDir, configFile, inputSequences, toil, restart=False, outputSequences=[], maskAlpha=False, clipAlpha=None): #Replace any constants configNode = ET.parse(configFile).getroot() if not outputSequences: outputSequences = CactusPreprocessor.getOutputSequenceFiles( inputSequences, outputSequenceDir) else: assert len(outputSequences) == len(inputSequences) # Make sure we have the dna-brnn model in the filestore if we need it loadDnaBrnnModel(toil, ET.parse(configFile).getroot(), maskAlpha=maskAlpha) if configNode.find("constants") != None: ConfigWrapper( configNode).substituteAllPredefinedConstantsWithLiterals() if maskAlpha or clipAlpha: ConfigWrapper(configNode).setPreprocessorActive( "lastzRepeatMask", False) ConfigWrapper(configNode).setPreprocessorActive("dna-brnn", True) if clipAlpha: for node in configNode.findall("preprocessor"): if getOptionalAttrib(node, "preprocessJob") == 'dna-brnn': node.attrib["action"] = "clip" node.attrib["minLength"] = clipAlpha node.attrib["mergeLength"] = clipAlpha if not restart: inputSequenceIDs = [] for seq in inputSequences: logger.info("Importing {}".format(seq)) inputSequenceIDs.append(toil.importFile(makeURL(seq))) unzip_job = Job.wrapJobFn(unzip_then_pp, configNode, inputSequences, inputSequenceIDs) outputSequenceIDs = toil.start(unzip_job) else: outputSequenceIDs = toil.restart() for seqID, path in zip(outputSequenceIDs, outputSequences): try: iter(seqID) # dna-brnn will output a couple of bed files. we scrape those out here toil.exportFile(seqID[0], makeURL(path)) toil.exportFile(seqID[1], makeURL(path) + '.bed') toil.exportFile(seqID[2], makeURL(path) + '.mask.bed') except: toil.exportFile(seqID, makeURL(path))
def stageWorkflow(outputSequenceDir, configFile, inputSequences, toil, restart=False): #Replace any constants configNode = ET.parse(configFile).getroot() outputSequences = CactusPreprocessor.getOutputSequenceFiles(inputSequences, outputSequenceDir) if configNode.find("constants") != None: ConfigWrapper(configNode).substituteAllPredefinedConstantsWithLiterals() if not restart: inputSequenceIDs = [toil.importFile(makeURL(seq)) for seq in inputSequences] outputSequenceIDs = toil.start(CactusPreprocessor(inputSequenceIDs, configNode)) else: outputSequenceIDs = toil.restart() for seqID, path in zip(outputSequenceIDs, outputSequences): toil.exportFile(seqID, makeURL(path))
def runCactusGraphMapJoin(options): with Toil(options) as toil: importSingularityImage(options) #Run the workflow if options.restart: wf_output = toil.restart() else: options.cactusDir = getTempDirectory() #load cactus config configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) config.substituteAllPredefinedConstantsWithLiterals() # load up the vgs vg_ids = [] for vg_path in options.vg: logger.info("Importing {}".format(vg_path)) vg_ids.append(toil.importFile(makeURL(vg_path))) # tack on the decoys if options.decoyGraph: logger.info("Importing decoys {}".format(options.decoyGraph)) vg_ids.append(toil.importFile(makeURL(options.decoyGraph))) # we'll treat it like any other graph downstream, except clipping # where we'll check first using the path name options.vg.append(options.decoyGraph) # load up the hals hal_ids = [] for hal_path in options.hal: logger.info("Importing {}".format(hal_path)) hal_ids.append(toil.importFile(makeURL(hal_path))) # run the workflow wf_output = toil.start( Job.wrapJobFn(graphmap_join_workflow, options, config, vg_ids, hal_ids)) #export the split data export_join_data(toil, options, wf_output[0], wf_output[1], wf_output[2])
def runCactusBlast(sequenceFiles, alignmentsFile, toilDir, chunkSize=None, overlapSize=None, logLevel=None, compressFiles=None, lastzMemory=None, targetSequenceFiles=None): options = Job.Runner.getDefaultOptions(toilDir) options.logLevel = "CRITICAL" blastOptions = BlastOptions(chunkSize=chunkSize, overlapSize=overlapSize, compressFiles=compressFiles, memory=lastzMemory) with Toil(options) as toil: seqIDs = [toil.importFile(makeURL(seqFile)) for seqFile in sequenceFiles] if targetSequenceFiles: targetSeqIDs = [toil.importFile(makeURL(seqFile)) for seqFile in targetSequenceFiles] rootJob = BlastSequencesAgainstEachOther(sequenceFileIDs1=seqIDs, sequenceFileIDs2=targetSeqIDs, blastOptions=blastOptions) else: rootJob = BlastSequencesAllAgainstAll(seqIDs, blastOptions) alignmentsID = toil.start(rootJob) toil.exportFile(alignmentsID, makeURL(alignmentsFile))
def runToilPipeline(self, alignmentsFile, alpha=0.001): # Tests the toil pipeline options = Job.Runner.getDefaultOptions(os.path.join(self.tempDir, "toil")) options.logLevel = self.logLevelString with Toil(options) as toil: # Import the input file into the job store inputAlignmentFileID = toil.importFile(makeURL(alignmentsFile)) rootJob = Job.wrapJobFn(mappingQualityRescoring, inputAlignmentFileID, minimumMapQValue=0, maxAlignmentsPerSite=1, alpha=alpha, logLevel=self.logLevelString) primaryOutputAlignmentsFileID, secondaryOutputAlignmentsFileID = toil.start(rootJob) toil.exportFile(primaryOutputAlignmentsFileID, makeURL(self.simpleOutputCigarPath)) toil.exportFile(secondaryOutputAlignmentsFileID, makeURL(self.simpleOutputCigarPath2)) # Check output with open(self.simpleOutputCigarPath, 'r') as fh: primaryOutputCigars = [ cigar[:-1] for cigar in fh.readlines() ] # Remove new lines with open(self.simpleOutputCigarPath2, 'r') as fh: secondaryOutputCigars = [ cigar[:-1] for cigar in fh.readlines() ] # Remove new lines return primaryOutputCigars + secondaryOutputCigars
def loadDnaBrnnModel(toil, configNode, maskAlpha=False): """ store the model in a toil file id so it can be used in any workflow """ for prepXml in configNode.findall("preprocessor"): if prepXml.attrib["preprocessJob"] == "dna-brnn": if maskAlpha or getOptionalAttrib( prepXml, "active", typeFn=bool, default=False): dnabrnnOpts = getOptionalAttrib(prepXml, "dna-brnnOpts", default="") if '-i' in dnabrnnOpts: model_path = dnabrnnOpts[dnabrnnOpts.index('-i') + 1] else: model_path = os.path.join(cactusRootPath(), 'attcc-alpha.knm') os.environ["CACTUS_DNA_BRNN_MODEL_ID"] = toil.importFile( makeURL(model_path))
def get_asms_from_seqfile(seqFile, workflow): """[summary] Args: seqFile ([type]): [description] workflow ([type]): [description] Returns: [type]: [description] """ seqFile = SeqFile(seqFile) seqDict = col.OrderedDict(seqFile.pathMap) print(seqDict) for name, seqURL in seqDict.items(): seqDict[name] = workflow.importFile(makeURL(seqURL)) return seqDict
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) addCactusWorkflowOptions(parser) parser.add_argument("seqFile", help = "Seq file") parser.add_argument("outputHal", type=str, help = "Output HAL file") #Progressive Cactus Options parser.add_argument("--database", dest="database", help="Database type: tokyo_cabinet or kyoto_tycoon" " [default: %(default)s]", default="kyoto_tycoon") parser.add_argument("--configFile", dest="configFile", help="Specify cactus configuration file", default=None) parser.add_argument("--root", dest="root", help="Name of ancestral node (which" " must appear in NEWICK tree in <seqfile>) to use as a " "root for the alignment. Any genomes not below this node " "in the tree may be used as outgroups but will never appear" " in the output. If no root is specifed then the root" " of the tree is used. ", default=None) parser.add_argument("--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument("--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) options = parser.parse_args() setupBinaries(options) setLoggingFromOptions(options) # Mess with some toil options to create useful defaults. # Caching generally slows down the cactus workflow, plus some # methods like readGlobalFileStream don't support forced # reads directly from the job store rather than from cache. options.disableCaching = True # Job chaining breaks service termination timing, causing unused # databases to accumulate and waste memory for no reason. options.disableChaining = True # The default deadlockWait is currently 60 seconds. This can cause # issues if the database processes take a while to actually begin # after they're issued. Change it to at least an hour so that we # don't preemptively declare a deadlock. if options.deadlockWait is None or options.deadlockWait < 3600: options.deadlockWait = 3600 if options.retryCount is None: # If the user didn't specify a retryCount value, make it 5 # instead of Toil's default (1). options.retryCount = 5 with Toil(options) as toil: importSingularityImage() #Run the workflow if options.restart: halID = toil.restart() else: options.cactusDir = getTempDirectory() #Create the progressive cactus project projWrapper = ProjectWrapper(options) projWrapper.writeXml() pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) #import the sequences seqIDs = [] print "Importing %s sequences" % (len(project.getInputSequencePaths())) for seq in project.getInputSequencePaths(): if os.path.isdir(seq): tmpSeq = getTempFile() catFiles([os.path.join(seq, subSeq) for subSeq in os.listdir(seq)], tmpSeq) seq = tmpSeq seq = makeURL(seq) seqIDs.append(toil.importFile(seq)) project.setInputSequenceIDs(seqIDs) #import cactus config if options.configFile: cactusConfigID = toil.importFile(makeURL(options.configFile)) else: cactusConfigID = toil.importFile(makeURL(project.getConfigPath())) project.setConfigID(cactusConfigID) project.syncToFileStore(toil) configNode = ET.parse(project.getConfigPath()).getroot() configWrapper = ConfigWrapper(configNode) configWrapper.substituteAllPredefinedConstantsWithLiterals() project.writeXML(pjPath) halID = toil.start(RunCactusPreprocessorThenProgressiveDown(options, project, memory=configWrapper.getDefaultMemory())) toil.exportFile(halID, makeURL(options.outputHal))
def runCactusBlastOnly(options): with Toil(options) as toil: importSingularityImage(options) #Run the workflow if options.restart: alignmentID = toil.restart() else: options.cactusDir = getTempDirectory() # apply path overrides. this was necessary for wdl which doesn't take kindly to # text files of local paths (ie seqfile). one way to fix would be to add support # for s3 paths and force wdl to use it. a better way would be a more fundamental # interface shift away from files of paths throughout all of cactus if options.pathOverrides: seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) tree = MultiCactusTree(seqFile.tree) tree.nameUnlabeledInternalNodes( prefix=config.getDefaultInternalNodePrefix()) for name, override in zip(options.pathOverrideNames, options.pathOverrides): seqFile.pathMap[name] = override override_seq = os.path.join(options.cactusDir, 'seqFile.override') with open(override_seq, 'w') as out_sf: out_sf.write(str(seqFile)) options.seqFile = override_seq #to be consistent with all-in-one cactus, we make sure the project #isn't limiting itself to the subtree (todo: parameterize so root can #be passed through from prepare to blast/align) proj_options = copy.deepcopy(options) proj_options.root = None #Create the progressive cactus project (as we do in runCactusProgressive) projWrapper = ProjectWrapper(proj_options, proj_options.configFile, ignoreSeqPaths=options.root) projWrapper.writeXml() pjPath = os.path.join( options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) # open up the experiment (as we do in ProgressiveUp.run) # note that we copy the path into the options here experimentFile = project.expMap[options.root] expXml = ET.parse(experimentFile).getroot() logger.info("Experiment {}".format(ET.tostring(expXml))) experiment = ExperimentWrapper(expXml) configPath = experiment.getConfigPath() configXml = ET.parse(configPath).getroot() seqIDMap = dict() tree = MultiCactusTree(experiment.getTree()).extractSubTree( options.root) leaves = tree.getChildNames(tree.getRootName()) outgroups = experiment.getOutgroupGenomes() genome_set = set(leaves + outgroups) logger.info("Genomes in blastonly, {}: {}".format( options.root, list(genome_set))) print(str(project.inputSequenceMap)) #import the sequences (that we need to align for the given event, ie leaves and outgroups) for genome, seq in list(project.inputSequenceMap.items()): if genome in genome_set: if os.path.isdir(seq): tmpSeq = getTempFile() catFiles([ os.path.join(seq, subSeq) for subSeq in os.listdir(seq) ], tmpSeq) seq = tmpSeq seq = makeURL(seq) project.inputSequenceIDMap[genome] = toil.importFile(seq) else: # out-of-scope sequences will only cause trouble later on del project.inputSequenceMap[genome] #import cactus config if options.configFile: cactusConfigID = toil.importFile(makeURL(options.configFile)) else: cactusConfigID = toil.importFile( makeURL(project.getConfigPath())) project.setConfigID(cactusConfigID) project.syncToFileStore(toil) configNode = ET.parse(project.getConfigPath()).getroot() configWrapper = ConfigWrapper(configNode) configWrapper.substituteAllPredefinedConstantsWithLiterals() workFlowArgs = CactusWorkflowArguments( options, experimentFile=experimentFile, configNode=configNode, seqIDMap=project.inputSequenceIDMap) outWorkFlowArgs = toil.start( CactusTrimmingBlastPhase(standAlone=True, cactusWorkflowArguments=workFlowArgs, phaseName="trimBlast")) # export the alignments toil.exportFile(outWorkFlowArgs.alignmentsID, makeURL(options.outputFile)) # optional secondary alignments if outWorkFlowArgs.secondaryAlignmentsID: toil.exportFile(outWorkFlowArgs.secondaryAlignmentsID, makeURL(options.outputFile) + '.secondary') # outgroup fragments and coverage are necessary for cactus-align, as the sequence names got changed in the above alignemnts for i, outgroupFragmentID in enumerate( outWorkFlowArgs.outgroupFragmentIDs): toil.exportFile( outgroupFragmentID, makeURL(options.outputFile) + '.og_fragment_{}'.format(i)) # cactus-align can recompute coverage on the fly, but we save them because we have them for i, ingroupCoverageID in enumerate( outWorkFlowArgs.ingroupCoverageIDs): toil.exportFile( ingroupCoverageID, makeURL(options.outputFile) + '.ig_coverage_{}'.format(i))
def runCactusAfterBlastOnly(options): with Toil(options) as toil: importSingularityImage(options) #Run the workflow if options.restart: alignmentID = toil.restart() else: options.cactusDir = getTempDirectory() #Create the progressive cactus project (as we do in runCactusProgressive) projWrapper = ProjectWrapper(options, options.configFile, ignoreSeqPaths=options.root) projWrapper.writeXml() pjPath = os.path.join( options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) # open up the experiment (as we do in ProgressiveUp.run) # note that we copy the path into the options here experimentFile = project.expMap[options.root] expXml = ET.parse(experimentFile).getroot() experiment = ExperimentWrapper(expXml) configPath = experiment.getConfigPath() configXml = ET.parse(configPath).getroot() seqIDMap = dict() tree = MultiCactusTree(experiment.getTree()).extractSubTree( options.root) leaves = [tree.getName(leaf) for leaf in tree.getLeaves()] outgroups = experiment.getOutgroupGenomes() genome_set = set(leaves + outgroups) # import the outgroups outgroupIDs = [] cactus_blast_input = not options.nonBlastInput for i, outgroup in enumerate(outgroups): try: outgroupID = toil.importFile( makeURL(options.blastOutput) + '.og_fragment_{}'.format(i)) outgroupIDs.append(outgroupID) experiment.setSequenceID(outgroup, outgroupID) except: if cactus_blast_input: raise # we assume that input is not coming from cactus blast, so we'll treat output # sequences normally and not go looking for fragments outgroupIDs = [] break #import the sequences (that we need to align for the given event, ie leaves and outgroups) for genome, seq in list(project.inputSequenceMap.items()): if genome in leaves or (not cactus_blast_input and genome in outgroups): if os.path.isdir(seq): tmpSeq = getTempFile() catFiles([ os.path.join(seq, subSeq) for subSeq in os.listdir(seq) ], tmpSeq) seq = tmpSeq seq = makeURL(seq) experiment.setSequenceID(genome, toil.importFile(seq)) if not cactus_blast_input: outgroupIDs = [ experiment.getSequenceID(outgroup) for outgroup in outgroups ] # write back the experiment, as CactusWorkflowArguments wants a path experiment.writeXML(experimentFile) #import cactus config if options.configFile: cactusConfigID = toil.importFile(makeURL(options.configFile)) else: cactusConfigID = toil.importFile( makeURL(project.getConfigPath())) project.setConfigID(cactusConfigID) project.syncToFileStore(toil) configNode = ET.parse(project.getConfigPath()).getroot() configWrapper = ConfigWrapper(configNode) configWrapper.substituteAllPredefinedConstantsWithLiterals() workFlowArgs = CactusWorkflowArguments( options, experimentFile=experimentFile, configNode=configNode, seqIDMap=project.inputSequenceIDMap) #import the files that cactus-blast made workFlowArgs.alignmentsID = toil.importFile( makeURL(options.blastOutput)) try: workFlowArgs.secondaryAlignmentsID = toil.importFile( makeURL(options.blastOutput) + '.secondary') except: workFlowArgs.secondaryAlignmentsID = None workFlowArgs.outgroupFragmentIDs = outgroupIDs workFlowArgs.ingroupCoverageIDs = [] if cactus_blast_input and len(outgroups) > 0: for i in range(len(leaves)): workFlowArgs.ingroupCoverageIDs.append( toil.importFile( makeURL(options.blastOutput) + '.ig_coverage_{}'.format(i))) halID = toil.start( Job.wrapJobFn(run_cactus_align, configWrapper, workFlowArgs, project, cactus_blast_input)) # export the hal toil.exportFile(halID, makeURL(options.outputHal))
def testCactusPreprocessor(self): #Demo sequences sequenceNames = [ "%s.ENm001.fa" % species for species in ['human', 'hedgehog'] ] sequenceFiles = [ os.path.join(self.encodePath, self.encodeRegion, sequenceName) for sequenceName in sequenceNames ] #Make config file configFile = os.path.join(self.tempDir, "config.xml") rootElem = ET.Element("preprocessor") #<preprocessor chunkSize="10000" proportionToSample="0.2" memory="littleMemory" preprocessorString="cactus_lastzRepeatMask.py --proportionSampled=PROPORTION_SAMPLED --minPeriod=1 --lastzOpts='--step=1 --ambiguous=iupac,100 --ungapped' IN_FILE OUT_FILE "/> preprocessor = ET.SubElement(rootElem, "preprocessor") preprocessor.attrib["chunkSize"] = "100000" preprocessor.attrib["proportionToSample"] = "0.2" preprocessor.attrib["preprocessJob"] = "lastzRepeatMask" preprocessor.attrib["minPeriod"] = "1" preprocessor.attrib["lastzOpts"] = "--step=1 --ambiguous=iupac,100 --ungapped" preprocessor.attrib["fragment"] = "200" fileHandle = open(configFile, "w") fileHandle.write(ET.tostring(rootElem)) fileHandle.close() #Run preprocessor tmpToil = os.path.join(self.tempDir, "toil") runCactusPreprocessor(outputSequenceDir=self.tempDir, configFile=configFile, inputSequences=sequenceFiles, toilDir=tmpToil) for sequenceFile, processedSequenceFile in zip(sequenceFiles, CactusPreprocessor.getOutputSequenceFiles(sequenceFiles, self.tempDir)): print "sequenceFile: %s" % sequenceFile print "output sequence file: %s" % processedSequenceFile #Parse sequences into dictionary originalSequences = getSequences(sequenceFile) #Load the new sequences processedSequences = getSequences(processedSequenceFile) #Check they are the same module masking self.checkSequenceSetsEqualModuloSoftMasking(originalSequences, processedSequences) #Compare the proportion of bases masked by lastz with original repeat masking maskedBasesOriginal = getMaskedBases(originalSequences) maskedBasesLastzMasked = getMaskedBases(processedSequences) #Total bases totalBases = sum([ len(i) for i in originalSequences.values() ]) #Calculate number of hard masked bases totalNBases = len([ (header, i, base) for (header, i, base) in maskedBasesOriginal if base.upper() == "N" ]) print " For the sequence file ", sequenceFile, \ " the total number of sequences is ", len(originalSequences), \ " the total number of bases ", totalBases, \ " the number of bases originally masked was: ", len(maskedBasesOriginal),\ " the number of bases masked after running lastz repeat masking is: ", len(maskedBasesLastzMasked), \ " the intersection of these masked sets is: ", len(maskedBasesLastzMasked.intersection(maskedBasesOriginal)), \ " the total number of bases that are Ns ", totalNBases #Now compare to running lastz on its own toilOptions = Job.Runner.getDefaultOptions(os.path.join(self.tempDir, "lastzRepeatMaskToil")) toilOptions.logLevel = "CRITICAL" with Toil(toilOptions) as toil: queryID = toil.importFile(makeURL(sequenceFile)) targetIDs = [queryID] repeatMaskedID = toil.start(LastzRepeatMaskJob(queryID=queryID, targetIDs=targetIDs, repeatMaskOptions=RepeatMaskOptions(lastzOpts='--step=1 --ambiguous=iupac,100 --ungapped --queryhsplimit=keep,nowarn:30', minPeriod=1, proportionSampled=0.2, fragment=200))) toil.exportFile(repeatMaskedID, makeURL(self.tempOutputFile)) lastzSequencesFast = getSequences(self.tempOutputFile) maskedBasesLastzMaskedFast = getMaskedBases(lastzSequencesFast) i = float(len(maskedBasesLastzMaskedFast.intersection(maskedBasesLastzMasked))) print " The number of bases masked after running lastz repeat masking without the preprocessor is: ", len(maskedBasesLastzMaskedFast), \ " the recall of the fast vs. the new is: ", i/len(maskedBasesLastzMasked), \ " the precision of the fast vs. the new is: ", i/len(maskedBasesLastzMaskedFast)
def main_batch(): """ this is a bit like cactus-align --batch except it will use toil-in-toil to assign each chromosome to a machine. pros: much less chance of a problem with one chromosome affecting anything else more forgiving for inexact resource specs could be ported to Terra cons: less efficient use of resources """ parser = ArgumentParser() Job.Runner.addToilOptions(parser) addCactusWorkflowOptions(parser) parser.add_argument("chromFile", help="chroms file") parser.add_argument("outHal", type=str, help="Output directory (can be s3://)") parser.add_argument( "--alignOptions", type=str, help= "Options to pass through to cactus-align (don't forget to wrap in quotes)" ) parser.add_argument("--alignCores", type=int, help="Number of cores per align job") parser.add_argument( "--alignCoresOverrides", nargs="*", help= "Override align job cores for a chromosome. Space-separated list of chrom,cores pairse epxected" ) parser.add_argument("--configFile", dest="configFile", help="Specify cactus configuration file", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) options = parser.parse_args() options.containerImage = None options.binariesMode = None options.root = None options.latest = None options.database = "kyoto_tycoon" setupBinaries(options) setLoggingFromOptions(options) enableDumpStack() # Mess with some toil options to create useful defaults. cactus_override_toil_options(options) # Turn the overrides into a dict cores_overrides = {} if options.alignCoresOverrides: for o in options.alignCoresOverrides: try: chrom, cores = o.split(',') cores_overrides[chrom] = int(cores) except: raise RuntimeError( "Error parsing alignCoresOverrides \"{}\"".format(o)) options.alignCoresOverrides = cores_overrides start_time = timeit.default_timer() with Toil(options) as toil: importSingularityImage(options) if options.restart: results_dict = toil.restart() else: config_id = toil.importFile(makeURL(options.configFile)) # load the chromfile into memory chrom_dict = {} with open(options.chromFile, 'r') as chrom_file: for line in chrom_file: toks = line.strip().split() if len(toks): assert len(toks) == 3 chrom, seqfile, alnFile = toks[0], toks[1], toks[2] chrom_dict[chrom] = toil.importFile( makeURL(seqfile)), toil.importFile( makeURL(alnFile)) results_dict = toil.start( Job.wrapJobFn(align_toil_batch, chrom_dict, config_id, options)) # when using s3 output urls, things get checkpointed as they're made so no reason to export # todo: make a more unified interface throughout cactus for this # (see toil-vg's outstore logic which, while not perfect, would be an improvement if not options.outHal.startswith('s3://'): if options.batch: for chrom, results in results_dict.items(): toil.exportFile( results[0], makeURL( os.path.join(options.outHal, '{}.hal'.format(chrom)))) if options.outVG: toil.exportFile( results[1], makeURL( os.path.join(options.outHal, '{}.vg'.format(chrom)))) if options.outGFA: toil.exportFile( results[2], makeURL( os.path.join(options.outHal, '{}.gfa.gz'.format(chrom)))) toil.exportFile( results[3], makeURL( os.path.join(options.outHal, '{}.hal.log'.format(chrom)))) end_time = timeit.default_timer() run_time = end_time - start_time logger.info( "cactus-align-batch has finished after {} seconds".format(run_time))
def runCactusGraphMapSplit(options): with Toil(options) as toil: importSingularityImage(options) #Run the workflow if options.restart: wf_output = toil.restart() else: options.cactusDir = getTempDirectory() #load cactus config configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) config.substituteAllPredefinedConstantsWithLiterals() # load up the contigs if any ref_contigs = set(options.refContigs) # todo: use import? if options.refContigsFile: with open(options.refContigsFile, 'r') as rc_file: for line in rc_file: if len(line.strip()): ref_contigs.add(line.strip().split()[0]) if options.otherContig: assert options.otherContig not in ref_contigs # get the minigraph "virutal" assembly name graph_event = getOptionalAttrib(findRequiredNode( configNode, "graphmap"), "assemblyName", default="_MINIGRAPH_") # load the seqfile seqFile = SeqFile(options.seqFile) #import the graph gfa_id = toil.importFile(makeURL(options.minigraphGFA)) #import the paf paf_id = toil.importFile(makeURL(options.graphmapPAF)) #import the sequences (that we need to align for the given event, ie leaves and outgroups) seqIDMap = {} leaves = set([ seqFile.tree.getName(node) for node in seqFile.tree.getLeaves() ]) if graph_event not in leaves: raise RuntimeError( "Minigraph name {} not found in seqfile".format( graph_event)) if options.reference and options.reference not in leaves: raise RuntimeError( "Name given with --reference {} not found in seqfile". format(options.reference)) for genome, seq in seqFile.pathMap.items(): if genome in leaves: if os.path.isdir(seq): tmpSeq = getTempFile() catFiles([ os.path.join(seq, subSeq) for subSeq in os.listdir(seq) ], tmpSeq) seq = tmpSeq seq = makeURL(seq) logger.info("Importing {}".format(seq)) seqIDMap[genome] = (seq, toil.importFile(seq)) # run the workflow wf_output = toil.start( Job.wrapJobFn(graphmap_split_workflow, options, config, seqIDMap, gfa_id, options.minigraphGFA, paf_id, options.graphmapPAF, ref_contigs, options.otherContig)) #export the split data export_split_data(toil, wf_output[0], wf_output[1], wf_output[2:], options.outDir, config)
def testLastzRepeatMask(self): #Demo sequences sequenceFiles = [ os.path.join(self.encodePath, self.encodeRegion, "%s.ENm001.fa" % species) for species in ('human', "hedgehog") ] #Max occurrences of a repeat within the sequence maxOccurrence = 1 for sequenceFile in sequenceFiles: #Parse sequences into dictionary originalSequences = getSequences(sequenceFile) #Get the masked bases maskedBasesOriginal = getMaskedBases(originalSequences) #Total bases totalBases = sum( [len(i) for i in list(originalSequences.values())]) #Calculate number of hard masked bases totalNBases = len([(header, i, base) for (header, i, base) in maskedBasesOriginal if base.upper() == "N"]) #Run lastz repeat masker startTime = time.time() with Toil(self.toilOptions) as toil: sequenceID = toil.importFile(makeURL(sequenceFile)) repeatMaskOptions = RepeatMaskOptions( proportionSampled=1.0, minPeriod=maxOccurrence, lastzOpts="--step=1 --ambiguous=iupac,100,100 --ydrop=3000", fragment=200) outputID = toil.start( LastzRepeatMaskJob(repeatMaskOptions=repeatMaskOptions, queryID=sequenceID, targetIDs=[sequenceID])) toil.exportFile(outputID, makeURL(self.tempOutputFile)) print(("It took %s seconds to run lastzMasking" % (time.time() - startTime))) #Parse lastz masked sequences into dictionary lastzSequences = getSequences(self.tempOutputFile) #Check the sequences are the same modulo masking self.checkSequenceSetsEqualModuloSoftMasking( originalSequences, lastzSequences) #Compare the proportion of bases masked by lastz with original repeat masking maskedBasesOriginal = getMaskedBases(originalSequences) maskedBasesLastzMasked = getMaskedBases(lastzSequences) print((" For the sequence file ", sequenceFile, \ " the total number of sequences is ", len(originalSequences), \ " the total number of bases ", totalBases, \ " the number of bases originally masked was: ", len(maskedBasesOriginal),\ " the number of bases masked after running lastz repeat masking is: ", len(maskedBasesLastzMasked), \ " the intersection of these masked sets is: ", len(maskedBasesLastzMasked.intersection(maskedBasesOriginal)), \ " the total number of bases that are Ns ", totalNBases, \ " lastz was filter for max-occurrences of more than : ", maxOccurrence)) #self.assertGreater(len(maskedBasesLastzMasked), len(maskedBasesOriginal)) #Run lastz repeat masker using heuristic settings for comparison with the slower settings startTime = time.time() with Toil(self.toilOptions) as toil: sequenceID = toil.importFile(makeURL(sequenceFile)) repeatMaskOptions = RepeatMaskOptions( proportionSampled=1.0, minPeriod=maxOccurrence, lastzOpts= "--step=3 --ambiguous=iupac,100,100 --ungapped --queryhsplimit=keep,nowarn:%i" % (int(maxOccurrence) * 20), fragment=200) outputID = toil.start( LastzRepeatMaskJob(repeatMaskOptions=repeatMaskOptions, queryID=sequenceID, targetIDs=[sequenceID])) toil.exportFile(outputID, makeURL(self.tempOutputFile)) print(("It took %s seconds to run lastzMasking fast" % (time.time() - startTime))) lastzSequencesFast = getSequences(self.tempOutputFile) maskedBasesLastzMaskedFast = getMaskedBases(lastzSequencesFast) self.assertGreater(len(maskedBasesLastzMaskedFast), len(maskedBasesOriginal)) i = float( len( maskedBasesLastzMaskedFast.intersection( maskedBasesLastzMasked))) precision = i / len(maskedBasesLastzMasked) recall = i / len(maskedBasesLastzMaskedFast) self.assertGreater(precision, 0.93) self.assertGreater(recall, 0.93)
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) addCactusWorkflowOptions(parser) parser.add_argument("seqFile", help="Seq file") parser.add_argument("outputHal", type=str, help="Output HAL file") #Progressive Cactus Options parser.add_argument("--database", dest="database", help="Database type: tokyo_cabinet or kyoto_tycoon" " [default: %(default)s]", default="kyoto_tycoon") parser.add_argument("--configFile", dest="configFile", help="Specify cactus configuration file", default=None) parser.add_argument( "--root", dest="root", help="Name of ancestral node (which" " must appear in NEWICK tree in <seqfile>) to use as a " "root for the alignment. Any genomes not below this node " "in the tree may be used as outgroups but will never appear" " in the output. If no root is specifed then the root" " of the tree is used. ", default=None) parser.add_argument("--latest", dest="latest", action="store_true", help="Use the latest, locally-built docker container " "rather than pulling from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) options = parser.parse_args() options.cactusDir = getTempDirectory() setupBinaries(options) setLoggingFromOptions(options) # Mess with some toil options to create useful defaults. # Caching generally slows down the cactus workflow, plus some # methods like readGlobalFileStream don't support forced # reads directly from the job store rather than from cache. options.disableCaching = True # Job chaining breaks service termination timing, causing unused # databases to accumulate and waste memory for no reason. options.disableChaining = True # The default deadlockWait is currently 60 seconds. This can cause # issues if the database processes take a while to actually begin # after they're issued. Change it to at least an hour so that we # don't preemptively declare a deadlock. if options.deadlockWait is None or options.deadlockWait < 3600: options.deadlockWait = 3600 if options.retryCount is None: # If the user didn't specify a retryCount value, make it 5 # instead of Toil's default (1). options.retryCount = 5 #Create the progressive cactus project projWrapper = ProjectWrapper(options) projWrapper.writeXml() pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) with Toil(options) as toil: importSingularityImage() #Run the workflow if options.restart: halID = toil.restart() else: project.readXML(pjPath) #import the sequences seqIDs = [] for seq in project.getInputSequencePaths(): if os.path.isdir(seq): tmpSeq = getTempFile() catFiles([ os.path.join(seq, subSeq) for subSeq in os.listdir(seq) ], tmpSeq) seq = tmpSeq seq = makeURL(seq) seqIDs.append(toil.importFile(seq)) project.setInputSequenceIDs(seqIDs) #import cactus config if options.configFile: cactusConfigID = toil.importFile(makeURL(options.configFile)) else: cactusConfigID = toil.importFile( makeURL(project.getConfigPath())) logger.info("Setting config id to: %s" % cactusConfigID) project.setConfigID(cactusConfigID) project.syncToFileStore(toil) configNode = ET.parse(project.getConfigPath()).getroot() configWrapper = ConfigWrapper(configNode) configWrapper.substituteAllPredefinedConstantsWithLiterals() project.writeXML(pjPath) halID = toil.start( RunCactusPreprocessorThenProgressiveDown( options, project, memory=configWrapper.getDefaultMemory())) toil.exportFile(halID, makeURL(options.outputHal))
def runCactusBlastOnly(options): with Toil(options) as toil: importSingularityImage(options) #Run the workflow if options.restart: alignmentID = toil.restart() else: options.cactusDir = getTempDirectory() #Create the progressive cactus project (as we do in runCactusProgressive) projWrapper = ProjectWrapper(options, options.configFile, ignoreSeqPaths=options.root) projWrapper.writeXml() pjPath = os.path.join( options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) # open up the experiment (as we do in ProgressiveUp.run) # note that we copy the path into the options here experimentFile = project.expMap[options.root] expXml = ET.parse(experimentFile).getroot() logger.info("Experiment {}".format(ET.tostring(expXml))) experiment = ExperimentWrapper(expXml) configPath = experiment.getConfigPath() configXml = ET.parse(configPath).getroot() seqIDMap = dict() tree = MultiCactusTree(experiment.getTree()).extractSubTree( options.root) leaves = tree.getChildNames(tree.getRootName()) outgroups = experiment.getOutgroupGenomes() genome_set = set(leaves + outgroups) logger.info("Genomes in blastonly, {}: {}".format( options.root, list(genome_set))) #import the sequences (that we need to align for the given event, ie leaves and outgroups) for genome, seq in list(project.inputSequenceMap.items()): if genome in genome_set: if os.path.isdir(seq): tmpSeq = getTempFile() catFiles([ os.path.join(seq, subSeq) for subSeq in os.listdir(seq) ], tmpSeq) seq = tmpSeq seq = makeURL(seq) project.inputSequenceIDMap[genome] = toil.importFile(seq) else: # out-of-scope sequences will only cause trouble later on del project.inputSequenceMap[genome] #import cactus config if options.configFile: cactusConfigID = toil.importFile(makeURL(options.configFile)) else: cactusConfigID = toil.importFile( makeURL(project.getConfigPath())) project.setConfigID(cactusConfigID) project.syncToFileStore(toil) configNode = ET.parse(project.getConfigPath()).getroot() configWrapper = ConfigWrapper(configNode) configWrapper.substituteAllPredefinedConstantsWithLiterals() workFlowArgs = CactusWorkflowArguments( options, experimentFile=experimentFile, configNode=configNode, seqIDMap=project.inputSequenceIDMap) outWorkFlowArgs = toil.start( CactusTrimmingBlastPhase(standAlone=True, cactusWorkflowArguments=workFlowArgs, phaseName="trimBlast")) # export the alignments toil.exportFile(outWorkFlowArgs.alignmentsID, makeURL(options.outputFile)) # optional secondary alignments if outWorkFlowArgs.secondaryAlignmentsID: toil.exportFile(outWorkFlowArgs.secondaryAlignmentsID, makeURL(options.outputFile) + '.secondary') # outgroup fragments and coverage are necessary for cactus-align, as the sequence names got changed in the above alignemnts for i, outgroupFragmentID in enumerate( outWorkFlowArgs.outgroupFragmentIDs): toil.exportFile( outgroupFragmentID, makeURL(options.outputFile) + '.og_fragment_{}'.format(i)) # cactus-align can recompute coverage on the fly, but we save them because we have them for i, ingroupCoverageID in enumerate( outWorkFlowArgs.ingroupCoverageIDs): toil.exportFile( ingroupCoverageID, makeURL(options.outputFile) + '.ig_coverage_{}'.format(i))
def make_align_job(options, toil): options.cactusDir = getTempDirectory() # apply path overrides. this was necessary for wdl which doesn't take kindly to # text files of local paths (ie seqfile). one way to fix would be to add support # for s3 paths and force wdl to use it. a better way would be a more fundamental # interface shift away from files of paths throughout all of cactus if options.pathOverrides: seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) tree = MultiCactusTree(seqFile.tree) tree.nameUnlabeledInternalNodes( prefix=config.getDefaultInternalNodePrefix()) for name, override in zip(options.pathOverrideNames, options.pathOverrides): seqFile.pathMap[name] = override override_seq = os.path.join(options.cactusDir, 'seqFile.override') with open(override_seq, 'w') as out_sf: out_sf.write(str(seqFile)) options.seqFile = override_seq if not options.root: seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) mcTree = MultiCactusTree(seqFile.tree) mcTree.nameUnlabeledInternalNodes( prefix=config.getDefaultInternalNodePrefix()) options.root = mcTree.getRootName() if options.acyclic: seqFile = SeqFile(options.seqFile) tree = MultiCactusTree(seqFile.tree) leaves = [tree.getName(leaf) for leaf in tree.getLeaves()] if options.acyclic not in leaves: raise RuntimeError( "Genome specified with --acyclic, {}, not found in tree leaves" .format(options.acyclic)) #to be consistent with all-in-one cactus, we make sure the project #isn't limiting itself to the subtree (todo: parameterize so root can #be passed through from prepare to blast/align) proj_options = copy.deepcopy(options) proj_options.root = None #Create the progressive cactus project (as we do in runCactusProgressive) projWrapper = ProjectWrapper(proj_options, proj_options.configFile, ignoreSeqPaths=options.root) projWrapper.writeXml() pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) # open up the experiment (as we do in ProgressiveUp.run) # note that we copy the path into the options here experimentFile = project.expMap[options.root] expXml = ET.parse(experimentFile).getroot() experiment = ExperimentWrapper(expXml) configPath = experiment.getConfigPath() configXml = ET.parse(configPath).getroot() seqIDMap = dict() tree = MultiCactusTree(experiment.getTree()).extractSubTree(options.root) leaves = [tree.getName(leaf) for leaf in tree.getLeaves()] outgroups = experiment.getOutgroupGenomes() genome_set = set(leaves + outgroups) # this is a hack to allow specifying all the input on the command line, rather than using suffix lookups def get_input_path(suffix=''): base_path = options.cigarsFile[0] for input_path in options.cigarsFile: if suffix and input_path.endswith(suffix): return input_path if os.path.basename(base_path).startswith( os.path.basename(input_path)): base_path = input_path return base_path + suffix # import the outgroups outgroupIDs = [] outgroup_fragment_found = False for i, outgroup in enumerate(outgroups): try: outgroupID = toil.importFile( makeURL(get_input_path('.og_fragment_{}'.format(i)))) outgroupIDs.append(outgroupID) experiment.setSequenceID(outgroup, outgroupID) outgroup_fragment_found = True assert not options.pangenome except: # we assume that input is not coming from cactus blast, so we'll treat output # sequences normally and not go looking for fragments outgroupIDs = [] break #import the sequences (that we need to align for the given event, ie leaves and outgroups) for genome, seq in list(project.inputSequenceMap.items()): if genome in leaves or (not outgroup_fragment_found and genome in outgroups): if os.path.isdir(seq): tmpSeq = getTempFile() catFiles( [os.path.join(seq, subSeq) for subSeq in os.listdir(seq)], tmpSeq) seq = tmpSeq seq = makeURL(seq) logger.info("Importing {}".format(seq)) experiment.setSequenceID(genome, toil.importFile(seq)) if not outgroup_fragment_found: outgroupIDs = [ experiment.getSequenceID(outgroup) for outgroup in outgroups ] # write back the experiment, as CactusWorkflowArguments wants a path experiment.writeXML(experimentFile) #import cactus config if options.configFile: cactusConfigID = toil.importFile(makeURL(options.configFile)) else: cactusConfigID = toil.importFile(makeURL(project.getConfigPath())) project.setConfigID(cactusConfigID) project.syncToFileStore(toil) configNode = ET.parse(project.getConfigPath()).getroot() configWrapper = ConfigWrapper(configNode) configWrapper.substituteAllPredefinedConstantsWithLiterals() if options.singleCopySpecies: findRequiredNode( configWrapper.xmlRoot, "caf").attrib["alignmentFilter"] = "singleCopyEvent:{}".format( options.singleCopySpecies) if options.barMaskFilter: findRequiredNode( configWrapper.xmlRoot, "bar").attrib["partialOrderAlignmentMaskFilter"] = str( options.barMaskFilter) if options.pangenome: # turn off the megablock filter as it ruins non-all-to-all alignments findRequiredNode(configWrapper.xmlRoot, "caf").attrib["minimumBlockHomologySupport"] = "0" findRequiredNode( configWrapper.xmlRoot, "caf").attrib["minimumBlockDegreeToCheckSupport"] = "9999999999" # turn off mapq filtering findRequiredNode(configWrapper.xmlRoot, "caf").attrib["runMapQFiltering"] = "0" # more iterations here helps quite a bit to reduce underalignment findRequiredNode(configWrapper.xmlRoot, "caf").attrib["maxRecoverableChainsIterations"] = "50" # turn down minimum block degree to get a fat ancestor findRequiredNode(configWrapper.xmlRoot, "bar").attrib["minimumBlockDegree"] = "1" # turn on POA findRequiredNode(configWrapper.xmlRoot, "bar").attrib["partialOrderAlignment"] = "1" # save it if not options.batch: pg_file = options.outHal + ".pg-conf.xml" if pg_file.startswith('s3://'): pg_temp_file = getTempFile() else: pg_temp_file = pg_file configWrapper.writeXML(pg_temp_file) if pg_file.startswith('s3://'): write_s3(pg_temp_file, pg_file, region=get_aws_region(options.jobStore)) logger.info("pangenome configuration overrides saved in {}".format( pg_file)) workFlowArgs = CactusWorkflowArguments(options, experimentFile=experimentFile, configNode=configNode, seqIDMap=project.inputSequenceIDMap) #import the files that cactus-blast made workFlowArgs.alignmentsID = toil.importFile(makeURL(get_input_path())) workFlowArgs.secondaryAlignmentsID = None if not options.pafInput: try: workFlowArgs.secondaryAlignmentsID = toil.importFile( makeURL(get_input_path('.secondary'))) except: pass workFlowArgs.outgroupFragmentIDs = outgroupIDs workFlowArgs.ingroupCoverageIDs = [] if outgroup_fragment_found and len(outgroups) > 0: for i in range(len(leaves)): workFlowArgs.ingroupCoverageIDs.append( toil.importFile( makeURL(get_input_path('.ig_coverage_{}'.format(i))))) align_job = Job.wrapJobFn(run_cactus_align, configWrapper, workFlowArgs, project, checkpointInfo=options.checkpointInfo, doRenaming=options.nonCactusInput, pafInput=options.pafInput, pafSecondaries=options.usePafSecondaries, doVG=options.outVG, doGFA=options.outGFA, delay=options.stagger, eventNameAsID=options.eventNameAsID, acyclicEvent=options.acyclic) return align_job
def runCactusGraphMap(options): with Toil(options) as toil: importSingularityImage(options) #Run the workflow if options.restart: alignmentID = toil.restart() else: options.cactusDir = getTempDirectory() # apply path overrides. this was necessary for wdl which doesn't take kindly to # text files of local paths (ie seqfile). one way to fix would be to add support # for s3 paths and force wdl to use it. a better way would be a more fundamental # interface shift away from files of paths throughout all of cactus if options.pathOverrides: seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) tree = MultiCactusTree(seqFile.tree) tree.nameUnlabeledInternalNodes( prefix=config.getDefaultInternalNodePrefix()) for name, override in zip(options.pathOverrideNames, options.pathOverrides): seqFile.pathMap[name] = override override_seq = os.path.join(options.cactusDir, 'seqFile.override') with open(override_seq, 'w') as out_sf: out_sf.write(str(seqFile)) options.seqFile = override_seq #load cactus config configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) config.substituteAllPredefinedConstantsWithLiterals() # get the minigraph "virutal" assembly name graph_event = getOptionalAttrib(findRequiredNode( configNode, "refgraph"), "assemblyName", default="__MINIGRAPH_SEQUENCES__") # load the seqfile seqFile = SeqFile(options.seqFile) logger.info("Genomes for graphmap, {}".format(seqFile.pathMap)) if not options.outputFasta and graph_event not in seqFile.pathMap: raise RuntimeError( "{} assembly not found in seqfile so it must be specified with --outputFasta" .format(graph_event)) #import the graph gfa_id = toil.importFile(makeURL(options.minigraphGFA)) #import the sequences (that we need to align for the given event, ie leaves and outgroups) seqIDMap = {} for genome, seq in seqFile.pathMap.items(): if genome != graph_event: if os.path.isdir(seq): tmpSeq = getTempFile() catFiles([ os.path.join(seq, subSeq) for subSeq in os.listdir(seq) ], tmpSeq) seq = tmpSeq seq = makeURL(seq) seqIDMap[genome] = toil.importFile(seq) # run the workflow paf_id, gfa_fa_id = toil.start( Job.wrapJobFn(minigraph_workflow, options, config, seqIDMap, gfa_id, graph_event)) #export the paf toil.exportFile(paf_id, makeURL(options.outputPAF)) if gfa_fa_id: toil.exportFile(gfa_fa_id, makeURL(options.outputFasta)) # update the input seqfile (in place!) add_genome_to_seqfile(options.seqFile, makeURL(options.outputFasta), graph_event)
def export_split_data(toil, input_seq_id_map, output_id_map, split_log_ids, output_dir, config): """ download all the split data locally """ amb_name = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap_split"), "ambiguousName", default="_AMBIGUOUS_") chrom_file_map = {} for ref_contig in output_id_map.keys(): ref_contig_path = os.path.join(output_dir, ref_contig) if not os.path.isdir( ref_contig_path) and not ref_contig_path.startswith('s3://'): os.makedirs(ref_contig_path) # GFA: <output_dir>/<contig>/<contig>.gfa if 'gfa' in output_id_map[ref_contig]: # we do this check because no gfa made for ambiguous sequences "contig" toil.exportFile( output_id_map[ref_contig]['gfa'], makeURL( os.path.join(ref_contig_path, '{}.gfa'.format(ref_contig)))) # PAF: <output_dir>/<contig>/<contig>.paf paf_path = os.path.join(ref_contig_path, '{}.paf'.format(ref_contig)) toil.exportFile(output_id_map[ref_contig]['paf'], makeURL(paf_path)) # Fasta: <output_dir>/<contig>/fasta/<event>_<contig>.fa .. seq_file_map = {} for event, ref_contig_fa_id in output_id_map[ref_contig]['fa'].items(): fa_base = os.path.join(ref_contig_path, 'fasta') if not os.path.isdir(fa_base) and not fa_base.startswith('s3://'): os.makedirs(fa_base) fa_path = makeURL( os.path.join(fa_base, '{}_{}.fa'.format(event, ref_contig))) if input_seq_id_map[event][0].endswith('.gz'): fa_path += '.gz' seq_file_map[event] = fa_path toil.exportFile(ref_contig_fa_id, fa_path) # Seqfile: <output_dir>/seqfiles/<contig>.seqfile seq_file_path = os.path.join(output_dir, 'seqfiles', '{}.seqfile'.format(ref_contig)) if seq_file_path.startswith('s3://'): seq_file_temp_path = getTempFile() else: seq_file_temp_path = seq_file_path if not os.path.isdir(os.path.dirname(seq_file_path)): os.makedirs(os.path.dirname(seq_file_path)) with open(seq_file_temp_path, 'w') as seq_file: for event, fa_path in seq_file_map.items(): # cactus can't handle empty fastas. if there are no sequences for a sample for this # contig, just don't add it. if output_id_map[ref_contig]['fa'][event].size > 0: seq_file.write('{}\t{}\n'.format(event, fa_path)) if seq_file_path.startswith('s3://'): write_s3(seq_file_temp_path, seq_file_path) # Top-level seqfile chrom_file_map[ref_contig] = seq_file_path, paf_path # Chromfile : <coutput_dir>/chromfile.txt chrom_file_path = os.path.join(output_dir, 'chromfile.txt') if chrom_file_path.startswith('s3://'): chrom_file_temp_path = getTempFile() else: chrom_file_temp_path = chrom_file_path with open(chrom_file_temp_path, 'w') as chromfile: for ref_contig, seqfile_paf in chrom_file_map.items(): if ref_contig != amb_name: seqfile, paf = seqfile_paf[0], seqfile_paf[1] if seqfile.startswith('s3://'): # no use to have absolute s3 reference as cactus-align requires seqfiles passed locally seqfile = 'seqfiles/{}'.format(os.path.basename(seqfile)) chromfile.write('{}\t{}\t{}\n'.format(ref_contig, seqfile, paf)) if chrom_file_path.startswith('s3://'): write_s3(chrom_file_temp_path, chrom_file_path) toil.exportFile(split_log_ids[0], makeURL(os.path.join(output_dir, 'minigraph.split.log'))) if split_log_ids[1]: toil.exportFile( split_log_ids[1], makeURL(os.path.join(output_dir, 'minimap2.ambiguous.split.log')))
def main(): options = get_options() with Toil(options) as workflow: setupBinaries(options) importSingularityImage(options) ## Preprocessing: if (options.pathOverrides or options.pathOverrideNames): if not options.pathOverrides or not options.pathOverrideNames or \ len(options.pathOverrideNames) != len(options.pathOverrides): raise RuntimeError( 'same number of values must be passed to --pathOverrides and --pathOverrideNames' ) # apply path overrides. this was necessary for wdl which doesn't take kindly to # text files of local paths (ie seqfile). one way to fix would be to add support # for s3 paths and force wdl to use it. a better way would be a more fundamental # interface shift away from files of paths throughout all of cactus if options.pathOverrides: seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) tree = MultiCactusTree(seqFile.tree) tree.nameUnlabeledInternalNodes( prefix=config.getDefaultInternalNodePrefix()) for name, override in zip(options.pathOverrideNames, options.pathOverrides): seqFile.pathMap[name] = override override_seq = os.path.join(options.cactusDir, 'seqFile.override') with open(override_seq, 'w') as out_sf: out_sf.write(str(seqFile)) options.seqFile = override_seq # Import asms; by default, prepends unique IDs in the technique used in cactus-blast. asms = get_asms_from_seqfile(options.seqFile, workflow) ## Perform alignments: if not workflow.options.restart: alignments = workflow.start( Job.wrapJobFn(run_cactus_reference_align, asms, options.refID, options.debug_export, options.dipcall_bed_filter, options.dipcall_vcf_filter)) else: alignments = workflow.restart() if options.debug_export: # first, ensure the debug dir exists. if not os.path.isdir(options.debug_export_dir): os.mkdir(options.debug_export_dir) print(alignments) # Then return value is: (all_primary, all_secondary, ref_mappings, primary_mappings, secondary_mappings) for asm, mapping_file in alignments[2].items(): workflow.exportFile( mapping_file, 'file://' + os.path.abspath("mappings_for_" + asm + ".paf")) for asm, mapping_file in alignments[3].items(): workflow.exportFile( mapping_file, 'file://' + os.path.abspath("mappings_for_" + asm + ".cigar")) for asm, mapping_file in alignments[4].items(): workflow.exportFile( mapping_file, 'file://' + os.path.abspath("mappings_for_" + asm + ".cigar.secondry")) ## Save alignments: if options.dipcall_vcf_filter: # this is substantially less restrictive than the dipcall_bed_filter. dipcall_filtered = workflow.start( Job.wrapJobFn(apply_dipcall_vcf_filter, alignments[0])) workflow.exportFile(dipcall_filtered, makeURL(options.outputFile)) workflow.exportFile( alignments[1], makeURL(options.outputFile + ".unfiltered.secondary")) else: workflow.exportFile(alignments[0], makeURL(options.outputFile)) workflow.exportFile(alignments[1], makeURL(options.outputFile + ".secondary"))
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) addCactusWorkflowOptions(parser) parser.add_argument("seqFile", help="Seq file") parser.add_argument( "cigarsFile", nargs="*", help= "Pairiwse aliginments (from cactus-blast, cactus-refmap or cactus-graphmap)" ) parser.add_argument("outHal", type=str, help="Output HAL file (or directory in --batch mode)") parser.add_argument( "--pathOverrides", nargs="*", help="paths (multiple allowd) to override from seqFile") parser.add_argument( "--pathOverrideNames", nargs="*", help="names (must be same number as --paths) of path overrides") #Pangenome Options parser.add_argument( "--pangenome", action="store_true", help= "Activate pangenome mode (suitable for star trees of closely related samples) by overriding several configuration settings." " The overridden configuration will be saved in <outHal>.pg-conf.xml") parser.add_argument( "--pafInput", action="store_true", help="'cigarsFile' arugment is in PAF format, rather than lastz cigars." ) parser.add_argument( "--usePafSecondaries", action="store_true", help= "use the secondary alignments from the PAF input. They are ignored by default." ) parser.add_argument("--singleCopySpecies", type=str, help="Filter out all self-alignments in given species") parser.add_argument( "--barMaskFilter", type=int, default=None, help= "BAR's POA aligner will ignore softmasked regions greater than this length. (overrides partialOrderAlignmentMaskFilter in config)" ) parser.add_argument( "--outVG", action="store_true", help="export pangenome graph in VG (.vg) in addition to HAL") parser.add_argument( "--outGFA", action="store_true", help="export pangenome grpah in GFA (.gfa.gz) in addition to HAL") parser.add_argument( "--batch", action="store_true", help= "Launch batch of alignments. Input seqfile is expected to be chromfile as generated by cactus-graphmap-slit" ) parser.add_argument( "--stagger", type=int, help= "Stagger alignment jobs in batch mode by this many seconds (to avoid starting all at once)", default=0) parser.add_argument( "--acyclic", type=str, help= "Ensure that given genome is cyclic by deleting all paralogy edges in postprocessing" ) #Progressive Cactus Options parser.add_argument("--configFile", dest="configFile", help="Specify cactus configuration file", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument( "--root", dest="root", help="Name of ancestral node (which" " must appear in NEWICK tree in <seqfile>) to use as a " "root for the alignment. Any genomes not below this node " "in the tree may be used as outgroups but will never appear" " in the output. If no root is specifed then the root" " of the tree is used. ", default=None) parser.add_argument( "--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument( "--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) parser.add_argument( "--nonCactusInput", action="store_true", help= "Input lastz cigars do not come from cactus-blast or cactus-refmap: Prepend ids in cigars" ) parser.add_argument("--database", choices=["kyoto_tycoon", "redis"], help="The type of database", default="kyoto_tycoon") options = parser.parse_args() setupBinaries(options) setLoggingFromOptions(options) enableDumpStack() if (options.pathOverrides or options.pathOverrideNames): if not options.pathOverrides or not options.pathOverrideNames or \ len(options.pathOverrideNames) != len(options.pathOverrides): raise RuntimeError( 'same number of values must be passed to --pathOverrides and --pathOverrideNames' ) # cactus doesn't run with 1 core if options.batchSystem == 'singleMachine': if options.maxCores is not None: if int(options.maxCores) < 2: raise RuntimeError('Cactus requires --maxCores > 1') else: # is there a way to get this out of Toil? That would be more consistent if cpu_count() < 2: raise RuntimeError( 'Only 1 CPU detected. Cactus requires at least 2') options.buildHal = True options.buildFasta = True if options.outHal.startswith('s3://'): if not has_s3: raise RuntimeError( "S3 support requires toil to be installed with [aws]") # write a little something to the bucket now to catch any glaring problems asap test_file = os.path.join(getTempDirectory(), 'check') with open(test_file, 'w') as test_o: test_o.write("\n") region = get_aws_region( options.jobStore) if options.jobStore.startswith('aws:') else None write_s3(test_file, options.outHal if options.outHal.endswith('.hal') else os.path.join(options.outHal, 'test'), region=region) options.checkpointInfo = (get_aws_region(options.jobStore), options.outHal) else: options.checkpointInfo = None if options.batch: # the output hal is a directory, make sure it's there if not os.path.isdir(options.outHal): os.makedirs(options.outHal) assert len(options.cigarsFile) == 0 else: assert len(options.cigarsFile) > 0 # Mess with some toil options to create useful defaults. cactus_override_toil_options(options) # We set which type of unique ids to expect. Numeric (from cactus-blast) or Eventname (cactus-refmap or cactus-grpahmap) # This is a bit ugly, since we don't have a good way to differentiate refmap from blast, and use --pangenome as a proxy # But I don't think there's a real use case yet of making a separate parameter options.eventNameAsID = os.environ.get('CACTUS_EVENT_NAME_AS_UNIQUE_ID') if options.eventNameAsID is not None: options.eventNameAsID = False if not bool( eventName) or eventName == '0' else True else: options.eventNameAsID = options.pangenome or options.pafInput os.environ['CACTUS_EVENT_NAME_AS_UNIQUE_ID'] = str( int(options.eventNameAsID)) start_time = timeit.default_timer() with Toil(options) as toil: importSingularityImage(options) if options.restart: results_dict = toil.restart() else: align_jobs = make_batch_align_jobs(options, toil) results_dict = toil.start( Job.wrapJobFn(run_batch_align_jobs, align_jobs)) # when using s3 output urls, things get checkpointed as they're made so no reason to export # todo: make a more unified interface throughout cactus for this # (see toil-vg's outstore logic which, while not perfect, would be an improvement if not options.outHal.startswith('s3://'): if options.batch: for chrom, results in results_dict.items(): toil.exportFile( results[0], makeURL( os.path.join(options.outHal, '{}.hal'.format(chrom)))) if options.outVG: toil.exportFile( results[1], makeURL( os.path.join(options.outHal, '{}.vg'.format(chrom)))) if options.outGFA: toil.exportFile( results[2], makeURL( os.path.join(options.outHal, '{}.gfa.gz'.format(chrom)))) else: assert len(results_dict) == 1 and None in results_dict halID, vgID, gfaID = results_dict[None][0], results_dict[None][ 1], results_dict[None][2] # export the hal toil.exportFile(halID, makeURL(options.outHal)) # export the vg if options.outVG: toil.exportFile( vgID, makeURL(os.path.splitext(options.outHal)[0] + '.vg')) if options.outGFA: toil.exportFile( gfaID, makeURL( os.path.splitext(options.outHal)[0] + '.gfa.gz')) end_time = timeit.default_timer() run_time = end_time - start_time logger.info("cactus-align has finished after {} seconds".format(run_time))
def runCactusAfterBlastOnly(options): with Toil(options) as toil: importSingularityImage(options) #Run the workflow if options.restart: halID = toil.restart() else: options.cactusDir = getTempDirectory() # apply path overrides. this was necessary for wdl which doesn't take kindly to # text files of local paths (ie seqfile). one way to fix would be to add support # for s3 paths and force wdl to use it. a better way would be a more fundamental # interface shift away from files of paths throughout all of cactus if options.pathOverrides: seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) tree = MultiCactusTree(seqFile.tree) tree.nameUnlabeledInternalNodes( prefix=config.getDefaultInternalNodePrefix()) for name, override in zip(options.pathOverrideNames, options.pathOverrides): seqFile.pathMap[name] = override override_seq = os.path.join(options.cactusDir, 'seqFile.override') with open(override_seq, 'w') as out_sf: out_sf.write(str(seqFile)) options.seqFile = override_seq #to be consistent with all-in-one cactus, we make sure the project #isn't limiting itself to the subtree (todo: parameterize so root can #be passed through from prepare to blast/align) proj_options = copy.deepcopy(options) proj_options.root = None #Create the progressive cactus project (as we do in runCactusProgressive) projWrapper = ProjectWrapper(proj_options, proj_options.configFile, ignoreSeqPaths=options.root) projWrapper.writeXml() pjPath = os.path.join( options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) # open up the experiment (as we do in ProgressiveUp.run) # note that we copy the path into the options here experimentFile = project.expMap[options.root] expXml = ET.parse(experimentFile).getroot() experiment = ExperimentWrapper(expXml) configPath = experiment.getConfigPath() configXml = ET.parse(configPath).getroot() seqIDMap = dict() tree = MultiCactusTree(experiment.getTree()).extractSubTree( options.root) leaves = [tree.getName(leaf) for leaf in tree.getLeaves()] outgroups = experiment.getOutgroupGenomes() genome_set = set(leaves + outgroups) # this is a hack to allow specifying all the input on the command line, rather than using suffix lookups def get_input_path(suffix=''): base_path = options.cigarsFile[0] for input_path in options.cigarsFile: if suffix and input_path.endswith(suffix): return input_path if os.path.basename(base_path).startswith( os.path.basename(input_path)): base_path = input_path return base_path + suffix # import the outgroups outgroupIDs = [] outgroup_fragment_found = False for i, outgroup in enumerate(outgroups): try: outgroupID = toil.importFile( makeURL(get_input_path('.og_fragment_{}'.format(i)))) outgroupIDs.append(outgroupID) experiment.setSequenceID(outgroup, outgroupID) outgroup_fragment_found = True assert not options.pangenome except: # we assume that input is not coming from cactus blast, so we'll treat output # sequences normally and not go looking for fragments outgroupIDs = [] break #import the sequences (that we need to align for the given event, ie leaves and outgroups) for genome, seq in list(project.inputSequenceMap.items()): if genome in leaves or (not outgroup_fragment_found and genome in outgroups): if os.path.isdir(seq): tmpSeq = getTempFile() catFiles([ os.path.join(seq, subSeq) for subSeq in os.listdir(seq) ], tmpSeq) seq = tmpSeq seq = makeURL(seq) experiment.setSequenceID(genome, toil.importFile(seq)) if not outgroup_fragment_found: outgroupIDs = [ experiment.getSequenceID(outgroup) for outgroup in outgroups ] # write back the experiment, as CactusWorkflowArguments wants a path experiment.writeXML(experimentFile) #import cactus config if options.configFile: cactusConfigID = toil.importFile(makeURL(options.configFile)) else: cactusConfigID = toil.importFile( makeURL(project.getConfigPath())) project.setConfigID(cactusConfigID) project.syncToFileStore(toil) configNode = ET.parse(project.getConfigPath()).getroot() configWrapper = ConfigWrapper(configNode) configWrapper.substituteAllPredefinedConstantsWithLiterals() if options.pangenome: # turn off the megablock filter as it ruins non-all-to-all alignments configWrapper.disableCafMegablockFilter() # the recoverable chains parameter does not seem to play nicely with star-like alignments either #configWrapper.disableRecoverableChains() workFlowArgs = CactusWorkflowArguments( options, experimentFile=experimentFile, configNode=configNode, seqIDMap=project.inputSequenceIDMap) #import the files that cactus-blast made workFlowArgs.alignmentsID = toil.importFile( makeURL(get_input_path())) workFlowArgs.secondaryAlignmentsID = None if not options.pafInput: try: workFlowArgs.secondaryAlignmentsID = toil.importFile( makeURL(get_input_path('.secondary'))) except: pass workFlowArgs.outgroupFragmentIDs = outgroupIDs workFlowArgs.ingroupCoverageIDs = [] if outgroup_fragment_found and len(outgroups) > 0: for i in range(len(leaves)): workFlowArgs.ingroupCoverageIDs.append( toil.importFile( makeURL(get_input_path( '.ig_coverage_{}'.format(i))))) halID = toil.start( Job.wrapJobFn(run_cactus_align, configWrapper, workFlowArgs, project, doRenaming=options.nonCactusInput, pafInput=options.pafInput)) # export the hal toil.exportFile(halID, makeURL(options.outputHal))
def testLastzRepeatMask(self): #Demo sequences sequenceFiles = [ os.path.join(self.encodePath, self.encodeRegion, "%s.ENm001.fa" % species) for species in 'human', "hedgehog" ] #Max occurrences of a repeat within the sequence maxOccurrence = 1 for sequenceFile in sequenceFiles: #Parse sequences into dictionary originalSequences = getSequences(sequenceFile) #Get the masked bases maskedBasesOriginal = getMaskedBases(originalSequences) #Total bases totalBases = sum([ len(i) for i in originalSequences.values() ]) #Calculate number of hard masked bases totalNBases = len([ (header, i, base) for (header, i, base) in maskedBasesOriginal if base.upper() == "N" ]) #Run lastz repeat masker startTime = time.time() with Toil(self.toilOptions) as toil: sequenceID = toil.importFile(makeURL(sequenceFile)) repeatMaskOptions = RepeatMaskOptions(proportionSampled=1.0, minPeriod=maxOccurrence, lastzOpts="--step=1 --ambiguous=iupac,100,100 --ydrop=3000", fragment=200) outputID = toil.start(LastzRepeatMaskJob(repeatMaskOptions=repeatMaskOptions, queryID=sequenceID, targetIDs=[sequenceID])) toil.exportFile(outputID, makeURL(self.tempOutputFile)) print "It took %s seconds to run lastzMasking" % (time.time()-startTime) #Parse lastz masked sequences into dictionary lastzSequences = getSequences(self.tempOutputFile) #Check the sequences are the same modulo masking self.checkSequenceSetsEqualModuloSoftMasking(originalSequences, lastzSequences) #Compare the proportion of bases masked by lastz with original repeat masking maskedBasesOriginal = getMaskedBases(originalSequences) maskedBasesLastzMasked = getMaskedBases(lastzSequences) print " For the sequence file ", sequenceFile, \ " the total number of sequences is ", len(originalSequences), \ " the total number of bases ", totalBases, \ " the number of bases originally masked was: ", len(maskedBasesOriginal),\ " the number of bases masked after running lastz repeat masking is: ", len(maskedBasesLastzMasked), \ " the intersection of these masked sets is: ", len(maskedBasesLastzMasked.intersection(maskedBasesOriginal)), \ " the total number of bases that are Ns ", totalNBases, \ " lastz was filter for max-occurrences of more than : ", maxOccurrence #self.assertGreater(len(maskedBasesLastzMasked), len(maskedBasesOriginal)) #Run lastz repeat masker using heuristic settings for comparison with the slower settings startTime = time.time() with Toil(self.toilOptions) as toil: sequenceID = toil.importFile(makeURL(sequenceFile)) repeatMaskOptions = RepeatMaskOptions(proportionSampled=1.0, minPeriod=maxOccurrence, lastzOpts="--step=3 --ambiguous=iupac,100,100 --ungapped --queryhsplimit=keep,nowarn:%i" % (int(maxOccurrence)*20), fragment=200) outputID = toil.start(LastzRepeatMaskJob(repeatMaskOptions=repeatMaskOptions, queryID=sequenceID, targetIDs=[sequenceID])) toil.exportFile(outputID, makeURL(self.tempOutputFile)) print "It took %s seconds to run lastzMasking fast" % (time.time()-startTime) lastzSequencesFast = getSequences(self.tempOutputFile) maskedBasesLastzMaskedFast = getMaskedBases(lastzSequencesFast) self.assertGreater(len(maskedBasesLastzMaskedFast), len(maskedBasesOriginal)) i = float(len(maskedBasesLastzMaskedFast.intersection(maskedBasesLastzMasked))) precision = i/len(maskedBasesLastzMasked) recall = i/len(maskedBasesLastzMaskedFast) self.assertGreater(precision, 0.93) self.assertGreater(recall, 0.93)