def getConfigPath(self): config = self.xmlRoot.attrib["config"] if config == 'default': config = os.path.join(cactusRootPath(), "cactus_config.xml") if config == 'defaultProgressive': config = os.path.join(cactusRootPath(), "cactus_progressive_config.xml") return config
def processConfig(self): # read in the default right out of cactus if self.options.configFile is not None: configPath = self.options.configFile else: dir = cactusRootPath() configPath = os.path.join(dir, "cactus_progressive_config.xml") configXml = ET.parse(configPath).getroot() self.configWrapper = ConfigWrapper(configXml) # here we can go through the options and apply some to the config self.configWrapper.setBuildHal(True) self.configWrapper.setBuildFasta(True) if self.options.outputMaf is not None: self.configWrapper.setBuildMaf(True) self.configWrapper.setJoinMaf(True) # pre-emptively turn down maxParallelSubtree for singleMachine # mode if not enough threads are provided to support it. Probably # need to do something for other ?combined? batch systems? if self.options.batchSystem == 'singleMachine' and \ self.options.database == 'kyoto_tycoon': if int(self.options.maxThreads) < \ self.configWrapper.getMaxParallelSubtrees() * 3: self.configWrapper.setMaxParallelSubtrees( max(1, int(self.options.maxThreads) / 3)) # this is a little hack to effectively toggle back to the # non-progressive version of cactus (as published in Gen. Res. 2011) # from the high-level interface. if self.options.legacy is True: self.configWrapper.setSubtreeSize(sys.maxint)
def run(self, fileStore): """ mask alpha satellites with dna-brnn """ fastaFile = fileStore.readGlobalFile(self.fastaID) cmd = ['dna-brnn', fastaFile] + self.dnabrnnOpts.split() if '-i' not in self.dnabrnnOpts: # pull up the model # todo: is there are more robust way? cmd += ['-i', os.path.join(cactusRootPath(), 'attcc-alpha.knm')] if self.cores: cmd += ['-t', str(self.cores)] bedFile = fileStore.getLocalTempFile() # run dna-brnn to make a bed file cactus_call(outfile=bedFile, parameters=cmd) maskedFile = fileStore.getLocalTempFile() mask_cmd = [ 'cactus_fasta_softmask_intervals.py', '--origin=zero', '--minLength={}'.format(self.minLength), bedFile ] # do the softmasking cactus_call(infile=fastaFile, outfile=maskedFile, parameters=mask_cmd) return fileStore.writeGlobalFile(maskedFile)
def wdl_workflow_start(options, in_seq_file): s = 'version 1.0\n\n' s += wdl_task_preprocess(options) + '\n' s += wdl_task_blast(options) + '\n' s += wdl_task_align(options) + '\n' s += wdl_task_hal_append(options) + '\n' s += 'workflow cactus_prepared {\n\n' # we need to explicitly import local files s += ' input {\n' s += ' File seq_file' if not options.noLocalInputs: s += '=\"{}\"'.format(os.path.abspath(options.seqFile)) s += '\n' s += ' File? config_file' if not options.noLocalInputs and options.configFile != os.path.join(cactusRootPath(), "cactus_progressive_config.xml"): s += '=\"{}\"'.format(os.path.abspath(options.configFile)) s += '\n' for name, fa_path in in_seq_file.pathMap.items(): # todo: replace with check from toil if '://' not in fa_path: s += ' File {}'.format(input_fa_name(name)) if not options.noLocalInputs: s += '=\"{}\"'.format(os.path.abspath(fa_path)) s += '\n' s += ' }\n' return s
def getRandomConfigFile(): tempConfigFile = getTempFile(rootDir="./", suffix=".xml") config = ET.parse(os.path.join(cactusRootPath(), "cactus_config.xml")).getroot() cafNode = config.find("caf") assert len(config.findall("caf")) == 1 annealingRounds = 1 + int(random.random() * 10) cafNode.attrib["annealingRounds"] = " ".join([ str(1 + int(random.random() * 10)) for i in xrange(annealingRounds) ]) deannealingRounds = list(set([ 1 + int(random.random() * 10) for i in xrange(int(random.random() * 10)) ])) deannealingRounds.sort() cafNode.attrib["deannealingRounds"] = " ".join([ str(i) for i in deannealingRounds ]) cafNode.attrib["trim"] = " ".join([ str(1 + int(random.random() * 5)) for i in xrange(annealingRounds) ]) cafNode.attrib["alignRepeatsAtLoop"] = str(random.random() * annealingRounds) cafNode.attrib["minimumTreeCoverage"] = str(random.random()) cafNode.attrib["blockTrim"] = str(int(random.random() * 5)) cafNode.attrib["ignoreAllChainsLessThanMinimumTreeCoverage"] = str(random.choice([0, 1])) cafNode.attrib["minimumBlockDegree"] = str(random.choice([0, 5])) checkNode = config.find("check") checkNode.attrib["runCheck"] = "1" checkNode = config.find("normal") checkNode.attrib["iterations"] = "2" #Now print the file.. fileHandle = open(tempConfigFile, 'w') ET.ElementTree(config).write(fileHandle) fileHandle.close() if getLogLevelString() == "DEBUG": system("cat %s" % tempConfigFile) return tempConfigFile
def getConfigFile(matchingAlgorithm="greedy"): tempConfigFile = getTempFile(rootDir="./", suffix=".xml") config = ET.parse( os.path.join(cactusRootPath(), "cactus_progressive_config.xml")).getroot() config.find("reference").attrib["matching_algorithm"] = matchingAlgorithm ET.ElementTree(config).write(tempConfigFile) return os.path.abspath(tempConfigFile)
def setUp(self): self.batchSystem = "singleMachine" if getBatchSystem() != None: self.batchSystem = getBatchSystem() unittest.TestCase.setUp(self) self.configFile = os.path.join(cactusRootPath(), "cactus_config.xml") self.configNode = ET.parse(self.configFile).getroot() self.barNode = self.configNode.find("bar") assert self.barNode != None
def getConfigFile(matchingAlgorithm="greedy"): tempConfigFile = getTempFile(rootDir="./", suffix=".xml") config = ET.parse(os.path.join(cactusRootPath(), "cactus_progressive_config.xml")).getroot() #Set the matching algorithm config.find("reference").attrib["matching_algorithm"] = matchingAlgorithm #Now print the file.. fileHandle = open(tempConfigFile, 'w') ET.ElementTree(config).write(fileHandle) fileHandle.close() return os.path.abspath(tempConfigFile)
def cactusPrepare(options, project): """ annotate a SeqFile with ancestral names as well as paths for output sequences.""" # read the input seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) # prepare output sequence directory # todo: support remote (ie s3) output directory try: os.makedirs(options.outSeqDir) except: pass if not os.path.isdir(options.outSeqDir): raise RuntimeError('Unable to create output sequence directory \'{}\''.format(options.outSeqDir)) if not os.access(options.outSeqDir, os.W_OK): logger.warning('Output sequence directory is not writeable: \'{}\''.format(options.outSeqDir)) # hack the configfile to skip preprocessing and write it to the output dir if options.preprocessOnly: config.removePreprocessors() options.configFile = os.path.join(options.outSeqDir, 'config.xml') config.writeXML(options.configFile) # pass through the config file to the options # todo (don't like second hard-code check of .xml path) if options.configFile != os.path.join(cactusRootPath(), "cactus_progressive_config.xml"): options.cactusOptions += ' --configFile {}'.format(options.configFile) # get the ancestor names tree = MultiCactusTree(seqFile.tree) tree.nameUnlabeledInternalNodes(prefix = config.getDefaultInternalNodePrefix()) # make the output outSeqFile = SeqFile() outSeqFile.tree= tree outSeqFile.pathMap = seqFile.pathMap outSeqFile.outgroups = seqFile.outgroups # update paths for preprocessed leaves or inferred ancestors for node in outSeqFile.tree.breadthFirstTraversal(): name = outSeqFile.tree.getName(node) leaf = outSeqFile.tree.isLeaf(node) if leaf or (not leaf and name not in seqFile.pathMap and not options.preprocessOnly): out_basename = seqFile.pathMap[name] if name in seqFile.pathMap else '{}.fa'.format(name) outSeqFile.pathMap[name] = os.path.join(options.outSeqDir, os.path.basename(out_basename)) # write the output with open(options.outSeqFile, 'w') as out_sf: out_sf.write(str(outSeqFile)) # write the instructions print(get_plan(options, project, outSeqFile))
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) parser.add_argument("outputSequenceDir", help='Directory where the processed sequences will be placed') parser.add_argument("--configFile", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument("inputSequences", nargs='+', help='input FASTA file(s)') options = parser.parse_args() setLoggingFromOptions(options) with Toil(options) as toil: stageWorkflow(outputSequenceDir=options.outputSequenceDir, configFile=options.configFile, inputSequences=options.inputSequences, toil=toil, restart=options.restart)
def getConfigFile(matchingAlgorithm="greedy"): tempConfigFile = getTempFile(rootDir="./", suffix=".xml") config = ET.parse( os.path.join(cactusRootPath(), "cactus_progressive_config.xml")).getroot() #Set the matching algorithm config.find("reference").attrib["matching_algorithm"] = matchingAlgorithm #Now print the file.. fileHandle = open(tempConfigFile, 'w') ET.ElementTree(config).write(fileHandle) fileHandle.close() return os.path.abspath(tempConfigFile)
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) addCactusWorkflowOptions(parser) parser.add_argument("seqFile", help = "Seq file") parser.add_argument("outputHal", type=str, help = "Output HAL file") #Progressive Cactus Options parser.add_argument("--configFile", dest="configFile", help="Specify cactus configuration file", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument("--root", dest="root", help="Name of ancestral node (which" " must appear in NEWICK tree in <seqfile>) to use as a " "root for the alignment. Any genomes not below this node " "in the tree may be used as outgroups but will never appear" " in the output. If no root is specifed then the root" " of the tree is used. ", default=None) parser.add_argument("--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument("--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) parser.add_argument("--database", choices=["kyoto_tycoon", "redis"], help="The type of database", default="kyoto_tycoon") options = parser.parse_args() setupBinaries(options) setLoggingFromOptions(options) enableDumpStack() # cactus doesn't run with 1 core if options.batchSystem == 'singleMachine': if options.maxCores is not None: if int(options.maxCores) < 2: raise RuntimeError('Cactus requires --maxCores > 1') else: # is there a way to get this out of Toil? That would be more consistent if cpu_count() < 2: raise RuntimeError('Only 1 CPU detected. Cactus requires at least 2') # Mess with some toil options to create useful defaults. cactus_override_toil_options(options) start_time = timeit.default_timer() runCactusProgressive(options) end_time = timeit.default_timer() run_time = end_time - start_time logger.info("Cactus has finished after {} seconds".format(run_time))
def processConfig(self): # read in the default right out of cactus if self.options.configFile is not None: configPath = self.options.configFile else: dir = cactusRootPath() configPath = os.path.join(dir, "cactus_progressive_config.xml") configXml = ET.parse(configPath).getroot() self.configWrapper = ConfigWrapper(configXml) # here we can go through the options and apply some to the config self.configWrapper.setBuildHal(True) self.configWrapper.setBuildFasta(True)
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) addCactusWorkflowOptions(parser) parser.add_argument("seqFile", help = "Seq file (will be modified if necessary to include graph Fasta sequence)") parser.add_argument("minigraphGFA", help = "Minigraph-compatible reference graph in GFA format (can be gzipped)") parser.add_argument("outputPAF", type=str, help = "Output pairwise alignment file in PAF format") parser.add_argument("--outputFasta", type=str, help = "Output graph sequence file in FASTA format (required if not present in seqFile)") parser.add_argument("--maskFilter", type=int, help = "Ignore softmasked sequence intervals > Nbp (overrides config option of same name)") parser.add_argument("--outputGAFDir", type=str, help = "Output GAF alignments (raw minigraph output before PAF conversion) to this directory") parser.add_argument("--refFromGFA", type=str, help = "Do not align given genome from seqfile, and instead extract its alignment from the rGFA tags (must have been used as reference for minigraph GFA construction)") #WDL hacks parser.add_argument("--pathOverrides", nargs="*", help="paths (multiple allowed) to override from seqFile") parser.add_argument("--pathOverrideNames", nargs="*", help="names (must be same number as --pathOverrides) of path overrides") #Progressive Cactus Options parser.add_argument("--configFile", dest="configFile", help="Specify cactus configuration file", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument("--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument("--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) options = parser.parse_args() setupBinaries(options) setLoggingFromOptions(options) enableDumpStack() if options.outputGAFDir: if not os.path.isdir(options.outputGAFDir): os.makedirs(options.outputGAFDir) if (options.pathOverrides or options.pathOverrideNames): if not options.pathOverrides or not options.pathOverrideNames or \ len(options.pathOverrideNames) != len(options.pathOverrides): raise RuntimeError('same number of values must be passed to --pathOverrides and --pathOverrideNames') # Mess with some toil options to create useful defaults. cactus_override_toil_options(options) start_time = timeit.default_timer() runCactusGraphMap(options) end_time = timeit.default_timer() run_time = end_time - start_time logger.info("cactus-graphmap has finished after {} seconds".format(run_time))
def setUp(self): self.batchSystem = "singleMachine" if getBatchSystem() != None: self.batchSystem = getBatchSystem() unittest.TestCase.setUp(self) self.useOutgroup = False self.doSelfAlignment = False #Load the config file, turn on the checks. configWrapper = ConfigWrapper(ET.parse(os.path.join(cactusRootPath(), "cactus_progressive_config.xml")).getroot()) configWrapper.turnAllModesOn() self.tempDir = getTempDirectory(os.getcwd()) self.configFile = os.path.join(self.tempDir, "tempConfig.xml") configWrapper.writeXML(self.configFile)
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) addCactusWorkflowOptions(parser) parser.add_argument("--vg", required=True, nargs='+', help = "Input vg files (PackedGraph or HashGraph format)") parser.add_argument("--outDir", required=True, type=str, help = "Output directory") parser.add_argument("--outName", required=True, type=str, help = "Basename of all output files") parser.add_argument("--reference", required=True, type=str, help = "Reference event name") parser.add_argument("--vcfReference", type=str, help = "Reference event for VCF (if different from --reference)") parser.add_argument("--rename", nargs='+', default = [], help = "Path renaming, each of form src>dest (see clip-vg -r)") parser.add_argument("--clipLength", type=int, default=None, help = "clip out unaligned sequences longer than this") parser.add_argument("--wlineSep", type=str, help = "wline separator for vg convert") parser.add_argument("--indexCores", type=int, default=1, help = "cores for indexing processes") parser.add_argument("--decoyGraph", help= "decoy sequences vg graph to add (PackedGraph or HashGraph format)") parser.add_argument("--hal", nargs='+', default = [], help = "Input hal files (for merging)") #Progressive Cactus Options parser.add_argument("--configFile", dest="configFile", help="Specify cactus configuration file", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument("--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument("--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) options = parser.parse_args() setupBinaries(options) setLoggingFromOptions(options) enableDumpStack() if options.outDir and not options.outDir.startswith('s3://'): if not os.path.isdir(options.outDir): os.makedirs(options.outDir) if options.hal and len(options.hal) != len(options.vg): raise RuntimeError("If --hal and --vg should specify the same number of files") # Mess with some toil options to create useful defaults. cactus_override_toil_options(options) start_time = timeit.default_timer() runCactusGraphMapJoin(options) end_time = timeit.default_timer() run_time = end_time - start_time logger.info("cactus-graphmap-join has finished after {} seconds".format(run_time))
def processConfig(self): # read in the default right out of cactus if self.options.configFile is not None: configPath = self.options.configFile else: dir = cactusRootPath() configPath = os.path.join(dir, "cactus_progressive_config.xml") log.info("Using config from path %s." % configPath) configXml = ET.parse(configPath).getroot() self.configWrapper = ConfigWrapper(configXml) # here we can go through the options and apply some to the config self.configWrapper.setBuildHal(True) self.configWrapper.setBuildFasta(True)
def setUp(self): self.batchSystem = "singleMachine" if getBatchSystem() != None: self.batchSystem = getBatchSystem() unittest.TestCase.setUp(self) self.useOutgroup = False self.doSelfAlignment = False #Load the config file, turn on the checks. configWrapper = ConfigWrapper(ET.parse(os.path.join(cactusRootPath(), "cactus_progressive_config.xml")).getroot()) configWrapper.turnAllModesOn() configWrapper.turnOffHeaderChecks() self.tempDir = getTempDirectory(os.getcwd()) self.configFile = os.path.join(self.tempDir, "tempConfig.xml") configWrapper.writeXML(self.configFile)
def loadDnaBrnnModel(toil, configNode, maskAlpha=False): """ store the model in a toil file id so it can be used in any workflow """ for prepXml in configNode.findall("preprocessor"): if prepXml.attrib["preprocessJob"] == "dna-brnn": if maskAlpha or getOptionalAttrib( prepXml, "active", typeFn=bool, default=False): dnabrnnOpts = getOptionalAttrib(prepXml, "dna-brnnOpts", default="") if '-i' in dnabrnnOpts: model_path = dnabrnnOpts[dnabrnnOpts.index('-i') + 1] else: model_path = os.path.join(cactusRootPath(), 'attcc-alpha.knm') os.environ["CACTUS_DNA_BRNN_MODEL_ID"] = toil.importFile( makeURL(model_path))
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) addCactusWorkflowOptions(parser) parser.add_argument("seqFile", help = "Seq file (gzipped fastas supported)") parser.add_argument("minigraphGFA", help = "Minigraph-compatible reference graph in GFA format (can be gzipped)") parser.add_argument("graphmapPAF", type=str, help = "Output pairwise alignment file in PAF format (can be gzipped)") parser.add_argument("--outDir", required=True, type=str, help = "Output directory") parser.add_argument("--refContigs", nargs="*", help = "Subset to these reference contigs (multiple allowed)", default=[]) parser.add_argument("--refContigsFile", type=str, help = "Subset to (newline-separated) reference contigs in this file") parser.add_argument("--otherContig", type=str, help = "Lump all reference contigs unselected by above options into single one with this name") parser.add_argument("--reference", type=str, help = "Name of reference (in seqFile). Ambiguity filters will not be applied to it") parser.add_argument("--maskFilter", type=int, help = "Ignore softmasked sequence intervals > Nbp") #Progressive Cactus Options parser.add_argument("--configFile", dest="configFile", help="Specify cactus configuration file", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument("--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument("--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) options = parser.parse_args() setupBinaries(options) setLoggingFromOptions(options) enableDumpStack() if options.outDir and not options.outDir.startswith('s3://'): if not os.path.isdir(options.outDir): os.makedirs(options.outDir) # Mess with some toil options to create useful defaults. cactus_override_toil_options(options) start_time = timeit.default_timer() runCactusGraphMapSplit(options) end_time = timeit.default_timer() run_time = end_time - start_time logger.info("cactus-graphmap-split has finished after {} seconds".format(run_time))
def getRandomConfigFile(): tempConfigFile = getTempFile(rootDir="./", suffix=".xml") config = ET.parse(os.path.join(cactusRootPath(), "cactus_config.xml")).getroot() cafNode = config.find("caf") assert len(config.findall("caf")) == 1 annealingRounds = 1 + int(random.random() * 10) cafNode.attrib["annealingRounds"] = " ".join( [str(1 + int(random.random() * 10)) for i in xrange(annealingRounds)]) deannealingRounds = list( set([ 1 + int(random.random() * 10) for i in xrange(int(random.random() * 10)) ])) deannealingRounds.sort() cafNode.attrib["deannealingRounds"] = " ".join( [str(i) for i in deannealingRounds]) cafNode.attrib["trim"] = " ".join( [str(1 + int(random.random() * 5)) for i in xrange(annealingRounds)]) cafNode.attrib["alignRepeatsAtLoop"] = str(random.random() * annealingRounds) cafNode.attrib["minimumTreeCoverage"] = str(random.random()) cafNode.attrib["blockTrim"] = str(int(random.random() * 5)) cafNode.attrib["ignoreAllChainsLessThanMinimumTreeCoverage"] = str( random.choice([0, 1])) cafNode.attrib["minimumBlockDegree"] = str(random.choice([0, 5])) checkNode = config.find("check") checkNode.attrib["runCheck"] = "1" checkNode = config.find("normal") checkNode.attrib["iterations"] = "2" #Now print the file.. fileHandle = open(tempConfigFile, 'w') ET.ElementTree(config).write(fileHandle) fileHandle.close() if getLogLevelString() == "DEBUG": system("cat %s" % tempConfigFile) return tempConfigFile
def main(): parser = ArgumentParser() parser.add_argument("seqFile", help = "Seq file") parser.add_argument("outSeqDir", help='Directory where the processed leaf sequence and ancestral sequences will be placed') parser.add_argument("outSeqFile", help = "Path for annotated Seq file output") parser.add_argument("outputHal", type=str, help = "Output HAL file") parser.add_argument("--configFile", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument("--preprocessBatchSize", type=int, default=3, help="size (number of genomes) of suggested preprocessing jobs") parser.add_argument("--jobStore", type=str, default="$JOBSTORE", help="jobstore to use in suggested commands") parser.add_argument("--halOptions", type=str, default="--hdf5InMemory", help="options for every hal command") parser.add_argument("--cactusOptions", type=str, default="--realTimeLogging --logInfo", help="options for every cactus command") parser.add_argument("--preprocessOnly", action="store_true", help="only decompose into preprocessor and cactus jobs") options = parser.parse_args() options.database = 'kyoto_tycoon' #todo support root option options.root = None # need to go through this garbage (copied from the main() in progressive_cactus) to # come up with the project options.cactusDir = getTempDirectory() #Create the progressive cactus project projWrapper = ProjectWrapper(options, options.configFile) projWrapper.writeXml() pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) enableDumpStack() cactusPrepare(options, project)
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) addCactusWorkflowOptions(parser) parser.add_argument("seqFile", help="Seq file") parser.add_argument( "cigarsFile", nargs="+", help= "Pairiwse aliginments (from cactus-blast, cactus-refmap or cactus-graphmap)" ) parser.add_argument("outputHal", type=str, help="Output HAL file") parser.add_argument( "--pathOverrides", nargs="*", help="paths (multiple allowd) to override from seqFile") parser.add_argument( "--pathOverrideNames", nargs="*", help="names (must be same number as --paths) of path overrides") #Progressive Cactus Options parser.add_argument("--configFile", dest="configFile", help="Specify cactus configuration file", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument( "--root", dest="root", help="Name of ancestral node (which" " must appear in NEWICK tree in <seqfile>) to use as a " "root for the alignment. Any genomes not below this node " "in the tree may be used as outgroups but will never appear" " in the output. If no root is specifed then the root" " of the tree is used. ", default=None, required=True) parser.add_argument( "--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument( "--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) parser.add_argument( "--nonCactusInput", action="store_true", help= "Input lastz cigars do not come from cactus-blast or cactus-refmap: Prepend ids in cigars" ) parser.add_argument( "--pangenome", action="store_true", help= "Override some CAF settings whose defaults are not suited to star trees" ) parser.add_argument( "--pafInput", action="store_true", help="'cigarsFile' arugment is in PAF format, rather than lastz cigars." ) parser.add_argument("--database", choices=["kyoto_tycoon", "redis"], help="The type of database", default="kyoto_tycoon") options = parser.parse_args() setupBinaries(options) setLoggingFromOptions(options) enableDumpStack() if (options.pathOverrides or options.pathOverrideNames): if not options.pathOverrides or not options.pathOverrideNames or \ len(options.pathOverrideNames) != len(options.pathOverrides): raise RuntimeError( 'same number of values must be passed to --pathOverrides and --pathOverrideNames' ) # cactus doesn't run with 1 core if options.batchSystem == 'singleMachine': if options.maxCores is not None: if int(options.maxCores) < 2: raise RuntimeError('Cactus requires --maxCores > 1') else: # is there a way to get this out of Toil? That would be more consistent if cpu_count() < 2: raise RuntimeError( 'Only 1 CPU detected. Cactus requires at least 2') if options.pafInput: # cactus-graphmap does not do any prepending to simplify interface with minigraph node names # so it must be done here options.nonCactusInput = True options.buildHal = True options.buildFasta = True # Mess with some toil options to create useful defaults. cactus_override_toil_options(options) start_time = timeit.default_timer() runCactusAfterBlastOnly(options) end_time = timeit.default_timer() run_time = end_time - start_time logger.info("cactus-align has finished after {} seconds".format(run_time))
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) parser.add_argument("inSeqFile", type=str, nargs='?', default=None, help="Input Seq file") parser.add_argument( "outSeqFile", type=str, nargs='?', default=None, help="Output Seq file (ex generated with cactus-prepare)") parser.add_argument("--configFile", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument( "--inputNames", nargs='*', help= 'input genome names (not paths) to preprocess (all leaves from Input Seq file if none specified)' ) parser.add_argument( "--inPaths", nargs='*', help= 'Space-separated list of input fasta paths (to be used in place of --inSeqFile' ) parser.add_argument( "--outPaths", nargs='*', help= 'Space-separated list of output fasta paths (one for each inPath, used in place of --outSeqFile)' ) parser.add_argument("--maskAlpha", action='store_true', help='Use dna-brnn instead of lastz for repeatmasking') parser.add_argument( "--clipAlpha", action='store_true', help= 'use dna-brnn instead of lastz for repeatmasking. Also, clip sequence using given minimum length instead of softmasking' ) parser.add_argument( "--ignore", nargs='*', help='Space-separate list of genomes from inSeqFile to ignore', default=[]) parser.add_argument( "--maskPAF", type=str, help= 'Incorporate coverage gaps from given PAF when masking. Only implemented for dna-brnn masking' ) parser.add_argument( "--brnnCores", type=int, help= 'Specify number of cores for each dna-brnn job (overriding default value from the config)' ) parser.add_argument( "--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument( "--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) options = parser.parse_args() setupBinaries(options) setLoggingFromOptions(options) enableDumpStack() # Mess with some toil options to create useful defaults. cactus_override_toil_options(options) # we have two modes: operate directly on paths or rely on the seqfiles. they cannot be mixed if options.inSeqFile or options.outSeqFile: if not options.inSeqFile or not options.outSeqFile or options.inPaths or options.outPaths: raise RuntimeError( '--inSeqFile must be used in conjunction with --outSeqFile and not with --inPaths nor --outPaths' ) elif options.inPaths or options.outPaths: if not options.inPaths or not options.outPaths or options.inSeqFile or options.outSeqFile: raise RuntimeError( '--inPaths must be used in conjunction with --outPaths and not with --inSeqFile nor --outSeqFile' ) if len(options.inPaths) != len(options.outPaths): raise RuntimeError( '--inPaths and --outPaths must have the same number of arguments' ) else: raise RuntimeError( '--inSeqFile/--outSeqFile/--inputNames or --inPaths/--outPaths required to specify input' ) if options.maskAlpha and options.clipAlpha: raise RuntimeError( '--maskAlpha and --clipAlpha cannot be used together') if options.clipAlpha: options.maskAlpha = True if options.maskPAF and not options.inputNames and not options.inSeqFile: raise RuntimeError( '--maskPAF requires event names specified wither with an input seqfile or with --inputNames' ) if options.ignore and options.clipAlpha is None: raise RuntimeError('--ignore can only be used with --clipAlpha') inSeqPaths = [] outSeqPaths = [] inNames = options.inputNames eventNames = [] #load cactus config configNode = ET.parse(options.configFile).getroot() #we never want to preprocess minigraph sequences graph_event = getOptionalAttrib(findRequiredNode(configNode, "graphmap"), "assemblyName", default="_MINIGRAPH_") options.ignore.append(graph_event) # mine the paths out of the seqfiles if options.inSeqFile: inSeqFile = SeqFile(options.inSeqFile) outSeqFile = SeqFile(options.outSeqFile) if not inNames: inNames = [ inSeqFile.tree.getName(node) for node in inSeqFile.tree.getLeaves() ] for inName in inNames: if inName in options.ignore: # "convenience" functionality: we let the --ignore option update the output seqfile # to reflect the fact that we're not touching the original input outSeqFile.pathMap[inName] = inSeqFile.pathMap[inName] continue if inName not in inSeqFile.pathMap or inName not in outSeqFile.pathMap: raise RuntimeError( '{} not present in input and output Seq files'.format( inName)) inPath = inSeqFile.pathMap[inName] outPath = outSeqFile.pathMap[inName] if os.path.isdir(inPath): try: os.makedirs(outPath) except: pass assert os.path.isdir(inPath) == os.path.isdir(outPath) inSeqPaths += [ os.path.join(inPath, seqPath) for seqPath in os.listdir(inPath) ] outSeqPaths += [ os.path.join(outPath, seqPath) for seqPath in os.listdir(inPath) ] else: inSeqPaths += [inPath] outSeqPaths += [outPath] eventNames.append(inName) if options.ignore: # see comment above with open(options.outSeqFile, 'w') as outSF: outSF.write(str(outSeqFile)) # we got path names directly from the command line else: inSeqPaths = options.inPaths outSeqPaths = options.outPaths with Toil(options) as toil: stageWorkflow(outputSequenceDir=None, configFile=options.configFile, inputSequences=inSeqPaths, toil=toil, restart=options.restart, outputSequences=outSeqPaths, maskAlpha=options.maskAlpha, clipAlpha=options.clipAlpha, maskPAF=options.maskPAF, inputEventNames=eventNames, brnnCores=options.brnnCores)
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) parser.add_argument("seqFile", help="Input Seq file") parser.add_argument( "outSeqFile", help="Output Seq file (ex generated with cactus-prepare)") parser.add_argument("--configFile", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument( "--inputNames", nargs='*', help= 'input genome names (not paths) to preprocess (all leaves from Input Seq file if none specified)' ) parser.add_argument( "--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument( "--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) options = parser.parse_args() setupBinaries(options) setLoggingFromOptions(options) enableDumpStack() inSeqFile = SeqFile(options.seqFile) outSeqFile = SeqFile(options.outSeqFile) inNames = options.inputNames if not inNames: inNames = [ inSeqFile.tree.getName(node) for node in inSeqFile.tree.getLeaves() ] inSeqPaths = [] outSeqPaths = [] for inName in inNames: if inName not in inSeqFile.pathMap or inName not in outSeqFile.pathMap: raise RuntimeError( '{} not present in input and output Seq files'.format(inNmae)) inPath = inSeqFile.pathMap[inName] outPath = outSeqFile.pathMap[inName] if os.path.isdir(inPath): try: os.makedirs(outPath) except: pass assert os.path.isdir(inPath) == os.path.isdir(outPath) inSeqPaths += [ os.path.join(inPath, seqPath) for seqPath in os.listdir(inPath) ] outSeqPaths += [ os.path.join(outPath, seqPath) for seqPath in os.listdir(inPath) ] else: inSeqPaths += [inPath] outSeqPaths += [outPath] with Toil(options) as toil: stageWorkflow(outputSequenceDir=None, configFile=options.configFile, inputSequences=inSeqPaths, toil=toil, restart=options.restart, outputSequences=outSeqPaths)
def get_options(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) # addCactusWorkflowOptions(parser) # ### For quick debugging of apply_dipcall_bed_filter: # parser.add_argument('paf', type=str, # help='For quick debugging of apply_dipcall_bed_filter.') # options for basic input/output parser.add_argument( 'seqFile', type=str, help= 'A file containing all the information specified by cactus in construction. This aligner ignores the newick tree.' ) parser.add_argument( 'refID', type=str, help= 'Specifies which asm in seqFile should be treated as the reference.') parser.add_argument("outputFile", type=str, help="Output pairwise alignment file") parser.add_argument( "--pathOverrides", nargs="*", help="paths (multiple allowd) to override from seqFile") parser.add_argument( "--pathOverrideNames", nargs="*", help="names (must be same number as --paths) of path overrides") # dipcall-like filters parser.add_argument( '--dipcall_bed_filter', action='store_true', help= "Applies filters & minimap2 arguments used to make the bedfile in dipcall. Only affects the primary mappings file. Secondary mappings aren't used in dipcall." ) parser.add_argument( '--dipcall_vcf_filter', action='store_true', help= "Applies filters & minimap2 arguments used to make the vcf in dipcall. Only affects the primary mappings file. Secondary mappings aren't used in dipcall." ) # Progressive Cactus Options: parser.add_argument("--configFile", dest="configFile", help="Specify cactus configuration file", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument( "--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) parser.add_argument( "--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") ## options for importing assemblies: # following arguments are only useful under --non_blast_output # parser.add_argument('--non_blast_output', action='store_true', # help="Instead of using cactus-blast-style prepended ids, use an alternative import method that only alters contig ids if absolutely necessary.") # parser.add_argument('--all_unique_ids', action='store_true', # help="Only take effect when called with --non_blast_output. Prevents the program from touching the assembly files; the user promises that they don't contain any duplicate contig ids. In reality, there should never be contig renamings if there are no duplicate fasta ids.") # parser.add_argument('--overwrite_assemblies', action='store_true', # help="When cleaning the assembly files to make sure there are no duplicate contig ids, overwrite the assembly files. Copy them to a neigboring folder with the affix '_edited_for_duplicate_contig_ids' instead.") # # Useful in normal asms import # parser.add_argument('--assembly_save_dir', type=str, default='./unique_id_assemblies/', # help='While deduplicating contig ids in the input fastas, save the assemblies in this directory. Ignored when used in conjunction with --overwrite_assemblies.') # for debugging: parser.add_argument( '--debug_export', action='store_true', help='Export several other files for debugging inspection.') parser.add_argument('--debug_export_dir', type=str, default='./debug_export_dir/', help='Location of the exported debug files.') options = parser.parse_args() return options
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) addCactusWorkflowOptions(parser) parser.add_argument("seqFile", help="Seq file") parser.add_argument( "cigarsFile", nargs="*", help= "Pairiwse aliginments (from cactus-blast, cactus-refmap or cactus-graphmap)" ) parser.add_argument("outHal", type=str, help="Output HAL file (or directory in --batch mode)") parser.add_argument( "--pathOverrides", nargs="*", help="paths (multiple allowd) to override from seqFile") parser.add_argument( "--pathOverrideNames", nargs="*", help="names (must be same number as --paths) of path overrides") #Pangenome Options parser.add_argument( "--pangenome", action="store_true", help= "Activate pangenome mode (suitable for star trees of closely related samples) by overriding several configuration settings." " The overridden configuration will be saved in <outHal>.pg-conf.xml") parser.add_argument( "--pafInput", action="store_true", help="'cigarsFile' arugment is in PAF format, rather than lastz cigars." ) parser.add_argument( "--usePafSecondaries", action="store_true", help= "use the secondary alignments from the PAF input. They are ignored by default." ) parser.add_argument("--singleCopySpecies", type=str, help="Filter out all self-alignments in given species") parser.add_argument( "--barMaskFilter", type=int, default=None, help= "BAR's POA aligner will ignore softmasked regions greater than this length. (overrides partialOrderAlignmentMaskFilter in config)" ) parser.add_argument( "--outVG", action="store_true", help="export pangenome graph in VG (.vg) in addition to HAL") parser.add_argument( "--outGFA", action="store_true", help="export pangenome grpah in GFA (.gfa.gz) in addition to HAL") parser.add_argument( "--batch", action="store_true", help= "Launch batch of alignments. Input seqfile is expected to be chromfile as generated by cactus-graphmap-slit" ) parser.add_argument( "--stagger", type=int, help= "Stagger alignment jobs in batch mode by this many seconds (to avoid starting all at once)", default=0) parser.add_argument( "--acyclic", type=str, help= "Ensure that given genome is cyclic by deleting all paralogy edges in postprocessing" ) #Progressive Cactus Options parser.add_argument("--configFile", dest="configFile", help="Specify cactus configuration file", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument( "--root", dest="root", help="Name of ancestral node (which" " must appear in NEWICK tree in <seqfile>) to use as a " "root for the alignment. Any genomes not below this node " "in the tree may be used as outgroups but will never appear" " in the output. If no root is specifed then the root" " of the tree is used. ", default=None) parser.add_argument( "--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument( "--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) parser.add_argument( "--nonCactusInput", action="store_true", help= "Input lastz cigars do not come from cactus-blast or cactus-refmap: Prepend ids in cigars" ) parser.add_argument("--database", choices=["kyoto_tycoon", "redis"], help="The type of database", default="kyoto_tycoon") options = parser.parse_args() setupBinaries(options) setLoggingFromOptions(options) enableDumpStack() if (options.pathOverrides or options.pathOverrideNames): if not options.pathOverrides or not options.pathOverrideNames or \ len(options.pathOverrideNames) != len(options.pathOverrides): raise RuntimeError( 'same number of values must be passed to --pathOverrides and --pathOverrideNames' ) # cactus doesn't run with 1 core if options.batchSystem == 'singleMachine': if options.maxCores is not None: if int(options.maxCores) < 2: raise RuntimeError('Cactus requires --maxCores > 1') else: # is there a way to get this out of Toil? That would be more consistent if cpu_count() < 2: raise RuntimeError( 'Only 1 CPU detected. Cactus requires at least 2') options.buildHal = True options.buildFasta = True if options.outHal.startswith('s3://'): if not has_s3: raise RuntimeError( "S3 support requires toil to be installed with [aws]") # write a little something to the bucket now to catch any glaring problems asap test_file = os.path.join(getTempDirectory(), 'check') with open(test_file, 'w') as test_o: test_o.write("\n") region = get_aws_region( options.jobStore) if options.jobStore.startswith('aws:') else None write_s3(test_file, options.outHal if options.outHal.endswith('.hal') else os.path.join(options.outHal, 'test'), region=region) options.checkpointInfo = (get_aws_region(options.jobStore), options.outHal) else: options.checkpointInfo = None if options.batch: # the output hal is a directory, make sure it's there if not os.path.isdir(options.outHal): os.makedirs(options.outHal) assert len(options.cigarsFile) == 0 else: assert len(options.cigarsFile) > 0 # Mess with some toil options to create useful defaults. cactus_override_toil_options(options) # We set which type of unique ids to expect. Numeric (from cactus-blast) or Eventname (cactus-refmap or cactus-grpahmap) # This is a bit ugly, since we don't have a good way to differentiate refmap from blast, and use --pangenome as a proxy # But I don't think there's a real use case yet of making a separate parameter options.eventNameAsID = os.environ.get('CACTUS_EVENT_NAME_AS_UNIQUE_ID') if options.eventNameAsID is not None: options.eventNameAsID = False if not bool( eventName) or eventName == '0' else True else: options.eventNameAsID = options.pangenome or options.pafInput os.environ['CACTUS_EVENT_NAME_AS_UNIQUE_ID'] = str( int(options.eventNameAsID)) start_time = timeit.default_timer() with Toil(options) as toil: importSingularityImage(options) if options.restart: results_dict = toil.restart() else: align_jobs = make_batch_align_jobs(options, toil) results_dict = toil.start( Job.wrapJobFn(run_batch_align_jobs, align_jobs)) # when using s3 output urls, things get checkpointed as they're made so no reason to export # todo: make a more unified interface throughout cactus for this # (see toil-vg's outstore logic which, while not perfect, would be an improvement if not options.outHal.startswith('s3://'): if options.batch: for chrom, results in results_dict.items(): toil.exportFile( results[0], makeURL( os.path.join(options.outHal, '{}.hal'.format(chrom)))) if options.outVG: toil.exportFile( results[1], makeURL( os.path.join(options.outHal, '{}.vg'.format(chrom)))) if options.outGFA: toil.exportFile( results[2], makeURL( os.path.join(options.outHal, '{}.gfa.gz'.format(chrom)))) else: assert len(results_dict) == 1 and None in results_dict halID, vgID, gfaID = results_dict[None][0], results_dict[None][ 1], results_dict[None][2] # export the hal toil.exportFile(halID, makeURL(options.outHal)) # export the vg if options.outVG: toil.exportFile( vgID, makeURL(os.path.splitext(options.outHal)[0] + '.vg')) if options.outGFA: toil.exportFile( gfaID, makeURL( os.path.splitext(options.outHal)[0] + '.gfa.gz')) end_time = timeit.default_timer() run_time = end_time - start_time logger.info("cactus-align has finished after {} seconds".format(run_time))
def main_batch(): """ this is a bit like cactus-align --batch except it will use toil-in-toil to assign each chromosome to a machine. pros: much less chance of a problem with one chromosome affecting anything else more forgiving for inexact resource specs could be ported to Terra cons: less efficient use of resources """ parser = ArgumentParser() Job.Runner.addToilOptions(parser) addCactusWorkflowOptions(parser) parser.add_argument("chromFile", help="chroms file") parser.add_argument("outHal", type=str, help="Output directory (can be s3://)") parser.add_argument( "--alignOptions", type=str, help= "Options to pass through to cactus-align (don't forget to wrap in quotes)" ) parser.add_argument("--alignCores", type=int, help="Number of cores per align job") parser.add_argument( "--alignCoresOverrides", nargs="*", help= "Override align job cores for a chromosome. Space-separated list of chrom,cores pairse epxected" ) parser.add_argument("--configFile", dest="configFile", help="Specify cactus configuration file", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) options = parser.parse_args() options.containerImage = None options.binariesMode = None options.root = None options.latest = None options.database = "kyoto_tycoon" setupBinaries(options) setLoggingFromOptions(options) enableDumpStack() # Mess with some toil options to create useful defaults. cactus_override_toil_options(options) # Turn the overrides into a dict cores_overrides = {} if options.alignCoresOverrides: for o in options.alignCoresOverrides: try: chrom, cores = o.split(',') cores_overrides[chrom] = int(cores) except: raise RuntimeError( "Error parsing alignCoresOverrides \"{}\"".format(o)) options.alignCoresOverrides = cores_overrides start_time = timeit.default_timer() with Toil(options) as toil: importSingularityImage(options) if options.restart: results_dict = toil.restart() else: config_id = toil.importFile(makeURL(options.configFile)) # load the chromfile into memory chrom_dict = {} with open(options.chromFile, 'r') as chrom_file: for line in chrom_file: toks = line.strip().split() if len(toks): assert len(toks) == 3 chrom, seqfile, alnFile = toks[0], toks[1], toks[2] chrom_dict[chrom] = toil.importFile( makeURL(seqfile)), toil.importFile( makeURL(alnFile)) results_dict = toil.start( Job.wrapJobFn(align_toil_batch, chrom_dict, config_id, options)) # when using s3 output urls, things get checkpointed as they're made so no reason to export # todo: make a more unified interface throughout cactus for this # (see toil-vg's outstore logic which, while not perfect, would be an improvement if not options.outHal.startswith('s3://'): if options.batch: for chrom, results in results_dict.items(): toil.exportFile( results[0], makeURL( os.path.join(options.outHal, '{}.hal'.format(chrom)))) if options.outVG: toil.exportFile( results[1], makeURL( os.path.join(options.outHal, '{}.vg'.format(chrom)))) if options.outGFA: toil.exportFile( results[2], makeURL( os.path.join(options.outHal, '{}.gfa.gz'.format(chrom)))) toil.exportFile( results[3], makeURL( os.path.join(options.outHal, '{}.hal.log'.format(chrom)))) end_time = timeit.default_timer() run_time = end_time - start_time logger.info( "cactus-align-batch has finished after {} seconds".format(run_time))
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) parser.add_argument("inSeqFile", type=str, nargs='?', default=None, help = "Input Seq file") parser.add_argument("outSeqFile", type=str, nargs='?', default=None, help = "Output Seq file (ex generated with cactus-prepare)") parser.add_argument("--configFile", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument("--inputNames", nargs='*', help='input genome names (not paths) to preprocess (all leaves from Input Seq file if none specified)') parser.add_argument("--inPaths", nargs='*', help='Space-separated list of input fasta paths (to be used in place of --inSeqFile') parser.add_argument("--outPaths", nargs='*', help='Space-separated list of output fasta paths (one for each inPath, used in place of --outSeqFile)') parser.add_argument("--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument("--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) options = parser.parse_args() setupBinaries(options) setLoggingFromOptions(options) enableDumpStack() # we have two modes: operate directly on paths or rely on the seqfiles. they cannot be mixed if options.inSeqFile or options.outSeqFile: if not options.inSeqFile or not options.outSeqFile or options.inPaths or options.outPaths: raise RuntimeError('--inSeqFile must be used in conjunction with --outSeqFile and not with --inPaths nor --outPaths') elif options.inPaths or options.outPaths: if not options.inPaths or not options.outPaths or options.inSeqFile or options.outSeqFile or options.inputNames: raise RuntimeError('--inPaths must be used in conjunction with --outPaths and not with --inSeqFile, --outSeqFile nor --inputNames') if len(options.inPaths) != len(options.outPaths): raise RuntimeError('--inPaths and --outPaths must have the same number of arguments') else: raise RuntimeError('--inSeqFile/--outSeqFile/--inputNames or --inPaths/--outPaths required to specify input') inSeqPaths = [] outSeqPaths = [] # mine the paths out of the seqfiles if options.inSeqFile: inSeqFile = SeqFile(options.inSeqFile) outSeqFile = SeqFile(options.outSeqFile) inNames = options.inputNames if not inNames: inNames = [inSeqFile.tree.getName(node) for node in inSeqFile.tree.getLeaves()] for inName in inNames: if inName not in inSeqFile.pathMap or inName not in outSeqFile.pathMap: raise RuntimeError('{} not present in input and output Seq files'.format(inNmae)) inPath = inSeqFile.pathMap[inName] outPath = outSeqFile.pathMap[inName] if os.path.isdir(inPath): try: os.makedirs(outPath) except: pass assert os.path.isdir(inPath) == os.path.isdir(outPath) inSeqPaths += [os.path.join(inPath, seqPath) for seqPath in os.listdir(inPath)] outSeqPaths += [os.path.join(outPath, seqPath) for seqPath in os.listdir(inPath)] else: inSeqPaths += [inPath] outSeqPaths += [outPath] # we got path names directly from the command line else: inSeqPaths = options.inPaths outSeqPaths = options.outPaths with Toil(options) as toil: stageWorkflow(outputSequenceDir=None, configFile=options.configFile, inputSequences=inSeqPaths, toil=toil, restart=options.restart, outputSequences=outSeqPaths)
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) addCactusWorkflowOptions(parser) parser.add_argument("seqFile", help="Seq file") parser.add_argument("outputHal", type=str, help="Output HAL file") #Progressive Cactus Options parser.add_argument("--configFile", dest="configFile", help="Specify cactus configuration file", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument( "--root", dest="root", help="Name of ancestral node (which" " must appear in NEWICK tree in <seqfile>) to use as a " "root for the alignment. Any genomes not below this node " "in the tree may be used as outgroups but will never appear" " in the output. If no root is specifed then the root" " of the tree is used. ", default=None) parser.add_argument( "--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument( "--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) options = parser.parse_args() setupBinaries(options) setLoggingFromOptions(options) enableDumpStack() # cactus doesn't run with 1 core if options.batchSystem == 'singleMachine': if options.maxCores is not None: if int(options.maxCores) < 2: raise RuntimeError('Cactus requires --maxCores > 1') else: # is there a way to get this out of Toil? That would be more consistent if cpu_count() < 2: raise RuntimeError( 'Only 1 CPU detected. Cactus requires at least 2') # tokyo_cabinet is no longer supported options.database = "kyoto_tycoon" # Mess with some toil options to create useful defaults. # Caching generally slows down the cactus workflow, plus some # methods like readGlobalFileStream don't support forced # reads directly from the job store rather than from cache. options.disableCaching = True # Job chaining breaks service termination timing, causing unused # databases to accumulate and waste memory for no reason. options.disableChaining = True # The default deadlockWait is currently 60 seconds. This can cause # issues if the database processes take a while to actually begin # after they're issued. Change it to at least an hour so that we # don't preemptively declare a deadlock. if options.deadlockWait is None or options.deadlockWait < 3600: options.deadlockWait = 3600 if options.retryCount is None and options.batchSystem != 'singleMachine': # If the user didn't specify a retryCount value, make it 5 # instead of Toil's default (1). options.retryCount = 5 start_time = timeit.default_timer() runCactusProgressive(options) end_time = timeit.default_timer() run_time = end_time - start_time logger.info("Cactus has finished after {} seconds".format(run_time))
def main(): parser = ArgumentParser() parser.add_argument("seqFile", help = "Seq file") parser.add_argument("--outDir", help='Directory where the processed leaf sequence and ancestral sequences will be placed.' ' Required when not using --wdl') parser.add_argument("--outSeqFile", help="Path for annotated Seq file output [default: outDir/seqFile]") parser.add_argument("--outHal", help="Output HAL file [default: outDir/out.hal]") parser.add_argument("--wdl", action="store_true", help="output wdl workflow instead of list of commands") parser.add_argument("--noLocalInputs", action="store_true", help="dont embed local input paths in WDL script (as they will need" " to be respecified when running on Terra") parser.add_argument("--configFile", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument("--preprocessBatchSize", type=int, default=3, help="size (number of genomes) of suggested preprocessing jobs") parser.add_argument("--jobStore", type=str, default="./jobstore", help="base directory of jobStores to use in suggested commands") parser.add_argument("--halOptions", type=str, default="--hdf5InMemory", help="options for every hal command") parser.add_argument("--cactusOptions", type=str, default="--realTimeLogging --logInfo", help="options for every cactus command") parser.add_argument("--preprocessOnly", action="store_true", help="only decompose into preprocessor and cactus jobs") parser.add_argument("--dockerImage", type=str, help="docker image to use as wdl runtime") parser.add_argument("--gpu", action="store_true", help="use gpu-enabled lastz in cactus-blast") parser.add_argument("--gpuType", default="nvidia-tesla-v100", help="GPU type (to set in WDL runtime parameters)") parser.add_argument("--gpuCount", default=1, help="GPU count (to set in WDL runtime parameters)") parser.add_argument("--nvidiaDriver", default="440.64.00", help="Nvidia driver version") parser.add_argument("--gpuZone", default="us-central1-c", help="zone used for gpu task") parser.add_argument("--zone", default="us-west2-a", help="zone used for all but gpu tasks") parser.add_argument("--defaultCores", type=int, help="Number of cores for each job unless otherwise specified") parser.add_argument("--preprocessCores", type=int, help="Number of cores for each cactus-preprocess job") parser.add_argument("--blastCores", type=int, help="Number of cores for each cactus-blast job") parser.add_argument("--alignCores", type=int, help="Number of cores for each cactus-align job") parser.add_argument("--defaultMem", type=float, help="Memory in GB for each job unless otherwise specified") parser.add_argument("--preprocessMem", type=float, help="Memory in GB for each cactus-preprocess job") parser.add_argument("--blastMem", type=float, help="Memory in GB for each cactus-blast job") parser.add_argument("--alignMem", type=float, help="Memory in GB for each cactus-align job") parser.add_argument("--defaultDisk", type=int, help="Disk in GB for each job unless otherwise specified") parser.add_argument("--preprocessDisk", type=int, help="Disk in GB for each cactus-preprocess job") parser.add_argument("--blastDisk", type=int, help="Disk in GB for each cactus-blast job") parser.add_argument("--alignDisk", type=int, help="Disk in GB for each cactus-align job") parser.add_argument("--halAppendDisk", type=int, help="Disk in GB for each halAppendSubtree job") parser.add_argument("--preprocessPreemptible", type=int, help="Preemptible in GB for each cactus-preprocess job [default=2]", default=2) parser.add_argument("--blastPreemptible", type=int, help="Preemptible in GB for each cactus-blast job [default=1]", default=1) parser.add_argument("--alignPreemptible", type=int, help="Preemptible in GB for each cactus-align job [default=1]", default=1) parser.add_argument("--halAppendPreemptible", type=int, help="Preemptible in GB for each halAppendSubtree job [default=1]", default=1) options = parser.parse_args() options.database = 'kyoto_tycoon' #todo support root option options.root = None if not options.wdl: if not options.outDir: raise RuntimeError("--outDir option required when not using --wdl") if not options.outSeqFile: options.outSeqFile = os.path.join(options.outDir, os.path.basename(options.seqFile)) if os.path.abspath(options.seqFile) == os.path.abspath(options.outSeqFile): options.outSeqFile += '.1' if (not options.wdl or not options.gpu) and (options.gpuCount > 1 or options.gpuType != "nvidia-tesla-v100"): raise RuntimeError("--gpuType and gpuCount can only be used with --wdl --gpu") if not options.outHal: options.outHal = os.path.join(options.outDir if options.outDir else '', 'out.hal') if options.wdl: if options.preprocessBatchSize != 1: if options.preprocessBatchSize != 3: # hacky way to only warn for non-default sys.stderr.write("Warning: --preprocessBatchSize reset to 1 for --wdl support\n") options.preprocessBatchSize = 1 # wdl handles output file structure if options.outDir: sys.stderr.write("Warning: --outDir option ignored with --wdl\n") options.outDir = "." if options.outSeqFile: sys.stderr.write("Warning: --outSeqFile option ignored with --wdl\n") options.outSeqFile = None if options.preprocessOnly: raise RuntimeError('--preprocessOnly cannot be used in conjunction with --wdl') if not options.dockerImage: options.dockerImage = getDockerImage() # apply defaults if options.defaultCores: if not options.preprocessCores: options.preprocessCores = options.defaultCores if not options.blastCores: options.blastCores = options.defaultCores if not options.alignCores: options.alignCores = options.defaultCores if options.defaultMem: if not options.preprocessMem: options.preprocessMem = options.defaultMem if not options.blastMem: options.blastMem = options.defaultMem if not options.alignMem: options.alignMem = options.defaultMem if not options.alignCores or options.alignCores == 1: if options.alignCores == 1: sys.stderr.write("Warning: --alignCores changed from 1 to 2\n") options.alignCores = 2 if options.defaultDisk: if not options.preprocessDisk: options.preprocessDisk = options.defaultDisk if not options.blastDisk: options.blastDisk = options.defaultDisk if not options.alignDisk: options.alignDisk = options.defaultDisk if not options.halAppendDisk: options.halAppendDisk = options.defaultDisk # https://cromwell.readthedocs.io/en/stable/RuntimeAttributes/#gpucount-gputype-and-nvidiadriverversion # note: k80 not included as WGA_GPU doesn't run on it. acceptable_gpus = ['nvidia-tesla-v100', 'nvidia-tesla-p100', 'nvidia-tesla-p4', 'nvidia-tesla-t4'] if options.gpuType not in acceptable_gpus: raise RuntimeError('--gpuType {} not supported by Terra. Acceptable types are {}'.format( options.gpuType, acceptable_gpus)) # need to go through this garbage (copied from the main() in progressive_cactus) to # come up with the project options.cactusDir = getTempDirectory() #Create the progressive cactus project projWrapper = ProjectWrapper(options, options.configFile) projWrapper.writeXml() # used to unique jobstore options.jobStoreCount = 0 pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) enableDumpStack() cactusPrepare(options, project)
def cactusPrepare(options, project): """ annotate a SeqFile with ancestral names as well as paths for output sequences.""" # read the input seqFile = SeqFile(options.seqFile) configNode = ET.parse(options.configFile).getroot() config = ConfigWrapper(configNode) if not options.wdl: # prepare output sequence directory # todo: support remote (ie s3) output directory try: os.makedirs(options.outDir) except: pass if not os.path.isdir(options.outDir): raise RuntimeError('Unable to create output sequence directory \'{}\''.format(options.outDir)) if not os.access(options.outDir, os.W_OK): logger.warning('Output sequence directory is not writeable: \'{}\''.format(options.outDir)) if options.preprocessOnly or options.gpu: if options.preprocessOnly: # hack the configfile to skip preprocessing and write it to the output dir config.removePreprocessors() if options.gpu: # hack the configfile to toggle on gpu lastz cafNode = findRequiredNode(config.xmlRoot, "caf") cafNode.attrib["gpuLastz"] = "true" # realigning doesn't mix well with lastz so we make sure it's off # https://github.com/ComparativeGenomicsToolkit/cactus/issues/271 cafNode.attrib["realign"] = "0" options.configFile = os.path.join(options.outDir, 'config-prepared.xml') sys.stderr.write("configuration saved in {}\n".format(options.configFile)) config.writeXML(options.configFile) # pass through the config file to the options # todo (don't like second hard-code check of .xml path) if options.configFile != os.path.join(cactusRootPath(), "cactus_progressive_config.xml") and not options.wdl: options.cactusOptions += ' --configFile {}'.format(options.configFile) # get the ancestor names tree = MultiCactusTree(seqFile.tree) tree.nameUnlabeledInternalNodes(prefix = config.getDefaultInternalNodePrefix()) # make the output outSeqFile = SeqFile() outSeqFile.tree= tree outSeqFile.pathMap = copy.deepcopy(seqFile.pathMap) outSeqFile.outgroups = copy.deepcopy(seqFile.outgroups) # update paths for preprocessed leaves or inferred ancestors for node in outSeqFile.tree.breadthFirstTraversal(): name = outSeqFile.tree.getName(node) leaf = outSeqFile.tree.isLeaf(node) if leaf or (not leaf and name not in seqFile.pathMap and not options.preprocessOnly): out_basename = seqFile.pathMap[name] if name in seqFile.pathMap else '{}.fa'.format(name) outSeqFile.pathMap[name] = os.path.join(options.outDir, os.path.basename(out_basename)) if options.wdl: # uniquify name in wdl to prevent collisions outSeqFile.pathMap[name] += '.pp' # write the output if options.outSeqFile: with open(options.outSeqFile, 'w') as out_sf: out_sf.write(str(outSeqFile)) # write the instructions print(get_plan(options, project, seqFile, outSeqFile))
def main(): parser = ArgumentParser() Job.Runner.addToilOptions(parser) addCactusWorkflowOptions(parser) parser.add_argument("seqFile", help = "Seq file") parser.add_argument("blastOutput", nargs="+", help = "Blast output (from cactus-blast)") parser.add_argument("outputHal", type=str, help = "Output HAL file") parser.add_argument("--pathOverrides", nargs="*", help="paths (multiple allowd) to override from seqFile") parser.add_argument("--pathOverrideNames", nargs="*", help="names (must be same number as --paths) of path overrides") #Progressive Cactus Options parser.add_argument("--configFile", dest="configFile", help="Specify cactus configuration file", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument("--root", dest="root", help="Name of ancestral node (which" " must appear in NEWICK tree in <seqfile>) to use as a " "root for the alignment. Any genomes not below this node " "in the tree may be used as outgroups but will never appear" " in the output. If no root is specifed then the root" " of the tree is used. ", default=None, required=True) parser.add_argument("--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument("--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries", default=None) parser.add_argument("--nonBlastInput", action="store_true", help="Input does not come from cactus-blast: Do not append ids to fasta names") parser.add_argument("--nonBlastMegablockFilter", action="store_true", help="By default, the megablock filter is off for --nonBlastInput, as it does not play" "nicely with reference-based alignments. This flag will turn it back on") parser.add_argument("--pafInput", action="store_true", help="'blastOutput' input is in paf format, rather than lastz cigars.") options = parser.parse_args() setupBinaries(options) setLoggingFromOptions(options) enableDumpStack() if (options.pathOverrides or options.pathOverrideNames): if not options.pathOverrides or not options.pathOverrideNames or \ len(options.pathOverrideNames) != len(options.pathOverrides): raise RuntimeError('same number of values must be passed to --pathOverrides and --pathOverrideNames') # cactus doesn't run with 1 core if options.batchSystem == 'singleMachine': if options.maxCores is not None: if int(options.maxCores) < 2: raise RuntimeError('Cactus requires --maxCores > 1') else: # is there a way to get this out of Toil? That would be more consistent if cpu_count() < 2: raise RuntimeError('Only 1 CPU detected. Cactus requires at least 2') # tokyo_cabinet is no longer supported options.database = "kyoto_tycoon" options.database = 'kyoto_tycoon' options.buildHal = True options.buildFasta = True # Mess with some toil options to create useful defaults. cactus_override_toil_options(options) start_time = timeit.default_timer() runCactusAfterBlastOnly(options) end_time = timeit.default_timer() run_time = end_time - start_time logger.info("cactus-align has finished after {} seconds".format(run_time))
def main(toil_mode=False): parser = ArgumentParser() if toil_mode: Job.Runner.addToilOptions(parser) parser.add_argument("--latest", dest="latest", action="store_true", help="Use the latest version of the docker container " "rather than pulling one matching this version of cactus") parser.add_argument("--containerImage", dest="containerImage", default=None, help="Use the the specified pre-built containter image " "rather than pulling one from quay.io") parser.add_argument("--binariesMode", choices=["docker", "local", "singularity"], help="The way to run the Cactus binaries (at top level; use --cactusOpts to set it in nested calls)", default=None) parser.add_argument("seqFile", help = "Seq file") parser.add_argument("--outDir", help='Directory where the processed leaf sequence and ancestral sequences will be placed.' ' Required when not using --wdl') parser.add_argument("--outSeqFile", help="Path for annotated Seq file output [default: outDir/seqFile]") parser.add_argument("--outHal", help="Output HAL file [default: outDir/out.hal]", required=toil_mode) if not toil_mode: parser.add_argument("--wdl", action="store_true", help="output wdl workflow instead of list of commands") parser.add_argument("--noLocalInputs", action="store_true", help="dont embed local input paths in WDL script (as they will need" " to be respecified when running on Terra") parser.add_argument("--jobStore", type=str, default="./jobstore", help="base directory of jobStores to use in suggested commands") parser.add_argument("--configFile", default=os.path.join(cactusRootPath(), "cactus_progressive_config.xml")) parser.add_argument("--preprocessBatchSize", type=int, default=3, help="size (number of genomes) of suggested preprocessing jobs") parser.add_argument("--halOptions", type=str, default="--hdf5InMemory", help="options for every hal command") parser.add_argument("--cactusOptions", type=str, default="--realTimeLogging --logInfo --retryCount 0", help="options for every cactus command") parser.add_argument("--preprocessOnly", action="store_true", help="only decompose into preprocessor and cactus jobs") parser.add_argument("--dockerImage", type=str, help="docker image to use as wdl runtime") parser.add_argument("--gpu", action="store_true", help="use gpu-enabled lastz in cactus-blast") parser.add_argument("--gpuType", default="nvidia-tesla-v100", help="GPU type (to set in WDL runtime parameters)") parser.add_argument("--gpuCount", default=1, help="GPU count (to set in WDL runtime parameters)") parser.add_argument("--nvidiaDriver", default="440.64.00", help="Nvidia driver version") parser.add_argument("--gpuZone", default="us-central1-c", help="zone used for gpu task") parser.add_argument("--zone", default="us-west2-a", help="zone used for all but gpu tasks") if not toil_mode: parser.add_argument("--defaultCores", type=int, help="Number of cores for each job unless otherwise specified") parser.add_argument("--preprocessCores", type=int, help="Number of cores for each cactus-preprocess job") parser.add_argument("--blastCores", type=int, help="Number of cores for each cactus-blast job") parser.add_argument("--alignCores", type=int, help="Number of cores for each cactus-align job") if not toil_mode: parser.add_argument("--defaultMemory", type=human2bytesN, help="Memory for each job unless otherwise specified. " "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes)") parser.add_argument("--preprocessMemory", type=human2bytesN, help="Memory for each cactus-preprocess job. " "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes)") parser.add_argument("--blastMemory", type=human2bytesN, help="Memory for each cactus-blast job. " "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes)") parser.add_argument("--alignMemory", type=human2bytesN, help="Memory for each cactus-align job. " "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes)") if not toil_mode: parser.add_argument("--defaultDisk", type=human2bytesN, help="Disk for each job unless otherwise specified. " "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes)") parser.add_argument("--preprocessDisk", type=human2bytesN, help="Disk for each cactus-preprocess job. " "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes)") parser.add_argument("--blastDisk", type=human2bytesN, help="Disk for each cactus-blast job. " "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes)") parser.add_argument("--alignDisk", type=human2bytesN, help="Disk for each cactus-align job. " "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes)") parser.add_argument("--halAppendDisk", type=human2bytesN, help="Disk for each halAppendSubtree job. " "Standard suffixes like K, Ki, M, Mi, G or Gi are supported (default=bytes)") parser.add_argument("--preprocessPreemptible", type=int, help="Preemptible attempt count for each cactus-preprocess job [default=2]", default=2) parser.add_argument("--blastPreemptible", type=int, help="Preemptible attempt count for each cactus-blast job [default=1]", default=1) parser.add_argument("--alignPreemptible", type=int, help="Preemptible attempt count for each cactus-align job [default=1]", default=1) parser.add_argument("--halAppendPreemptible", type=int, help="Preemptible attempt count for each halAppendSubtree job [default=1]", default=1) parser.add_argument("--database", choices=["kyoto_tycoon", "redis"], help="The type of database", default="kyoto_tycoon") options = parser.parse_args() #todo support root option options.root = None if toil_mode: options.wdl = False options.noLocalInputs = False options.outDir = '.' setupBinaries(options) # need to avoid nested container calls, so set toil-inside-toil jobs to local by default if "--binariesMode" not in options.cactusOptions: options.cactusOptions += " --binariesMode local" if options.jobStore.startswith('aws'): if not options.outHal.startswith('s3://'): raise RuntimeError("--outHal must be s3:// address when using s3 job store") if not has_s3: raise RuntimeError("S3 support requires toil to be installed with [aws]") options.toil = toil_mode if not options.wdl and not options.toil: if not options.outDir: raise RuntimeError("--outDir option required when not using --wdl") if not options.outSeqFile: options.outSeqFile = os.path.join(options.outDir, os.path.basename(options.seqFile)) if os.path.abspath(options.seqFile) == os.path.abspath(options.outSeqFile): options.outSeqFile += '.1' if (not options.wdl or not options.gpu) and (options.gpuCount > 1 or options.gpuType != "nvidia-tesla-v100"): raise RuntimeError("--gpuType and gpuCount can only be used with --wdl --gpu") if not options.outHal: options.outHal = os.path.join(options.outDir if options.outDir else '', 'out.hal') if options.wdl: # wdl handles output file structure if options.outDir: sys.stderr.write("Warning: --outDir option ignored with --wdl\n") options.outDir = "." if options.outSeqFile: sys.stderr.write("Warning: --outSeqFile option ignored with --wdl\n") options.outSeqFile = None if options.preprocessOnly: raise RuntimeError('--preprocessOnly cannot be used in conjunction with --wdl') if not options.dockerImage: options.dockerImage = getDockerImage() # apply defaults if options.defaultCores: if not options.preprocessCores: options.preprocessCores = options.defaultCores if not options.blastCores: options.blastCores = options.defaultCores if not options.alignCores: options.alignCores = options.defaultCores if options.defaultMemory: if not options.preprocessMemory: options.preprocessMemory = options.defaultMemory if not options.blastMemory: options.blastMemory = options.defaultMemory if not options.alignMemory: options.alignMemory = options.defaultMemory if not options.alignCores or options.alignCores == 1: if options.alignCores == 1: sys.stderr.write("Warning: --alignCores changed from 1 to 2\n") options.alignCores = 2 if options.defaultDisk: if not options.preprocessDisk: options.preprocessDisk = options.defaultDisk if not options.blastDisk: options.blastDisk = options.defaultDisk if not options.alignDisk: options.alignDisk = options.defaultDisk if not options.halAppendDisk: options.halAppendDisk = options.defaultDisk # todo: no reason not to support non-1 batch size, but mirror wdl logic for now if options.toil: if options.preprocessBatchSize != 1: if options.preprocessBatchSize != 3: # hacky way to only warn for non-default sys.stderr.write("Warning: --preprocessBatchSize reset to 1 for --wdl support\n") options.preprocessBatchSize = 1 # todo: could also support this assert not options.preprocessOnly # https://cromwell.readthedocs.io/en/stable/RuntimeAttributes/#gpucount-gputype-and-nvidiadriverversion # note: k80 not included as WGA_GPU doesn't run on it. acceptable_gpus = ['nvidia-tesla-v100', 'nvidia-tesla-p100', 'nvidia-tesla-p4', 'nvidia-tesla-t4'] if options.gpuType not in acceptable_gpus: raise RuntimeError('--gpuType {} not supported by Terra. Acceptable types are {}'.format( options.gpuType, acceptable_gpus)) # need to go through this garbage (copied from the main() in progressive_cactus) to # come up with the project options.cactusDir = getTempDirectory() #Create the progressive cactus project projWrapper = ProjectWrapper(options, options.configFile) projWrapper.writeXml() # used to unique jobstore options.jobStoreCount = 0 pjPath = os.path.join(options.cactusDir, ProjectWrapper.alignmentDirName, '%s_project.xml' % ProjectWrapper.alignmentDirName) assert os.path.exists(pjPath) project = MultiCactusProject() if not os.path.isdir(options.cactusDir): os.makedirs(options.cactusDir) project.readXML(pjPath) enableDumpStack() cactusPrepare(options, project)