class LogicalStep(Target): ''' defines a single logical step of the pipeline (like alignment) that has control over its own temp directory ''' @property def ana(self): return self._analysis @property def analysis(self): return self._analysis @property def dir(self): '''Returns the temporary logical step directory''' if self._dir == None: raise Exception("Trying to retrieve directory before its been created") return self._dir @property def name(self): return self._stepName @property def version(self): return self.ana.version + '.' + str(self._stepVersion) @property def err(self): return self._err @err.setter def err(self,value): self._err = value @property def status(self): '''Returns status of step: Init/Running/Success/Fail.''' return self._status def __init__(self, analysis, stepName, ram=1000000000, cpus=1): Target.__init__(self, time=0.00025, memory=ram, cpu=cpus) self._stepVersion = 1 self._analysis = analysis self.interimFiles = {} self.targetFiles = {} self._garbageFiles = {} self.metaFiles = {} # ??? self.json = {} self.log = Log() # Before logfile is declared, log print to stdout self._stepName = stepName # descendent classes MUST fill in the _stepName self._err = -1 # descendent classes should set this to returns from ganular steps self._status = 'Init' # Init/Running/Success/Fail self._dir = None # needs to make temp directory for itself self._toolBegan = None self._stepBegan = None self.ana.registerStep(self) # Analysis may manage multiple steps simultaneously def __str__(self): return pprint.pformat(self) def run(self): self._status = 'Running' self.createDir() self.declareLogFile() # Ensures that the logical step dir and log exist self._stepBegan = datetime.now() self.log.out("--- Beginning '" + self._stepName + "' [version: "+self.version+"] [" + self._stepBegan.strftime("%Y-%m-%d %X (%A)")+ '] ---') self._prevDir = os.getcwd() os.chdir(self.dir) self.log.out("> cd "+self.dir) try: self.ana.onRun(self) self.onRun() #now this calls child onRun directly except StepError as e: return self.onFail(e) except Exception as e: return self.onFail(e, logTrace=True) self.success() # success() must be outside of try or else we loose any exeptions return 0 def onRun(self): '''this part would likely be overridden for each logical step''' raise Exception('children need to override this') def encodeDebug(self, message): self.logToMaster(message) def success(self): self._status = 'Success' if self.ana.dryRun: self.mockUpResults() for fileName in self.metaFiles: self.metaFiles[fileName].write() self._err = 0 # by definition os.chdir(self._prevDir) #self.log.out("> cd "+self._prevDir) stepEnded = datetime.now() stepTook = str(stepEnded - self._stepBegan + timedelta(seconds=0.5)).split('.')[0] self.log.out("\n>> Successfully completed '" + self._stepName + "' [" + \ stepEnded.strftime("%Y-%m-%d %X (%A)") + ' duration:' + stepTook + "]\n") self.ana.onSucceed(self) def fail(self, message): raise StepError(message) def onFail(self, e, logTrace=False): self._status = 'Fail' stepEnded = datetime.now() stepTook = str(stepEnded - self._stepBegan + timedelta(seconds=0.5)).split('.')[0] if e.message != None and e.message != "": self.log.out(">>> FAILURE: '" + e.message) self.log.out(">>> Failure during '" + self._stepName + ': ' + str(e) + "' [" + \ stepEnded.strftime("%Y-%m-%d %X (%A)") + ' duration:' + stepTook + "]\n") if logTrace: self.log.out(traceback.format_exc()) if self._err == 0: self._err = 1 # Make sure this error is noticed! return self.ana.onFail(self) def createDir(self): '''Creates logical step directory''' self._dir = self.ana.createTempDir(self.name, clean=True) def fileNameOrFullPath(self, filePath): '''Returns just the file name or the full path as appropriate''' if filePath.endswith('/') or not filePath.startswith(self.dir): return filePath else: # Since file is in step dir, and so is execution, just return name return os.path.split(filePath)[1] def declareTargetFile(self, key, name=None, ext=''): ''' Reserves name for a file we want to keep permanantly, and returns a fully qualified filename in the local temp dir ''' self.targetFiles[key] = self.makeFilePath(key, name, ext) return self.fileNameOrFullPath(self.targetFiles[key]) def declareInterimFile(self, key, name=None, ext=''): ''' Reserves name for a file we want to keep during the life of the analysis, and returns a fully qualified filename in the local temp dir ''' self.interimFiles[key] = self.makeFilePath(key, name, ext) return self.fileNameOrFullPath(self.interimFiles[key]) def declareGarbageFile(self, key, name=None, ext=''): ''' Reserves name for a file we do not care about, and returns a fully qualified filename in the local temp dir ''' self._garbageFiles[key] = self.makeFilePath(key, name, ext) return self.fileNameOrFullPath(self._garbageFiles[key]) def declareLogFile(self, name=None): ''' Gets or sets the filename for the log that will be created by this logical step. ''' if self.log != None and self.log.file() != None and name != None: return self.log.file() #self.encodeDebug(str(self)) if name == None: if self._stepName != None: name = self._stepName else: raise Exception("This 'logical step' has not been named.") self.log.declareFile(self.dir + name.replace(' ','_') + '.log') self.log.empty() # Logical step log always starts empty! return self.log.file() def mockUpPath(self,path,show=False): '''Touch a file or make a directory''' if path.endswith('/'): self.ana.runCmd('mkdir -p '+path,logOut=show,logErr=show,dryRun=False,log=self.log) else: dirt = os.path.split( path )[0] self.ana.runCmd('mkdir -p '+dirt,logOut=show,logErr=show,dryRun=False,log=self.log) self.ana.runCmd('touch -a '+path,logOut=show,logErr=show,dryRun=False,log=self.log) def mockUpResults(self): ''' For each result file, will create it empty if it does not exist. This is used to mock up results in a dry run. ''' for key in self.targetFiles.keys(): self.mockUpPath(self.targetFiles[key],True) for key in self.interimFiles.keys(): self.mockUpPath(self.interimFiles[key],True) def deliverTargetFile(self, name, pathToTarget): ''' Hard links a step 'target' file to the analysis 'target' file in the analysis directory. This is expected when a logical step succeeds. ''' # Because dryRun should mock up result files, we should set dryRun to False to actually # make links to the mocked up files. return self.ana.linkOrCopy(self.targetFiles[name], pathToTarget, logOut=True,dryRun=False,log=self.log) def deliverInterimFile(self, name, pathToInterim): ''' Hard links a step 'target' file to the analysis 'target' file in the analysis directory. This is expected when a logical step succeeds. ''' # Because dryRun should mock up result files, we should set dryRun to False to actually # make links to the mocked up files. return self.ana.linkOrCopy(self.interimFiles[name], pathToInterim, logOut=True,dryRun=False,log=self.log) def deliverResultFile(self, name, pathToTarget): ''' Hard links a step 'result' file to the analysis 'interim' or 'target' file in the analysis directory. This is expected when a logical step succeeds. ''' try: return self.deliverTargetFile(name, pathToTarget) except: return self.deliverInterimFile(name, pathToTarget) def cleanup(self): ''' Removes the temporary 'step' directory and all of its contents. This is expected when a logical step succeeds. ''' if self._dir != None: self.ana.runCmd('rm -rf ' + self._dir) #self._analysis.removeStep(self) # Do we want to do this? def makeFilePath(self, key, name=None, ext=''): ''' Returns a fully qualified file/dir name ''' if name == None: name = key if len(ext) > 0: if ext.lower() == 'dir': # directories are also supported! if not name.endswith('/'): ext = '/' elif not ext.startswith('.'): ext = '.' + ext return self.dir + name + ext def toolBegins(self, toolName): '''Standardized message before tool comandline''' self._toolBegan = datetime.now() self.log.out("\n# [" + self._toolBegan.strftime("%Y-%m-%d %X") + "] '" + toolName + \ "' begins...") def toolEnds(self,toolName,retVal,raiseError=True): '''Standardized message after tool comandline. Raise exception for non-zero retVal.''' toolEnded = datetime.now() toolTook = str(toolEnded - self._toolBegan + timedelta(seconds=0.5)).split('.')[0] self.log.out("# ["+toolEnded.strftime("%Y-%m-%d %X") + ' duration:' + toolTook + "] '" + \ toolName + "' returned " + str(retVal)) if raiseError and not retVal == 0: self._err = retVal self.fail(toolName + " returned " + str(self._err)) def writeVersions(self,raFile=None): '''Writes versions to to the log or a file.''' # Each logical step is expected to extend or replace this to record the actual tool versions if raFile != None: raFile.add("ENCODE_Analysis_pipeline",self.ana.version) self.log.out("# ENCODE Analysis pipeline [version: " + self.ana.version + "]") raFile.add(self.name+'_Step',self.version) self.log.out("# "+self.name+"_Step [version: " + self.version + "]") else: self.log.out("# ENCODE Analysis pipeline [version: "+self.ana.version+"]") self.log.out("# "+self.name+"_Step [version: "+self.version+ "]") def createAndWriteJsonFile(self, name=None, target=False, jsonObj=None): ''' Creates and writes the json object as an interim file keyed as {name}.ra. ''' if name == None: name = self.name if target: filePath = self.declareTargetFile(name + '.json') else: filePath = self.declareInterimFile(name + '.json') if jsonObj == None: jsonObj = self.json if jsonObj: fp = open(filePath, 'w') json.dump(jsonObj, fp, sort_keys=True, indent=4, separators=(',', ': ')) fp.write('\n') # Hopefully this makes galaxy happy that the json is plain txt. fp.close() def createMetadataFile(self, name, target=False): ''' Creates a metadata file as an interim file keyed as {name}.ra. ''' if name in self.metaFiles: raise Exception('metadata file already exists') if target: filePath = self.declareTargetFile(name + '.ra') else: filePath = self.declareInterimFile(name + '.ra') self.metaFiles[name] = RaFile(filePath) return self.metaFiles[name] def printPaths(self, tab, log=None): '''Used in debugging; prints step's paths, etc.''' if log == None: log = self.log if self._dir != None: title = self.name + ' dir:' log.out(title.ljust(tab) + self.dir ) if self.log.file() != None: title = self.name + ' log:' log.out(title.ljust(tab) + self.log.file() ) for key in sorted( self._garbageFiles.keys() ): title = "garbage[%s]:" % (key) log.out(title.ljust(tab) + self._garbageFiles[key]) for key in sorted( self.interimFiles.keys() ): title = "interimFile[%s]:" % (key) log.out(title.ljust(tab) + self.interimFiles[key]) for key in sorted( self.targetFiles.keys() ): title = "targetFile[%s]:" % (key) log.out(title.ljust(tab) + self.targetFiles[key]) def getToolVersion(self, executable, logOut=True): ''' Retrieves tool version, but if there is no success, then the version is the md5sum. ''' if executable.find('/') == -1: # Not a path, then look for this on the path! toolId = self.ana.getCmdOut("md5sum `which "+executable+"` | awk '{print $1}'", \ dryRun=False,logCmd=False) if toolId.startswith('which: no'): # failed to find executable: might be perl script toolId = self.ana.getCmdOut("md5sum "+self.ana.toolsDir + executable + \ " | awk '{print $1}'",dryRun=False,logCmd=False) if toolId.startswith('md5sum: '): # failed to find executable toolId = "" toolName = executable else: toolId = self.ana.getCmdOut("md5sum "+executable+" | awk '{print $1}'", \ dryRun=False,logCmd=False) if toolId.startswith('md5sum: '): # failed to find executable toolId = "" toolName = os.path.split( executable )[1] toolData = self.ana.getToolData(toolId, toolName) if toolData == None: version = 'md5sum:'+toolId # when all else fails if logOut: self.log.out("# tool: "+toolName+" [version: " + version + "]") return version try: version = toolData['version'] except: try: version = toolData['packageVersion'] except: raise Exception("Tool '"+executable+"' was found in tool database, " + \ "but 'version' was not!") # Try to get actual version and compare the two if 'versionCommand' in toolData: actual = self.ana.getCmdOut(toolData['versionCommand'],dryRun=False,logCmd=False) if version != actual: if self.ana.strict: raise Exception("Expecting "+executable+" [version: "+version+"], " + \ "but found [version: "+actual+"]") version = actual # Use actual rather than expected. # If the toolData was found by name, then the tooldIds may not match, so: if toolId != toolData['toolId'] and toolId != "": version += ' (md5sum:'+toolId+')' # If there is a package name or version that differs, then use it intro = "tool: " if 'packageName' in toolData and 'packageVersion' in toolData: package = toolData['packageName'] packageVersion = toolData['packageVersion'] if packageVersion != version or package.lower() != toolName.lower(): toolName = package+'('+toolName+')' intro = "package(tool): " if packageVersion != version: version = packageVersion+'('+version+')' if logOut: self.log.out("# "+intro+toolName+" [version: " + version + "]") return version
class Analysis(object): """ This is the interface for an instantiation of the Encode Analysis pipeline on a single analysis, which has two specific implementations: - EncodeAnalysis: for use in the official pipeline run by ENCODE - GalaxyAnalysis: code run by the Galaxy system for end-users """ @property def dir(self): """Returns the analysis directory""" if self._analysisDir == None: raise Exception("Trying to retrieve directory before its been created") return self._analysisDir @property def version(self): return str(self._pipelineVersion) def __init__(self, settingsFile, analysisId=None, genome="hg19"): """ Takes in a settings file which contains various paths to tools, a temp directory and other configuration setting for all analyses. Optionally a manifest file for analysis specific details (relevant input files and analysis ID) may be provided. If no manifest file is provided, those details will have to be "registered" to the analysis, one by one. """ self._pipelineVersion = 1 self._variables = {} self._variables["analysisId"] = analysisId self.genome = genome self.log = Log() # Before logfile is declared, log print to stdout self._analysisDir = None self._tmpDirs = {} # Note: these should be replaced with _steps[0].stepDir() self._steps = [] # Keep track of ordered logical steps? self._inputFiles = {} self._interimFiles = {} self._targetOutput = {} self.strict = False self._deliveryKeys = None self._toolsDb = None self._toolsDir = None self._refDir = None self._settingsFile = os.path.abspath(settingsFile) self._settings = Settings(self._settingsFile) self.setupEnv() self._dryRun = self._settings.getBoolean("dryRun", default=False) def onRun(self, step): pass @property def dryRun(self): return self._dryRun @dryRun.setter def dryRun(self, value): """ Sets the dryRun variable. """ self._dryRun = value @property def readType(self): return self._variables["readType"] @readType.setter def readType(self, value): """ Sets the readType variable. """ if value == "paired" or value == "single": self._variables["readType"] = value else: raise ValueError("readType must be either 'paired' or 'single'") @property def type(self): return self._variables["analysisType"] @type.setter def type(self, value): """ Sets the analysis type variable. """ if value == "DNase" or value == "ChIPseq" or value == "RNAseq-long": self._variables["analysisType"] = value else: raise ValueError("Analysis type must be one of 'ChIPseq', 'DNase' or 'RNAseq'") @property def id(self): return self._variables["analysisId"] @id.setter def id(self, value): """ Sets the analysis ID variable. """ if "analysisId" in self._variables: raise ValueError("Analysis ID already set to '" + self._variables["analysisId"] + "'") self._variables["analysisId"] = value @property def genome(self): return self._variables["genome"] @genome.setter def genome(self, value): """ Sets the genome variable. """ if "genome" in self._variables: raise ValueError("Genome already set to '" + self._variables["genome"] + "'") if value not in ["hg19"]: # Add hg38, mm10, etc. when those are supported raise ValueError("Unsupported genome '" + value + "'") self._variables["genome"] = value @property def gender(self): if "gender" not in self._variables: return "unspecified" return self._variables["gender"] @gender.setter def gender(self, value): """ Sets the gender variable. """ if "gender" in self._variables: raise ValueError("Gender already set to '" + self._variables["gender"] + "'") if value not in ["unspecified", "female", "male"]: raise ValueError("Unsupported gender '" + value + "'") self._variables["gender"] = value @property def toolsDir(self): """ Retrieves the EAP_TOOLS_DIR environment variable. If not found, fall back to settings. """ if self._toolsDir != None: return self._toolsDir self._toolsDir = os.environ.get("EAP_TOOLS_DIR") if self._toolsDir == "": self._toolsDir = self.getDir("toolsDir") else: self._toolsDir = self._toolsDir + "/" # normalize dirs to always end in / return self._toolsDir @property def refDir(self): """ Retrieves the EAP_REF_DIR environment variable. If not found, fall back to settings. """ if self._refDir != None: return self._refDir self._refDir = os.environ.get("EAP_REF_DIR") if self._refDir == "": self._refDir = self.getDir("refDir") else: self._refDir = self._refDir + "/" # normalize dirs to always end in / return self._refDir def setupEnv(self): """ Ensures the toolsDir is in the path. """ path = os.environ.get("PATH") if path != None and path.find(self.toolsDir) == -1: newPath = self.toolsDir + os.pathsep + path # os.putenv('PATH',newPath) os.environ["PATH"] = newPath def getVar(self, varName, default=None): """ Retrieves variable for the Analysis variables. """ if varName in self._variables: return self._variables[varName] return default def setVar(self, varName, val): """ Sets an Analysis variable. If set to None, will be removed. """ if val == None: del self._variables[varName] else: self._variables[varName] = val def getSetting(self, settingName, default=None, alt=None): """ Retrieves setting from the settings file """ if self._settingsFile == None or self._settings == None: raise ValueError("ENCODE3 settings file is unknown!") return self._settings.get(settingName, default, alt) def getDir(self, settingName, default=None, alt=None): """ Retrieves full path to a directory from the settings file (always ending with '/') """ if self._settingsFile == None or self._settings == None: raise ValueError("ENCODE3 settings file is unknown!") return self._settings.getDir(settingName, default, alt) def getTool(self, toolName, orInPath=False): """ Retrieves full path to tool from the settings file """ # NOTE: set orInPath=True then missing full path will default to execution path # Example: if toolName is 'bwa' and 'bwaPath' not in settings file, and # if orInPath==True then 'bwa' will be returned, and if bwa is found on # execution path, no error will occur. if self._settingsFile == None or self._settings == None: raise ValueError("ENCODE3 settings file is unknown!") try: toolPath = self._settings.get(toolName + "Tool") return os.path.abspath(toolPath) except: if orInPath: return toolName # to clever by half: we know there is no toolName+'Path' so if toolPath # is missing, then exception will already have the correct message. return self.getDir(toolName + "Dir", self.toolsDir) + toolName def getToolData(self, toolId, name=None): """ Retrieves tool data as a dictionary from the toolDb. """ if self._toolsDb == None: toolDbFile = self.getSetting("toolDbFile", "") if toolDbFile == "": toolDbFile = self.toolsDir + "tools.ra" self._toolsDb = Stanzas(toolDbFile) if self._toolsDb == None: return None toolData = self._toolsDb.getStanza(toolId) # If tool not found by id, see if it can be found by name if toolData == None and name != None: self._toolsDb.altIndex("name", unique=False) self._toolsDb.setSortOrder(["name", "version", "toolId"]) stanza = None while True: # With sort order, the last shall have the latest version stanza = self._toolsDb.getStanzaFromAlt(name, stanza) if stanza == None: break toolData = stanza return toolData def createAnalysisDir(self): """creates analysis level directory""" if self.id == None: raise Exception("This analysis has not been registered or defined in manifest") if self._analysisDir != None: raise Exception("The directory for this analysis has already been created") self._analysisDir = self.getDir("tmpDir") + self.id.replace(" ", "_") + "/" if not os.path.isdir(self._analysisDir): os.makedirs(self._analysisDir) return self._analysisDir def createTempDir(self, name, clean=False): """ Returns a named temporary directory, creating it if necessary """ # Used for logicalStep dirs. Since steps could run in parallel, tmpDirs are in dict. if name in self._tmpDirs: raise Exception(name + " already exists as a temporary directory in this analysis") tmpdir = self.dir + name.replace(" ", "_") + "/" if clean and os.path.isdir(tmpdir): err = os.system("rm -rf " + tmpdir) os.mkdir(tmpdir) elif not os.path.isdir(tmpdir): os.mkdir(tmpdir) self._tmpDirs[name] = tmpdir return tmpdir def registerInputFile(self, name, fileWithPath=None): """ Registers a single input file by name. Retrieve again by name. Input files reside outside the analysis directory and are input to steps. """ if fileWithPath != None: self._inputFiles[name] = fileWithPath return self._inputFiles[name] def inputFile(self, name): return self._inputFiles[name] def registerInterimOutput(self, name, fileNoPath=None): """ Registers a single interim output file by name. Retrieve again by name. Interim outputs are generated by some steps to be used by other steps. They reside in the analysis directory and should be deleted when the analysis concludes. """ if fileNoPath != None: self._interimFiles[name] = self.dir + fileNoPath return self._interimFiles[name] def interimOutput(self, name): return self._interimFiles[name] def registerTargetOutput(self, name, outputNoPath=None): """ Registers a single target output (typically a file) by name. Retrieve again by name. Target outputs are the result of successful steps. They are written to the analysis directory and are expected to be hard-linked outside of it when the analysis completes. """ if outputNoPath != None: self._targetOutput[name] = self.dir + outputNoPath return self._targetOutput[name] def targetOutput(self, name): return self._targetOutput[name] def targetName(self, name): """ Returns the targetFile Name, stripped of the path. """ return os.path.split(self.targetOutput(name))[1] def linkOrCopy(self, fromLoc, toLoc, soft=False, logOut=True, dryRun=None, log=None): """ Standard call for all cases of moving files/dirs into position. """ if dryRun == None: dryRun = self._dryRun if soft: err = self.runCmd("ln -sf " + fromLoc + " " + toLoc, logOut=logOut, dryRun=dryRun, log=log) else: err = self.runCmd("ln -f " + fromLoc + " " + toLoc, logOut=logOut, dryRun=dryRun, log=log) if err != 0: if os.path.isdir(fromLoc): # If dir then remove old and then copy contents recursively self.runCmd("rm -rf " + toLoc, logOut=logOut, dryRun=dryRun, log=log) err = self.runCmd("cp -rf " + fromLoc + " " + toLoc, logOut=logOut, dryRun=dryRun, log=log) else: err = self.runCmd("cp -f " + fromLoc + " " + toLoc, logOut=logOut, dryRun=dryRun, log=log) if err != 0: raise Exception("Unable to ln or cp '" + fromLoc + "' to '" + toLoc + "'") # special case for bam files that may be paired with bai files! if fromLoc.endswith(".bam") and toLoc.endswith(".bam"): if os.path.exists(fromLoc + ".bai"): return self.linkOrCopy(fromLoc + ".bai", toLoc + ".bai", soft, logOut, dryRun, log) return err def getFile(self, name, io="input"): """ gets the filename to a file we created previously through either registerInputFile/registerTargetOutput OR passed as input in a manifest file. """ if io == "input": return self._inputFiles[name] else: return self._targetOutput[name] def declareLogFile(self, name=None): """ Gets or sets the filename for the log that might be created at the analysis level. """ if self.log != None and self.log.file() != None: return self.log.file() # Could check that name matches log if name == None: if self.id == None: raise Exception("This 'analysis' has not been registered or defined in manifest.") name = self.id self.log.declareFile(self.dir + name.replace(" ", "") + ".log") # self.log.empty() # Analysis log is a running log except when explicitly emptied return self.log.file() def registerStep(self, step): """ Multiple logical steps can be managed by an analysis simultaneously """ self._steps.append(step) def removeStep(self, step): """ Multiple logical steps can be managed by an analysis simultaneously """ try: self._steps.remove(step) except: pass ### Proccessing support ### def deliverFiles(self, step): """ Delivers interim and target files based upon matching keys. about and maybe trashing the directory as well? """ # Because we do not want to stop the loop for an exception # we record exceptions and raise one at the end. fails = "" # copy interims fullSetOfKeys = self._interimFiles.keys() deliveryKeys = fullSetOfKeys if self._deliveryKeys != None: deliveryKeys = self._deliveryKeys for key in fullSetOfKeys: if key not in deliveryKeys: continue try: step.deliverResultFile(key, self._interimFiles[key]) except: fails = fails + "Failed to find interim result for '" + key + "'\n" # copy targets fullSetOfKeys = self._targetOutput.keys() deliveryKeys = fullSetOfKeys if self._deliveryKeys != None: deliveryKeys = self._deliveryKeys for key in fullSetOfKeys: if key not in deliveryKeys: continue try: step.deliverResultFile(key, self._targetOutput[key]) except: fails = fails + "Failed to find target result for '" + key + "'\n" if len(fails) > 0: raise Exception(fails) def deliveryKeys(self, justThisSet): """ Register certain keys to be delived in deliverFiles and in this order. Without setting this, all keys in interim and target files will be delivered. """ self._deliveryKeys = justThisSet def onSucceed(self, step): """ pipeline will handle all success steps, like copying out files we care about and maybe trashing the directory as well? """ # deliver the files from step to analysis directory try: self.deliverFiles(step) except: pass # descendent classes should consider this an exception step.log.out("'\n--- End of step ---") step.log.dump(self.log.file()) # to stdout if no runningLog # Morgan, do you want the step log going to stdout even if there is an analysis log? # if self.log.file() != None: # If analysis log, be sure to just print step log to stdout # step.log.dump() if not self._dryRun: step.cleanup() # Removes step.stepDir() else: self.log.out("") # skip a lineline self.runCmd("ls -l " + step.dir, dryRun=False) self.log.out("") self.removeStep(step) # Do we want to do this? return 0 def onFail(self, step): """ pipeline will handle failure of logical steps like sweeping the log to the running log """ step.log.out("\n--- End of step ---") step.log.dump(self.log.file()) # to stdout if no runningLog if self.log.file() != None: # If analysis log, be sure to just print step log to stdout step.log.dump() if self._dryRun: self.log.out("") # skip a lineline self.runCmd("ls -l " + step.dir, dryRun=False) self.log.out("") retVal = step.err self.removeStep(step) # Do we want to do this? if retVal == 0: retVal = 1 # Must fail! return retVal def runCmd(self, cmd, logOut=True, logErr=True, dryRun=None, log=None): """ Runs the provided command and returns error code. Does NOT trigger onFail. Note that you can pass in a log object if you don't want to use the analysis log. """ if dryRun == None: dryRun = self._dryRun if log == None: log = self.log if logOut or logErr: if dryRun: log.out("*> " + cmd) else: log.out("> " + cmd) # Always log command itself if dryRun: return 0 log.close() # Ensure log is closed so that command redirect can be tacked on logFile = log.file() if logFile != None and logOut and logErr: err = os.system(cmd + " >> " + logFile + " 2>&1") elif logFile != None and logErr: err = os.system(cmd + " 2>>" + logFile) else: err = os.system(cmd) return err def getCmdOut(self, cmd, dryRun=None, logCmd=True, logResult=False, default="", log=None, errOk=False): """ Runs the provided command and returns the stdout. Note that you can pass in a log object if you don't want to use the analysis log. """ if dryRun == None: dryRun = self._dryRun if log == None: log = self.log if logCmd: log.out("> " + cmd) if dryRun: return default err, out = commands.getstatusoutput(cmd) if logResult: log.out(out) if err != 0 and not errOk: raise Exception("Running [" + cmd + "] returned '" + str(err)) if len(out) == 0: out = default return out
class Analysis(object): ''' This is the interface for an instantiation of the Encode Analysis pipeline on a single analysis, which has two specific implementations: - EncodeAnalysis: for use in the official pipeline run by ENCODE - GalaxyAnalysis: code run by the Galaxy system for end-users ''' @property def dir(self): '''Returns the analysis directory''' if self._analysisDir == None: raise Exception( "Trying to retrieve directory before its been created") return self._analysisDir @property def version(self): return str(self._pipelineVersion) def __init__(self, settingsFile, analysisId=None, genome='hg19'): ''' Takes in a settings file which contains various paths to tools, a temp directory and other configuration setting for all analyses. Optionally a manifest file for analysis specific details (relevant input files and analysis ID) may be provided. If no manifest file is provided, those details will have to be "registered" to the analysis, one by one. ''' self._pipelineVersion = 1 self._variables = {} self._variables['analysisId'] = analysisId self.genome = genome self.log = Log() # Before logfile is declared, log print to stdout self._analysisDir = None self._tmpDirs = { } # Note: these should be replaced with _steps[0].stepDir() self._steps = [] # Keep track of ordered logical steps? self._inputFiles = {} self._interimFiles = {} self._targetOutput = {} self.strict = False self._deliveryKeys = None self._toolsDb = None self._toolsDir = None self._refDir = None self._settingsFile = os.path.abspath(settingsFile) self._settings = Settings(self._settingsFile) self.setupEnv() self._dryRun = self._settings.getBoolean('dryRun', default=False) def onRun(self, step): pass @property def dryRun(self): return self._dryRun @dryRun.setter def dryRun(self, value): ''' Sets the dryRun variable. ''' self._dryRun = value @property def readType(self): return self._variables['readType'] @readType.setter def readType(self, value): ''' Sets the readType variable. ''' if value == 'paired' or value == 'single': self._variables['readType'] = value else: raise ValueError("readType must be either 'paired' or 'single'") @property def type(self): return self._variables['analysisType'] @type.setter def type(self, value): ''' Sets the analysis type variable. ''' if value == 'DNase' or value == 'ChIPseq' or value == 'RNAseq-long': self._variables['analysisType'] = value else: raise ValueError( "Analysis type must be one of 'ChIPseq', 'DNase' or 'RNAseq'") @property def id(self): return self._variables['analysisId'] @id.setter def id(self, value): ''' Sets the analysis ID variable. ''' if 'analysisId' in self._variables: raise ValueError("Analysis ID already set to '" + self._variables['analysisId'] + "'") self._variables['analysisId'] = value @property def genome(self): return self._variables['genome'] @genome.setter def genome(self, value): ''' Sets the genome variable. ''' if 'genome' in self._variables: raise ValueError("Genome already set to '" + self._variables['genome'] + "'") if value not in ['hg19' ]: # Add hg38, mm10, etc. when those are supported raise ValueError("Unsupported genome '" + value + "'") self._variables['genome'] = value @property def gender(self): if 'gender' not in self._variables: return 'unspecified' return self._variables['gender'] @gender.setter def gender(self, value): ''' Sets the gender variable. ''' if 'gender' in self._variables: raise ValueError("Gender already set to '" + self._variables['gender'] + "'") if value not in ['unspecified', 'female', 'male']: raise ValueError("Unsupported gender '" + value + "'") self._variables['gender'] = value @property def toolsDir(self): ''' Retrieves the EAP_TOOLS_DIR environment variable. If not found, fall back to settings. ''' if self._toolsDir != None: return self._toolsDir self._toolsDir = os.environ.get('EAP_TOOLS_DIR') if self._toolsDir == "": self._toolsDir = self.getDir('toolsDir') else: self._toolsDir = self._toolsDir + '/' # normalize dirs to always end in / return self._toolsDir @property def refDir(self): ''' Retrieves the EAP_REF_DIR environment variable. If not found, fall back to settings. ''' if self._refDir != None: return self._refDir self._refDir = os.environ.get('EAP_REF_DIR') if self._refDir == "": self._refDir = self.getDir('refDir') else: self._refDir = self._refDir + '/' # normalize dirs to always end in / return self._refDir def setupEnv(self): ''' Ensures the toolsDir is in the path. ''' path = os.environ.get('PATH') if path != None and path.find(self.toolsDir) == -1: newPath = self.toolsDir + os.pathsep + path #os.putenv('PATH',newPath) os.environ['PATH'] = newPath def getVar(self, varName, default=None): ''' Retrieves variable for the Analysis variables. ''' if varName in self._variables: return self._variables[varName] return default def setVar(self, varName, val): ''' Sets an Analysis variable. If set to None, will be removed. ''' if val == None: del self._variables[varName] else: self._variables[varName] = val def getSetting(self, settingName, default=None, alt=None): ''' Retrieves setting from the settings file ''' if self._settingsFile == None or self._settings == None: raise ValueError('ENCODE3 settings file is unknown!') return self._settings.get(settingName, default, alt) def getDir(self, settingName, default=None, alt=None): ''' Retrieves full path to a directory from the settings file (always ending with '/') ''' if self._settingsFile == None or self._settings == None: raise ValueError('ENCODE3 settings file is unknown!') return self._settings.getDir(settingName, default, alt) def getTool(self, toolName, orInPath=False): ''' Retrieves full path to tool from the settings file ''' # NOTE: set orInPath=True then missing full path will default to execution path # Example: if toolName is 'bwa' and 'bwaPath' not in settings file, and # if orInPath==True then 'bwa' will be returned, and if bwa is found on # execution path, no error will occur. if self._settingsFile == None or self._settings == None: raise ValueError('ENCODE3 settings file is unknown!') try: toolPath = self._settings.get(toolName + 'Tool') return os.path.abspath(toolPath) except: if orInPath: return toolName # to clever by half: we know there is no toolName+'Path' so if toolPath # is missing, then exception will already have the correct message. return self.getDir(toolName + 'Dir', self.toolsDir) + toolName def getToolData(self, toolId, name=None): ''' Retrieves tool data as a dictionary from the toolDb. ''' if self._toolsDb == None: toolDbFile = self.getSetting('toolDbFile', '') if toolDbFile == '': toolDbFile = self.toolsDir + 'tools.ra' self._toolsDb = Stanzas(toolDbFile) if self._toolsDb == None: return None toolData = self._toolsDb.getStanza(toolId) # If tool not found by id, see if it can be found by name if toolData == None and name != None: self._toolsDb.altIndex('name', unique=False) self._toolsDb.setSortOrder(['name', 'version', 'toolId']) stanza = None while True: # With sort order, the last shall have the latest version stanza = self._toolsDb.getStanzaFromAlt(name, stanza) if stanza == None: break toolData = stanza return toolData def createAnalysisDir(self): '''creates analysis level directory''' if self.id == None: raise Exception( 'This analysis has not been registered or defined in manifest') if self._analysisDir != None: raise Exception( 'The directory for this analysis has already been created') self._analysisDir = self.getDir('tmpDir') + self.id.replace(' ', '_') + '/' if not os.path.isdir(self._analysisDir): os.makedirs(self._analysisDir) return self._analysisDir def createTempDir(self, name, clean=False): ''' Returns a named temporary directory, creating it if necessary ''' # Used for logicalStep dirs. Since steps could run in parallel, tmpDirs are in dict. if name in self._tmpDirs: raise Exception( name + ' already exists as a temporary directory in this analysis') tmpdir = self.dir + name.replace(' ', '_') + '/' if clean and os.path.isdir(tmpdir): err = os.system("rm -rf " + tmpdir) os.mkdir(tmpdir) elif not os.path.isdir(tmpdir): os.mkdir(tmpdir) self._tmpDirs[name] = tmpdir return tmpdir def registerInputFile(self, name, fileWithPath=None): ''' Registers a single input file by name. Retrieve again by name. Input files reside outside the analysis directory and are input to steps. ''' if fileWithPath != None: self._inputFiles[name] = fileWithPath return self._inputFiles[name] def inputFile(self, name): return self._inputFiles[name] def registerInterimOutput(self, name, fileNoPath=None): ''' Registers a single interim output file by name. Retrieve again by name. Interim outputs are generated by some steps to be used by other steps. They reside in the analysis directory and should be deleted when the analysis concludes. ''' if fileNoPath != None: self._interimFiles[name] = self.dir + fileNoPath return self._interimFiles[name] def interimOutput(self, name): return self._interimFiles[name] def registerTargetOutput(self, name, outputNoPath=None): ''' Registers a single target output (typically a file) by name. Retrieve again by name. Target outputs are the result of successful steps. They are written to the analysis directory and are expected to be hard-linked outside of it when the analysis completes. ''' if outputNoPath != None: self._targetOutput[name] = self.dir + outputNoPath return self._targetOutput[name] def targetOutput(self, name): return self._targetOutput[name] def targetName(self, name): ''' Returns the targetFile Name, stripped of the path. ''' return os.path.split(self.targetOutput(name))[1] def linkOrCopy(self, fromLoc, toLoc, soft=False, logOut=True, dryRun=None, log=None): ''' Standard call for all cases of moving files/dirs into position. ''' if dryRun == None: dryRun = self._dryRun if soft: err = self.runCmd('ln -sf ' + fromLoc + ' ' + toLoc, logOut=logOut, dryRun=dryRun, log=log) else: err = self.runCmd('ln -f ' + fromLoc + ' ' + toLoc, logOut=logOut, dryRun=dryRun, log=log) if err != 0: if os.path.isdir( fromLoc ): # If dir then remove old and then copy contents recursively self.runCmd('rm -rf ' + toLoc, logOut=logOut, dryRun=dryRun, log=log) err = self.runCmd('cp -rf ' + fromLoc + ' ' + toLoc, logOut=logOut, dryRun=dryRun, log=log) else: err = self.runCmd('cp -f ' + fromLoc + ' ' + toLoc, logOut=logOut, dryRun=dryRun, log=log) if err != 0: raise Exception("Unable to ln or cp '" + fromLoc + "' to '" + toLoc + "'") # special case for bam files that may be paired with bai files! if fromLoc.endswith('.bam') and toLoc.endswith('.bam'): if os.path.exists(fromLoc + '.bai'): return self.linkOrCopy(fromLoc + '.bai', toLoc + '.bai', soft, logOut, dryRun, log) return err def getFile(self, name, io='input'): ''' gets the filename to a file we created previously through either registerInputFile/registerTargetOutput OR passed as input in a manifest file. ''' if io == 'input': return self._inputFiles[name] else: return self._targetOutput[name] def declareLogFile(self, name=None): ''' Gets or sets the filename for the log that might be created at the analysis level. ''' if self.log != None and self.log.file() != None: return self.log.file() # Could check that name matches log if name == None: if self.id == None: raise Exception( "This 'analysis' has not been registered or defined in manifest." ) name = self.id self.log.declareFile(self.dir + name.replace(' ', '') + '.log') #self.log.empty() # Analysis log is a running log except when explicitly emptied return self.log.file() def registerStep(self, step): ''' Multiple logical steps can be managed by an analysis simultaneously ''' self._steps.append(step) def removeStep(self, step): ''' Multiple logical steps can be managed by an analysis simultaneously ''' try: self._steps.remove(step) except: pass ### Proccessing support ### def deliverFiles(self, step): ''' Delivers interim and target files based upon matching keys. about and maybe trashing the directory as well? ''' # Because we do not want to stop the loop for an exception # we record exceptions and raise one at the end. fails = '' # copy interims fullSetOfKeys = self._interimFiles.keys() deliveryKeys = fullSetOfKeys if self._deliveryKeys != None: deliveryKeys = self._deliveryKeys for key in fullSetOfKeys: if key not in deliveryKeys: continue try: step.deliverResultFile(key, self._interimFiles[key]) except: fails = fails + "Failed to find interim result for '" + key + "'\n" # copy targets fullSetOfKeys = self._targetOutput.keys() deliveryKeys = fullSetOfKeys if self._deliveryKeys != None: deliveryKeys = self._deliveryKeys for key in fullSetOfKeys: if key not in deliveryKeys: continue try: step.deliverResultFile(key, self._targetOutput[key]) except: fails = fails + "Failed to find target result for '" + key + "'\n" if len(fails) > 0: raise Exception(fails) def deliveryKeys(self, justThisSet): ''' Register certain keys to be delived in deliverFiles and in this order. Without setting this, all keys in interim and target files will be delivered. ''' self._deliveryKeys = justThisSet def onSucceed(self, step): ''' pipeline will handle all success steps, like copying out files we care about and maybe trashing the directory as well? ''' # deliver the files from step to analysis directory try: self.deliverFiles(step) except: pass # descendent classes should consider this an exception step.log.out("'\n--- End of step ---") step.log.dump(self.log.file()) # to stdout if no runningLog # Morgan, do you want the step log going to stdout even if there is an analysis log? #if self.log.file() != None: # If analysis log, be sure to just print step log to stdout # step.log.dump() if not self._dryRun: step.cleanup() # Removes step.stepDir() else: self.log.out('') # skip a lineline self.runCmd('ls -l ' + step.dir, dryRun=False) self.log.out('') self.removeStep(step) # Do we want to do this? return 0 def onFail(self, step): ''' pipeline will handle failure of logical steps like sweeping the log to the running log ''' step.log.out("\n--- End of step ---") step.log.dump(self.log.file()) # to stdout if no runningLog if self.log.file( ) != None: # If analysis log, be sure to just print step log to stdout step.log.dump() if self._dryRun: self.log.out('') # skip a lineline self.runCmd('ls -l ' + step.dir, dryRun=False) self.log.out('') retVal = step.err self.removeStep(step) # Do we want to do this? if retVal == 0: retVal = 1 # Must fail! return retVal def runCmd(self, cmd, logOut=True, logErr=True, dryRun=None, log=None): ''' Runs the provided command and returns error code. Does NOT trigger onFail. Note that you can pass in a log object if you don't want to use the analysis log. ''' if dryRun == None: dryRun = self._dryRun if log == None: log = self.log if logOut or logErr: if dryRun: log.out('*> ' + cmd) else: log.out('> ' + cmd) # Always log command itself if dryRun: return 0 log.close( ) # Ensure log is closed so that command redirect can be tacked on logFile = log.file() if logFile != None and logOut and logErr: err = os.system(cmd + ' >> ' + logFile + ' 2>&1') elif logFile != None and logErr: err = os.system(cmd + ' 2>>' + logFile) else: err = os.system(cmd) return err def getCmdOut(self, cmd, dryRun=None, logCmd=True, logResult=False, default='', log=None, errOk=False): ''' Runs the provided command and returns the stdout. Note that you can pass in a log object if you don't want to use the analysis log. ''' if dryRun == None: dryRun = self._dryRun if log == None: log = self.log if logCmd: log.out('> ' + cmd) if dryRun: return default err, out = commands.getstatusoutput(cmd) if logResult: log.out(out) if err != 0 and not errOk: raise Exception("Running [" + cmd + "] returned '" + str(err)) if len(out) == 0: out = default return out