def test_equal(self): v1 = Version(1, 0) v2 = Version(1, 0) self.assertEqual(v1, v2) self.assertTrue(v1.isCompatible(v2))
class LHCbDataset(GangaDataset): '''Class for handling LHCb data sets (i.e. inputdata for LHCb jobs). Example Usage: ds = LHCbDataset(["lfn:/some/lfn.file","pfn:/some/pfn.file"]) ds[0] # DiracFile("/some/lfn.file") - see DiracFile docs for usage ds[1] # PhysicalFile("/some/pfn.file")- see PhysicalFile docs for usage len(ds) # 2 (number of files) ds.getReplicas() # returns replicas for *all* files in the data set ds.replicate("CERN-USER") # replicate *all* LFNs to "CERN-USER" SE ds.getCatalog() # returns XML catalog slice ds.optionsString() # returns Gaudi-sytle options [...etc...] ''' schema = {} docstr = 'List of PhysicalFile and DiracFile objects' schema['files'] = GangaFileItem(defvalue=[], typelist=['str', 'Ganga.GPIDev.Adapters.IGangaFile.IGangaFile'], sequence=1, doc=docstr) docstr = 'Ancestor depth to be queried from the Bookkeeping' schema['depth'] = SimpleItem(defvalue=0, doc=docstr) docstr = 'Use contents of file rather than generating catalog.' schema['XMLCatalogueSlice'] = GangaFileItem(defvalue=None, doc=docstr) docstr = 'Specify the dataset persistency technology' schema['persistency'] = SimpleItem( defvalue=None, typelist=['str', 'type(None)'], doc=docstr) schema['treat_as_inputfiles'] = SimpleItem(defvalue=False, doc="Treat the inputdata as inputfiles, i.e. copy the inputdata to the WN") _schema = Schema(Version(3, 0), schema) _category = 'datasets' _name = "LHCbDataset" _exportmethods = ['getReplicas', '__len__', '__getitem__', 'replicate', 'hasLFNs', 'append', 'extend', 'getCatalog', 'optionsString', 'getLFNs', 'getFileNames', 'getFullFileNames', 'difference', 'isSubset', 'isSuperset', 'intersection', 'symmetricDifference', 'union', 'bkMetadata', 'isEmpty', 'hasPFNs', 'getPFNs'] # ,'pop'] def __init__(self, files=None, persistency=None, depth=0, fromRef=False): super(LHCbDataset, self).__init__() if files is None: files = [] self.files = GangaList() process_files = True if fromRef: self.files._list.extend(files) process_files = False elif isinstance(files, GangaList): def isFileTest(_file): return isinstance(_file, IGangaFile) areFiles = all([isFileTest(f) for f in files._list]) if areFiles: self.files._list.extend(files._list) process_files = False elif isinstance(files, LHCbDataset): self.files._list.extend(files.files._list) process_files = False if process_files: if isType(files, LHCbDataset): for this_file in files: self.files.append(deepcopy(this_file)) elif isType(files, IGangaFile): self.files.append(deepcopy(this_file)) elif isType(files, (list, tuple, GangaList)): new_list = [] for this_file in files: if type(this_file) is str: new_file = string_datafile_shortcut_lhcb(this_file, None) elif isType(this_file, IGangaFile): new_file = stripProxy(this_file) else: new_file = strToDataFile(this_file) new_list.append(new_file) self.files.extend(new_list) elif type(files) is str: self.files.append(string_datafile_shortcut_lhcb(this_file, None), False) else: raise GangaException("Unknown object passed to LHCbDataset constructor!") self.files._setParent(self) logger.debug("Processed inputs, assigning files") # Feel free to turn this on again for debugging but it's potentially quite expensive #logger.debug( "Creating dataset with:\n%s" % self.files ) logger.debug("Assigned files") self.persistency = persistency self.depth = depth logger.debug("Dataset Created") def __getitem__(self, i): '''Proivdes scripting (e.g. ds[2] returns the 3rd file) ''' #this_file = self.files[i] # print type(this_file) # return this_file # return this_file # return this_file if type(i) == type(slice(0)): ds = LHCbDataset(files=self.files[i]) ds.depth = self.depth #ds.XMLCatalogueSlice = self.XMLCatalogueSlice return ds else: return self.files[i] def getReplicas(self): 'Returns the replicas for all files in the dataset.' lfns = self.getLFNs() cmd = 'getReplicas(%s)' % str(lfns) result = get_result(cmd, 'LFC query error', 'Could not get replicas.') return result['Value']['Successful'] def hasLFNs(self): 'Returns True is the dataset has LFNs and False otherwise.' for f in self.files: if isDiracFile(f): return True return False def hasPFNs(self): 'Returns True is the dataset has PFNs and False otherwise.' for f in self.files: if not isDiracFile(f): return True return False def replicate(self, destSE=''): '''Replicate all LFNs to destSE. For a list of valid SE\'s, type ds.replicate().''' if not destSE: from GangaDirac.Lib.Files.DiracFile import DiracFile DiracFile().replicate('') return if not self.hasLFNs(): raise GangaException('Cannot replicate dataset w/ no LFNs.') retry_files = [] for f in self.files: if not isDiracFile(f): continue try: result = f.replicate( destSE=destSE ) except Exception as err: msg = 'Replication error for file %s (will retry in a bit).' % f.lfn logger.warning(msg) logger.warning("Error: %s" % str(err)) retry_files.append(f) for f in retry_files: try: result = f.replicate( destSE=destSE ) except Exception as err: msg = '2nd replication attempt failed for file %s. (will not retry)' % f.lfn logger.warning(msg) logger.warning(str(err)) def extend(self, files, unique=False): '''Extend the dataset. If unique, then only add files which are not already in the dataset.''' from Ganga.GPIDev.Base import ReadOnlyObjectError if self._parent is not None and self._parent._readonly(): raise ReadOnlyObjectError('object Job#%s is read-only and attribute "%s/inputdata" cannot be modified now' % (self._parent.id, getName(self))) _external_files = [] if type(files) is str or isType(files, IGangaFile): _external_files = [files] elif type(files) in [list, tuple]: _external_files = files elif isType(files, LHCbDataset): _external_files = files.files else: if not hasattr(files, "__getitem__") or not hasattr(files, '__iter__'): _external_files = [files] # just in case they extend w/ self _to_remove = [] for this_file in _external_files: if hasattr(this_file, 'subfiles'): if len(this_file.subfiles) > 0: _external_files = makeGangaListByRef(this_file.subfiles) _to_remove.append(this_file) if type(this_file) is str: _external_files.append(string_datafile_shortcut_lhcb(this_file, None)) _to_remove.append(this_file) for _this_file in _to_remove: _external_files.pop(_external_files.index(_this_file)) for this_f in _external_files: _file = getDataFile(this_f) if _file is None: _file = this_f myName = _file.namePattern from GangaDirac.Lib.Files.DiracFile import DiracFile if isType(_file, DiracFile): myName = _file.lfn if unique and myName in self.getFileNames(): continue self.files.append(stripProxy(_file)) def removeFile(self, input_file): try: self.files.remove(input_file) except: raise GangaException('Dataset has no file named %s' % input_file.namePattern) def getLFNs(self): 'Returns a list of all LFNs (by name) stored in the dataset.' lfns = [] if not self: return lfns for f in self.files: if isDiracFile(f): subfiles = f.getSubFiles() if len(subfiles) == 0: lfns.append(f.lfn) else: for file in subfiles: lfns.append(file.lfn) #logger.debug( "Returning LFNS:\n%s" % str(lfns) ) logger.debug("Returning #%s LFNS" % str(len(lfns))) return lfns def getPFNs(self): 'Returns a list of all PFNs (by name) stored in the dataset.' pfns = [] if not self: return pfns for f in self.files: if isPFN(f): pfns.append(f.namePattern) return pfns def getFullFileNames(self): 'Returns all file names w/ PFN or LFN prepended.' names = [] from GangaDirac.Lib.Files.DiracFile import DiracFile for f in self.files: if isType(f, DiracFile): names.append('LFN:%s' % f.lfn) else: try: names.append('PFN:%s' % f.namePattern) except: logger.warning("Cannot determine filename for: %s " % f) raise GangaException("Cannot Get File Name") return names def getCatalog(self, site=''): '''Generates an XML catalog from the dataset (returns the XML string). Note: site defaults to config.LHCb.LocalSite Note: If the XMLCatalogueSlice attribute is set, then it returns what is written there.''' if hasattr(self.XMLCatalogueSlice, 'name'): if self.XMLCatalogueSlice.name: f = open(self.XMLCatalogueSlice.name) xml_catalog = f.read() f.close() return xml_catalog if not site: site = getConfig('LHCb')['LocalSite'] lfns = self.getLFNs() depth = self.depth tmp_xml = tempfile.NamedTemporaryFile(suffix='.xml') cmd = 'getLHCbInputDataCatalog(%s,%d,"%s","%s")' \ % (str(lfns), depth, site, tmp_xml.name) result = get_result(cmd, 'LFN->PFN error', 'XML catalog error.') xml_catalog = tmp_xml.read() tmp_xml.close() return xml_catalog def optionsString(self, file=None): 'Returns the Gaudi-style options string for the dataset (if a filename' \ ' is given, the file is created and output is written there).' if not self or len(self) == 0: return '' snew = '' if self.persistency == 'ROOT': snew = '\n#new method\nfrom GaudiConf import IOExtension\nIOExtension(\"%s\").inputFiles([' % self.persistency elif self.persistency == 'POOL': snew = '\ntry:\n #new method\n from GaudiConf import IOExtension\n IOExtension(\"%s\").inputFiles([' % self.persistency elif self.persistency == None: snew = '\ntry:\n #new method\n from GaudiConf import IOExtension\n IOExtension().inputFiles([' else: logger.warning( "Unknown LHCbDataset persistency technology... reverting to None") snew = '\ntry:\n #new method\n from GaudiConf import IOExtension\n IOExtension().inputFiles([' sold = '\nexcept ImportError:\n #Use previous method\n from Gaudi.Configuration import EventSelector\n EventSelector().Input=[' sdatasetsnew = '' sdatasetsold = '' dtype_str_default = getConfig('LHCb')['datatype_string_default'] dtype_str_patterns = getConfig('LHCb')['datatype_string_patterns'] for f in self.files: dtype_str = dtype_str_default for this_str in dtype_str_patterns: matched = False for pat in dtype_str_patterns[this_str]: if fnmatch.fnmatch(f.namePattern, pat): dtype_str = this_str matched = True break if matched: break sdatasetsnew += '\n ' sdatasetsold += '\n ' if isDiracFile(f): sdatasetsnew += """ \"LFN:%s\",""" % f.lfn sdatasetsold += """ \"DATAFILE='LFN:%s' %s\",""" % (f.lfn, dtype_str) else: sdatasetsnew += """ \"PFN:%s\",""" % f.namePattern sdatasetsold += """ \"DATAFILE='PFN:%s' %s\",""" % (f.namePattern, dtype_str) if sdatasetsold.endswith(","): if self.persistency == 'ROOT': sdatasetsnew = sdatasetsnew[:-1] + """\n], clear=True)""" else: sdatasetsnew = sdatasetsnew[:-1] + """\n ], clear=True)""" sdatasetsold = sdatasetsold[:-1] sdatasetsold += """\n ]""" if(file): f = open(file, 'w') if self.persistency == 'ROOT': f.write(snew) f.write(sdatasetsnew) else: f.write(snew) f.write(sdatasetsnew) f.write(sold) f.write(sdatasetsold) f.close() else: if self.persistency == 'ROOT': return snew + sdatasetsnew else: return snew + sdatasetsnew + sold + sdatasetsold def _checkOtherFiles(self, other ): if isType(other, GangaList) or isType(other, []): other_files = LHCbDataset(other).getFullFileNames() elif isType(other, LHCbDataset): other_files = other.getFullFileNames() else: raise GangaException("Unknown type for difference") return other_files def difference(self, other): '''Returns a new data set w/ files in this that are not in other.''' other_files = self._checkOtherFiles(other) files = set(self.getFullFileNames()).difference(other_files) data = LHCbDataset() data.extend([list(files)]) data.depth = self.depth return data def isSubset(self, other): '''Is every file in this data set in other?''' other_files = self._checkOtherFiles(other) return set(self.getFileNames()).issubset(other_files) def isSuperset(self, other): '''Is every file in other in this data set?''' other_files = self._checkOtherFiles(other) return set(self.getFileNames()).issuperset(other_files) def symmetricDifference(self, other): '''Returns a new data set w/ files in either this or other but not both.''' other_files = other.checkOtherFiles(other) files = set(self.getFullFileNames()).symmetric_difference(other_files) data = LHCbDataset() data.extend([list(files)]) data.depth = self.depth return data def intersection(self, other): '''Returns a new data set w/ files common to this and other.''' other_files = other._checkOtherFiles(other) files = set(self.getFullFileNames()).intersection(other_files) data = LHCbDataset() data.extend([list(files)]) data.depth = self.depth return data def union(self, other): '''Returns a new data set w/ files from this and other.''' other_files = self._checkOtherFiles(other) files = set(self.getFullFileNames()).union(other_files) data = LHCbDataset() data.extend([list(files)]) data.depth = self.depth return data def bkMetadata(self): 'Returns the bookkeeping metadata for all LFNs. ' logger.info("Using BKQuery(bkpath).getDatasetMetadata() with bkpath=the bookkeeping path, will yeild more metadata such as 'TCK' info...") cmd = 'bkMetaData(%s)' % self.getLFNs() b = get_result(cmd, 'Error removing replica', 'Replica rm error.') return b
def test_different(self): v1 = Version(1, 0) v2 = Version(1, 2) self.assertNotEqual(v1, v2) self.assertTrue(v2.isCompatible(v1)) self.assertFalse(v1.isCompatible(v2))
class IGangaFile(GangaObject): """IGangaFile represents base class for output files, such as MassStorageFile, LCGSEFile, DiracFile, LocalFile, etc """ _schema = Schema(Version(1, 1), {'namePattern': SimpleItem( defvalue="", doc='pattern of the file name')}) _category = 'gangafiles' _name = 'IGangaFile' _hidden = 1 __slots__ = list() def __init__(self): super(IGangaFile, self).__init__() def setLocation(self): """ Sets the location of output files that were uploaded from the WN """ raise NotImplementedError def location(self): """ Return list with the locations of the post processed files (if they were configured to upload the output somewhere) """ raise NotImplementedError def get(self): """ Retrieves locally all files that were uploaded before that Order of priority about where a file is going to be placed are: 1) The localDir as defined in the schema. (Exceptions thrown if this doesn't exist) 2) The Job outpudir of the parent job if the localDir is not defined. 3) raise an exception if neither are defined correctly. """ if self.localDir: if not os.path.isdir(self.localDir): msg = "Folder '%s' doesn't exist. Please construct this before 'get'-ing a file." % self.localDir raise GangaFileError(msg) to_location = self.localDir else: try: to_location = self.getJobObject().outputdir except AssertionError: msg = "%s: Failed to get file object. Please set the `localDir` parameter and try again. e.g. file.localDir=os.getcwd();file.get()" % getName(self) logger.debug("localDir value: %s" % self.localDir) logger.debug("parent: %s" % self._getParent()) raise GangaFileError(msg) # FIXME CANNOT perform a remote globbing here in a nice way so have to just perform a copy when dealing with wildcards if not os.path.isfile(os.path.join(to_location, self.namePattern)): returnable = self.copyTo(to_location) if not self.localDir: self.localDir = to_location return returnable else: logger.debug("File: %s already exists, not performing copy" % (os.path.join(to_location, self.namePattern), )) return True def getSubFiles(self, process_wildcards=False): """ Returns the sub files if wildcards are used """ # should we process wildcards? Used for inputfiles if process_wildcards: self.processWildcardMatches() # if we have subfiles, return that if hasattr(self, 'subfiles'): return self.subfiles return [] def getFilenameList(self): """ Returns the filenames of all associated files through a common interface """ raise NotImplementedError def getWNScriptDownloadCommand(self, indent): """ Gets the command used to download already uploaded file """ raise NotImplementedError def put(self): """ Postprocesses (upload) output file to the desired destination from the client """ raise NotImplementedError def copyTo(self, targetPath): """ Copy a the file to the local storage using the appropriate file-transfer mechanism This will raise an exception if targetPath isn't set to something sensible. Args: targetPath (str): Target path where the file is to copied to """ if not isinstance(targetPath, str) and targetPath: raise GangaFileError("Cannot perform a copyTo with no given targetPath!") if regex.search(self.namePattern) is None\ and os.path.isfile(os.path.join(self.localDir, self.namePattern)): if not os.path.isfile(os.path.join(targetPath, self.namePattern)): shutil.copy(os.path.join(self.localDir, self.namePattern), os.path.join(targetPath, self.namePattern)) else: logger.debug("Already found file: %s" % os.path.join(targetPath, self.namePattern)) return True # Again, cannot perform a remote glob here so have to ignore wildcards else: return self.internalCopyTo(targetPath) def internalCopyTo(self, targetPath): """ Internal method for implementing the actual copy mechanism for each IGangaFile Args: targetPath (str): Target path where the file is to copied to """ raise NotImplementedError def getWNInjectedScript(self, outputFiles, indent, patternsToZip, postProcessLocationsFP): """ Returns script that have to be injected in the jobscript for postprocessing on the WN """ raise NotImplementedError def processWildcardMatches(self): """ If namePattern contains a wildcard, populate the subfiles property """ raise NotImplementedError def _auto_remove(self): """ Remove called when job is removed as long as config option allows """ self.remove() def _readonly(self): return False def _list_get__match__(self, to_match): if isinstance(to_match, str): return fnmatch(self.namePattern, to_match) # Note: type(DiracFile) = ObjectMetaclass # type(ObjectMetaclass) = type # hence checking against a class type not an instance if isinstance(type(to_match), type): return issubclass(self.__class__, to_match) return to_match == self def execSyscmdSubprocess(self, cmd): import subprocess exitcode = -999 mystdout = '' mystderr = '' try: child = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) (mystdout, mystderr) = child.communicate() exitcode = child.returncode finally: pass return (exitcode, mystdout, mystderr) def remove(self): """ Objects should implement something to overload this! """ raise NotImplementedError def accessURL(self): """ Return the URL including the protocol used to access a file on a certain storage element """ raise NotImplementedError def hasMatchedFiles(self): """ Return if this file has got valid matched files. Default implementation checks for subfiles and locations """ # check for subfiles if (hasattr(self, 'subfiles') and len(self.subfiles) > 0): # we have subfiles so we must have actual files associated return True # check for locations if (hasattr(self, 'locations') and len(self.locations) > 0): return True return False def containsWildcards(self): """ Return if the name has got wildcard characters """ if regex.search(self.namePattern) != None: return True return False def cleanUpClient(self): """ This method cleans up the client space after performing a put of a file after a job has completed """ # For all other file types (not LocalFile) The file in the outputdir is temporary waiting for Ganga to pass it to the storage solution job = self.getJobObject() for f in glob.glob(os.path.join(job.outputdir, self.namePattern)): try: os.remove(f) except OSError as err: if err.errno != errno.ENOENT: logger.error('failed to remove temporary/intermediary file: %s' % f) logger.debug("Err: %s" % err) raise err def expandString(self, inputStr, fileName=''): """ This method deals with the automatic string replacement in the string notation for IGangaFile objects Args: inputStr(str): This is the input string which is being evaluated/expanded fileName(str): This is an optional filename used to replace {fname} Returns: str: This new string is the result of fully expanding the inputStr object """ outputStr = inputStr if self._getParent() is not None: jobfqid = self.getJobObject().fqid jobid = jobfqid subjobid = '' split = jobfqid.split('.') if len(split) > 1: jobid = split[0] subjobid = split[1] outputStr = outputStr.replace('{jid}', jobid) outputStr = outputStr.replace('{sjid}', subjobid) if fileName: outputStr = outputStr.replace('{fname}', fileName) else: outputStr = outputStr.replace('{fname}', os.path.basename(self.namePattern)) return outputStr
class DiracBase(IBackend): """The backend that submits jobs to the Grid via DIRAC. The backend for jobs to be submitted to the Grid. Jobs are submitted through the DIRAC WMS system and then in turn submitted to the Grid. A few examples of usage are given below # Create Dirac backend object b = Dirac() # Create and submit job. j = Job(application=app,backend=b) j.submit() # Run a Root job on the Grid if in LHCb VO # Create a Root application object. See Root help text for instructions # on how to configure this. app = Root() # Create and submit job to Dirac using default options j = Job(application=app,backend=Dirac()) j.submit() # Using the 'settings' attribute j.backend.settings['BannedSites'] = ['LCG.CERN.ch'] j.resubmit() # settings can be set at any time but are only 'respected' during # submit and resubmit. """ dirac_monitoring_is_active = True _schema = Schema(Version(3, 2), { 'id': SimpleItem(defvalue=None, protected=1, copyable=0, typelist=['int', 'type(None)'], doc='The id number assigned to the job by the DIRAC WMS. If seeking help' ' on jobs with the Dirac backend, please always report this id ' 'number in addition to a full description of your problem. The id ' 'can also be used to further inspect the job at ' 'https://lhcbweb.pic.es/DIRAC/info/general/diracOverview'), 'status': SimpleItem(defvalue=None, protected=1, copyable=0, typelist=['str', 'type(None)'], doc='The detailed status as reported by the DIRAC WMS'), 'actualCE': SimpleItem(defvalue=None, protected=1, copyable=0, typelist=['str', 'type(None)'], doc='The location where the job ran'), 'normCPUTime': SimpleItem(defvalue=None, protected=1, copyable=0, typelist=['str', 'type(None)'], doc='The normalized CPU time reported by the DIRAC WMS'), 'statusInfo': SimpleItem(defvalue='', protected=1, copyable=0, typelist=['str', 'type(None)'], doc='Minor status information from Dirac'), 'extraInfo': SimpleItem(defvalue='', protected=1, copyable=0, typelist=['str', 'type(None)'], doc='Application status information from Dirac'), 'diracOpts': SimpleItem(defvalue='', doc='DIRAC API commands to add the job definition script. Only edit ' 'if you *really* know what you are doing'), 'settings': SimpleItem(defvalue={'CPUTime': 2 * 86400}, doc='Settings for DIRAC job (e.g. CPUTime, BannedSites, etc.)') }) _exportmethods = ['getOutputData', 'getOutputSandbox', 'removeOutputData', 'getOutputDataLFNs', 'peek', 'reset', 'debug'] _packed_input_sandbox = True _category = "backends" _name = 'DiracBase' _hidden = True def _setup_subjob_dataset(self, dataset): """ This method is used for constructing datasets on a per subjob basis when submitting parametric jobs Args: Dataset (Dataset): This is a GangaDataset object, todo check this isn't a list """ return None def _setup_bulk_subjobs(self, dirac_ids, dirac_script): """ This is the old bulk submit method which is used to construct the subjobs for a parametric job Args: dirac_ids (list): This is a list of the Dirac ids which have been created dirac_script (str): Name of the dirac script which contains the job jdl """ f = open(dirac_script, 'r') parametric_datasets = get_parametric_datasets(f.read().split('\n')) f.close() if len(parametric_datasets) != len(dirac_ids): raise BackendError('Dirac', 'Missmatch between number of datasets defines in dirac API script and those returned by DIRAC') from Ganga.GPIDev.Lib.Job.Job import Job master_job = self.getJobObject() master_job.subjobs = [] for i in range(len(dirac_ids)): j = Job() j.copyFrom(master_job) j.splitter = None j.backend.id = dirac_ids[i] j.id = i j.inputdata = self._setup_subjob_dataset(parametric_datasets[i]) j.status = 'submitted' j.time.timenow('submitted') master_job.subjobs.append(j) return True def _common_submit(self, dirac_script): '''Submit the job via the Dirac server. Args: dirac_script (str): filename of the JDL which is to be submitted to DIRAC ''' j = self.getJobObject() self.id = None self.actualCE = None self.status = None self.extraInfo = None self.statusInfo = '' j.been_queued = False dirac_cmd = """execfile(\'%s\')""" % dirac_script result = execute(dirac_cmd) # Could use the below code instead to submit on a thread # If submitting many then user may terminate ganga before # all jobs submitted # def submit_checker(result, job, script): # err_msg = 'Error submitting job to Dirac: %s' % str(result) # if not result_ok(result) or 'Value' not in result: # logger.error(err_msg) # raise BackendError('Dirac',err_msg) # # idlist = result['Value'] # if type(idlist) is list: # return job._setup_bulk_subjobs(idlist, script) # job.id = idlist # server.execute_nonblocking(dirac_cmd, callback_func=submit_checker, args=(self, dirac_script)) # return True err_msg = 'Error submitting job to Dirac: %s' % str(result) if not result_ok(result) or 'Value' not in result: logger.error(err_msg) logger.error("\n\n===\n%s\n===\n" % dirac_script) logger.error("\n\n====\n") with open(dirac_script, 'r') as file_in: logger.error("%s" % file_in.read()) logger.error("\n====\n") raise BackendError('Dirac', err_msg) idlist = result['Value'] if type(idlist) is list: return self._setup_bulk_subjobs(idlist, dirac_script) self.id = idlist return type(self.id) == int def _addition_sandbox_content(self, subjobconfig): '''any additional files that should be sent to dirac Args: subjobcofig (unknown): This is the config for this subjob (I think)''' return [] def submit(self, subjobconfig, master_input_sandbox): """Submit a DIRAC job Args: subjobconfig (unknown): master_input_sandbox (list): file names which are in the master sandbox of the master sandbox (if any) """ j = self.getJobObject() sboxname = j.createPackedInputSandbox(subjobconfig.getSandboxFiles()) input_sandbox = master_input_sandbox[:] input_sandbox += sboxname input_sandbox += self._addition_sandbox_content(subjobconfig) ## Add LFN to the inputfiles section of the file input_sandbox_userFiles = [] for this_file in j.inputfiles: if isType(this_file, DiracFile): input_sandbox_userFiles.append('LFN:'+str(this_file.lfn)) if j.master: for this_file in j.master.inputfiles: if isType(this_file, DiracFile): input_sandbox_userFiles.append('LFN:'+str(this_file.lfn)) for this_file in input_sandbox_userFiles: input_sandbox.append(this_file) logger.debug("dirac_script: %s" % str(subjobconfig.getExeString())) logger.debug("sandbox_cont:\n%s" % str(input_sandbox)) dirac_script = subjobconfig.getExeString().replace('##INPUT_SANDBOX##', str(input_sandbox)) dirac_script_filename = os.path.join(j.getInputWorkspace().getPath(), 'dirac-script.py') f = open(dirac_script_filename, 'w') f.write(dirac_script) f.close() return self._common_submit(dirac_script_filename) def master_auto_resubmit(self, rjobs): '''Duplicate of the IBackend.master_resubmit but hooked into auto resubmission such that the monitoring server is used rather than the user server Args: rjobs (list): This is a list of jobs which are to be auto-resubmitted''' from Ganga.Core import IncompleteJobSubmissionError, GangaException from Ganga.Utility.logging import log_user_exception incomplete = 0 def handleError(x): if incomplete: raise x else: return 0 try: for sj in rjobs: fqid = sj.getFQID('.') logger.info("resubmitting job %s to %s backend", fqid, getName(sj.backend)) try: b = sj.backend sj.updateStatus('submitting') result = b._resubmit() if result: sj.updateStatus('submitted') # sj._commit() # PENDING: TEMPORARY DISABLED incomplete = 1 else: return handleError(IncompleteJobSubmissionError(fqid, 'resubmission failed')) except Exception as x: log_user_exception(logger, debug=isType(x, GangaException)) return handleError(IncompleteJobSubmissionError(fqid, str(x))) finally: master = self.getJobObject().master if master: master.updateMasterJobStatus() return 1 def resubmit(self): """Resubmit a DIRAC job""" return self._resubmit() def _resubmit(self): """Resubmit a DIRAC job""" j = self.getJobObject() parametric = False script_path = os.path.join(j.getInputWorkspace().getPath(), 'dirac-script.py') # Check old script if j.master is None and not os.path.exists(script_path): raise BackendError('Dirac', 'No "dirac-script.py" found in j.inputdir') if j.master is not None and not os.path.exists(script_path): script_path = os.path.join( j.master.getInputWorkspace().getPath(), 'dirac-script.py') if not os.path.exists(script_path): raise BackendError('Dirac', 'No "dirac-script.py" found in j.inputdir or j.master.inputdir') parametric = True # Read old script f = open(script_path, 'r') script = f.read() f.close() # Create new script - ##note instead of using get_parametric_dataset # could just use j.inputdata. if parametric is True: parametric_datasets = get_parametric_datasets(script.split('\n')) if j.master: if len(parametric_datasets) != len(j.master.subjobs): raise BackendError('Dirac', 'number of parametric datasets defined in API script doesn\'t match number of master.subjobs') if j.inputdata and len(j.inputdata) > 0: _input_files = [f for f in j.inputdata if not isType(f, DiracFile)] else: _input_files = [] if set(parametric_datasets[j.id]).symmetric_difference(set([f.namePattern for f in _input_files])): raise BackendError( 'Dirac', 'Mismatch between dirac-script and job attributes.') script = script.replace('.setParametricInputData(%s)' % str(parametric_datasets), '.setInputData(%s)' % str(parametric_datasets[j.id])) script = script.replace('%n', str(j.id)) # name start_user_settings = '# <-- user settings\n' new_script = script[ :script.find(start_user_settings) + len(start_user_settings)] job_ident = get_job_ident(script.split('\n')) for key, value in self.settings.iteritems(): if str(key).startswith('set'): _key = key[3:] else: _key = key if type(value) is str: template = '%s.set%s("%s")\n' else: template = '%s.set%s(%s)\n' new_script += template % (job_ident, str(_key), str(value)) new_script += script[script.find('# user settings -->'):] # Save new script new_script_filename = os.path.join(j.getInputWorkspace().getPath(), 'dirac-script.py') f = open(new_script_filename, 'w') f.write(new_script) f.flush() f.close() return self._common_submit(new_script_filename) def reset(self, doSubjobs=False): """Resets the state of a job back to 'submitted' so that the monitoring will run on it again. Args: doSubjobs (bool): Should we rest the subjobs associated with this job or not""" j = self.getJobObject() disallowed = ['submitting', 'killed'] if j.status in disallowed: logger.warning("Can not reset a job in status '%s'." % j.status) else: j.getOutputWorkspace().remove(preserve_top=True) self.extraInfo = None self.statusInfo = '' self.status = None self.actualCE = None j.been_queued = False j.updateStatus('submitted') if j.subjobs and not doSubjobs: logger.info('This job has subjobs, if you would like the backends ' 'of all the subjobs that are in status=\'completing\' or ' 'status=\'failed\' also reset then recall reset with the ' 'arg \'True\' i.e. job(3).backend.reset(True)') elif j.subjobs and doSubjobs: logger.info('resetting the backends of \'completing\' and \'failed\' subjobs.') for sj in j.subjobs: if sj.status == 'completing' or sj.status == 'failed': sj.backend.reset() if j.master: j.master.updateMasterJobStatus() def kill(self): """ Kill a Dirac jobs""" if not self.id: return None dirac_cmd = 'kill(%d)' % self.id result = execute(dirac_cmd) if not result_ok(result): raise BackendError('Dirac', 'Could not kill job: %s' % str(result)) return result['OK'] def peek(self, filename=None, command=None): """Peek at the output of a job (Note: filename/command are ignored). Args: filename (str): Ignored but is filename of a file in the sandbox command (str): Ignored but is a command which could be executed""" dirac_cmd = 'peek(%d)' % self.id result = execute(dirac_cmd) if result_ok(result): logger.info(result['Value']) else: logger.error("No peeking available for Dirac job '%i'.", self.id) def getOutputSandbox(self, outputDir=None): """Get the outputsandbox for the job object controlling this backend Args: outputDir (str): This string represents the output dir where the sandbox is to be placed """ j = self.getJobObject() if outputDir is None: outputDir = j.getOutputWorkspace().getPath() dirac_cmd = "getOutputSandbox(%d,'%s')" % (self.id, outputDir) result = execute(dirac_cmd) if not result_ok(result): msg = 'Problem retrieving output: %s' % str(result) logger.warning(msg) return False return True def removeOutputData(self): """ Remove all the LFNs associated with this job. """ # Note when the API can accept a list for removeFile I will change # this. j = self.getJobObject() if j.subjobs: for sj in j.subjobs: outputfiles_foreach(sj, DiracFile, lambda x: x.remove()) else: outputfiles_foreach(j, DiracFile, lambda x: x.remove()) def getOutputData(self, outputDir=None, names=None, force=False): """Retrieve data stored on SE to dir (default=job output workspace). If names=None, then all outputdata is downloaded otherwise names should be a list of files to download. If force=True then data will be redownloaded even if the file already exists. Note that if called on a master job then all subjobs outputwill be downloaded. If dir is None then the subjobs output goes into their individual outputworkspaces as expected. If however one specifies a dir then this is treated as a top dir and a subdir for each job will be created below it. This will avoid overwriting files with the same name from each subjob. Args: outputDir (str): This string represents the output dir where the sandbox is to be placed names (list): list of names which match namePatterns in the outputfiles force (bool): Force the download out data potentially overwriting existing objects """ j = self.getJobObject() if outputDir is not None and not os.path.isdir(outputDir): raise GangaException("Designated outupt path '%s' must exist and be a directory" % outputDir) def download(dirac_file, job, is_subjob=False): dirac_file.localDir = job.getOutputWorkspace().getPath() if outputDir is not None: output_dir = outputDir if is_subjob: output_dir = os.path.join(outputDir, job.fqid) if not os.path.isdir(output_dir): os.mkdir(output_dir) dirac_file.localDir = output_dir if os.path.exists(os.path.join(dirac_file.localDir, os.path.basename(dirac_file.lfn))) and not force: return try: if isType(dirac_file, DiracFile): dirac_file.get(localPath=dirac_file.localDir) else: dirac_file.get() return dirac_file.lfn # should really make the get method throw if doesn't suceed. todo except GangaException as e: logger.warning(e) suceeded = [] if j.subjobs: for sj in j.subjobs: suceeded.extend([download(f, sj, True) for f in outputfiles_iterator(sj, DiracFile) if f.lfn != '' and (names is None or f.namePattern in names)]) else: suceeded.extend([download(f, j, False) for f in outputfiles_iterator(j, DiracFile) if f.lfn != '' and (names is None or f.namePattern in names)]) return filter(lambda x: x is not None, suceeded) def getOutputDataLFNs(self): """Retrieve the list of LFNs assigned to outputdata""" j = self.getJobObject() lfns = [] if j.subjobs: for sj in j.subjobs: lfns.extend([f.lfn for f in outputfiles_iterator(sj, DiracFile) if f.lfn != '']) else: lfns.extend([f.lfn for f in outputfiles_iterator(j, DiracFile) if f.lfn != '']) return lfns def debug(self): '''Obtains some (possibly) useful DIRAC debug info. ''' # check services cmd = 'getServicePorts()' result = execute(cmd) if type(result) == str: try: result = eval(result) except Exception as err: logger.debug("Exception, err: %s" % str(err)) pass if not result_ok(result): logger.warning('Could not obtain services: %s' % str(result)) return services = result.get('Value', {}) for category in services: system, service = category.split('/') cmd = "ping('%s','%s')" % (system, service) result = execute(cmd) if type(result) == str: try: result = eval(result) except Exception as err: logger.debug("Exception: %s" % str(err)) pass msg = 'OK.' if not result_ok(result): msg = '%s' % result['Message'] logger.info('%s: %s' % (category, msg)) # get pilot info for this job if type(self.id) != int: return j = self.getJobObject() cwd = os.getcwd() debug_dir = j.getDebugWorkspace().getPath() cmd = "getJobPilotOutput(%d,'%s')" % \ (self.id, debug_dir) result = execute(cmd) if result_ok(result): logger.info('Pilot Info: %s/pilot_%d/std.out.' % (debug_dir, self.id)) else: logger.error(result.get('Message', '')) @staticmethod def _bulk_updateStateTime(jobStateDict, bulk_time_lookup={} ): """ This performs the same as the _getStateTime method but loops over a list of job ids within the DIRAC namespace (much faster) Args: jobStateDict (dict): This is a dict of {job.backend.id : job_status, } elements bulk_time_lookup (dict): Dict of result of multiple calls to getBulkStateTime, performed in advance """ for this_state, these_jobs in jobStateDict.iteritems(): if bulk_time_lookup == {} or this_state not in bulk_time_lookup: bulk_result = execute("getBulkStateTime(%s,\'%s\')" % (repr([j.backend.id for j in these_jobs]), this_state)) else: bulk_result = bulk_time_lookup[this_state] for this_job in jobStateDict[this_state]: backend_id = this_job.backend.id if backend_id in bulk_result and bulk_result[backend_id]: DiracBase._getStateTime(this_job, this_state, {this_state : bulk_result[backend_id]}) else: DiracBase._getStateTime(this_job, this_state) @staticmethod def _getStateTime(job, status, getStateTimeResult={}): """Returns the timestamps for 'running' or 'completed' by extracting their equivalent timestamps from the loggingInfo. Args: job (Job): This is the job object we want to update status (str): This is the Ganga status we're updating (running, completed... etc) getStateTimeResult (dict): This is the optional result of executing the approriate getStateTime against this job.backend.id, if not provided the command is called internally """ # Now private to stop server cross-talk from user thread. Since updateStatus calles # this method whether called itself by the user thread or monitoring thread. # Now don't use hook but define our own private version # used in monitoring loop... messy but works. if job.status != status: b_list = ['running', 'completing', 'completed', 'failed'] backend_final = ['failed', 'completed'] # backend stamps if not job.subjobs and status in b_list: for childstatus in b_list: if job.backend.id: logger.debug("Accessing getStateTime() in diracAPI") if childstatus in backend_final: if childstatus in getStateTimeResult: be_statetime = getStateTimeResult[childstatus] else: be_statetime = execute("getStateTime(%d,\'%s\')" % (job.backend.id, childstatus)) job.time.timestamps["backend_final"] = be_statetime logger.debug("Wrote 'backend_final' to timestamps.") break else: time_str = "backend_" + childstatus if time_str not in job.time.timestamps: if childstatus in getStateTimeResult: be_statetime = getStateTimeResult[childstatus] else: be_statetime = execute("getStateTime(%d,\'%s\')" % (job.backend.id, childstatus)) job.time.timestamps["backend_" + childstatus] = be_statetime logger.debug("Wrote 'backend_%s' to timestamps.", childstatus) if childstatus == status: break logger.debug("_getStateTime(job with id: %d, '%s') called.", job.id, job.status) else: logger.debug("Status changed from '%s' to '%s'. No new timestamp was written", job.status, status) def timedetails(self): """Prints contents of the loggingInfo from the Dirac API.""" if not self.id: return None logger.debug("Accessing timedetails() in diracAPI") dirac_cmd = 'timedetails(%d)' % self.id return execute(dirac_cmd) @staticmethod def job_finalisation_cleanup(job, updated_dirac_status): """ Method for reverting a job back to a clean state upon a failure in the job progression Args: job (Job) This is the job to change the status updated_dirac_status (str): Ganga status which is to be used somewhere """ # Revert job back to running state if we exit uncleanly if job.status == "completing": job.updateStatus('running') if job.master: job.master.updateMasterJobStatus() # FIXME should I add something here to cleanup on sandboxes pulled from # malformed job output? @staticmethod def _internal_job_finalisation(job, updated_dirac_status): """ This method performs the main job finalisation Args: job (Job): Thi is the job we want to finalise updated_dirac_status (str): String representing the Ganga finalisation state of the job failed/completed """ if updated_dirac_status == 'completed': start = time.time() # firstly update job to completing DiracBase._getStateTime(job, 'completing') if job.status in ['removed', 'killed']: return elif (job.master and job.master.status in ['removed', 'killed']): return # user changed it under us job.updateStatus('completing') if job.master: job.master.updateMasterJobStatus() output_path = job.getOutputWorkspace().getPath() logger.info('Contacting DIRAC for job: %s' % job.fqid) # Contact dirac which knows about the job job.backend.normCPUTime, getSandboxResult, file_info_dict, completeTimeResult = execute("finished_job(%d, '%s')" % (job.backend.id, output_path)) now = time.time() logger.info('%0.2fs taken to download output from DIRAC for Job %s' % ((now - start), job.fqid)) #logger.info('Job ' + job.fqid + ' OutputDataInfo: ' + str(file_info_dict)) #logger.info('Job ' + job.fqid + ' OutputSandbox: ' + str(getSandboxResult)) #logger.info('Job ' + job.fqid + ' normCPUTime: ' + str(job.backend.normCPUTime)) # Set DiracFile metadata wildcards = [f.namePattern for f in job.outputfiles.get(DiracFile) if regex.search(f.namePattern) is not None] lfn_store = os.path.join(output_path, getConfig('Output')['PostProcessLocationsFileName']) # Make the file on disk with a nullop... if not os.path.isfile(lfn_store): with open(lfn_store, 'w'): pass if job.outputfiles.get(DiracFile): # Now we can iterate over the contents of the file without touching it with open(lfn_store, 'ab') as postprocesslocationsfile: if not hasattr(file_info_dict, 'keys'): logger.error("Error understanding OutputDataInfo: %s" % str(file_info_dict)) from Ganga.Core.exceptions import GangaException raise GangaException("Error understanding OutputDataInfo: %s" % str(file_info_dict)) ## Caution is not clear atm whether this 'Value' is an LHCbism or bug list_of_files = file_info_dict.get('Value', file_info_dict.keys()) for file_name in list_of_files: file_name = os.path.basename(file_name) info = file_info_dict.get(file_name) #logger.debug("file_name: %s,\tinfo: %s" % (str(file_name), str(info))) if not hasattr(info, 'get'): logger.error("Error getting OutputDataInfo for: %s" % str(job.getFQID('.'))) logger.error("Please check the Dirac Job still exists or attempt a job.backend.reset() to try again!") logger.error("Err: %s" % str(info)) logger.error("file_info_dict: %s" % str(file_info_dict)) from Ganga.Core.exceptions import GangaException raise GangaException("Error getting OutputDataInfo") valid_wildcards = [wc for wc in wildcards if fnmatch.fnmatch(file_name, wc)] if not valid_wildcards: valid_wildcards.append('') for wc in valid_wildcards: #logger.debug("wildcard: %s" % str(wc)) DiracFileData = 'DiracFile:::%s&&%s->%s:::%s:::%s\n' % (wc, file_name, info.get('LFN', 'Error Getting LFN!'), str(info.get('LOCATIONS', ['NotAvailable'])), info.get('GUID', 'NotAvailable') ) #logger.debug("DiracFileData: %s" % str(DiracFileData)) postprocesslocationsfile.write(DiracFileData) postprocesslocationsfile.flush() logger.debug("Written: %s" % open(lfn_store, 'r').readlines()) # check outputsandbox downloaded correctly if not result_ok(getSandboxResult): logger.warning('Problem retrieving outputsandbox: %s' % str(getSandboxResult)) DiracBase._getStateTime(job, 'failed') if job.status in ['removed', 'killed']: return elif (job.master and job.master.status in ['removed', 'killed']): return # user changed it under us job.updateStatus('failed') if job.master: job.master.updateMasterJobStatus() raise BackendError('Problem retrieving outputsandbox: %s' % str(getSandboxResult)) # finally update job to completed DiracBase._getStateTime(job, 'completed', completeTimeResult) if job.status in ['removed', 'killed']: return elif (job.master and job.master.status in ['removed', 'killed']): return # user changed it under us job.updateStatus('completed') if job.master: job.master.updateMasterJobStatus() now = time.time() logger.debug('Job ' + job.fqid + ' Time for complete update : ' + str(now - start)) elif updated_dirac_status == 'failed': # firstly update status to failed DiracBase._getStateTime(job, 'failed') if job.status in ['removed', 'killed']: return if (job.master and job.master.status in ['removed', 'killed']): return # user changed it under us job.updateStatus('failed') if job.master: job.master.updateMasterJobStatus() # if requested try downloading outputsandbox anyway if configDirac['failed_sandbox_download']: execute("getOutputSandbox(%d,'%s')" % (job.backend.id, job.getOutputWorkspace().getPath())) else: logger.error("Job #%s Unexpected dirac status '%s' encountered" % (job.getFQID('.'), updated_dirac_status)) @staticmethod def job_finalisation(job, updated_dirac_status): """ Attempt to finalise the job given and auto-retry 5 times on error Args: job (Job): Job object to finalise updated_dirac_status (str): The Ganga status to update the job to, i.e. failed/completed """ count = 1 limit = 5 sleep_length = 2.5 while count != limit: try: count += 1 # Check status is sane before we start if job.status != "running" and (not job.status in ['completed', 'killed', 'removed']): job.updateStatus('submitted') job.updateStatus('running') if job.status in ['completed', 'killed', 'removed']: break # make sure proxy is valid if DiracBase.checkDiracProxy(): # perform finalisation DiracBase._internal_job_finalisation(job, updated_dirac_status) else: # exit gracefully logger.warning("Cannot process job: %s. DIRAC monitoring has been disabled. To activate your grid proxy type: \'gridProxy.renew()\'" % job.fqid) break except Exception as err: logger.warning("An error occured finalising job: %s" % job.getFQID('.')) logger.warning("Attemting again (%s of %s) after %s-sec delay" % (str(count), str(limit), str(sleep_length))) if count == limit: logger.error("Unable to finalise job after %s retries due to error:\n%s" % (job.getFQID('.'), str(err))) job.force_status('failed') raise time.sleep(sleep_length) job.been_queued = False @staticmethod def requeue_dirac_finished_jobs(requeue_jobs, finalised_statuses): """ Method used to requeue jobs whih are in the finalized state of some form, finished/failed/etc Args: requeue_jobs (list): This is a list of the jobs which are to be requeued to be finalised finalised_statuses (dict): Dict of the Dirac statuses vs the Ganga statuses after running """ from Ganga.Core import monitoring_component # requeue existing completed job for j in requeue_jobs: if j.been_queued: continue if monitoring_component: if monitoring_component.should_stop(): break if not configDirac['serializeBackend']: getQueues()._monitoring_threadpool.add_function(DiracBase.job_finalisation, args=(j, finalised_statuses[j.backend.status]), priority=5, name="Job %s Finalizing" % j.fqid) j.been_queued = True else: DiracBase.job_finalisation(j, finalised_statuses[j.backend.status]) @staticmethod def monitor_dirac_running_jobs(monitor_jobs, finalised_statuses): """ Method to update the configuration of jobs which are in a submitted/running state in Ganga&Dirac Args: monitor_jobs (list): Jobs which are to be monitored for their status change finalised_statuses (dict): Dict of the Dirac statuses vs the Ganga statuses after running """ # now that can submit in non_blocking mode, can see jobs in submitting # that have yet to be assigned an id so ignore them # NOT SURE THIS IS VALID NOW BULK SUBMISSION IS GONE # EVEN THOUGH COULD ADD queues.add(j.submit) WILL KEEP AN EYE ON IT # dirac_job_ids = [ j.backend.id for j in monitor_jobs if j.backend.id is not None ] # Correction this did become a problem for a crashed session during # submit, see #104454 dead_jobs = (j for j in monitor_jobs if j.backend.id is None) for d in dead_jobs: d.updateStatus('failed') if d.master is not None: d.master.updateMasterJobStatus() ganga_job_status = [j.status for j in monitor_jobs if j.backend.id is not None] dirac_job_ids = [j.backend.id for j in monitor_jobs if j.backend.id is not None] logger.debug("GangaStatus: %s" % str(ganga_job_status)) logger.debug("diracJobIDs: %s" % str(dirac_job_ids)) if not dirac_job_ids: ## Nothing to do here stop bugging DIRAC about it! ## Everything else beyond here in the function depends on some ids present here, no ids means we can stop. return statusmapping = configDirac['statusmapping'] result, bulk_state_result = execute('monitorJobs(%s, %s)' %( repr(dirac_job_ids), repr(statusmapping))) if not DiracBase.checkDiracProxy(): return #result = results[0] #bulk_state_result = results[1] if len(result) != len(ganga_job_status): logger.warning('Dirac monitoring failed for %s, result = %s' % (str(dirac_job_ids), str(result))) logger.warning("Results: %s" % str(results)) return from Ganga.Core import monitoring_component requeue_job_list = [] jobStateDict = {} jobs_to_update = {} master_jobs_to_update = [] thread_handled_states = ['completed', 'failed'] for job, state, old_state in zip(monitor_jobs, result, ganga_job_status): if monitoring_component: if monitoring_component.should_stop(): break if job.been_queued: continue job.backend.statusInfo = state[0] job.backend.status = state[1] job.backend.actualCE = state[2] updated_dirac_status = state[3] try: job.backend.extraInfo = state[4] except Exception as err: logger.debug("gexception: %s" % str(err)) pass logger.debug('Job status vector : ' + job.fqid + ' : ' + repr(state)) if updated_dirac_status not in jobStateDict: jobStateDict[updated_dirac_status] = [] jobStateDict[updated_dirac_status].append(job) if job.backend.status in finalised_statuses: if job.status != 'running': if job.status in ['removed', 'killed']: requeue_job_list.append(job) elif (job.master and job.master.status in ['removed', 'killed']): continue # user changed it under us else: if 'running' not in jobs_to_update: jobs_to_update['running'] = [] jobs_to_update['running'].append(job) if job.master: if job.master not in master_jobs_to_update: master_jobs_to_update.append(job.master) requeue_job_list.append(job) else: if job.status in ['removed', 'killed']: continue if (job.master and job.master.status in ['removed', 'killed']): continue # user changed it under us if job.status != updated_dirac_status: if updated_dirac_status not in jobs_to_update: jobs_to_update[updated_dirac_status] = [] jobs_to_update[updated_dirac_status].append(job) if job.master: if job.master not in master_jobs_to_update: master_jobs_to_update.append(job.master) DiracBase._bulk_updateStateTime(jobStateDict, bulk_state_result) for status in jobs_to_update: for job in jobs_to_update[status]: job.updateStatus(status, update_master=False) for j in master_jobs_to_update: j.updateMasterJobStatus() DiracBase.requeue_dirac_finished_jobs(requeue_job_list, finalised_statuses) @staticmethod def checkDiracProxy(): # make sure proxy is valid if not _proxyValid(shouldRenew = False, shouldRaise = False): if DiracBase.dirac_monitoring_is_active is True: logger.warning('DIRAC monitoring inactive (no valid proxy found).') logger.warning('Type: \'gridProxy.renew()\' to (re-)activate') DiracBase.dirac_monitoring_is_active = False else: DiracBase.dirac_monitoring_is_active = True return DiracBase.dirac_monitoring_is_active @staticmethod def updateMonitoringInformation(jobs_): """Check the status of jobs and retrieve output sandboxesi Args: jobs_ (list): List of the appropriate jobs to monitored """ # Only those jobs in 'submitted','running' are passed in here for checking # if however they have already completed in Dirac they may have been put on queue # for processing from last time. These should be put back on queue without # querying dirac again. Their signature is status = running and job.backend.status # already set to Done or Failed etc. jobs = [stripProxy(j) for j in jobs_] # make sure proxy is valid if not DiracBase.checkDiracProxy(): return # remove from consideration any jobs already in the queue. Checking this non persisted attribute # is better than querying the queue as cant tell if a job has just been taken off queue and is being processed # also by not being persistent, this attribute automatically allows queued jobs from last session to be considered # for requeing interesting_jobs = [j for j in jobs if not j.been_queued] # status that correspond to a ganga 'completed' or 'failed' (see DiracCommands.status(id)) # if backend status is these then the job should be on the queue finalised_statuses = configDirac['finalised_statuses'] monitor_jobs = [j for j in interesting_jobs if j.backend.status not in finalised_statuses] requeue_jobs = [j for j in interesting_jobs if j.backend.status in finalised_statuses] #logger.debug('Interesting jobs: ' + repr([j.fqid for j in interesting_jobs])) #logger.debug('Monitor jobs : ' + repr([j.fqid for j in monitor_jobs])) #logger.debug('Requeue jobs : ' + repr([j.fqid for j in requeue_jobs])) DiracBase.requeue_dirac_finished_jobs(requeue_jobs, finalised_statuses) DiracBase.monitor_dirac_running_jobs(monitor_jobs, finalised_statuses)
class GridftpSandboxCache(GridSandboxCache): ''' Helper class for upladong/downloading/deleting sandbox files using lcg-cp/lcg-del commands with gsiftp protocol. @author: Hurng-Chun Lee @contact: [email protected] ''' gridftp_sandbox_cache_schema_datadict.update({ 'baseURI': SimpleItem(defvalue='', copyable=1, doc='the base URI for storing cached files') }) _schema = Schema(Version(1, 0), gridftp_sandbox_cache_schema_datadict) _category = 'GridSandboxCache' _name = 'GridftpSandboxCache' logger = getLogger() def __init__(self): super(GridftpSandboxCache, self).__init__() self.protocol = 'gsiftp' def impl_upload(self, cred_req, files=[], opts=''): """ Uploads multiple files to a remote gridftp server. """ shell = getShell(cred_req) # making the directory on remove storage at destURI dirname = self.__get_unique_fname__() # creating subdirectory dir_ok = False destURI = '%s/%s' % (self.baseURI, dirname) uri_info = urisplit(destURI) cmd = 'uberftp %s "cd %s"' % (uri_info[1], uri_info[2]) rc, output, m = self.__cmd_retry_loop__(shell, cmd, 1) if rc != 0: for l in output.split('\n'): l.strip() if re.match(r'^550.*', l): # the directory is not found (error code 550), try to creat # the lowest level one cmd = 'uberftp %s "mkdir %s"' % (uri_info[1], uri_info[2]) rc, output, m = self.__cmd_retry_loop__(shell, cmd, 1) if rc != 0: self.logger.error(output) else: dir_ok = True break else: self.logger.debug('parent directory already available: %s' % destURI) dir_ok = True if not dir_ok: self.logger.error('parent directory not available: %s' % destURI) return [] # the algorithm of uploading one file class MyAlgorithm(Algorithm): def __init__(self, cacheObj): Algorithm.__init__(self) self.cacheObj = cacheObj def process(self, file): # decide number of parallel stream to be used fsize = os.path.getsize(urlparse(file)[2]) fname = os.path.basename(urlparse(file)[2]) fpath = os.path.abspath(urlparse(file)[2]) md5sum = get_md5sum(fpath, ignoreGzipTimestamp=True) nbstream = int((fsize * 1.0) / (10.0 * 1024 * 1024 * 1024)) if nbstream < 1: nbstream = 1 # min stream if nbstream > 8: nbstream = 8 # max stream myDestURI = '%s/%s' % (destURI, fname) # uploading the file cmd = 'uberftp' if nbstream > 1: cmd += ' -c %d' % nbstream cmd += ' file:%s %s' % (fpath, myDestURI) rc, output, m = self.cacheObj.__cmd_retry_loop__( shell, cmd, self.cacheObj.max_try) if rc != 0: self.cacheObj.logger.error(output) return False else: fidx = GridftpFileIndex() fidx.id = myDestURI fidx.name = fname fidx.md5sum = md5sum fidx.attributes['fpath'] = fpath self.__appendResult__(file, fidx) return True myAlg = MyAlgorithm(cacheObj=self) myData = Data(collection=files) runner = MTRunner(name='sandboxcache_gridftp', algorithm=myAlg, data=myData) runner.start() runner.join(-1) return runner.getResults().values() def impl_download(self, cred_req, files=[], dest_dir=None, opts=''): """ Downloads multiple files from gridftp server to a local directory. """ if not dest_dir: dest_dir = os.getcwd() self.logger.debug('download file to: %s', dest_dir) shell = getShell(cred_req) # the algorithm of downloading one file to a local directory class MyAlgorithm(Algorithm): def __init__(self, cacheObj): Algorithm.__init__(self) self.cacheObj = cacheObj def process(self, file): srcURI = file.id fname = os.path.basename(urisplit(srcURI)[2]) destURI = 'file:%s/%s' % (dest_dir, fname) #cmd = 'uberftp %s %s' % (srcURI, destURI) cmd = 'globus-url-copy %s %s' % (srcURI, destURI) rc, output, m = self.cacheObj.__cmd_retry_loop__( shell, cmd, self.cacheObj.max_try) if rc != 0: self.cacheObj.logger.error(output) return False else: self.__appendResult__(file.id, file) return True myAlg = MyAlgorithm(cacheObj=self) myData = Data(collection=files) runner = MTRunner(name='sandboxcache_gridftp', algorithm=myAlg, data=myData) runner.start() runner.join(-1) return runner.getResults().values() def impl_delete(self, cred_req, files=[], opts=''): """ Deletes multiple files from remote gridftp server """ shell = getShell(cred_req) # the algorithm of downloading one file to a local directory class MyAlgorithm(Algorithm): def __init__(self, cacheObj): Algorithm.__init__(self) self.cacheObj = cacheObj def process(self, file): destURI = file.id uri_info = urisplit(destURI) cmd = 'uberftp %s "rm %s"' % (uri_info[1], uri_info[2]) rc, output, m = self.cacheObj.__cmd_retry_loop__( shell, cmd, self.cacheObj.max_try) if rc != 0: self.cacheObj.logger.error(output) return False else: self.__appendResult__(file.id, file) return True myAlg = MyAlgorithm(cacheObj=self) myData = Data(collection=files) runner = MTRunner(name='sandboxcache_lcgdel', algorithm=myAlg, data=myData) runner.start() runner.join(-1) # update the local index file del_files = runner.getResults().values() all_files = self.get_cached_files() left_files = [] for f in all_files: if f not in del_files: left_files.append(f) self.impl_bookkeepUploadedFiles(left_files, append=False) return del_files
class LCGSandboxCache(GridSandboxCache): ''' Helper class for upladong/downloading/deleting sandbox files using lcg-cr/lcg-cp/lcg-del commands. @author: Hurng-Chun Lee @contact: [email protected] ''' lcg_sandbox_cache_schema_datadict.update({ 'se': SimpleItem(defvalue='', copyable=1, doc='the LCG SE hostname'), 'se_type': SimpleItem(defvalue='srmv2', copyable=1, doc='the LCG SE type'), 'se_rpath': SimpleItem(defvalue='generated', copyable=1, doc='the relative path to the VO directory on the SE'), 'lfc_host': SimpleItem(defvalue='', copyable=1, doc='the LCG LFC hostname'), 'srm_token': SimpleItem( defvalue='', copyable=1, doc= 'the SRM space token, meaningful only when se_type is set to srmv2' ) }) _schema = Schema(Version(1, 0), lcg_sandbox_cache_schema_datadict) _category = 'GridSandboxCache' _name = 'LCGSandboxCache' logger = getLogger() def __init__(self): super(LCGSandboxCache, self).__init__() self.protocol = 'lcg' def __setattr__(self, attr, value): if attr == 'se_type' and value not in ['', 'srmv1', 'srmv2', 'se']: raise AttributeError('invalid se_type: %s' % value) super(LCGSandboxCache, self).__setattr__(attr, value) def impl_upload(self, files=[], opts=''): """ Uploads multiple files to a remote grid storage. """ shell = getShell(self.middleware) if self.lfc_host: shell.env['LFC_HOST'] = self.lfc_host self.logger.debug('upload file with LFC_HOST: %s', shell.env['LFC_HOST']) # the algorithm of uploading one file class MyAlgorithm(Algorithm): def __init__(self, cacheObj): Algorithm.__init__(self) self.cacheObj = cacheObj self.dirname = self.cacheObj.__get_unique_fname__() def process(self, file): # decide number of parallel stream to be used fsize = os.path.getsize(urlparse(file)[2]) fname = os.path.basename(urlparse(file)[2]) fpath = os.path.abspath(urlparse(file)[2]) md5sum = get_md5sum(fpath, ignoreGzipTimestamp=True) nbstream = int((fsize * 1.0) / (10.0 * 1024 * 1024 * 1024)) if nbstream < 1: nbstream = 1 # min stream if nbstream > 8: nbstream = 8 # max stream cmd = 'lcg-cr -t 180 --vo %s -n %d' % (self.cacheObj.vo, nbstream) if self.cacheObj.se != None: cmd = cmd + ' -d %s' % self.cacheObj.se if self.cacheObj.se_type == 'srmv2' and self.cacheObj.srm_token: cmd = cmd + ' -D srmv2 -s %s' % self.cacheObj.srm_token # specify the physical location cmd = cmd + \ ' -P %s/ganga.%s/%s' % (self.cacheObj.se_rpath, self.dirname, fname) # specify the logical filename # NOTE: here we assume the root dir for VO is /grid/<voname> lfc_dir = '/grid/%s/ganga.%s' % (self.cacheObj.vo, self.dirname) if not self.cacheObj.__lfc_mkdir__(shell, lfc_dir): self.cacheObj.logger.warning( 'cannot create LFC directory: %s' % lfc_dir) return None cmd = cmd + ' -l %s/%s %s' % (lfc_dir, fname, file) rc, output, m = self.cacheObj.__cmd_retry_loop__( shell, cmd, self.cacheObj.max_try) if rc != 0: return False else: match = re.search('(guid:\S+)', output) if match: guid = match.group(1) fidx = LCGFileIndex() fidx.id = guid fidx.name = fname fidx.md5sum = md5sum fidx.lfc_host = self.cacheObj.lfc_host fidx.local_fpath = fpath self.__appendResult__(file, fidx) return True else: return False myAlg = MyAlgorithm(cacheObj=self) myData = Data(collection=files) runner = MTRunner(name='sandboxcache_lcgcr', algorithm=myAlg, data=myData) runner.start() runner.join(-1) return runner.getResults().values() def impl_download(self, files=[], dest_dir=None, opts=''): """ Downloads multiple files from remote grid storages to a local directory. """ if not dest_dir: dest_dir = os.getcwd() self.logger.debug('download file to: %s', dest_dir) # the algorithm of downloading one file to a local directory class MyAlgorithm(Algorithm): def __init__(self, cacheObj): Algorithm.__init__(self) self.cacheObj = cacheObj self.shell = getShell(self.cacheObj.middleware) def process(self, file): guid = file.id lfn = file.attributes['local_fpath'] lfc_host = file.attributes['lfc_host'] fname = os.path.basename(urlparse(lfn)[2]) self.shell.env['LFC_HOST'] = lfc_host self.cacheObj.logger.debug('download file with LFC_HOST: %s', self.shell.env['LFC_HOST']) cmd = 'lcg-cp -t %d --vo %s ' % (self.cacheObj.timeout, self.cacheObj.vo) if self.cacheObj.se_type: cmd += '-T %s ' % self.cacheObj.se_type cmd += '%s file://%s/%s' % (guid, dest_dir, fname) self.cacheObj.logger.debug('download file: %s', cmd) rc, output, m = self.cacheObj.__cmd_retry_loop__( self.shell, cmd, self.cacheObj.max_try) if rc != 0: return False else: self.__appendResult__(file.id, file) return True myAlg = MyAlgorithm(cacheObj=self) myData = Data(collection=files) runner = MTRunner(name='sandboxcache_lcgcp', algorithm=myAlg, data=myData) runner.start() runner.join(-1) return runner.getResults().values() def impl_delete(self, files=[], opts=''): """ Deletes multiple files from remote grid storages. """ # the algorithm of downloading one file to a local directory class MyAlgorithm(Algorithm): def __init__(self, cacheObj): Algorithm.__init__(self) self.cacheObj = cacheObj self.shell = getShell(self.cacheObj.middleware) def process(self, file): guid = file.id lfc_host = file.attributes['lfc_host'] self.shell.env['LFC_HOST'] = lfc_host self.cacheObj.logger.debug('delete file with LFC_HOST: %s' % self.shell.env['LFC_HOST']) cmd = 'lcg-del -a -t 60 --vo %s %s' % (self.cacheObj.vo, guid) rc, output, m = self.cacheObj.__cmd_retry_loop__( self.shell, cmd, self.cacheObj.max_try) if rc != 0: return False else: self.__appendResult__(file.id, file) return True myAlg = MyAlgorithm(cacheObj=self) myData = Data(collection=files) runner = MTRunner(name='sandboxcache_lcgdel', algorithm=myAlg, data=myData) runner.start() runner.join(-1) # update the local index file del_files = runner.getResults().values() all_files = self.get_cached_files() left_files = [] for f in all_files: if f not in del_files: left_files.append(f) self.impl_bookkeepUploadedFiles(left_files, append=False) return del_files # For GUID protocol def __lfc_mkdir__(self, shell, path, mode='775'): '''Creates a directory in LFC''' cmd = 'lfc-mkdir -p -m %s %s' % (mode, path) (rc, output, m) = self.__cmd_retry_loop__(shell, cmd, 1) if rc != 0: return False else: return True
class JobTime(GangaObject): """Job timestamp access. In development Changes in the status of a Job are timestamped - a datetime object is stored in the dictionary named 'timestamps', in Coordinated Universal Time(UTC). More information on datetime objects can be found at: http://docs.python.org/library/datetime.html Datetime objects can be subtracted to produce a 'timedelta' object. More information about these can be found at the above address. '+', '*', and '/' are not supported by datetime objects. Datetime objects can be formatted into strings using the .strftime(format_string) application, and the strftime codes. e.g. %Y -> year as integer %a -> abbreviated weekday name %M -> minutes as inetger The full list can be found at: http://docs.python.org/library/datetime.html#strftime-behavior Standard status types with built in access methods are: -'new' -'submitted' -'running' -'completed' -'killed' -'failed' These return a string with default format %Y/%m/%d @ %H:%M:%S. A custom format can be specified in the arguement. Any information stored within the timestamps dictionary can also be extracted in the way as in would be for a standard, non-application specific python dictionary. For a table display of the Job's timestamps use .time.display(). For timestamps details from the backend use .time.details() """ timestamps = {} sj_statlist = [] _schema = Schema( Version(0, 0), { 'timestamps': SimpleItem(defvalue={}, doc="Dictionary containing timestamps for job", summary_print='_timestamps_summary_print') }) _category = 'jobtime' _name = 'JobTime' _exportmethods = [ 'display', 'new', 'submitting', 'submitted', 'backend_running', 'backend_final', 'backend_completing', 'completing', 'final', 'running', 'runtime', 'waittime', 'submissiontime', 'details', 'printdetails' ] def __init__(self): super(JobTime, self).__init__() self.timestamps = {} # this makes sure the contents of the list don't get copied when the # Job does. self.sj_statlist = [] def __deepcopy__(self, memo): obj = super(JobTime, self).__deepcopy__(memo) # Lets not re-initialize the object as we lose history from previous submissions # obj.newjob() return obj def newjob(self): """Timestamps job upon creation. """ t = datetime.datetime.utcnow() self.timestamps['new'] = t # this makes sure the contents of the list don't get copied when the # Job does. self.sj_statlist = [] def timenow(self, status): """Updates timestamps as job status changes. """ j = self.getJobObject() t_now = datetime.datetime.utcnow() b_list = ['running', 'completing', 'completed', 'failed'] final = ['killed', 'failed', 'completed'] backend_final = ['failed', 'completed'] ganga_master = ['new', 'submitting', 'killed'] id = j.id if id is None: id = str("unknown") logger.debug("Job %s called timenow('%s')", str(id), status) # standard method: if not j.subjobs: # backend stamps if status in b_list: for childstatus in b_list: be_statetime = stripProxy( j.backend).getStateTime(childstatus) if be_statetime is not None: if childstatus in backend_final: self.timestamps["backend_final"] = be_statetime logger.debug( "Wrote 'backend_final' to timestamps.") else: self.timestamps["backend_" + childstatus] = be_statetime logger.debug("Wrote 'backend_%s' to timestamps.", childstatus) if childstatus == status: break # ganga stamps if status in final: self.timestamps["final"] = t_now logger.debug("Wrote 'final' to timestamps.") else: self.timestamps[status] = t_now logger.debug("Wrote '%s' to timestamps.", status) # subjobs method: if j.master: # identifies subjobs logger.debug( "j.time.timenow() caught subjob %d.%d in the '%s' status", j.master.id, j.id, status) for written_status in j.time.timestamps.keys(): if written_status not in j.master.time.sj_statlist: j.master.time.sj_statlist.append(written_status) logger.debug("written_status: '%s' written to sj_statlist", written_status) # master job method if j.subjobs: # identifies master job logger.debug( "j.time.timenow() caught master job %d in the '%s' status", j.id, status) if status in ganga_master: # don't use subjob stamp for these self.timestamps[status] = t_now logger.debug( "status: '%s' in ganga_master written to master timestamps.", status) else: for state in self.sj_statlist: if state not in ganga_master: j.time.timestamps[state] = self.sjStatList_return( state) logger.debug( "state: '%s' from sj_statlist to written to master timestamps.", state) else: pass def sjStatList_return(self, status): list = [] final = ['backend_final', 'final'] j = self.getJobObject() for sjs in j.subjobs: try: if isinstance(sjs.time.timestamps[status], datetime.datetime): list.append(sjs.time.timestamps[status]) else: logger.debug( 'Attempt to add a non datetime object in the timestamp, job=%d, subjob=%d', j.id, sjs.id) except KeyError: logger.debug( "Status '%s' not found in timestamps of job %d.%d.", status, sjs.master.id, sjs.id) list.sort() try: if status in final: return list[-1] return list[0] except IndexError: # change this to a more appropriate debug. logger.debug( "IndexError: ID: %d, Status: '%s', length of list: %d", j.id, status, len(list)) def display(self, format="%Y/%m/%d %H:%M:%S"): return self._display(format) # Justin 10.9.09: I think 'ljust' might be just as good if not better than # 'rjust' here: def _display(self, format="%Y/%m/%d %H:%M:%S", interactive=False): """Displays existing timestamps in a table. Format can be specified by typing a string of the appropriate strftime() behaviour codes as the arguement. e.g. '%H:%M:%S' ==> 13:55:01 For a full list of codes see http://docs.python.org/library/datetime.html?#strftime-behavior """ retstr = '' T = datetime.datetime.now() tstring = T.strftime(format) length = len(tstring) times = [0 for k in self.timestamps.keys()] for i in range(0, len(self.timestamps.keys())): try: times[i] = self.timestamps[self.timestamps.keys()[i]].strftime( format).rjust(length) + ' - ' + self.timestamps.keys()[i] except AttributeError: times[i] = str(self.timestamps[self.timestamps.keys( )[i]]).rjust(length) + ' - ' + self.timestamps.keys()[i] # try to make chronological - can fail when timestamps are the same to # nearest sec -> becomes alphabetical... times.sort() retstr = retstr + '\n' + \ 'Time (UTC)'.rjust(length) + ' Status' + '\n' for i in range(0, 21): retstr = retstr + '- ' retstr = retstr + '\n' for i in range(0, len(times)): retstr = retstr + times[i] + '\n' return retstr def _timestamps_summary_print(self, value, verbosity_level, interactive=False): """Used to display timestamps when JobTime object is displayed. """ return self._display(interactive=interactive) # This didn't work: # # def __str__(self): # """ string cast """ # return self._display() def details(self, subjob=None): """Obtains all timestamps available from the job's specific backend. Subjob arguement: None = default 'all' = gets details for ALL SUBJOBS. You have been warned. int = gets details for subjob number 'int' No argument is required for a job with no subjobs. """ j = self.getJobObject() idstr = '' detdict = {} # If job is SUBJOB do the normal procedure. Not sure this clause is # neccessary as subjobs will be caught normally if j.master: logger.debug("j.time.details(): subjob %d.%d caught.", j.master.id, j.id) detdict = j.backend.timedetails() return detdict # If job is MASTER iterate over subjobs and do normal method. This # isn't going to be ideal for a large number of subjobs if j.subjobs: logger.debug("j.time.details(): master job %d caught.", j.id) idstr = str(j.id) # User wants 'all' if subjob == 'all': keyin = None # NOTE: The interactive loop below was more an exercise for learning how 'keyin' is used than a useful addition. # ask whether user really wants to print timedetails for all # their jobs: while keyin is None: keyin = raw_input( "Are you sure you want details for ALL %d subjobs(y/n)?" % len(j.subjobs)) # if yes carry on at for loop if keyin == 'y': pass # if no return None. Doesn't execute rest of method elif keyin == 'n': return None # if something else - asks again else: logger.info("y/n please!") keyin = None for jobs in j.subjobs: subidstr = idstr + '.' + str(jobs.id) # String needs more info if it is going to stay in. logger.debug("Subjob: %d, Backend ID: %d", jobs.id, jobs.backend.id) detdict[subidstr] = jobs.backend.timedetails() return detdict # no arguement specified elif subjob is None: logger.debug( "j.time.details(): no subjobs specified for this master job." ) return None # Subjob id or string passed else: # string = error if not isinstance(subjob, int): raise TypeError("Subjob id requires type 'int'") # subjob id supplied for sj in j.subjobs: if sj.id == subjob: logger.debug("Subjob: %d, Backend ID: %d", sj.id, sj.backend.id) detdict = sj.backend.timedetails() return detdict else: pass if subjob >= len(j.subjobs): logger.warning( "Index '%s' is out of range. Corresponding subjob does not exist.", str(subjob)) return None logger.debug( "subjob arguement '%s' has failed to be caught and dealt with.", subjob) return None detdict = j.backend.timedetails() # called if no subjobs return detdict def printdetails(self, subjob=None): """Prints backend details to screen by calling details() and printing the returned dictionary. """ j = self.getJobObject() if subjob == 'all': # the warning and action taken below are pretty annoying, but I was # unsure how to deal with the request to print the details for all # n subjobs, which seems unlikely to be made. logger.warning( "It might be unwise to print all subjobs details. Use details() and extract relevant info from dictionary." ) return None pd = self.details(subjob) for key in pd.keys(): logger.info(key, '\t', pd[key]) def runtime(self): """Method which returns the 'runtime' of the specified job. The runtime is calculated as the duration between the job entering the 'running' state and the job entering the 'completed' state. """ end_list = ['killed', 'completed', 'failed'] end_stamps = {} # if master job, sum: j = self.getJobObject() if j.subjobs: masterrun = datetime.timedelta(0, 0, 0) for jobs in j.subjobs: masterrun = masterrun + jobs.time.runtime() return masterrun # all other jobs: return self.duration('backend_running', 'backend_final') def waittime(self): """Method which returns the waiting time of the specified job. The waiting time is calculated as the duration between the job entering the 'submitted' state and entering the 'running' state. """ # master job: j = self.getJobObject() if j.subjobs: start_list = [] end_list = [] for jobs in j.subjobs: start_list.append(jobs.time.timestamps['submitted']) end_list.append(jobs.time.timestamps['backend_running']) start_list.sort() end_list.sort() start = start_list[0] end = end_list[len(end_list) - 1] masterwait = end - start return masterwait # all other jobs: return self.duration('submitted', 'backend_running') def submissiontime(self): """Method which returns submission time of specified job. Calculation: sub_time = submitted - submitting. """ j = self.getJobObject() if j.subjobs: start_list = [] end_list = [] for jobs in j.subjobs: end_list.append(jobs.time.timestamps['submitted']) end_list.sort() start = j.time.timestamps['submitting'] end = end_list[len(end_list) - 1] mastersub = end - start return mastersub return self.duration('submitting', 'submitted') def duration(self, start, end): """Returns duration between two specified timestamps as timedelta object. """ if start in self.timestamps.keys(): if end in self.timestamps.keys(): s, e = self.timestamps[start], self.timestamps[end] s_micro, e_micro = datetime.timedelta( 0, 0, s.microsecond), datetime.timedelta(0, 0, e.microsecond) e, s = e - e_micro, s - s_micro td = e - s # method for rounding removed because timestamps aren't always recorded with microsecond precision, and stamping accuracy isn't high enough to justify doing so # ds = td.days # secs = td.seconds # micros = td.microseconds # if micros >= 500000: # secs +=1 dur = td # datetime.timedelta(days=ds, seconds=secs) return dur else: logger.warning("Could not calculate duration: '%s' not found.", end) else: logger.warning("Could not calculate duration: '%s' not found.", start) return None def statetime(self, status, format=None): """General method for obtaining the specified timestamp in specified format. """ if status not in self.timestamps: logger.debug("Timestamp '%s' not available.", status) return None if format is not None: return self.timestamps[status].strftime(format) return self.timestamps[status] def new(self, format=None): """Method for obtaining 'new' timestamp. """ return self.statetime('new', format) def submitting(self, format=None): """Method for obtaining 'submitting' timestamp. """ return self.statetime('submitting', format) def submitted(self, format=None): """Method for obtaining 'submitted' timestamp. """ return self.statetime('submitted', format) def backend_running(self, format=None): """Method for obtaining 'backend_running' timestamp. """ return self.statetime('backend_running', format) def backend_final(self, format=None): """Method for obtaining 'backend_final' timestamp. """ return self.statetime('backend_final', format) def backend_completing(self, format=None): """Method for obtaining 'backend_completing' timestamp. """ return self.statetime('backend_completing', format) def completing(self, format=None): """Method for obtaining 'completing' timestamp. """ return self.statetime('completing', format) def final(self, format=None): """Method for obtaining 'final' timestamp. """ return self.statetime('final', format) def running(self, format=None): """Method for obtaining 'running' timestamp. """ return self.statetime('running', format)
class LogicalFile(DiracFile): # Logical File schema # Observing the 'old' 1.0 schema whilst preserving backwards compatability # with the fact that we're translating the object into a DiracFile in this # case _schema = Schema( Version(1, 0), { 'name': SimpleItem( defvalue="", doc='the LFN filename a LogicalFile is constructed with'), 'namePattern': SimpleItem( defvalue="", doc='pattern of the file name', transient=1), 'localDir': SimpleItem( defvalue=None, copyable=1, typelist=['str', 'type(None)'], doc= 'local dir where the file is stored, used from get and put methods', transient=1), 'remoteDir': SimpleItem( defvalue="", doc= 'remote directory where the LFN is to be placed in the dirac base directory by the put method.', transient=1), 'locations': SimpleItem( defvalue=[], copyable=1, typelist=['str'], sequence=1, doc="list of SE locations where the outputfiles are uploaded", transient=1), 'compressed': SimpleItem( defvalue=False, typelist=['bool'], protected=0, doc= 'wheather the output file should be compressed before sending somewhere', transient=1), 'lfn': SimpleItem( defvalue='', copyable=1, typelist=['str'], doc= 'return the logical file name/set the logical file name to use if not ' 'using wildcards in namePattern', transient=1), 'guid': SimpleItem( defvalue='', copyable=1, typelist=['str'], doc= 'return the GUID/set the GUID to use if not using wildcards in the namePattern.', transient=1), 'subfiles': ComponentItem(category='gangafiles', defvalue=[], hidden=1, sequence=1, copyable=0, typelist=['GangaDirac.Lib.Files.DiracFile'], doc="collected files from the wildcard namePattern", transient=1), 'failureReason': SimpleItem(defvalue="", protected=1, copyable=0, doc='reason for the upload failure', transient=1) }) _name = "LogicalFile" # TODO: Add warning to User NOT to create these objects themselves and that they should # only be used for backwards compatability to load old jobs def __init__(self, name=""): super(LogicalFile, self).__init__(lfn=name) self.name = name logger.warning( "!!! LogicalFile has been deprecated, this is now just a wrapper to the DiracFile object" ) logger.warning( "!!! Please update your scripts before LogicalFile is removed") self._setLFNnamePattern(lfn=self.name, namePattern='') def __setattr__(self, name, value): if name == "name": #elf.name = value self.lfn = value import os.path self.namePattern = os.path.basename(value) self.remoteDir = os.path.dirname(value) super(LogicalFile, self).__setattr__(name, value) def _attribute_filter__set__(self, attrib_name, value): if attrib_name == "name": self._setLFNnamePattern(lfn=value, namePattern='') return super(LogicalFile, self)._attribute_filter__set__(attrib_name, value)
class ITransform(GangaObject): _schema = Schema(Version(1, 0), { 'status': SimpleItem(defvalue='new', protected=1, copyable=0, doc='Status - running, pause or completed', typelist=["str"]), 'name': SimpleItem(defvalue='Simple Transform', doc='Name of the transform (cosmetic)', typelist=["str"]), 'application': ComponentItem('applications', defvalue=None, optional=1, load_default=False, doc='Application of the Transform.'), 'inputsandbox': FileItem(defvalue=[], typelist=['str', 'Ganga.GPIDev.Lib.File.File.File'], sequence=1, doc="list of File objects shipped to the worker node "), 'outputsandbox': SimpleItem(defvalue=[], typelist=['str'], sequence=1, doc="list of filenames or patterns shipped from the worker node"), 'backend': ComponentItem('backends', defvalue=None, optional=1, load_default=False, doc='Backend of the Transform.'), 'splitter': ComponentItem('splitters', defvalue=None, optional=1, load_default=False, doc='Splitter used on each unit of the Transform.'), 'postprocessors': ComponentItem('postprocessor', defvalue=None, doc='list of postprocessors to run after job has finished'), 'merger': ComponentItem('mergers', defvalue=None, hidden=1, copyable=0, load_default=0, optional=1, doc='Merger to be done over all units when complete.'), 'unit_merger': ComponentItem('mergers', defvalue=None, load_default=0, optional=1, doc='Merger to be copied and run on each unit separately.'), 'copy_output': ComponentItem('datasets', defvalue=None, load_default=0, optional=1, doc='The dataset to copy all units output to, e.g. Grid dataset -> Local Dataset'), 'unit_copy_output': ComponentItem('datasets', defvalue=None, load_default=0, optional=1, doc='The dataset to copy each individual unit output to, e.g. Grid dataset -> Local Dataset'), 'run_limit': SimpleItem(defvalue=8, doc='Number of times a partition is tried to be processed.', protected=1, typelist=["int"]), 'minor_run_limit': SimpleItem(defvalue=3, doc='Number of times a unit can be resubmitted', protected=1, typelist=["int"]), 'major_run_limit': SimpleItem(defvalue=3, doc='Number of times a junit can be rebrokered', protected=1, typelist=["int"]), 'units': ComponentItem('units', defvalue=[], sequence=1, copyable=1, doc='list of units'), 'inputdata': ComponentItem('datasets', defvalue=[], sequence=1, protected=1, optional=1, load_default=False, doc='Input datasets to run over'), 'outputdata': ComponentItem('datasets', defvalue=None, optional=1, load_default=False, doc='Output dataset template'), 'inputfiles': GangaFileItem(defvalue=[], typelist=['str', 'Ganga.GPIDev.Adapters.IGangaFile.IGangaFile'], sequence=1, doc="list of file objects that will act as input files for a job"), 'outputfiles' : GangaFileItem(defvalue=[], typelist=['str', 'Ganga.GPIDev.Adapters.IGangaFile.IGangaFile'], sequence=1, doc="list of \ OutputFile objects to be copied to all jobs"), 'metadata': ComponentItem('metadata', defvalue=MetadataDict(), doc='the metadata', protected=1), 'rebroker_on_job_fail': SimpleItem(defvalue=True, doc='Rebroker if too many minor resubs'), 'abort_loop_on_submit': SimpleItem(defvalue=True, doc='Break out of the Task Loop after submissions'), 'required_trfs': SimpleItem(defvalue=[], typelist=['int'], sequence=1, doc="IDs of transforms that must complete before this unit will start. NOTE DOESN'T COPY OUTPUT DATA TO INPUT DATA. Use TaskChainInput Dataset for that."), 'chain_delay': SimpleItem(defvalue=0, doc='Minutes delay between a required/chained unit completing and starting this one', protected=0, typelist=["int"]), 'submit_with_threads': SimpleItem(defvalue=False, doc='Use Ganga Threads for submission'), 'max_active_threads': SimpleItem(defvalue=10, doc='Maximum number of Ganga Threads to use. Note that the number of simultaneous threads is controlled by the queue system (default is 5)'), 'info' : SimpleItem(defvalue=[],typelist=['str'],protected=1,sequence=1,doc="Info showing status transitions and unit info"), 'id': SimpleItem(defvalue=-1, protected=1, doc='ID of the Transform', typelist=["int"]), #'force_single_unit' : SimpleItem(defvalue=False, doc='Force all input data into one Unit'), }) _category = 'transforms' _name = 'ITransform' _exportmethods = ['addInputData', 'resetUnit', 'setRunLimit', 'getJobs', 'setMinorRunLimit', 'setMajorRunLimit', 'getID', 'overview', 'resetUnitsByStatus', 'removeUnusedJobs', 'showInfo', 'showUnitInfo', 'pause', 'n_all', 'n_status' ] _hidden = 0 def showInfo(self): """Print out the info in a nice way""" print("\n".join( self.info )) def showUnitInfo(self, uid): """Print out the given unit info in a nice way""" self.units[uid].showInfo() def getJobs(self): """Return a list of the currently active job ids""" joblist = [] for u in self.units: joblist += u.active_job_ids return joblist def setMinorRunLimit(self, newRL): """Set the number of times a job will be resubmitted before a major resubmit is attempted""" self.minor_run_limit = newRL def setMajorRunLimit(self, newRL): """Set the number of times a job will be rebrokered before the transform is paused""" self.major_run_limit = newRL def setRunLimit(self, newRL): """Set the total (minor+major) number of times a job should be resubmitted before the transform is paused""" self.run_limit = newRL def overview(self, status=''): """Show the status of the units in this transform""" for unit in self.units: # display colour given state o = "" o += ("%d: " % self.units.index(unit)) + unit.name # is unit active? if unit.active: o += " " * (40 - len(o) + 3) + "*" else: o += " " * (40 - len(o) + 3) + "-" # sub job status o += "\t %i" % unit.n_status("submitted") o += "\t %i" % unit.n_status("running") o += "\t %i" % unit.n_status("completed") o += "\t %i" % unit.n_status("failed") o += "\t %i" % unit.minor_resub_count o += "\t %i" % unit.major_resub_count # change colour on state if unit.status == 'completed': o = markup(o, overview_colours["completed"]) elif not unit.active: o = markup(o, overview_colours["bad"]) elif unit.status == "recreating": o = markup(o, overview_colours["attempted"]) elif len(unit.active_job_ids) == 0: o = markup(o, overview_colours["hold"]) else: o = markup(o, overview_colours["running"]) print(o) # Special methods: def __init__(self): super(ITransform, self).__init__() self.initialize() def _readonly(self): """A transform is read-only if the status is not new.""" if self.status == "new": return 0 return 1 def initialize(self): from Ganga import GPI self.backend = stripProxy(GPI.Local()) self.updateStatus("new") def check(self): """Check this transform has valid data, etc. and has the correct units""" # ignore anything but new transforms if self.status != "new": return # first, validate the transform if not self.validate(): raise ApplicationConfigurationError( None, "Validate failed for Transform %s" % self.name) self.updateStatus("running") def startup(self): """This function is used to set the status after restarting Ganga""" pass # Public methods def resetUnit(self, uid): """Reset the given unit""" addInfoString( self, "Reseting Unit %i" % ( uid ) ) for u in self.units: if u.getID() == uid: u.reset() break # find any chained units and mark for recreation for trf in self._getParent().transforms: for u2 in trf.units: for req in u2.req_units: if req == "%d:%d" % (self.getID(), u.getID()) or req == "%d:ALL" % (self.getID()): trf.resetUnit(u2.getID()) self.updateStatus("running") def getID(self): """Return the index of this trf in the parent task""" # if the id isn't already set, use the index from the parent Task if self.id < 0: task = self._getParent() if not task: raise ApplicationConfigurationError( None, "This transform has not been associated with a task and so there is no ID available") self.id = task.transforms.index(self) return self.id def run(self, check=True): """Sets this transform to running status""" if self.status == "new" and check: self.check() if self.status != "completed": self.updateStatus("running") task = self._getParent() if task: task.updateStatus() else: logger.warning("Transform is already completed!") def update(self): """Called by the parent task to check for status updates, submit jobs, etc.""" #logger.warning("Entered Transform %d update function..." % self.getID()) if self.status == "pause" or self.status == "new": return 0 # check for complete required units task = self._getParent() for trf_id in self.required_trfs: if task.transforms[trf_id].status != "completed": return 0 # set the start time if not already set if len(self.required_trfs) > 0 and self.units[0].start_time == 0: for unit in self.units: unit.start_time = time.time() + self.chain_delay * 60 - 1 # report the info for this transform unit_status = { "new":0, "hold":0, "running":0, "completed":0, "bad":0, "recreating":0 } for unit in self.units: unit_status[unit.status] += 1 info_str = "Unit overview: %i units, %i new, %i hold, %i running, %i completed, %i bad. to_sub %i" % (len(self.units), unit_status["new"], unit_status["hold"], unit_status["running"], unit_status["completed"], unit_status["bad"], self._getParent().n_tosub()) addInfoString(self, info_str) # ask the unit splitter if we should create any more units given the # current data self.createUnits() # loop over units and update them ((re)submits will be called here) old_status = self.status unit_status_list = [] # find submissions first unit_update_list = [] for unit in self.units: if not unit.checkForSubmission() and not unit.checkForResubmission(): unit_update_list.append(unit) continue if unit.update() and self.abort_loop_on_submit: logger.info("Unit %d of transform %d, Task %d has aborted the loop" % ( unit.getID(), self.getID(), task.id)) return 1 unit_status_list.append(unit.status) # now check for download for unit in unit_update_list: if unit.update() and self.abort_loop_on_submit: logger.info("Unit %d of transform %d, Task %d has aborted the loop" % ( unit.getID(), self.getID(), task.id)) return 1 unit_status_list.append(unit.status) from Ganga.GPIDev.Lib.Tasks.TaskChainInput import TaskChainInput # check for any TaskChainInput completions for ds in self.inputdata: if isType(ds, TaskChainInput) and ds.input_trf_id != -1: if task.transforms[ds.input_trf_id].status != "completed": return 0 # update status and check old_status = self.status for state in ['running', 'hold', 'bad', 'completed']: if state in unit_status_list: if state == 'hold': state = "running" if state != self.status: self.updateStatus(state) break def createUnits(self): """Create new units if required given the inputdata""" from Ganga.GPIDev.Lib.Tasks.TaskChainInput import TaskChainInput # check for chaining for ds in self.inputdata: if isType(ds, TaskChainInput) and ds.input_trf_id != -1: # check for single unit if ds.single_unit: # is there a unit already linked? done = False rec_unit = None for out_unit in self.units: if '%d:ALL' % (ds.input_trf_id) in out_unit.req_units: done = True # check if the unit is being recreated if out_unit.status == "recreating": rec_unit = out_unit break if not done or rec_unit: new_unit = self.createChainUnit( self._getParent().transforms[ds.input_trf_id].units, ds.use_copy_output) if new_unit: self.addChainUnitToTRF( new_unit, ds, -1, prev_unit=rec_unit) else: # loop over units in parent trf and create units as # required for in_unit in self._getParent().transforms[ds.input_trf_id].units: # is there a unit already linked? done = False rec_unit = None for out_unit in self.units: if '%d:%d' % (ds.input_trf_id, in_unit.getID()) in out_unit.req_units: done = True # check if the unit is being recreated if out_unit.status == "recreating": rec_unit = out_unit break if not done or rec_unit: new_unit = self.createChainUnit( [in_unit], ds.use_copy_output) if new_unit: self.addChainUnitToTRF( new_unit, ds, in_unit.getID(), prev_unit=rec_unit) def createChainUnit(self, parent_units, use_copy_output=True): """Create a chained unit given the parent outputdata""" return IUnit() def addChainUnitToTRF(self, unit, inDS, unit_id=-1, prev_unit=None): """Add a chained unit to this TRF. Override for more control""" if unit_id == -1: unit.req_units.append('%d:ALL' % (inDS.input_trf_id)) unit.name = "Parent: TRF %d, All Units" % (inDS.input_trf_id) else: unit.req_units.append('%d:%d' % (inDS.input_trf_id, unit_id)) unit.name = "Parent: TRF %d, Unit %d" % ( inDS.input_trf_id, unit_id) self.addUnitToTRF(unit, prev_unit) def addInputData(self, inDS): """Add the given input dataset to the list""" self.inputdata.append(inDS) def pause(self): """Pause the task - the background thread will not submit new jobs from this task""" if self.status != "completed": self.updateStatus("pause") #self.status = "pause" task = self._getParent() if task: task.updateStatus() else: logger.debug("Transform is already completed!") def setRunlimit(self, newRL): """Set the number of times a job should be resubmitted before the transform is paused""" self.run_limit = newRL logger.debug("Runlimit set to %i", newRL) # Methods that can/should be overridden by derived classes def validate(self): """Override this to validate that the transform is OK""" from Ganga.GPIDev.Lib.Tasks.TaskLocalCopy import TaskLocalCopy # make sure a path has been selected for any local downloads if self.unit_copy_output is not None and isType(self.unit_copy_output, TaskLocalCopy): if self.unit_copy_output.local_location == '': logger.error("No path selected for Local Output Copy") return False if self.copy_output is not None and isType(self.copy_output, TaskLocalCopy): if self.copy_output.local_location == '': logger.error("No path selected for Local Output Copy") return False # this is a generic trf so assume the application and splitter will do # all the work return True def addUnitToTRF(self, unit, prev_unit=None): """Add a unit to this Transform given the input and output data""" if not unit: raise ApplicationConfigurationError(None, "addUnitTOTRF failed for Transform %d (%s): No unit specified" % (self.getID(), self.name)) addInfoString( self, "Adding Unit to TRF...") unit.updateStatus("hold") unit.active = True if prev_unit: unit.prev_job_ids += prev_unit.prev_job_ids self.units[prev_unit.getID()] = unit else: self.units.append(unit) stripProxy(unit).id = len(self.units) - 1 # Information methods def fqn(self): task = self._getParent() if task: return "Task %i Transform %i" % (task.id, task.transforms.index(self)) else: return "Unassigned Transform '%s'" % (self.name) def n_active(self): return sum([u.n_active() for u in self.units]) def n_all(self): return sum([u.n_all() for u in self.units]) def n_status(self, status): return sum([u.n_status(status) for u in self.units]) def info(self): logger.info(markup("%s '%s'" % (getName(self), self.name), status_colours[self.status])) logger.info("* backend: %s" % getName(self.backend)) logger.info("Application:") self.application.printTree() def updateStatus(self, status): """Update the transform status""" self.status = status def createUnitCopyOutputDS(self, unit_id): """Create a the Copy Output dataset to use with this unit. Overload to handle more than the basics""" from Ganga.GPIDev.Lib.Tasks.TaskLocalCopy import TaskLocalCopy if isType(self.unit_copy_output, TaskLocalCopy): logger.warning("Default implementation of createUnitCopyOutputDS can't handle datasets of type '%s'" % getName(self.unit_copy_output)) return # create copies of the Copy Output DS and add Unit name to path self.units[unit_id].copy_output = self.unit_copy_output.clone() self.units[unit_id].copy_output.local_location = os.path.join( self.unit_copy_output.local_location, self.units[unit_id].name.replace(":", "_").replace(" ", "").replace(",", "_")) def __setattr__(self, attr, value): if attr == 'outputfiles': if value != []: if self.outputdata is not None: logger.error( 'ITransform.outputdata is set, you can\'t set ITransform.outputfiles') return elif self.outputsandbox != []: logger.error( 'ITransform.outputsandbox is set, you can\'t set ITransform.outputfiles') return # reduce duplicate values here, leave only duplicates for LCG, # where we can have replicas uniqueValuesDict = [] uniqueValues = [] for val in value: key = '%s%s' % (getName(val), val.namePattern) if key not in uniqueValuesDict: uniqueValuesDict.append(key) uniqueValues.append(val) elif getName(val) == 'LCGSEFile': uniqueValues.append(val) super(ITransform, self).__setattr__(attr, uniqueValues) elif attr == 'inputfiles': if value != []: if self.inputsandbox != []: logger.error( 'ITransform.inputsandbox is set, you can\'t set ITransform.inputfiles') return super(ITransform, self).__setattr__(attr, value) elif attr == 'outputsandbox': if value != []: if getConfig('Output')['ForbidLegacyOutput']: logger.error( 'Use of ITransform.outputsandbox is forbidden, please use ITransform.outputfiles') return if self.outputfiles != []: logger.error( 'ITransform.outputfiles is set, you can\'t set ITransform.outputsandbox') return super(ITransform, self).__setattr__(attr, value) elif attr == 'inputsandbox': if value != []: if getConfig('Output')['ForbidLegacyInput']: logger.error( 'Use of ITransform.inputsandbox is forbidden, please use ITransform.inputfiles') return if self.inputfiles != []: logger.error( 'ITransform.inputfiles is set, you can\'t set ITransform.inputsandbox') return super(ITransform, self).__setattr__(attr, value) elif attr == 'outputdata': if value is not None: if getConfig('Output')['ForbidLegacyOutput']: logger.error( 'Use of ITransform.outputdata is forbidden, please use ITransform.outputfiles') return if self.outputfiles != []: logger.error( 'ITransform.outputfiles is set, you can\'t set ITransform.outputdata') return super(ITransform, self).__setattr__(attr, value) else: super(ITransform, self).__setattr__(attr, value) def resetUnitsByStatus(self, status='bad'): """Reset all units of a given status""" for unit in self.units: if unit.status == status: logger.info("Resetting Unit %d, Transform %d..." % (unit.getID(), self.getID())) self.resetUnit(unit.getID()) def checkUnitsAreCompleted(self, parent_units): """Check the given parent units are complete""" for parent in parent_units: if len(parent.active_job_ids) == 0 or parent.status != "completed": return False return True def getChainInclExclMasks(self, parent_units): """return the include/exclude masks from the TaskChainInput""" incl_pat_list = [] excl_pat_list = [] from Ganga.GPIDev.Lib.Tasks.TaskChainInput import TaskChainInput for parent in parent_units: for inds in self.inputdata: if isType(inds, TaskChainInput) and inds.input_trf_id == parent._getParent().getID(): incl_pat_list += inds.include_file_mask excl_pat_list += inds.exclude_file_mask return incl_pat_list, excl_pat_list def getParentUnitJobs(self, parent_units, include_subjobs=True): """Return the list of parent jobs""" job_list = [] for parent in parent_units: job = GPI.jobs(parent.active_job_ids[0]) if job.subjobs: job_list += job.subjobs else: job_list += [job] return job_list def removeUnusedJobs(self): """Remove all jobs that aren't being used, e.g. failed jobs""" for unit in self.units: for jid in unit.prev_job_ids: try: logger.warning("Removing job '%d'..." % jid) job = GPI.jobs(jid) job.remove() except Exception as err: logger.debug("removeUnused: %s" % str(err)) logger.error("Problem removing job '%d'" % jid)
class ISplitter(GangaObject): """ """ _schema = Schema(Version(0, 0), {}) _category = 'splitters' _hidden = 1 def createSubjob(self, job, additional_skip_args=None): """ Create a new subjob by copying the master job and setting all fields correctly. """ from Ganga.GPIDev.Lib.Job.Job import Job if additional_skip_args is None: additional_skip_args = [] j = Job() skipping_args = [ 'splitter', 'inputsandbox', 'inputfiles', 'inputdata', 'subjobs' ] for arg in additional_skip_args: skipping_args.append(arg) j.copyFrom(job, skipping_args) j.splitter = None j.inputsandbox = [] j.inputfiles = [] j.inputdata = None return j def split(self, job): """ Return a list of subjobs generated from a master job. The original master job should not be modified. This method should be implemented in the derived classes. Splitter changes certain parts of the subjobs i.e. mutates certain properties (otherwise all subjobs would be the same). Only these properties may be mutated which are declared 'splitable' in the schema. This restriction applies to application objects to avoid inconsistencies if application handler is not able to deal with modified arguments. In the current implementation the type of the backend cannot be changed either. """ raise NotImplementedError def validatedSplit(self, job): """ Perform splitting using the split() method and validate the mutability invariants. If the invariants are broken (or exception occurs in the split() method) then SplittingError exception is raised. This method is called directly by the framework and should not be modified in the derived classes. """ # try: subjobs = self.split(stripProxy(job)) # except Exception,x: #raise SplittingError(x) #raise x # if not len(subjobs): #raise SplittingError('splitter did not create any subjobs') cnt = 0 for s in subjobs: if not isType(s.backend, type(stripProxy(job.backend))): raise SplittingError( 'masterjob backend %s is not the same as the subjob (probable subjob id=%d) backend %s' % (job.backend._name, cnt, getName(s.backend))) cnt += 1 return subjobs
class File(GangaObject): """Represent the files, both local and remote and provide an interface to transparently get access to them. Typically in the context of job submission, the files are copied to the directory where the application runs on the worker node. The 'subdir' attribute influances the destination directory. The 'subdir' feature is not universally supported however and needs a review. """ _schema = Schema( Version(1, 1), { 'name': SimpleItem(defvalue="", doc='path to the file source'), 'subdir': SimpleItem(defvalue=os.curdir, doc='destination subdirectory (a relative path)'), 'executable': SimpleItem( defvalue=False, hidden=True, transient=True, doc= 'specify if executable bit should be set when the file is created (internal framework use)' ) }) _category = 'files' _name = "File" _exportmethods = ["getPathInSandbox", "exists", "create", "isExecutable"] # added a subdirectory to the File object. The default is os.curdir, that is "." in Unix. # The subdir is a relative path and will be appended to the pathname when writing out files. # Therefore changing subdir to a anything starting with "/" will still end up relative # to the pathname when the file is copied. # # There is no protection on putting the parent directory. So ".." is legal and will make # the file end up in the parent directory. - AM def __init__(self, name=None, subdir=os.curdir): super(File, self).__init__() if not name is None: assert (isinstance(name, str)) self.name = name if not subdir is None: self.subdir = subdir def __construct__(self, args): if len(args) == 1 and isinstance(args[0], str): v = args[0] import os.path expanded = expandfilename(v) # if it is not already an absolute filename if not urlprefix.match(expanded): if os.path.exists(os.path.abspath(expanded)): self.name = os.path.abspath(expanded) else: self.name = v else: # bugfix #20545 self.name = expanded else: super(File, self).__construct__(args) def _attribute_filter__set__(self, attribName, attribValue): if attribName is 'name': return expandfilename(attribValue) return attribValue def getPathInSandbox(self): """return a relative location of a file in a sandbox: subdir/name""" from Ganga.Utility.files import real_basename return os.path.join(self.subdir, real_basename(self.name)) def exists(self): """check if the file exists (as specified by 'name')""" import os.path return os.path.isfile(expandfilename(self.name)) def create(self, outname): """create a file in a local filesystem as 'outname', maintain the original permissions """ import shutil shutil.copy(expandfilename(self.name), outname) if self.executable: chmod_executable(outname) def __repr__(self): """Get the representation of the file. Since the a SimpleStreamer uses __repr__ for persistency it is important to return a valid python expression which fully reconstructs the object. """ return "File(name='%s',subdir='%s')" % (self.name, self.subdir) def isExecutable(self): """ return true if a file is create()'ed with executable permissions, i.e. the permissions of the existing 'source' file are checked""" return self.executable or is_executable(expandfilename(self.name))
class ShareDir(GangaObject): """Represents the directory used to store resources that are shared amongst multiple Ganga objects. Currently this is only used in the context of the prepare() method for certain applications, such as the Executable() application. A single ("prepared") application can be associated to multiple jobs. """ _schema = Schema( Version(1, 0), { 'name': SimpleItem(defvalue='', doc='path to the file source'), 'subdir': SimpleItem(defvalue=os.curdir, doc='destination subdirectory (a relative path)') }) _category = 'shareddirs' _exportmethods = ['add', 'ls'] _name = "ShareDir" _data = None # def _readonly(self): # return True def __init__(self, name=None, subdir=os.curdir): super(ShareDir, self).__init__() self._setRegistry(None) if not name is None: self.name = name else: # continue generating directory names until we create a unique one # (which will likely be on the first attempt). while True: name = 'conf-{0}'.format(uuid.uuid4()) if not os.path.isdir(os.path.join(getSharedPath(), name)): os.makedirs(os.path.join(getSharedPath(), name)) if not os.path.isdir(os.path.join(getSharedPath(), name)): logger.error("ERROR creating path: %s" % os.path.join(getSharedPath(), name)) raise GangaException("ShareDir ERROR") else: break self.name = str(name) # incrementing then decrementing the shareref counter has the effect of putting the newly # created ShareDir into the shareref table. This is desirable if a ShareDir is created in isolation, # filled with files, then assigned to an application. #a=Job(); s=ShareDir(); a.application.is_prepared=s #shareref = GPIProxyObjectFactory(getRegistry("prep").getShareRef()) # shareref.increase(self.name) # shareref.decrease(self.name) def __deepcopy__(self, memo): return super(ShareDir, self).__deepcopy__(memo) def add(self, input): from Ganga.Core.GangaRepository import getRegistry if not isType(input, list): input = [input] for item in input: if isType(item, str): if os.path.isfile(expandfilename(item)): logger.info('Copying file %s to shared directory %s' % (item, self.name)) shutil.copy2(expandfilename(item), os.path.join(getSharedPath(), self.name)) shareref = GPIProxyObjectFactory( getRegistry("prep").getShareRef()) shareref.increase(self.name) shareref.decrease(self.name) else: logger.error('File %s not found' % expandfilename(item)) elif isType(item, File) and item.name is not '' and os.path.isfile( expandfilename(item.name)): logger.info('Copying file object %s to shared directory %s' % (item.name, self.name)) shutil.copy2(expandfilename(item.name), os.path.join(getSharedPath(), self.name)) shareref = GPIProxyObjectFactory( getRegistry("prep").getShareRef()) shareref.increase(self.name) shareref.decrease(self.name) else: logger.error('File %s not found' % expandfilename(item.name)) def ls(self): """ Print the contents of the ShareDir """ full_shareddir_path = os.path.join(getSharedPath(), self.name) try: os.path.isdir(full_shareddir_path) cmd = "find '%s'" % (full_shareddir_path) files = os.popen(cmd).read().strip().split('\n') padding = '| ' for file in files: level = file.count(os.sep) level = level - 6 pieces = file.split(os.sep) symbol = {0: '', 1: '/'}[os.path.isdir(file)] logger.info(padding * level + pieces[-1] + symbol) except IOError: logger.warn('ShareDir %s not found on storage' % full_shareddir_path) def exists(self): """check if the file exists (as specified by 'name')""" import os.path return os.path.isdir(expandfilename(self.name)) def create(self, outname): """create a file in a local filesystem as 'outname', maintain the original permissions """ import shutil shutil.copy(expandfilename(self.name), outname) if self.executable: chmod_executable(outname) def __repr__(self): """Get the representation of the file. Since the a SimpleStreamer uses __repr__ for persistency it is important to return a valid python expression which fully reconstructs the object. """ return "ShareDir(name='%s',subdir='%s')" % (self.name, self.subdir) def isExecutable(self): """ return true if a file is create()'ed with executable permissions, i.e. the permissions of the existing 'source' file are checked""" return self.executable or is_executable(expandfilename(self.name))
class IMerger(IPostProcessor): """ Contains the interface for all mergers, all mergers should inherit from this object. """ # set outputdir for auto merge policy flag # the default behaviour (True) is that outputdir is set by runAutoMerge() function in Merger.py module # however if this flag is set to False then merge() will be called for auto merge with sum_outputdir set to None # thus it is up to the subclass to decide where the output goes in case of # auto merge set_outputdir_for_automerge = True _category = 'postprocessor' _exportmethods = ['merge'] _name = 'IMerger' _hidden = 1 _schema = Schema( Version(1, 0), { 'files': SimpleItem(defvalue=[], typelist=[str], sequence=1, doc='A list of files to merge.'), 'ignorefailed': SimpleItem( defvalue=False, doc= 'Jobs that are in the failed or killed states will be excluded from the merge when this flag is set to True.' ), 'overwrite': SimpleItem( defvalue=False, doc= 'The default behaviour for this Merger object. Will overwrite output files.' ), }) order = 1 def execute(self, job, newstatus): """ Execute """ if (len(job.subjobs) != 0): try: return self.merge(job.subjobs, job.outputdir) except PostProcessException as e: logger.error("%s" % e) return self.failure else: return True def merge(self, jobs, outputdir=None, ignorefailed=None, overwrite=None): if ignorefailed == None: ignorefailed = self.ignorefailed if overwrite == None: overwrite = self.overwrite from Ganga.GPIDev.Lib.Job import Job if not outputdir: outputdir = getDefaultMergeDir() else: if isType(outputdir, Job): # use info from job outputdir = outputdir.outputdir else: outputdir = os.path.expanduser(outputdir) files = {} if isType(jobs, Job): if outputdir is None: outputdir = jobs.outputdir return self.merge(jobs.subjobs, outputdir=outputdir, ignorefailed=ignorefailed, overwrite=overwrite) if not len(jobs): logger.warning( 'The jobslice given was empty. The merge will not continue.') return self.success for j in jobs: # first check that the job is ok if j.status != 'completed': # check if we can keep going if j.status == 'failed' or j.status == 'killed': if ignorefailed: logger.warning( 'Job %s has status %s and is being ignored.', j.fqid, j.status) continue else: raise PostProcessException( 'Job %s has status %s and so the merge can not continue. ' 'This can be overridden with the ignorefailed flag.' % (j.fqid, j.status)) else: raise PostProcessException( "Job %s is in an unsupported status %s and so the merge can not continue. '\ 'Supported statuses are 'completed', 'failed' or 'killed' (if the ignorefailed flag is set)." % (j.fqid, j.status)) if len(j.subjobs): sub_result = self.merge(j.subjobs, outputdir=j.outputdir, ignorefailed=ignorefailed, overwrite=overwrite) if (sub_result == self.failure) and not ignorefailed: raise PostProcessException( 'The merge of Job %s failed and so the merge can not continue. ' 'This can be overridden with the ignorefailed flag.' % j.fqid) import glob for f in self.files: for matchedFile in glob.glob(os.path.join(j.outputdir, f)): relMatchedFile = '' try: relMatchedFile = os.path.relpath( matchedFile, j.outputdir) except Exception as err: logger.debug("Err: %s" % err) Ganga.Utility.logging.log_unknown_exception() relMatchedFile = relpath(matchedFile, j.outputdir) if relMatchedFile in files: files[relMatchedFile].append(matchedFile) else: files[relMatchedFile] = [matchedFile] if not len(glob.glob(os.path.join(j.outputdir, f))): if ignorefailed: logger.warning( 'The file pattern %s in Job %s was not found. The file will be ignored.', f, j.fqid) continue else: raise PostProcessException( 'The file pattern %s in Job %s was not found and so the merge can not continue. ' 'This can be overridden with the ignorefailed flag.' % (f, j.fqid)) # files[f].extend(matchedFiles) for k in files.keys(): # make sure we are not going to over write anything outputfile = os.path.join(outputdir, k) if os.path.exists(outputfile) and not overwrite: raise PostProcessException( 'The merge process can not continue as it will result in over writing. ' 'Either move the file %s or set the overwrite flag to True.' % outputfile) # make the directory if it does not exist if not os.path.exists(outputdir): os.makedirs(outputdir) # recreate structure from output sandbox outputfile_dirname = os.path.dirname(outputfile) if outputfile_dirname != outputdir: if not os.path.exists(outputfile_dirname): os.mkdir(outputfile_dirname) # check that we are merging some files if not files[k]: logger.warning( 'Attempting to merge with no files. Request will be ignored.' ) continue # check outputfile != inputfile for f in files[k]: if f == outputfile: raise PostProcessException( 'Output file %s equals input file %s. The merge will fail.' % (outputfile, f)) # merge the lists of files with a merge tool into outputfile msg = None try: self.mergefiles(files[k], outputfile) # create a log file of the merge # we only get to here if the merge_tool ran ok log_file = '%s.merge_summary' % outputfile with open(log_file, 'w') as log: log.write('# -- List of files merged -- #\n') for f in files[k]: log.write('%s\n' % f) log.write('# -- End of list -- #\n') except PostProcessException as e: msg = str(e) # store the error msg log_file = '%s.merge_summary' % outputfile with open(log_file, 'w') as log: log.write('# -- Error in Merge -- #\n') log.write('\t%s\n' % msg) raise e return self.success