def initialize(self): self.transClient = TransformationClient() self.bkClient = BookkeepingClient() self.notifyClient = NotificationClient() self.operations = Operations() self.email = self.am_getOption("MailTo", '') return S_OK()
def getBKProductions(self, visible=None): """ It returns a list of productions """ if visible is None: visible = self.isVisible() prodList = self.__bkQueryDict.get('Production') if prodList: if not isinstance(prodList, list): prodList = [prodList] return sorted(prodList) if not self.getProcessingPass(): gLogger.fatal( 'Impossible to get a list of productions without the Processing Pass' ) return [] eventTypes = self.__bkQueryDict.get('EventType') if not isinstance(eventTypes, list): eventTypes = [eventTypes] fullList = set() for eventType in eventTypes: bkQ = BKQuery(self.__bkQueryDict) bkQ.setVisible(visible) bkDict = bkQ.setEventType(eventType) # gLogger.notice( 'Get productions for BK query', str( bkDict ) ) res = self.__bkClient.getProductions(bkDict) if not res['OK']: gLogger.error('Error getting productions from BK', res['Message']) return [] if self.getProcessingPass().replace('/', '') != 'Real Data': fileTypes = self.getFileTypeList() prodList = set(prod for prods in res['Value']['Records'] for prod in prods if self.__getProdStatus(prod) != 'Deleted') # print '\n', self.__bkQueryDict, res['Value']['Records'], '\nVisible:', visible, prodList pList = set() if fileTypes: transClient = TransformationClient() for prod in prodList: res = transClient.getBookkeepingQuery(prod) if res['OK'] and res['Value']['FileType'] in fileTypes: pList.add(prod) if not pList: pList = prodList else: runList = sorted( [-run for r in res['Value']['Records'] for run in r]) startRun = int(self.__bkQueryDict.get('StartRun', 0)) endRun = int(self.__bkQueryDict.get('EndRun', sys.maxsize)) pList = set(run for run in runList if run >= startRun and run <= endRun) fullList.update(pList) return sorted(fullList)
def __init__(self, transID=0, transClientIn=None): """ Just params setting. transClient is passed here as LHCbDIRAC TransformationsClient, it will be self.transClient """ if not transClientIn: self.transClient = TransformationClient() else: self.transClient = transClientIn super(Transformation, self).__init__(transID=transID, transClient=self.transClient)
def __init__( self, cacheFile = None ): if not cacheFile: self.prodStatFile = os.path.join( os.environ['HOME'], ".dirac/work", "dirac-production-stats.pkl" ) else: self.prodStatFile = cacheFile self.cacheVersion = '0.0' self.clearCache = [] self.cachedInfo = {} # Recuperate the previous cached information self.readCache() self.bk = BookkeepingClient() self.transClient = TransformationClient()
class ProductionClient(): """ Simple helper emulating old Production Client """ def __init__(self): """Instantiates the Workflow object and some default parameters. """ self.transClient = TransformationClient() def getParameters(self, prodID, pname=''): """Get a production parameter or all of them if no parameter name specified. """ result = self.transClient.getTransformation(int(prodID), True) if not result['OK']: gLogger.error(result) return S_ERROR('Could not retrieve parameters for production %s' % prodID) if not result['Value']: gLogger.info(result) return S_ERROR( 'No additional parameters available for production %s' % prodID) if pname: if result['Value'].has_key(pname): return S_OK(result['Value'][pname]) else: gLogger.verbose(result) return S_ERROR('Production %s does not have parameter %s' % (prodID, pname)) return result
def initialize(self): """ Logs some parameters and initializes the clients """ self.extensionFactorBoost = self.am_getOption( 'extensionFactorBoost', self.extensionFactorBoost) self.rpcProductionRequest = RPCClient( 'ProductionManagement/ProductionRequest') self.transClient = TransformationClient() self.log.info('Will consider the following transformation types: %s' % str(self.transformationTypes)) self.log.info('Will create a maximum of %s tasks per iteration' % self.maxIterationTasks) return S_OK()
def main(): informations = ['AuthorDN', 'AuthorGroup', 'Body', 'CreationDate', 'Description', 'EventsPerTask', 'FileMask', 'GroupSize', 'Hot', 'InheritedFrom', 'LastUpdate', 'LongDescription', 'MaxNumberOfTasks', 'Plugin', 'Status', 'TransformationGroup', 'TransformationName', 'Type', 'Request'] Script.registerSwitch('', 'Information=', ' Specify which information is required') Script.setUsageMessage('\n'.join([__doc__, 'Usage:', ' %s [options] transID1 [transID2 ...]' % Script.scriptName, 'Arguments:', '\ttransID1,... : transformantion IDs', 'Possible informations:', '\t%s' % ', '.join(sorted(informations)) ]) ) Script.parseCommandLine(ignoreErrors=True) from LHCbDIRAC.TransformationSystem.Client.TransformationClient import TransformationClient from DIRAC import gLogger tr = TransformationClient() requestedInfo = informations switches = Script.getUnprocessedSwitches() infoList = [] for switch, val in switches: if switch == 'Information': infoList = [info.lower() for info in val.split(',')] requestedInfo = [info for info in informations if info.lower() in infoList] if 'body' not in infoList and 'Body' in requestedInfo: requestedInfo.remove('Body') transIDs = Script.getPositionalArgs() for transID in transIDs: try: res = tr.getTransformation(int(transID)) gLogger.notice("==== Transformation %s ====" % transID) for info in requestedInfo: getInfo = info if info != 'Request' else 'TransformationFamily' gLogger.notice("\t%s: %s" % (info, res.get('Value', {}).get(getInfo, 'Unknown'))) except Exception: gLogger.error("Invalid transformation ID: '%s'" % transID)
def initialize(self): """Sets defaults """ self.am_setOption('shifterProxy', 'ProductionManager') self.transClient = TransformationClient() self.reqClient = ReqClient() self.consChecks = ConsistencyChecks(interactive=False, transClient=self.transClient) transformationTypes = Operations().getValue( 'Transformations/DataProcessing', []) extendableTTypes = Operations().getValue( 'Transformations/ExtendableTransfTypes', ['MCSimulation']) self.transformationTypes = list( set(transformationTypes) - set(extendableTTypes)) return S_OK()
def __getProdStatus(prod): """ Returns the status of a given transformation """ res = TransformationClient().getTransformation(prod, extraParams=False) if not res['OK']: gLogger.error("Couldn't get information on production %d" % prod) return None return res['Value']['Status']
def initialize(self): """ Make the necessary initializations. The ThreadPool is created here, the _execute() method is what each thread will execute. """ self.fullUpdatePeriod = self.am_getOption('FullUpdatePeriod', self.fullUpdatePeriod) self.bkUpdateLatency = self.am_getOption('BKUpdateLatency', self.bkUpdateLatency) self.debug = self.am_getOption('verbose', self.debug) self.pickleFile = os.path.join(self.am_getWorkDirectory(), self.pickleFile) self.chunkSize = self.am_getOption('maxFilesPerChunk', self.chunkSize) self.pluginsWithNoRunInfo = Operations().getValue('TransformationPlugins/PluginsWithNoRunInfo', self.pluginsWithNoRunInfo) self._logInfo('Full Update Period: %d seconds' % self.fullUpdatePeriod) self._logInfo('BK update latency : %d seconds' % self.bkUpdateLatency) self._logInfo('Plugins with no run info: %s' % ', '.join(self.pluginsWithNoRunInfo)) self.transClient = TransformationClient() self.bkClient = BookkeepingClient() try: with open(self.pickleFile, 'r') as pf: self.timeLog = pickle.load(pf) self.fullTimeLog = pickle.load(pf) self.bkQueries = pickle.load(pf) self._logInfo("successfully loaded Log from", self.pickleFile, "initialize") except (EOFError, IOError): self._logInfo("failed loading Log from", self.pickleFile, "initialize") self.timeLog = {} self.fullTimeLog = {} self.bkQueries = {} maxNumberOfThreads = self.am_getOption('maxThreadsInPool', 1) threadPool = ThreadPool(maxNumberOfThreads, maxNumberOfThreads) for i in xrange(maxNumberOfThreads): threadPool.generateJobAndQueueIt(self._execute, [i]) gMonitor.registerActivity("Iteration", "Agent Loops", AGENT_NAME, "Loops/min", gMonitor.OP_SUM) return S_OK()
def initialize(self): """ standard initialize method for DIRAC agents """ res = DIRACValidateOutputDataAgent.initialize(self) if not res['OK']: return res self.integrityClient = DataIntegrityClient() self.fileCatalog = FileCatalog() self.transClient = TransformationClient() self.storageUsageClient = StorageUsageClient() return S_OK()
def __init__(self, tsClientIn=None): """Instantiates the Workflow object and some default parameters. """ super(DiracProduction, self).__init__() if tsClientIn is None: self.transformationClient = TransformationClient() else: self.transformationClient = tsClientIn self.prodHeaders = { 'AgentType': 'SubmissionMode', 'Status': 'Status', 'CreationDate': 'Created', 'TransformationName': 'Name', 'Type': 'Type' } self.prodAdj = 22 self.commands = { 'start': ['Active', 'Manual'], 'stop': ['Stopped', 'Manual'], 'automatic': ['Active', 'Automatic'], 'manual': ['Active', 'Manual'], 'mctestmode': ['Testing', 'Automatic'], 'completed': ['Completed', 'Manual'], 'completing': ['Completing', 'Automatic'], 'cleaning': ['Cleaning', 'Manual'], 'flush': ['Flush', 'Automatic'], 'deleted': ['Deleted', 'Manual'], 'cleaned': ['Cleaned', 'Manual'], 'archived': ['Archived', 'Manual'], 'valinput': ['ValidatingInput', 'Manual'], 'valoutput': ['ValidatingOutput', 'Manual'], 'remove': ['RemovingFiles', 'Manual'], 'validated': ['ValidatedOutput', 'Manual'], 'removed': ['RemovedFiles', 'Manual'] }
def execute(): tr = TransformationClient() for switch in Script.getUnprocessedSwitches(): pass bkQuery = dmScript.getBKQuery() if not bkQuery: gLogger.notice("No BKQuery given...") exit(1) startTime = time.time() prods = bkQuery.getBKProductions() # visible = 'All' ) parents = {} productions = {} for prod in prods: type = tr.getTransformation(prod).get('Value', {}).get('Type', 'Unknown') productions[prod] = type parent = tr.getBookkeepingQuery(prod).get('Value', {}).get('ProductionID', '') if parent: type = tr.getTransformation(parent).get('Value', {}).get('Type', 'Unknown') parents[parent] = type gLogger.notice("For BK path %s:" % bkQuery.getPath()) if not prods: gLogger.notice('No productions found!') else: printProds('Productions found', productions) if parents: printProds('Parent productions', parents) gLogger.notice('Completed in %.1f seconds' % (time.time() - startTime))
def _getClients(self): """ returns the clients used in the threads """ res = DIRACTransformationAgent._getClients(self) threadTransformationClient = TransformationClient() threadRMClient = ResourceManagementClient() threadBkk = BookkeepingClient() res.update({ 'TransformationClient': threadTransformationClient, 'ResourceManagementClient': threadRMClient, 'BookkeepingClient': threadBkk }) return res
def _getClients(self): """ LHCb clients """ res = DIRACWorkflowTaskAgent._getClients(self) outputDataModule = Operations().getValue( "Transformations/OutputDataModule", "LHCbDIRAC.Core.Utilities.OutputDataPolicy") threadTransformationClient = TransformationClient() threadTaskManager = LHCbWorkflowTasks( outputDataModule=outputDataModule, jobClass=LHCbJob) res.update({ 'TransformationClient': threadTransformationClient, 'TaskManager': threadTaskManager }) return res
def initialize(self): """ Standard initialize method for agents """ DiracTCAgent.initialize(self) self.directoryLocations = sorted( self.am_getOption('DirectoryLocations', self.directoryLocations)) self.archiveAfter = self.am_getOption('ArchiveAfter', self.archiveAfter) # days self.fileTypesToKeep = Operations().getValue( 'Transformations/FileTypesToKeep', self.fileTypesToKeep) self.bkClient = BookkeepingClient() self.transClient = TransformationClient() self.storageUsageClient = StorageUsageClient() return S_OK()
def setUp(self): tcMock = MagicMock() sc = MagicMock() jmc = MagicMock() self.l_wft = LHCbWorkflowTasks(tcMock, submissionClient=sc, jobMonitoringClient=jmc) self.tc = TransformationClient() self.tc.dataProcessingTypes = ['MCSimulation', 'DataReconstruction'] self.tsMock = MagicMock() self.fcMock = MagicMock() self.fcMock.getFileSize.return_value = S_OK({ 'Failed': [], 'Successful': cachedLFNSize }) gLogger.setLevel('DEBUG') self.maxDiff = None
class MCSimulationTestingAgent (AgentModule): """An agent to check for MCSimulation productions that have undergone the testing phase. Productions that have the status Idle and are also in the table StoredJobDescription have undergone testing. A report is created by the agent from the results of the test phase and emailed to the Production Manager """ def __init__(self, *args, **kwargs): """ c'tor """ AgentModule.__init__(self, *args, **kwargs) self.transClient = None self.bkClient = None self.notifyClient = None self.operations = None self.failedTransIDs = [] def initialize(self): self.transClient = TransformationClient() self.bkClient = BookkeepingClient() self.notifyClient = NotificationClient() self.operations = Operations() self.email = self.am_getOption("MailTo", '') return S_OK() def execute(self): # get all the idle transformations extendableTTypes = Operations().getValue('Transformations/ExtendableTransfTypes', ['MCSimulation']) res = self.transClient.getTransformations(condDict={"Status": "Idle", "Type": extendableTTypes}) if res['OK']: idleTransformations = res['Value'] idleTransformations = [d.get("TransformationID") for d in idleTransformations] self.log.verbose("Found %d Idle MC transformations" % len(idleTransformations)) self.log.debug("Idle transformations found: %s" % ','.join([str(it) for it in idleTransformations])) else: self.log.error("Call to Transformation Client service failed", res['Message']) return res # get all the IDs of transformations undergoing a testing phase res = self.transClient.getStoredJobDescriptionIDs() if res['OK']: testingSimulations = res['Value'] testingSimulations = [pair[0] for pair in testingSimulations] self.log.verbose("Found %d MC transformations undergoing a testing phase" % len(testingSimulations)) self.log.debug("MC transformations found undergoing a testing phase: %s" % ','.join([str(ts) for ts in testingSimulations])) else: self.log.error("Call to Transformation Client service failed", res['Message']) return res # get the IDs that occur in both idle transformations and testing phase idleSimulations = list(set(testingSimulations).intersection(idleTransformations)) # remove those that we know failed idleSimulations = list(set(idleSimulations).difference(self.failedTransIDs)) self.log.info("MC transformations under considerations: %s (will loop on them)" % ','.join([str(idS) for idS in idleSimulations])) for transID in idleSimulations: self.log.info("Looking into %d" % transID) tasks = self.transClient.getTransformationTasks(condDict={"TransformationID": transID}) if not tasks['OK']: self.log.error("Call to Transformation Client service failed", tasks['Message']) continue else: tasks = tasks['Value'] numberOfTasks = len(tasks) numberOfDoneTasks = sum(1 for d in tasks if d.get("ExternalStatus") == "Done") self.log.verbose( "TransID = %d, numberOfTasks = %d, numberOfDoneTasks = %d" % (transID, numberOfTasks, numberOfDoneTasks)) if numberOfTasks == numberOfDoneTasks: self.log.info("All tasks have passed so the request can be accepted and the transformation updated") res = self._activateTransformation(transID, tasks) if not res['OK']: self.log.error("Error Activating Production", res['Message']) else: self.log.warn("There are failed tasks") report = self.__createReport(tasks) numberOfFailedTasks = sum(1 for d in tasks if d.get('ExternalStatus') == 'Failed') if numberOfFailedTasks == numberOfTasks: # all tasks have failed so the request can be rejected and an email report sent self._sendReport(report) self.log.warn("Transformation " + str(transID) + " failed the testing phase") self.failedTransIDs.append(transID) else: # only some tasks have failed so continue but send a warn email self.log.warn("Transformation " + str(transID) + " failed partially the testing phase, continuing anyway") doneTasks = list() for d in tasks: if d.get("ExternalStatus") == "Done": doneTasks.append(d) if not doneTasks: self.log.info("No tasks done for Transformation %d" % transID) continue res = self._activateTransformation(transID, doneTasks) if not res['OK']: self.log.error("Error Activating Production", res['Message']) continue subject = "MCSimulation Test Failure Report. TransformationID: " + str(transID) + " - some tasks failed" report['subject'] = subject self._sendReport(report) return S_OK() def _activateTransformation(self, transID, tasks): """ Calculate parameters, update the workflow, then move the production to Active """ parameters = self._calculateParameters(tasks) if not parameters['OK']: self.log.error("Error calculating parameters", parameters['Message']) return parameters else: parameters = parameters['Value'] self.log.verbose("TransID = %d, Calculated Parameters: %s" % (transID, str(parameters))) workflow = self._updateWorkflow(transID, int(round(float(parameters['CPUe']))), parameters['MCCpu']) if workflow['OK']: workflow = workflow['Value'] res = self._updateTransformationsTable(transID, workflow) if not res['OK']: self.log.error("Error updating transformations table", res['Message']) return res else: self.log.info("Transformation " + str(transID) + " passed the testing phase and is now set to active") return S_OK() def __createReport(self, tasks): """creates a report from a failed task to email to the production manager """ dateformat = '%d/%m/%Y %H:%M' transformationID = tasks[0]["TransformationID"] transformation = self.transClient.getTransformations(condDict={"TransformationID": transformationID}) transformation = transformation['Value'][0] subject = "MCSimulation Test Failure Report. TransformationID: " + str(transformationID) body = [subject] body.append("") body.append("Transformation:") body.append("----------------------------------------------------------------------") body.append("TransformationID: " + str(transformation["TransformationID"])) body.append("TransformationName: " + transformation["TransformationName"]) body.append("LastUpdate: " + transformation["LastUpdate"].strftime(dateformat)) body.append("Status: " + transformation["Status"]) body.append("Description: " + transformation["Description"]) body.append("TransformationFamily: " + str(transformation["TransformationFamily"])) body.append("Plugin: " + transformation["Plugin"]) body.append("Type: " + transformation["Type"]) body.append("AgentType: " + transformation["AgentType"]) body.append("GroupSize: " + str(transformation["GroupSize"])) body.append("MaxNumberOfTasks: " + str(transformation["MaxNumberOfTasks"])) body.append("AuthorDN: " + transformation["AuthorDN"]) body.append("TransformationGroup: " + transformation["TransformationGroup"]) body.append("InheritedFrom: " + str(transformation["InheritedFrom"])) body.append("CreationDate: " + transformation["CreationDate"].strftime(dateformat)) body.append("FileMask: " + transformation["FileMask"]) body.append("EventsPerTask: " + str(transformation["EventsPerTask"])) body.append("AuthorGroup: " + transformation["AuthorGroup"]) body.append("") body.append("Number of Tasks: " + str(len(tasks))) body.append("Tasks:") body.append("----------------------------------------------------------------------") for task in tasks: body.append("TaskID: " + str(task['TaskID'])) body.append("TargetSE: " + task['TargetSE']) body.append("LastUpdateTime: " + task['LastUpdateTime'].strftime(dateformat)) body.append("RunNumber: " + str(task['RunNumber'])) body.append("CreationTime: " + task['CreationTime'].strftime(dateformat)) body.append("ExternalID: " + str(task['ExternalID'])) body.append("ExternalStatus: " + task['ExternalStatus']) body.append("") return {'subject': subject, 'body': body} def _sendReport(self, report): """sends a given report to the production manager """ if not self.email: self.email = getUserOption(self.operations.getValue("Shifter/ProductionManager/User"), 'Email') body = '\n'.join(report['body']) res = self.notifyClient.sendMail( self.email, report['subject'], body, self.email, localAttempt=False, avoidSpam=True) if not res['OK']: self.log.error("sendMail failed", res['Message']) else: self.log.info('Mail summary sent to production manager') def _calculateParameters(self, tasks): """ Calculates the CPU time per event for the production """ jobIds = [int(x['ExternalID']) for x in tasks] res = self.bkClient.bulkJobInfo({'jobId': jobIds}) if not res['OK']: self.log.error("Error calling bkClient", res['Message']) return S_ERROR(res['Message']) successful = res['Value']['Successful'] self.log.debug("Successful tasks: %s" % str(successful)) if not successful: self.log.error("There are no successful tasks") return S_ERROR("There are no successful tasks") events = 0 CPUeJobTotal = 0.0 for job in successful.itervalues(): cpuJob = 0 for bkJob in job: if bkJob['ApplicationName'] in ['Gauss', 'Boole', 'Moore', 'Brunel', 'DaVinci']: if not events: events = bkJob['NumberOfEvents'] timeInSeconds = bkJob['CPUTIME'] cpuJob += timeInSeconds * bkJob['WNCPUHS06'] CPUeJob = cpuJob / events self.log.debug("CPUeJob = %d" % CPUeJob) CPUeJobTotal += CPUeJob CPUe = CPUeJobTotal / len(successful) # We want to produce at least 25 events per job... MCCpu = str(25 * int(round(float(CPUe)))) self.log.verbose("CPUe = %d, MCCpu = %s" % (CPUe, MCCpu)) return S_OK({'CPUe': CPUe, 'MCCpu': MCCpu}) def _updateWorkflow(self, transID, CPUe, MCCpu): """ Updates the workflow of a savedProductionDescription to reflect the calculated CPUe """ res = self.transClient.getStoredJobDescription(transID) if res['OK']: workflow = fromXMLString(res['Value'][0][1]) prod = Production() prod.LHCbJob.workflow = workflow prod.setParameter('CPUe', 'string', str(CPUe), 'CPU time per event') prod.LHCbJob.setCPUTime(MCCpu) self.log.info("Transformation ", str(transID)) self.log.info("Calculated CPUTime: ", str(CPUe)) self.log.info("CpuTime: ", str(MCCpu)) # maximum number of events to produce # try to get the CPU parameters from the configuration if possible cpuTimeAvg = Operations().getValue('Transformations/CPUTimeAvg') if cpuTimeAvg is None: self.log.info('Could not get CPUTimeAvg from config, defaulting to %d' % 200000) cpuTimeAvg = 200000 try: CPUNormalizationFactorAvg = getCPUNormalizationFactorAvg() except RuntimeError: self.log.info('Could not get CPUNormalizationFactorAvg, defaulting to %f' % 1.0) CPUNormalizationFactorAvg = 1.0 max_e = getEventsToProduce(CPUe, cpuTimeAvg, CPUNormalizationFactorAvg) prod.setParameter('maxNumberOfEvents', 'string', str(max_e), 'Maximum number of events to produce (Gauss)') return S_OK(prod.LHCbJob.workflow.toXML()) else: self.log.error("Call to Transformation Client service failed", res['Message']) return res def _updateTransformationsTable(self, transID, workflow): """ Puts the modified workflow from the savedProductionDescription table into the transformations table and removes it from the savedProductionDescription table. """ transformation = self.transClient.getTransformations(condDict={"TransformationID": transID}) if transformation['OK']: body = self.transClient.setTransformationParameter(transID, "Body", workflow) status = self.transClient.setTransformationParameter(transID, "Status", "Active") if body['OK'] and status['OK']: res = self.transClient.removeStoredJobDescription(transID) if not res['OK']: self.log.error("Call to removeStoredJobDescription failed", res['Message']) return res self.log.info("Transformation %s has an updated body and Status set to active" % transID) return S_OK() else: self.log.error("One of the updates has failed so set them both back to the previous value to ensure atomicity") self.log.debug(str(transformation['Value'][0]['Body'])) res = self.transClient.setTransformationParameter(transID, "Body", transformation['Value'][0]['Body']) if not res['OK']: self.log.error("Failure calling setTransformationParameter", res['Message']) return res res = self.transClient.setTransformationParameter(transID, "Status", transformation['Value'][0]['Status']) if not res['OK']: self.log.error("Failure calling setTransformationParameter", res['Message']) return res else: self.log.error("Call to getTransformations failed", transformation['Message']) return transformation
class DiracProduction(DiracLHCb): """ class for managing productions """ def __init__(self, tsClientIn=None): """Instantiates the Workflow object and some default parameters. """ super(DiracProduction, self).__init__() if tsClientIn is None: self.transformationClient = TransformationClient() else: self.transformationClient = tsClientIn self.prodHeaders = { 'AgentType': 'SubmissionMode', 'Status': 'Status', 'CreationDate': 'Created', 'TransformationName': 'Name', 'Type': 'Type' } self.prodAdj = 22 self.commands = { 'start': ['Active', 'Manual'], 'stop': ['Stopped', 'Manual'], 'automatic': ['Active', 'Automatic'], 'manual': ['Active', 'Manual'], 'mctestmode': ['Testing', 'Automatic'], 'completed': ['Completed', 'Manual'], 'completing': ['Completing', 'Automatic'], 'cleaning': ['Cleaning', 'Manual'], 'flush': ['Flush', 'Automatic'], 'deleted': ['Deleted', 'Manual'], 'cleaned': ['Cleaned', 'Manual'], 'archived': ['Archived', 'Manual'], 'valinput': ['ValidatingInput', 'Manual'], 'valoutput': ['ValidatingOutput', 'Manual'], 'remove': ['RemovingFiles', 'Manual'], 'validated': ['ValidatedOutput', 'Manual'], 'removed': ['RemovedFiles', 'Manual'] } def getProduction(self, productionID, printOutput=False): """Returns the metadata associated with a given production ID. Protects against LFN: being prepended and different types of production ID. """ if not isinstance(productionID, (int, long, str)): return self._errorReport( 'Expected string, long or int for production ID') result = self.transformationClient.getTransformation(int(productionID)) if not result['OK']: return result # to fix TODO if printOutput: adj = self.prodAdj prodInfo = result['Value'] top = '' for i in self.prodHeaders.itervalues(): top += i.ljust(adj) message = ['ProductionID'.ljust(adj) + top + '\n'] # very painful to make this consistent, better improved first on the server side productionID = str(productionID) info = productionID.ljust(adj) + prodInfo['Status'].ljust(adj) + prodInfo['Type'].ljust(adj) +\ prodInfo['AgentType'].ljust(adj) + toString(prodInfo['CreationDate']).ljust(adj) +\ prodInfo['TransformationName'].ljust(adj) message.append(info) print '\n'.join(message) return S_OK(result['Value']) def getProductionLoggingInfo(self, productionID, printOutput=False): """The logging information for the given production is returned. This includes the operation performed, any messages associated with the operation and the DN of the production manager performing it. """ if not isinstance(productionID, (int, long, str)): return self._errorReport( 'Expected string, long or int for production ID') result = self.transformationClient.getTransformationLogging( int(productionID)) if not result['OK']: self.log.warn( 'Could not get transformation logging information for productionID %s' % (productionID)) return result if not result['Value']: self.log.warn('No logging information found for productionID %s' % (productionID)) return result if not printOutput: return result infoM = 'ProdID'.ljust(int(0.5 * self.prodAdj)) + 'Message'.ljust(3 * self.prodAdj) +\ 'DateTime [UTC]'.ljust(self.prodAdj) + 'AuthorCN'.ljust(2 * self.prodAdj) message = [infoM] for line in result['Value']: infoL = str(line['TransformationID']).ljust(int(0.5 * self.prodAdj)) +\ line['Message'].ljust(3 * self.prodAdj) + toString(line['MessageDate']).ljust(self.prodAdj) +\ line['AuthorDN'].split('/')[-1].ljust(2 * self.prodAdj) message.append(infoL) print '\nLogging summary for productionID ' + str( productionID) + '\n\n' + '\n'.join(message) return result def getProductionSummary(self, productionID=None, printOutput=False): """Returns a detailed summary for the productions in the system. If production ID is specified, the result is restricted to this value. If printOutput is specified, the result is printed to the screen. """ if not isinstance(productionID, (int, long, str)): return self._errorReport( 'Expected string, long or int for production ID') result = self.transformationClient.getTransformationSummary() if not result['OK']: return result if productionID: if long(productionID) in result['Value']: newResult = S_OK() newResult['Value'] = {} newResult['Value'][long(productionID)] = result['Value'][long( productionID)] result = newResult else: self.log.info('Specified productionID was not found, \ the list of active productions is:\n%s' % ', '.join(str(pID) for pID in result['Value'])) return S_ERROR('Production ID %s was not found' % (productionID)) if printOutput: self._prettyPrint(result['Value']) return result def getProductionApplicationSummary(self, productionID, status=None, minorStatus=None, printOutput=False): """Returns an application status summary for the productions in the system. If printOutput is specified, the result is printed to the screen. This queries the WMS for the given productionID and provides an up-to-date snapshot of the application status combinations and associated WMS JobIDs. """ if not isinstance(productionID, (int, long, str)): return self._errorReport( 'Expected string, long or int for production ID') statusDict = self.getProdJobMetadata(productionID, status, minorStatus) if not statusDict['OK']: self.log.warn('Could not get production metadata information') return statusDict jobIDs = list(statusDict['Value']) if not jobIDs: return S_ERROR('No JobIDs with matching conditions found') self.log.verbose('Considering %s jobs with selected conditions' % (len(jobIDs))) # now need to get the application status information result = JobMonitoringClient().getJobsApplicationStatus(jobIDs) if not result['OK']: self.log.warn('Could not get application status for jobs list') return result appStatus = result['Value'] # self._prettyPrint(appStatus) # self._prettyPrint(statusDict['Value']) # Now format the result. summary = {} submittedJobs = 0 doneJobs = 0 for job, atts in statusDict['Value'].iteritems(): for key, val in atts.iteritems(): if key == 'Status': uniqueStatus = val.capitalize() if uniqueStatus not in summary: summary[uniqueStatus] = {} if atts['MinorStatus'] not in summary[uniqueStatus]: summary[uniqueStatus][atts['MinorStatus']] = {} if appStatus[job]['ApplicationStatus'] not in summary[ uniqueStatus][atts['MinorStatus']]: summary[uniqueStatus][atts['MinorStatus']][ appStatus[job]['ApplicationStatus']] = {} summary[uniqueStatus][atts['MinorStatus']][ appStatus[job]['ApplicationStatus']]['Total'] = 1 submittedJobs += 1 if uniqueStatus == 'Done': doneJobs += 1 summary[uniqueStatus][atts['MinorStatus']][ appStatus[job]['ApplicationStatus']]['JobList'] = [ job ] else: if appStatus[job]['ApplicationStatus'] not in summary[ uniqueStatus][atts['MinorStatus']]: summary[uniqueStatus][atts['MinorStatus']] = {} summary[uniqueStatus][atts['MinorStatus']][ appStatus[job]['ApplicationStatus']] = {} summary[uniqueStatus][atts['MinorStatus']][ appStatus[job] ['ApplicationStatus']]['Total'] = 1 submittedJobs += 1 if uniqueStatus == 'Done': doneJobs += 1 summary[uniqueStatus][atts['MinorStatus']][ appStatus[job] ['ApplicationStatus']]['JobList'] = [job] else: current = summary[uniqueStatus][ atts['MinorStatus']][appStatus[job][ 'ApplicationStatus']]['Total'] summary[uniqueStatus][atts['MinorStatus']][ appStatus[job] ['ApplicationStatus']]['Total'] = current + 1 submittedJobs += 1 if uniqueStatus == 'Done': doneJobs += 1 jobList = summary[uniqueStatus][ atts['MinorStatus']][appStatus[job][ 'ApplicationStatus']]['JobList'] jobList.append(job) summary[uniqueStatus][atts['MinorStatus']][ appStatus[job] ['ApplicationStatus']]['JobList'] = jobList if not printOutput: result = S_OK() if not status and not minorStatus: result['Totals'] = { 'Submitted': int(submittedJobs), 'Done': int(doneJobs) } result['Value'] = summary return result # If a printed summary is requested statAdj = int(0.5 * self.prodAdj) mStatAdj = int(2.0 * self.prodAdj) totalAdj = int(0.5 * self.prodAdj) exAdj = int(0.5 * self.prodAdj) message = '\nJob Summary for ProductionID %s considering status %s' % ( productionID, status) if minorStatus: message += 'and MinorStatus = %s' % (minorStatus) message += ':\n\n' message += 'Status'.ljust(statAdj) + 'MinorStatus'.ljust(mStatAdj) + 'ApplicationStatus'.ljust(mStatAdj) + \ 'Total'.ljust(totalAdj) + 'Example'.ljust(exAdj) + '\n' for stat, metadata in summary.iteritems(): message += '\n' for minor, appInfo in metadata.iteritems(): message += '\n' for appStat, jobInfo in appInfo.iteritems(): message += stat.ljust(statAdj) + minor.ljust(mStatAdj) + appStat.ljust(mStatAdj) + \ str(jobInfo['Total']).ljust(totalAdj) + str(jobInfo['JobList'][0]).ljust(exAdj) + '\n' # self._prettyPrint(summary) if status or minorStatus: return S_OK(summary) result = self.getProductionProgress(productionID) if not result['OK']: self.log.warn('Could not get production progress information') return result if 'Created' in result['Value']: createdJobs = int(result['Value']['Created']) + submittedJobs else: createdJobs = submittedJobs percSub = int(100 * submittedJobs / createdJobs) percDone = int(100 * doneJobs / createdJobs) print '\nCurrent status of production %s:\n' % productionID print 'Submitted'.ljust(12) + str(percSub).ljust(3) + '% ( ' + str(submittedJobs).ljust(7) + \ 'Submitted / '.ljust(15) + str(createdJobs).ljust(7) + ' Created jobs )' print 'Done'.ljust(12) + str(percDone).ljust(3) + '% ( ' + str(doneJobs).ljust(7) + \ 'Done / '.ljust(15) + str(createdJobs).ljust(7) + ' Created jobs )' result = S_OK() result['Totals'] = { 'Submitted': int(submittedJobs), 'Created': int(createdJobs), 'Done': int(doneJobs) } result['Value'] = summary # self.pPrint(result) return result def getProductionJobSummary(self, productionID, status=None, minorStatus=None, printOutput=False): """Returns a job summary for the productions in the system. If printOutput is specified, the result is printed to the screen. This queries the WMS for the given productionID and provides an up-to-date snapshot of the job status combinations and associated WMS JobIDs. """ if not isinstance(productionID, (int, long, str)): return self._errorReport( 'Expected string, long or int for production ID') statusDict = self.getProdJobMetadata(productionID, status, minorStatus) if not statusDict['OK']: self.log.warn('Could not get production metadata information') return statusDict # Now format the result. summary = {} submittedJobs = 0 doneJobs = 0 for job, atts in statusDict['Value'].ietritems(): for key, val in atts.iteritems(): if key == 'Status': uniqueStatus = val.capitalize() if uniqueStatus not in summary: summary[uniqueStatus] = {} if atts['MinorStatus'] not in summary[uniqueStatus]: summary[uniqueStatus][atts['MinorStatus']] = {} summary[uniqueStatus][atts['MinorStatus']]['Total'] = 1 submittedJobs += 1 if uniqueStatus == 'Done': doneJobs += 1 summary[uniqueStatus][ atts['MinorStatus']]['JobList'] = [job] else: current = summary[uniqueStatus][ atts['MinorStatus']]['Total'] summary[uniqueStatus][ atts['MinorStatus']]['Total'] = current + 1 submittedJobs += 1 if uniqueStatus == 'Done': doneJobs += 1 jobList = summary[uniqueStatus][ atts['MinorStatus']]['JobList'] jobList.append(job) summary[uniqueStatus][ atts['MinorStatus']]['JobList'] = jobList if not printOutput: result = S_OK() if not status and not minorStatus: result['Totals'] = { 'Submitted': int(submittedJobs), 'Done': int(doneJobs) } result['Value'] = summary return result # If a printed summary is requested statAdj = int(0.5 * self.prodAdj) mStatAdj = int(2.0 * self.prodAdj) totalAdj = int(0.5 * self.prodAdj) exAdj = int(0.5 * self.prodAdj) message = '\nJob Summary for ProductionID %s considering' % ( productionID) if status: message += ' Status = %s' % (status) if minorStatus: message += ' MinorStatus = %s' % (minorStatus) if not status and not minorStatus: message += ' all status combinations' message += ':\n\n' message += 'Status'.ljust(statAdj) + 'MinorStatus'.ljust(mStatAdj) + 'Total'.ljust(totalAdj) + \ 'Example'.ljust(exAdj) + '\n' for stat, metadata in summary.iteritems(): message += '\n' for minor, jobInfo in metadata.iteritems(): message += stat.ljust(statAdj) + minor.ljust(mStatAdj) + str(jobInfo['Total']).ljust(totalAdj) + \ str(jobInfo['JobList'][0]).ljust(exAdj) + '\n' print message # self._prettyPrint(summary) if status or minorStatus: return S_OK(summary) result = self.getProductionProgress(productionID) if not result['OK']: return result if 'Created' in result['Value']: createdJobs = int(result['Value']['Created']) + submittedJobs else: createdJobs = submittedJobs percSub = int(100 * submittedJobs / createdJobs) percDone = int(100 * doneJobs / createdJobs) print '\nCurrent status of production %s:\n' % productionID print 'Submitted'.ljust(12) + str(percSub).ljust(3) + '% ( ' + str(submittedJobs).ljust(7) + \ 'Submitted / '.ljust(15) + str(createdJobs).ljust(7) + ' Created jobs )' print 'Done'.ljust(12) + str(percDone).ljust(3) + '% ( ' + str(doneJobs).ljust(7) + \ 'Done / '.ljust(15) + str(createdJobs).ljust(7) + ' Created jobs )' result = S_OK() result['Totals'] = { 'Submitted': int(submittedJobs), 'Created': int(createdJobs), 'Done': int(doneJobs) } result['Value'] = summary return result def getProductionSiteSummary(self, productionID, site=None, printOutput=False): """Returns a site summary for the productions in the system. If printOutput is specified, the result is printed to the screen. This queries the WMS for the given productionID and provides an up-to-date snapshot of the sites that jobs were submitted to. """ if not isinstance(productionID, (int, long, str)): return self._errorReport( 'Expected string, long or int for production ID') statusDict = self.getProdJobMetadata(productionID, None, None, site) if not statusDict['OK']: self.log.warn('Could not get production metadata information') return statusDict summary = {} submittedJobs = 0 doneJobs = 0 for job, atts in statusDict['Value'].iteritems(): for key, val in atts.iteritems(): if key == 'Site': uniqueSite = val currentStatus = atts['Status'].capitalize() if uniqueSite not in summary: summary[uniqueSite] = {} if currentStatus not in summary[uniqueSite]: summary[uniqueSite][currentStatus] = {} summary[uniqueSite][currentStatus]['Total'] = 1 submittedJobs += 1 if currentStatus == 'Done': doneJobs += 1 summary[uniqueSite][currentStatus]['JobList'] = [job] else: current = summary[uniqueSite][currentStatus]['Total'] summary[uniqueSite][currentStatus][ 'Total'] = current + 1 submittedJobs += 1 if currentStatus == 'Done': doneJobs += 1 jobList = summary[uniqueSite][currentStatus]['JobList'] jobList.append(job) summary[uniqueSite][currentStatus]['JobList'] = jobList if not printOutput: result = S_OK() if not site: result = self.getProductionProgress(productionID) if not result['OK']: return result if 'Created' in result['Value']: createdJobs = result['Value']['Created'] result['Totals'] = { 'Submitted': int(submittedJobs), 'Done': int(doneJobs) } result['Value'] = summary return result # If a printed summary is requested siteAdj = int(1.0 * self.prodAdj) statAdj = int(0.5 * self.prodAdj) totalAdj = int(0.5 * self.prodAdj) exAdj = int(0.5 * self.prodAdj) message = '\nSummary for ProductionID %s' % (productionID) if site: message += ' at Site %s' % (site) else: message += ' at all Sites' message += ':\n\n' message += 'Site'.ljust(siteAdj) + 'Status'.ljust(statAdj) + 'Total'.ljust(totalAdj) + \ 'Example'.ljust(exAdj) + '\n' for siteStr, metadata in summary.iteritems(): message += '\n' for stat, jobInfo in metadata.iteritems(): message += siteStr.ljust(siteAdj) + stat.ljust(statAdj) + str(jobInfo['Total']).ljust(totalAdj) + \ str(jobInfo['JobList'][0]).ljust(exAdj) + '\n' print message # self._prettyPrint(summary) result = self.getProductionProgress(productionID) if not result['OK']: return result if 'Created' in result['Value']: createdJobs = int(result['Value']['Created']) + submittedJobs else: createdJobs = submittedJobs percSub = int(100 * submittedJobs / createdJobs) percDone = int(100 * doneJobs / createdJobs) if not site: print '\nCurrent status of production %s:\n' % productionID print 'Submitted'.ljust(12) + str(percSub).ljust(3) + '% ( ' + str(submittedJobs).ljust(7) + \ 'Submitted / '.ljust(15) + str(createdJobs).ljust(7) + ' Created jobs )' print 'Done'.ljust(12) + str(percDone).ljust(3) + '% ( ' + str(doneJobs).ljust(7) + \ 'Done / '.ljust(15) + str(createdJobs).ljust(7) + ' Created jobs )' result = S_OK() result['Totals'] = { 'Submitted': int(submittedJobs), 'Created': int(createdJobs), 'Done': int(doneJobs) } result['Value'] = summary return result def getProductionProgress(self, productionID=None, printOutput=False): """Returns the status of jobs as seen by the production management infrastructure. """ if not isinstance(productionID, (int, long, str)): return self._errorReport( 'Expected string, long or int for production ID') productionID = long(productionID) if not productionID: result = self._getActiveProductions() if not result['OK']: return result productionID = result['Value'] else: productionID = [productionID] productionID = [str(x) for x in productionID] self.log.verbose('Will check progress for production(s):\n%s' % (', '.join(productionID))) progress = {} for prod in productionID: # self._prettyPrint(result) result = self.transformationClient.getTransformationTaskStats( int(prod)) if not result['Value']: self.log.error(result) return result progress[int(prod)] = result['Value'] if not printOutput: return result idAdj = int(self.prodAdj) statAdj = int(self.prodAdj) countAdj = int(self.prodAdj) message = 'ProductionID'.ljust(idAdj) + 'Status'.ljust( statAdj) + 'Count'.ljust(countAdj) + '\n\n' for prod, info in progress.iteritems(): for status, count in info.iteritems(): message += str(prod).ljust(idAdj) + status.ljust( statAdj) + str(count).ljust(countAdj) + '\n' message += '\n' print message return result def _getActiveProductions(self, printOutput=False): """Returns a dictionary of active production IDs and their status, e.g. automatic, manual. """ result = self.transformationClient.getTransformations() if not result['OK']: return result prodList = result['Value'] currentProductions = {} for prodDict in prodList: self.log.debug(prodDict) if 'AgentType' in prodDict and 'TransformationID' in prodDict: prodID = prodDict['TransformationID'] status = prodDict['AgentType'] currentProductions[prodID] = status if status.lower() == 'automatic': self.log.verbose( 'Found active production %s eligible to submit jobs' % prodID) if printOutput: self._prettyPrint(currentProductions) return S_OK(currentProductions) def getProductionCommands(self): """ Returns the list of possible commands and their meaning. """ prodCommands = {} for keyword, statusSubMode in self.commands.iteritems(): prodCommands[keyword] = { 'Status': statusSubMode[0], 'SubmissionMode': statusSubMode[1] } return S_OK(prodCommands) def production(self, productionID, command, disableCheck=True): """Allows basic production management by supporting the following commands: - start : set production status to Active, job submission possible - stop : set production status to Stopped, no job submissions - automatic: set production submission mode to Automatic, e.g. submission via Agent - manual: set produciton submission mode to manual, e.g. dirac-production-submit """ commands = self.commands if not isinstance(productionID, (int, long, str)): return self._errorReport( 'Expected string, long or int for production ID') productionID = long(productionID) if not isinstance(command, str): return self._errorReport('Expected string, for command') if not command.lower() in commands: return self._errorReport('Expected one of: %s for command string' % (', '.join(commands))) self.log.verbose( 'Requested to change production %s with command "%s"' % (productionID, command.lower().capitalize())) if not disableCheck: result = promptUser( 'Do you wish to change production %s with command "%s"? ' % (productionID, command.lower().capitalize())) if not result['OK']: self.log.info('Action cancelled') return S_OK('Action cancelled') if result['Value'] != 'y': self.log.info('Doing nothing') return S_OK('Doing nothing') actions = commands[command] self.log.info( 'Setting production status to %s and submission mode to %s for productionID %s' % (actions[0], actions[1], productionID)) result = self.transformationClient.setTransformationParameter( long(productionID), "Status", actions[0]) if not result['OK']: self.log.warn( 'Problem updating transformation status with result:\n%s' % result) return result self.log.verbose('Setting transformation status to %s successful' % (actions[0])) result = self.transformationClient.setTransformationParameter( long(productionID), 'AgentType', actions[1]) if not result['OK']: self.log.warn( 'Problem updating transformation agent type with result:\n%s' % result) return result self.log.verbose('Setting transformation agent type to %s successful' % (actions[1])) return S_OK('Production %s status updated' % productionID) def productionFileSummary(self, productionID, selectStatus=None, outputFile=None, orderOutput=True, printSummary=False, printOutput=False): """ Allows to investigate the input files for a given production transformation and provides summaries / selections based on the file status if desired. """ adj = 18 ordering = 'TaskID' if not orderOutput: ordering = 'LFN' fileSummary = self.transformationClient.getTransformationFiles( condDict={'TransformationID': int(productionID)}, orderAttribute=ordering) if not fileSummary['OK']: return fileSummary toWrite = '' totalRecords = 0 summary = {} selected = 0 if fileSummary['OK']: for lfnDict in fileSummary['Value']: totalRecords += 1 record = '' recordStatus = '' for n, v in lfnDict.iteritems(): record += str(n) + ' = ' + str(v).ljust(adj) + ' ' if n == 'Status': recordStatus = v if selectStatus == recordStatus: selected += 1 if v in summary: new = summary[v] + 1 summary[v] = new else: summary[v] = 1 if outputFile and selectStatus: if selectStatus == recordStatus: toWrite += record + '\n' if printOutput: print record elif outputFile: toWrite += record + '\n' if printOutput: print record else: if printOutput: print record if printSummary: print '\nSummary for %s files in production %s\n' % (totalRecords, productionID) print 'Status'.ljust(adj) + ' ' + 'Total'.ljust( adj) + 'Percentage'.ljust(adj) + '\n' for n, v in summary.iteritems(): percentage = int(100 * int(v) / totalRecords) print str(n).ljust(adj) + ' ' + str(v).ljust(adj) + ' ' + str( percentage).ljust(2) + ' % ' print '\n' if selectStatus and not selected: return S_ERROR( 'No files were selected for production %s and status "%s"' % (productionID, selectStatus)) elif selectStatus and selected: print '%s / %s files (%s percent) were found for production %s in status "%s"' % ( selected, totalRecords, int( 100 * int(selected) / totalRecords), productionID, selectStatus) if outputFile: if os.path.exists(outputFile): print 'Requested output file %s already exists, please remove this file to continue' % outputFile return fileSummary fopen = open(outputFile, 'w') fopen.write(toWrite) fopen.close() if not selectStatus: print 'Wrote %s lines to file %s' % (totalRecords, outputFile) else: print 'Wrote %s lines to file %s for status "%s"' % ( selected, outputFile, selectStatus) return fileSummary def checkFilesStatus(self, lfns, productionID='', printOutput=False): """Checks the given LFN(s) status in the productionDB. All productions are considered by default but can restrict to productionID. """ if not isinstance(productionID, (int, long, str)): return self._errorReport( 'Expected string, long or int for production ID') if isinstance(lfns, str): lfns = lfns.replace('LFN:', '') elif isinstance(lfns, list): try: lfns = [str(lfnName.replace('LFN:', '')) for lfnName in lfns] except Exception as x: return self._errorReport(str(x), 'Expected strings for LFN(s)') else: return self._errorReport( 'Expected single string or list of strings for LFN(s)') fileStatus = self.transformationClient.getFileSummary( lfns, long(productionID)) if printOutput: self._prettyPrint(fileStatus['Value']) return fileStatus def getWMSProdJobID(self, jobID, printOutput=False): """This method takes the DIRAC WMS JobID and returns the Production JobID information. """ result = self.attributes(jobID) if not result['OK']: return result if 'JobName' not in result['Value']: return S_ERROR( 'Could not establish ProductionID / ProductionJobID, missing JobName' ) wmsJobName = result['Value']['JobName'] prodID = wmsJobName.split('_')[0] prodJobID = wmsJobName.split('_')[1] info = { 'WMSJobID': jobID, 'JobName': wmsJobName, 'ProductionID': prodID, 'JobID': prodJobID } if printOutput: self._prettyPrint(info) return S_OK(info) def getProdJobInfo(self, productionID, jobID, printOutput=False): """Retrieve production job information from Production Manager service. """ res = self.transformationClient.getTransformationTasks( condDict={ 'TransformationID': productionID, 'TaskID': jobID }, inputVector=True) if not res['OK']: return res if not res['Value']: return S_ERROR("Job %s not found for production %s" % (jobID, productionID)) jobInfo = res['Value'][0] if printOutput: self._prettyPrint(jobInfo) return S_OK(jobInfo) def selectProductionJobs(self, productionID, status=None, minorStatus=None, applicationStatus=None, site=None, owner=None, date=None): """Wraps around DIRAC API selectJobs(). Arguments correspond to the web page selections. By default, the date is the creation date of the production. """ if not date: self.log.verbose( 'No Date supplied, setting old date for production %s' % productionID) date = '2001-01-01' return self.selectJobs(status, minorStatus, applicationStatus, site, owner, str(productionID).zfill(8), date) def extendProduction(self, productionID, numberOfJobs, printOutput=False): """ Extend Simulation type Production by number of jobs. Usage: extendProduction <ProductionNameOrID> nJobs """ if not isinstance(productionID, (int, long, str)): return self._errorReport( 'Expected string, long or int for production ID') if isinstance(numberOfJobs, str): try: numberOfJobs = int(numberOfJobs) except Exception as x: return self._errorReport( str(x), 'Expected integer or string for number of jobs to submit') result = self.transformationClient.extendTransformation( long(productionID), numberOfJobs) if not result['OK']: return self._errorReport( result, 'Could not extend production %s by %s jobs' % (productionID, numberOfJobs)) if printOutput: print 'Extended production %s by %s jobs' % (productionID, numberOfJobs) return result def getProdJobMetadata(self, productionID, status=None, minorStatus=None, site=None): """Function to get the WMS job metadata for selected fields. Given a production ID will return the current WMS status information for all jobs in that production starting from the creation date. """ result = self.transformationClient.getTransformationParameters( long(productionID), ['CreationDate']) if not result['OK']: self.log.warn( 'Problem getting production metadata for ID %s:\n%s' % (productionID, result)) return result creationDate = toString(result['Value']).split()[0] result = self.selectProductionJobs(productionID, status=status, minorStatus=minorStatus, site=site, date=creationDate) if not result['OK']: self.log.warn('Problem selecting production jobs for ID %s:\n%s' % (productionID, result)) return result jobsList = result['Value'] return self.status(jobsList) def launchProduction(self, prod, publishFlag, testFlag, requestID, extend=0, tracking=0, MCsimflag=False): """ Given a production object (prod), launch it It returns the productionID created """ if publishFlag is False and testFlag: gLogger.info('Test prod will be launched locally') result = prod.runLocal() if result['OK']: gLogger.info('Template finished successfully') return S_OK() else: gLogger.error( 'Launching production: something wrong with execution!') return S_ERROR('Something wrong with execution!') result = prod.create(publish=publishFlag, requestID=requestID, reqUsed=tracking) if not result['OK']: gLogger.error( 'Error during prod creation:\n%s\ncheck that the wkf name is unique.' % (result['Message'])) return result if publishFlag: prodID = result['Value'] msg = 'Production %s successfully created ' % (prodID) if extend: self.extendProduction(prodID, extend, printOutput=True) msg += ', extended by %s jobs' % extend if MCsimflag: self.production(prodID, 'mctestmode') msg = msg + ' and started in mctestmode.' elif testFlag: self.production(prodID, 'manual') msg = msg + ' and started in manual mode.' else: self.production(prodID, 'automatic') msg = msg + ' and started in automatic mode.' gLogger.notice(msg) else: prodID = 1 gLogger.notice( 'Production creation completed but not published (publishFlag was %s). \ Setting ID = %s (useless, just for the test)' % (publishFlag, prodID)) return S_OK(prodID)
class BookkeepingWatchAgent(AgentModule, TransformationAgentsUtilities): """ LHCbDIRAC only agent. A threaded agent. """ def __init__(self, *args, **kwargs): """ c'tor """ AgentModule.__init__(self, *args, **kwargs) TransformationAgentsUtilities.__init__(self) self.bkQueriesToBeChecked = Queue.Queue() self.bkQueriesInCheck = [] self.fullUpdatePeriod = 86400 self.bkUpdateLatency = 7200 self.debug = False self.transInThread = {} self.pickleFile = 'BookkeepingWatchAgent.pkl' self.chunkSize = 1000 self.pluginsWithNoRunInfo = ['LHCbStandard', 'ReplicateDataset', 'ArchiveDataset', 'LHCbMCDSTBroadcastRandom', 'ReplicateToLocalSE', 'RemoveReplicas', 'RemoveReplicasWhenProcessed', 'RemoveReplicasWithAncestors', 'ReplicateWithAncestors', 'ReduceReplicas', 'RemoveDatasetFromDisk', 'DestroyDataset', 'DestroyDatasetWhenProcessed', 'BySize', 'Standard'] self.timeLog = {} self.fullTimeLog = {} self.bkQueries = {} self.transClient = None self.bkClient = None def initialize(self): """ Make the necessary initializations. The ThreadPool is created here, the _execute() method is what each thread will execute. """ self.fullUpdatePeriod = self.am_getOption('FullUpdatePeriod', self.fullUpdatePeriod) self.bkUpdateLatency = self.am_getOption('BKUpdateLatency', self.bkUpdateLatency) self.debug = self.am_getOption('verbose', self.debug) self.pickleFile = os.path.join(self.am_getWorkDirectory(), self.pickleFile) self.chunkSize = self.am_getOption('maxFilesPerChunk', self.chunkSize) self.pluginsWithNoRunInfo = Operations().getValue('TransformationPlugins/PluginsWithNoRunInfo', self.pluginsWithNoRunInfo) self._logInfo('Full Update Period: %d seconds' % self.fullUpdatePeriod) self._logInfo('BK update latency : %d seconds' % self.bkUpdateLatency) self._logInfo('Plugins with no run info: %s' % ', '.join(self.pluginsWithNoRunInfo)) self.transClient = TransformationClient() self.bkClient = BookkeepingClient() try: with open(self.pickleFile, 'r') as pf: self.timeLog = pickle.load(pf) self.fullTimeLog = pickle.load(pf) self.bkQueries = pickle.load(pf) self._logInfo("successfully loaded Log from", self.pickleFile, "initialize") except (EOFError, IOError): self._logInfo("failed loading Log from", self.pickleFile, "initialize") self.timeLog = {} self.fullTimeLog = {} self.bkQueries = {} maxNumberOfThreads = self.am_getOption('maxThreadsInPool', 1) threadPool = ThreadPool(maxNumberOfThreads, maxNumberOfThreads) for i in xrange(maxNumberOfThreads): threadPool.generateJobAndQueueIt(self._execute, [i]) gMonitor.registerActivity("Iteration", "Agent Loops", AGENT_NAME, "Loops/min", gMonitor.OP_SUM) return S_OK() @gSynchro def __dumpLog(self): """ dump the log in the pickle file """ if self.pickleFile: try: with open(self.pickleFile, 'w') as pf: pickle.dump(self.timeLog, pf) pickle.dump(self.fullTimeLog, pf) pickle.dump(self.bkQueries, pf) self._logVerbose("successfully dumped Log into %s" % self.pickleFile) except IOError as e: self._logError("fail to open %s: %s" % (self.pickleFile, e)) except pickle.PickleError as e: self._logError("fail to dump %s: %s" % (self.pickleFile, e)) except ValueError as e: self._logError("fail to close %s: %s" % (self.pickleFile, e)) ################################################################################ def execute(self): """ Main execution method. Just fills a list, and a queue, with BKKQueries ID. """ gMonitor.addMark('Iteration', 1) # Get all the transformations result = self.transClient.getTransformations(condDict={'Status': ['Active', 'Idle']}) if not result['OK']: self._logError("Failed to get transformations.", result['Message']) return S_OK() transIDsList = [long(transDict['TransformationID']) for transDict in result['Value']] res = self.transClient.getTransformationsWithBkQueries(transIDsList) if not res['OK']: self._logError("Failed to get transformations with Bk Queries.", res['Message']) return S_OK() transIDsWithBkQueriesList = res['Value'] _count = 0 # Process each transformation for transID in transIDsWithBkQueriesList: if transID in self.bkQueriesInCheck: continue self.bkQueriesInCheck.append(transID) self.bkQueriesToBeChecked.put(transID) _count += 1 self._logInfo("Out of %d transformations, %d put in thread queue" % (len(result['Value']), _count)) self.__dumpLog() return S_OK() def _execute(self, threadID): """ Real executor. This is what is executed by the single threads - so do not return here! Just continue """ while True: # not self.bkQueriesToBeChecked.empty(): transID = None try: transID = self.bkQueriesToBeChecked.get() self.transInThread[transID] = ' [Thread%d] [%s] ' % (threadID, str(transID)) startTime = time.time() self._logInfo("Processing transformation %s." % transID, transID=transID) res = self.transClient.getTransformation(transID, extraParams=False) if not res['OK']: self._logError("Failed to get transformation", res['Message'], transID=transID) continue transPlugin = res['Value']['Plugin'] res = self.transClient.getBookkeepingQuery(transID) if not res['OK']: self._logError("Failed to get BkQuery", res['Message'], transID=transID) continue bkQuery = res['Value'] # Determine the correct time stamp to use for this transformation now = datetime.datetime.utcnow() self.__timeStampForTransformation(transID, bkQuery, now) try: files = self.__getFiles(transID, bkQuery, now) except RuntimeError as e: # In case we failed a full query, we should retry full query until successful if 'StartDate' not in bkQuery: self.bkQueries.pop(transID, None) self._logError("Failed to get response from the Bookkeeping: %s" % e, "", "__getFiles", transID) continue runDict = {} filesMetadata = {} # get the files metadata for lfnChunk in breakListIntoChunks(files, self.chunkSize): start = time.time() res = self.bkClient.getFileMetadata(lfnChunk) self._logVerbose("Got metadata from BK for %d files" % len(lfnChunk), transID=transID, reftime=start) if not res['OK']: self._logError("Failed to get BK metadata for %d files" % len(lfnChunk), res['Message'], transID=transID) # No need to return as we only consider files that are successful... else: filesMetadata.update(res['Value']['Successful']) # There is no need to add the run information for a transformation that doesn't need it if transPlugin not in self.pluginsWithNoRunInfo: for lfn, metadata in filesMetadata.iteritems(): runID = metadata.get('RunNumber', None) if isinstance(runID, (basestring, int, long)): runDict.setdefault(int(runID), []).append(lfn) try: self.__addRunsMetadata(transID, runDict.keys()) except RuntimeError as e: self._logException("Failure adding runs metadata", method="__addRunsMetadata", lException=e, transID=transID) else: runDict[None] = filesMetadata.keys() # Add all new files to the transformation for runID in sorted(runDict): lfnList = runDict[runID] # We enter all files of a run at once, otherwise do it by chunks lfnChunks = [lfnList] if runID else breakListIntoChunks(lfnList, self.chunkSize) for lfnChunk in lfnChunks: # Add the files to the transformation self._logVerbose('Adding %d lfns for transformation' % len(lfnChunk), transID=transID) result = self.transClient.addFilesToTransformation(transID, lfnChunk) if not result['OK']: self._logError("Failed to add %d lfns to transformation" % len(lfnChunk), result['Message'], transID=transID) return result else: # Handle errors errors = {} for lfn, error in result['Value']['Failed'].iteritems(): errors.setdefault(error, []).append(lfn) for error, lfns in errors.iteritems(): self._logWarn("Failed to add files to transformation", error, transID=transID) self._logVerbose("\n\t".join([''] + lfns)) # Add the metadata and RunNumber to the newly inserted files addedLfns = [lfn for (lfn, status) in result['Value']['Successful'].iteritems() if status == 'Added'] if addedLfns: # Add files metadata: size and file type lfnDict = dict((lfn, {'Size': filesMetadata[lfn]['FileSize'], 'FileType': filesMetadata[lfn]['FileType']}) for lfn in addedLfns) res = self.transClient.setParameterToTransformationFiles(transID, lfnDict) if not res['OK']: self._logError("Failed to set transformation files metadata", res['Message']) return res # Add run information if it exists if runID: self._logInfo("Added %d files to transformation for run %d, now including run information" % (len(addedLfns), runID), transID=transID) self._logVerbose("Associating %d files to run %d" % (len(addedLfns), runID), transID=transID) res = self.transClient.addTransformationRunFiles(transID, runID, addedLfns) if not res['OK']: self._logError("Failed to associate %d files to run %d" % (len(addedLfns), runID), res['Message'], transID=transID) return res else: self._logInfo("Added %d files to transformation" % len(addedLfns), transID=transID) except Exception as x: # pylint: disable=broad-except self._logException('Exception while adding files to transformation', lException=x, method='_execute', transID=transID) finally: self._logInfo("Processed transformation", transID=transID, reftime=startTime) if transID in self.bkQueriesInCheck: self.bkQueriesInCheck.remove(transID) self.transInThread.pop(transID, None) return S_OK() @gSynchro def __timeStampForTransformation(self, transID, bkQuery, now): """ Determine the correct time stamp to use for this transformation """ fullTimeLog = self.fullTimeLog.setdefault(transID, now) bkQueryLog = self.bkQueries.setdefault(transID, {}) bkQueryLog.pop('StartDate', None) self.bkQueries[transID] = bkQuery.copy() if transID in self.timeLog \ and bkQueryLog == bkQuery \ and (now - fullTimeLog) < datetime.timedelta(seconds=self.fullUpdatePeriod): # If it is more than a day since the last reduced query, make a full query just in case timeStamp = self.timeLog[transID] delta = datetime.timedelta(seconds=self.bkUpdateLatency) bkQuery['StartDate'] = (timeStamp - delta).strftime('%Y-%m-%d %H:%M:%S') if 'StartDate' not in bkQuery: self.fullTimeLog[transID] = now def __getFiles(self, transID, bkQuery, now): """ Perform the query to the Bookkeeping """ self._logInfo("Using BK query for transformation: %s" % str(bkQuery), transID=transID) start = time.time() result = self.bkClient.getFiles(bkQuery) self._logVerbose("BK query time: %.2f seconds." % (time.time() - start), transID=transID) if not result['OK']: raise RuntimeError(result['Message']) else: self.__updateTimeStamp(transID, now) if result['Value']: self._logInfo("Obtained %d files from BK" % len(result['Value']), transID=transID) return result['Value'] @gSynchro def __updateTimeStamp(self, transID, now): """ Update time stamp for current transformation to now """ self.timeLog[transID] = now def __addRunsMetadata(self, transID, runsList): """ Add the run metadata """ runsInCache = self.transClient.getRunsInCache({'Name': ['TCK', 'CondDb', 'DDDB']}) if not runsInCache['OK']: raise RuntimeError(runsInCache['Message']) newRuns = list(set(runsList) - set(runsInCache['Value'])) if newRuns: self._logVerbose("Associating run metadata to %d runs" % len(newRuns), transID=transID) res = self.bkClient.getRunInformation({'RunNumber': newRuns, 'Fields': ['TCK', 'CondDb', 'DDDB']}) if not res['OK']: raise RuntimeError(res['Message']) else: for run, runMeta in res['Value'].iteritems(): res = self.transClient.addRunsMetadata(run, runMeta) if not res['OK']: raise RuntimeError(res['Message']) # Add run duration to the metadata runsInCache = self.transClient.getRunsInCache({'Name': ['Duration']}) if not runsInCache['OK']: raise RuntimeError(runsInCache['Message']) newRuns = list(set(runsList) - set(runsInCache['Value'])) if newRuns: self._logVerbose("Associating run duration to %d runs" % len(newRuns), transID=transID) res = self.bkClient.getRunInformation({'RunNumber': newRuns, 'Fields': ['JobStart', 'JobEnd']}) if not res['OK']: raise RuntimeError(res['Message']) else: for run, runMeta in res['Value'].iteritems(): duration = (runMeta['JobEnd'] - runMeta['JobStart']).seconds res = self.transClient.addRunsMetadata(run, {'Duration': duration}) if not res['OK']: raise RuntimeError(res['Message']) def finalize(self): """ Gracious finalization """ if self.bkQueriesInCheck: self._logInfo("Wait for queue to get empty before terminating the agent (%d tasks)" % len(self.transInThread)) self.bkQueriesInCheck = [] while self.transInThread: time.sleep(2) self.log.info("Threads are empty, terminating the agent...") return S_OK()
"TaskID": taskID }) if not res['OK'] or not res['Value']: return None return res['Value'][0] #==================================== if __name__ == "__main__": Script.parseCommandLine(ignoreErrors=True) transList = __getTransformations(Script.getPositionalArgs()) from LHCbDIRAC.TransformationSystem.Client.TransformationClient import TransformationClient from DIRAC import gLogger, exit transClient = TransformationClient() for transID in transList: res = transClient.getTransformationFiles({ 'TransformationID': transID, 'Status': 'Assigned' }) if not res['OK']: gLogger.fatal("Error getting transformation files for %d" % transID) continue targetStats = {} taskDict = {} for fileDict in res['Value']: taskID = fileDict['TaskID'] taskDict[taskID] = taskDict.setdefault(taskID, 0) + 1
def setUp(self): self.transClient = TransformationClient()
print 'Usage: dirac-production-remove-output transID [transID] [transID]' DIRACExit(1) else: try: transIDs = [int(arg) for arg in sys.argv[1:]] except: print 'Invalid list of productions' DIRACExit(1) agent = TransformationCleaningAgent( 'Transformation/TransformationCleaningAgent', 'Transformation/TransformationCleaningAgent', 'dirac-production-remove-output') agent.initialize() client = TransformationClient() for transID in transIDs: res = client.getTransformationParameters(transID, ['Status']) if not res['OK']: gLogger.error("Failed to determine transformation status") gLogger.error(res['Message']) continue status = res['Value'] if not status in [ 'RemovingFiles', 'RemovingOutput', 'ValidatingInput', 'Active' ]: gLogger.error( "The transformation is in %s status and the outputs cannot be removed" % status) continue agent.removeTransformationOutput(transID)
class Transformation(DIRACTransformation): """ Class for dealing with Transformation objects """ ############################################################################# def __init__(self, transID=0, transClientIn=None): """ Just params setting. transClient is passed here as LHCbDIRAC TransformationsClient, it will be self.transClient """ if not transClientIn: self.transClient = TransformationClient() else: self.transClient = transClientIn super(Transformation, self).__init__(transID=transID, transClient=self.transClient) ############################################################################# def testBkQuery(self, bkQuery, printOutput=False, bkClient=None): """ just pretty print of the result of a BK Query """ if bkClient is None: bkClient = BookkeepingClient() res = bkClient.getFiles(bkQuery) if not res['OK']: return self._errorReport(res, 'Failed to perform BK query') gLogger.info('The supplied query returned %d files' % len(res['Value'])) if printOutput: self._prettyPrint(res) return S_OK(res['Value']) ############################################################################# def setBkQuery(self, queryDict, test=False): """ set a BKK Query """ if test: res = self.testBkQuery(queryDict) if not res['OK']: return res transID = self.paramValues['TransformationID'] if self.exists and transID: res = self.transClient.addBookkeepingQuery(transID, queryDict) if not res['OK']: return res self.item_called = 'BkQuery' self.paramValues[self.item_called] = queryDict return S_OK() ############################################################################# def getBkQuery(self, printOutput=False): """ get a BKK Query """ if self.paramValues['BkQuery']: return S_OK(self.paramValues['BkQuery']) res = self.__executeOperation('getBookkeepingQuery', printOutput=printOutput) if not res['OK']: return res self.item_called = 'BkQuery' self.paramValues[self.item_called] = res['Value'] return S_OK(res['Value']) ############################################################################# def deleteTransformationBkQuery(self): """ delete a BKK Query """ transID = self.paramValues['TransformationID'] if self.exists and transID: res = self.transClient.deleteTransformationBookkeepingQuery( transID) if not res['OK']: return res self.item_called = 'BkQuery' self.paramValues[self.item_called] = {} return S_OK() ############################################################################# def addTransformation(self, addFiles=True, printOutput=False): """ Add a transformation, using TransformationClient() """ res = super(Transformation, self).addTransformation(addFiles, printOutput) if res['OK']: transID = res['Value'] else: return res bkQuery = self.paramValues.get('BkQuery') if bkQuery: res = self.setBkQuery(bkQuery) if not res['OK']: return self._errorReport(res, "Failed to set BK query") else: self.transClient.deleteTransformationParameter(transID, 'BkQuery') return S_OK(transID) def setSEParam(self, key, seList): return self.__setSE(key, seList) def setAdditionalParam(self, key, val): self.item_called = key return self.__setParam(val) # This is a trick to overwrite the __checkSEs method of the base class def _Transformation__checkSEs(self, seList): # This test allows to set some parameters empty if seList == []: return S_OK() if resolveSEGroup(seList): return S_OK() gLogger.error("Some SEs are unknown in %s" % ','.join(seList)) return S_ERROR("Some StorageElements not known")
class BookkeepingDB: def __init__(self): self.bkClient = BookkeepingClient() self.tfClient = TransformationClient() # self.err = err def MakeRunLFN(self, runNmbr, cfgVersion, prodId): try: padding = "%08d" % int(prodId) lfn = LFN_FORMAT_STRING %( cfgVersion, runNmbr, runNmbr, padding) return lfn except Exception as inst: # self.err.rethrowException(inst) return None def getTCK(self, runNmbr): try: print runNmbr res = self.getRunsMetadata(runNmbr) pprint.pprint(res) if res != None and hasattr(res, 'Value') \ and hasattr(res['Value'], runNmbr) \ and hasattr(res['Value'][runNmbr], "TCK"): return res['Value'][runNmbr]["TCK"] else: return None except Exception as inst: # self.err.rethrowException(inst) return None def getRunsMetadata(self, runNmbr): try: res = self.tfClient.getRunsMetadata(int(runNmbr)) if res['OK']: return res else: return None except Exception as inst: # self.err.rethrowException(inst) return None def getInformation(self, run): try: res = self.bkClient.getRunInformations(run) if res['OK']: result = dict() val = res['Value'] result = {"runstart": val.get('RunStart', 'Unknown'), "runend": val.get('RunEnd', 'Unknown'), "configname": val.get('Configuration Name', 'Unknown'), "configversion": val.get('Configuration Version', 'Unknown'), "fillnb" : val.get('FillNumber', 'Unknown'), "datataking" : val.get('DataTakingDescription', 'Unknown'), "datataking" : val.get('DataTakingDescription', 'Unknown'), "processing" : val.get('ProcessingPass', 'Unknown'), "stream" : val.get('Stream', 'Unknown'), "fullstat" : val.get('FullStat', 'Unknown'), "nbofe" : val.get('Number of events', 'Unknown'), "nboff" : val.get('Number of file', 'Unknown'), "fsize" : val.get('File size', 'Unknown') } return result else: self.errorMessage("error in bkClient Connection") return None except Exception as inst: # self.err.rethrowException(inst) return None def getListOfRecos(self, runNmbr): try: d = {'RunNumber' : runNmbr} res = self.bkClient.getRunAndProcessingPass(d) results = list() if res['OK'] == True: recosList = res["Value"] for recoEntry in recosList: recoPath = recoEntry[1] if recoEntry[0] == runNmbr \ and recoPath.count("/") == 2 \ and "Reco" in recoPath : results.append(recoPath) return results else: pprint.pprint(res) self.errorMessage("error in bkClient Connection") return None except Exception as inst: # self.err.rethrowException(inst) return None def getProcessId(self, runNmbr, recoVersion): try: d = {'RunNumber' : runNmbr, 'ProcessingPass': recoVersion} res = self.bkClient.getProductionsFromView(d) if res["OK"] == True: return res["Value"][0][0] else: self.errorMessage("error in bkClient Connection") return None except Exception as inst: # self.err.rethrowException(inst) return None #recoVersion is just Reco13 and not the full path!! def makeReferenceROOTFileName(self, recoVersion, runNmbr): try: basePath = REFERENCE_BASE_PATH + recoVersion +"/" #nasty stuff!! #the problem is tck retrieved from db #0x790038 #but in the file it looks like #TCK_0x00760037_1.root #so do padding here tck = self.getTCK(runNmbr) #sometimes no tck set, then take default file if tck != None: tckDecimal = int(tck, 0) tckHexPaddedFileName = "TCK_0x" + str(format(tckDecimal, '08x')) + "_" #if we have multiple files like #TCK_0x00790038_1.root #TCK_0x00790038_2.root #we want the file with the highest subindex, so in this example _2 possibleTCKList = list() #store all possible files for file in os.listdir(basePath): if tck != None \ and file.endswith(".root") \ and file != "default_1.root" \ and tckHexPaddedFileName in file: possibleTCKList.append(file) #if we haven't foun anything, look for the default files and choose the one with the highest index if len(possibleTCKList) == 0: #store all possible files for file in os.listdir(basePath): if file.endswith(".root") \ and "default_" in file: possibleTCKList.append(file) #now sort this list, to find the highest subindex possibleTCKList.sort() return basePath+possibleTCKList.pop() except Exception as inst: # self.err.rethrowException(inst) return None
def getDestinationSEList(outputSE, site, outputmode='Any', run=None): """ Evaluate the output SE list from a workflow and return the concrete list of SEs to upload output data. """ if outputmode.lower() not in ('any', 'local', 'run'): raise RuntimeError("Unexpected outputmode") if outputmode.lower() == 'run': gLogger.verbose( "Output mode set to 'run', thus ignoring site parameter") if not run: raise RuntimeError("Expected runNumber") try: run = long(run) except ValueError as ve: raise RuntimeError("Expected runNumber as a number: %s" % ve) gLogger.debug("RunNumber = %d" % run) from LHCbDIRAC.TransformationSystem.Client.TransformationClient import TransformationClient runDestination = TransformationClient().getDestinationForRun(run) if not runDestination['OK'] or run not in runDestination['Value']: raise RuntimeError("Issue getting destinationForRun (%d): " % run + runDestination.get('Message', 'unknown run')) site = runDestination['Value'][run] gLogger.verbose("Site set to %s for run %d" % (site, run)) outputmode = 'Local' # Add output SE defined in the job description gLogger.info('Resolving workflow output SE description: %s' % outputSE) # Check if the SE is defined explicitly for the site prefix = site.split('.')[0] country = site.split('.')[-1] # Concrete SE name result = gConfig.getOptions('/Resources/StorageElements/' + outputSE) if result['OK']: gLogger.info('Found concrete SE %s' % outputSE) return [outputSE] # Get local SEs localSEs = getSEsForSite(site) if not localSEs['OK']: raise RuntimeError(localSEs['Message']) localSEs = localSEs['Value'] gLogger.verbose("Local SE list is: %s" % (localSEs)) # There is an alias defined for this Site associatedSEs = gConfig.getValue( '/Resources/Sites/%s/%s/AssociatedSEs/%s' % (prefix, site, outputSE), []) if associatedSEs: associatedSEs = _setLocalFirst(associatedSEs, localSEs) gLogger.info("Found associated SE %s for site %s" % (associatedSEs, site)) return associatedSEs groupSEs = resolveSEGroup(outputSE) if not groupSEs: raise RuntimeError("Failed to resolve SE " + outputSE) gLogger.verbose("Group SE list is: %s" % (groupSEs)) # Find a local SE or an SE considered as local because the country is associated to it if outputmode.lower() == "local": # First, check if one SE in the group is local for se in localSEs: if se in groupSEs: gLogger.info("Found eligible local SE: %s" % (se)) return [se] # Final check for country associated SE assignedCountry = country while True: # check if country is already one with associated SEs section = '/Resources/Countries/%s/AssociatedSEs/%s' % ( assignedCountry, outputSE) associatedSEs = gConfig.getValue(section, []) if associatedSEs: associatedSEs = _setLocalFirst(associatedSEs, localSEs) gLogger.info('Found associated SEs %s in %s' % (associatedSEs, section)) return associatedSEs gLogger.verbose("/Resources/Countries/%s/AssignedTo" % assignedCountry) opt = gConfig.getOption("/Resources/Countries/%s/AssignedTo" % assignedCountry) if opt['OK'] and opt['Value']: assignedCountry = opt['Value'] else: # No associated SE and no assigned country, give up raise RuntimeError( "Could not establish associated SE nor assigned country for country %s" % assignedCountry) # For collective Any and All modes return the whole group # Make sure that local SEs are passing first orderedSEs = _setLocalFirst(groupSEs, localSEs) gLogger.info('Found SEs, local first: %s' % orderedSEs) return orderedSEs
gLogger.exception('Bad production ID', lException=e) DIRAC.exit(1) elif switch[0] == 'Information': item = switch[1] elif switch[0] == 'ByValue': byValue = True elif switch[0] == 'Active': active = True from LHCbDIRAC.DataManagementSystem.Client.DMScript import printDMResult, ProgressBar from LHCbDIRAC.BookkeepingSystem.Client.BookkeepingClient import BookkeepingClient bk = BookkeepingClient() if production: from LHCbDIRAC.TransformationSystem.Client.TransformationClient import TransformationClient trClient = TransformationClient() condDict = {'TransformationID': production} if active: condDict['Status'] = 'Active' res = trClient.getTransformationRuns(condDict) if res['OK']: runSet.update(run['RunNumber'] for run in res['Value']) else: gLogger.fatal("Error getting production runs", res['Message']) DIRAC.exit(2) gLogger.notice("Found %d runs" % len(runSet)) # Use this call to get information but also the actual list of existing runs res = bk.getRunStatus(list(runSet)) if not res['OK']: gLogger.fatal("Error getting the run info", res['Message'])
def __init__(self): """Instantiates the Workflow object and some default parameters. """ self.transClient = TransformationClient()
class MCExtensionAgent(DIRACMCExtensionAgent): """ MCExtensionAgent """ def __init__(self, *args, **kwargs): """ c'tor """ DIRACMCExtensionAgent.__init__(self, *args, **kwargs) self.rpcProductionRequest = None self.transClient = None self.enableFlag = True # default values self.cpuE = 1 self.cpuTimeAvg = 200000 self.cpuNormalizationFactorAvg = 1.0 # Artificial boost of the number of events requested to be created self.extensionFactorBoost = 20 # Meaning 20% more than what is calculated ############################################################################# def initialize(self): """ Logs some parameters and initializes the clients """ self.extensionFactorBoost = self.am_getOption( 'extensionFactorBoost', self.extensionFactorBoost) self.rpcProductionRequest = RPCClient( 'ProductionManagement/ProductionRequest') self.transClient = TransformationClient() self.log.info('Will consider the following transformation types: %s' % str(self.transformationTypes)) self.log.info('Will create a maximum of %s tasks per iteration' % self.maxIterationTasks) return S_OK() ############################################################################# def execute(self): """ The MCExtensionAgent execution method. """ self.enableFlag = self.am_getOption('EnableFlag', 'True') if not self.enableFlag == 'True': self.log.info( "MCExtensionAgent is disabled by configuration option EnableFlag" ) return S_OK('Disabled via CS flag') # done every cycle, as they may have changed self._getCPUParameters() # get the production requests in which we are interested productionRequests = self.rpcProductionRequest.getProductionRequestSummary( 'Active', 'Simulation') if productionRequests['OK']: productionRequests = productionRequests['Value'] self.log.info( "Requests considered: %s" % ', '.join([str(prod) for prod in productionRequests.keys()])) else: message = "RPC call to ProductionRequest service failed : %s" % productionRequests[ 'Message'] self.log.error(message) return S_ERROR(message) for productionRequestID, productionRequestSummary in productionRequests.items( ): ret = self._checkProductionRequest(productionRequestID, productionRequestSummary) if not ret['OK']: return ret return S_OK() ############################################################################# def _getCPUParameters(self): """ Get the CPUTimeAvg and CPUNormalizationFactorAvg from config, or as a fail-over, there are some defaults """ op = Operations() self.cpuTimeAvg = op.getValue('Transformations/cpuTimeAvg', self.cpuTimeAvg) self.log.verbose("cpuTimeAvg = %d" % self.cpuTimeAvg) try: self.cpuNormalizationFactorAvg = getCPUNormalizationFactorAvg() self.log.verbose("cpuNormalizationFactorAvg = %d" % self.cpuNormalizationFactorAvg) except RuntimeError: self.log.info( "Could not get CPUNormalizationFactorAvg from config, defaulting to %d" % self.cpuNormalizationFactorAvg) ############################################################################# def _checkProductionRequest(self, productionRequestID, productionRequestSummary): """ Check if a production request need to be extended and do it if needed """ # check if enough events have been produced missingEvents = productionRequestSummary[ 'reqTotal'] - productionRequestSummary['bkTotal'] self.log.info("Missing events for production request %d: %d" % (productionRequestID, missingEvents)) if productionRequestSummary['bkTotal'] > 0 and missingEvents <= 0: message = "Enough events produced for production request %d" % productionRequestID self.log.verbose(message) return S_OK(message) # get the associated productions/transformations progress productionsProgress = self.rpcProductionRequest.getProductionProgressList( long(productionRequestID)) if productionsProgress['OK']: productionsProgress = productionsProgress['Value'] else: message = 'Failed to get productions progress : %s' % productionsProgress[ 'Message'] self.log.error(message) return S_ERROR(message) productionsProgress = productionsProgress['Rows'] self.log.verbose("Progress for production request %d: %s" % (productionRequestID, str(productionsProgress))) # get the informations for the productions/transformations productions = [] simulation = None for productionProgress in productionsProgress: productionID = productionProgress['ProductionID'] production = self.transClient.getTransformation(productionID) if not production['OK']: message = 'Failed to get informations on production %d : %s' % ( productionID, production['Message']) self.log.error(message) return S_ERROR(message) production = production['Value'] productions.append(production) # determine which one is the simulation production if production['Type'] in self.transformationTypes: simulation = production simulationID = productionID for prodProgress in productionsProgress: if prodProgress['ProductionID'] == simulationID: simulationProgress = prodProgress self.log.info( "Progress for the simulation production %d of request %d: %s" % (simulationID, productionRequestID, str(simulationProgress))) if simulation is None: message = 'Failed to get simulation production for request %d' % productionRequestID self.log.error(message) return S_ERROR(message) if simulation['Status'].lower() != 'idle': # the simulation is still producing events message = "Simulation for production request %d is not Idle (%s)" % ( productionRequestID, simulation['Status']) self.log.verbose(message) return S_OK(message) # Checking how long ago this production became 'Idle' res = self.transClient.getTransformationLogging(simulationID) if not res['OK']: return res lastLoggingEntry = res['Value'][-1] if ('idle' in lastLoggingEntry['Message'].lower()) and \ ((datetime.datetime.utcnow() - lastLoggingEntry['MessageDate']).seconds < 900): self.log.verbose( "Prod %d is in 'Idle' for less than 15 minutes, waiting a bit" % simulationID) return S_OK( "Prod %d is in 'Idle' for less than 15 minutes, waiting a bit" % simulationID) if simulationProgress['BkEvents'] < productionRequestSummary[ 'reqTotal']: # the number of events produced by the simulation is of the order of the number of events requested # -> there is probably no stripping production, no extension factor necessary return self._extendProduction(simulation, 1.0, missingEvents) else: # the number of events produced by the simulation is more than the number of events requested, yet events are missing # -> there is probably a stripping production, an extension factor is needed to account for stripped events # some events may still be processed (eg. merged), so wait that all the productions are idle if all(production['Status'].lower() == 'idle' for production in productions): try: extensionFactor = float( simulationProgress['BkEvents']) / float( productionRequestSummary['bkTotal']) return self._extendProduction(simulation, extensionFactor, missingEvents) except ZeroDivisionError: return S_OK() else: return S_OK() ############################################################################# def _extendProduction(self, production, extensionFactor, eventsNeeded): """ Extends a production to produce eventsNeeded*extensionFactor more events. """ productionID = production['TransformationID'] cpuEProd = getProductionParameterValue(production['Body'], 'CPUe') if cpuEProd is None: self.log.warn( "CPUe for transformation %d is not set, skipping for now" % productionID) return S_OK() cpuE = int(round(float(cpuEProd))) self.log.info( "Extending production %d, that is still missing %d events. \ Extension factor = %d, boost = %d" % (productionID, eventsNeeded, extensionFactor, self.extensionFactorBoost)) eventsToProduce = eventsNeeded * extensionFactor * ( float(100 + self.extensionFactorBoost) / 100) max_e = getEventsToProduce(cpuE, self.cpuTimeAvg, self.cpuNormalizationFactorAvg) numberOfTasks = int(math.ceil(float(eventsToProduce) / float(max_e))) self.log.info("Extending production %d by %d tasks" % (productionID, numberOfTasks)) # extend the transformation by the determined number of tasks res = self.transClient.extendTransformation(productionID, numberOfTasks) if not res['OK']: message = 'Failed to extend transformation %d : %s' % ( productionID, res['Message']) self.log.error(message) return S_ERROR(message) else: message = "Successfully extended transformation %d by %d tasks" % ( productionID, numberOfTasks) self.log.info(message) res = self.transClient.setTransformationParameter( productionID, 'Status', 'Active') if not res['OK']: message = 'Failed to set transformation %d to Active' % productionID self.log.error(message) return S_ERROR(message) return S_OK(message)
DIRAC.exit(1) elif opt == 'AddRuns': try: settings[opt] = [int(runID) for runID in val.split(',')] except TypeError: gLogger.error("Invalid run list", str(val)) DIRAC.exit(1) elif opt == 'List': settings[opt] = True if 'AddRuns' in settings and ('StartRun' in settings or 'EndRun' in settings): gLogger.error( 'Incompatible requests, cannot set run list and start/end run') DIRAC.exit(1) from LHCbDIRAC.TransformationSystem.Client.TransformationClient import TransformationClient client = TransformationClient() res = client.getBookkeepingQuery(prodId) if not res['OK']: gLogger.error("Error retrieving BKQuery for transformation %s" % prodId, res['Message']) DIRAC.exit(2) bkDict = res['Value'] startRun = bkDict.get('StartRun', 0) endRun = bkDict.get('EndRun', 0) runNumbers = bkDict.get('RunNumbers', 'All') if ('StartRun' in settings or 'EndRun' in settings) and runNumbers and runNumbers != 'All': gLogger.notice("Transformation %d has RunNumbers key" % prodId) settings = {'List': True}
def __init__(self): self.bkClient = BookkeepingClient() self.tfClient = TransformationClient()
if isinstance(steps, str): continue files = prodInfo["Number of files"] events = prodInfo["Number of events"] path = prodInfo["Path"] dddb = None conddb = None for step in reversed(steps): if step[4] and step[4].lower() != 'frompreviousstep': dddb = step[4] if step[5] and step[5].lower() != 'frompreviousstep': conddb = step[5] result = TransformationClient().getTransformation(prodID, True) if not result['OK']: gLogger.error( 'Could not retrieve parameters for production %d:' % prodID, result['Message']) continue parameters = result['Value'] if not dddb: dddb = parameters.get('DDDBTag') if not conddb: conddb = parameters.get('CondDBTag') if not (dddb and conddb): # probably the production above was not a MCSimulation reqID = int(parameters.get('RequestID'))