def __call__(self): # pylint: disable=arguments-differ if self.options.short: taskname = self.cachedinfo['RequestName'] inputlist = {'subresource': 'search', 'workflow': taskname} server = self.crabserver webdir = getProxiedWebDir(crabserver=self.crabserver, task=taskname, logFunction=self.logger.debug) dictresult, status, reason = server.get(api='task', data=inputlist) if not webdir: webdir = dictresult['result'][0] self.logger.info('Server result: %s' % webdir) if status != 200: msg = "Problem retrieving information from the server:\ninput:%s\noutput:%s\nreason:%s" % (str(inputlist), str(dictresult), str(reason)) raise RESTCommunicationException(msg) splitting = getColumn(dictresult, 'tm_split_algo') if getattr(self.options, 'jobids', None): self.options.jobids = validateJobids(self.options.jobids, splitting != 'Automatic') self.setDestination() self.logger.info("Setting the destination to %s " % self.dest) failed, success = self.retrieveShortLogs(webdir, self.proxyfilename) if failed: msg = "%sError%s: Failed to retrieve the following files: %s" % (colors.RED, colors.NORMAL, failed) self.logger.info(msg) else: self.logger.info("%sSuccess%s: All files successfully retrieved." % (colors.GREEN, colors.NORMAL)) returndict = {'success': success, 'failed': failed} else: # Different from the old getlog code: set 'logs2' as subresource so that 'getcommand' uses the new logic. returndict = getcommand.__call__(self, subresource='logs2') if ('success' in returndict and not returndict['success']) or \ ('failed' in returndict and returndict['failed']): msg = "You can use the --short option to retrieve a short version of the log files from the Grid scheduler." self.logger.info(msg) return returndict
def __call__(self): server = self.crabserver msg = "Continuing submission of task %s" % ( self.cachedinfo['RequestName']) self.logger.debug(msg) request = { 'workflow': self.cachedinfo['RequestName'], 'subresource': 'proceed' } self.logger.info("Sending the request to the server") self.logger.debug("Submitting %s " % str(request)) result, status, reason = server.post(api=self.defaultApi, data=urlencode(request)) self.logger.debug("Result: %s" % (result)) if status != 200: msg = "Problem continuing task submission:\ninput:%s\noutput:%s\nreason:%s" \ % (str(request), str(result), str(reason)) raise RESTCommunicationException(msg) msg = "Task continuation request successfully sent to the CRAB3 server" if result['result'][0]['result'] != 'ok': msg += "\nServer responded with: '%s'" % ( result['result'][0]['result']) status = {'status': 'FAILED'} else: status = {'status': 'SUCCESS'} self.logger.info("To check task progress, use 'crab status'") self.logger.info(msg) return status
def __call__(self): serverFactory = CRABClient.Emulator.getEmulator('rest') server = serverFactory(self.serverurl, self.proxyfilename, self.proxyfilename, version=__version__) self.logger.debug('Killing task %s' % self.cachedinfo['RequestName']) inputs = {'workflow': self.cachedinfo['RequestName']} if self.options.killwarning: inputs.update({'killwarning': b64encode(self.options.killwarning)}) dictresult, status, reason = server.delete( self.uri, data=urlencode(inputs) + '&' + urlencode(self.jobids)) self.logger.debug("Result: %s" % dictresult) if status != 200: msg = "Problem killing task %s:\ninput:%s\noutput:%s\nreason:%s" % \ (self.cachedinfo['RequestName'], str(self.cachedinfo['RequestName']), str(dictresult), str(reason)) raise RESTCommunicationException(msg) self.logger.info("Kill request successfully sent") if dictresult['result'][0]['result'] != 'ok': resultdict = {'status': 'FAILED'} self.logger.info(dictresult['result'][0]['result']) else: resultdict = {'status': 'SUCCESS'} return resultdict
def __call__(self): server = self.crabserver self.logger.debug("Killing task %s" % self.cachedinfo['RequestName']) inputs = {'workflow': self.cachedinfo['RequestName']} if self.options.killwarning: inputs.update({'killwarning': b64encode(self.options.killwarning)}) dictresult, status, reason = server.delete(api=self.defaultApi, data=urlencode(inputs)) self.logger.debug("Result: %s" % dictresult) if status != 200: msg = "Problem killing task %s:\ninput:%s\noutput:%s\nreason:%s" % \ (self.cachedinfo['RequestName'], str(self.cachedinfo['RequestName']), str(dictresult), str(reason)) raise RESTCommunicationException(msg) self.logger.info("Kill request successfully sent") if dictresult['result'][0]['result'] != 'ok': resultdict = {'status': 'FAILED'} self.logger.info(dictresult['result'][0]['result']) else: resultdict = {'status': 'SUCCESS'} return resultdict
def __call__(self): server = HTTPRequests(self.serverurl, self.proxyfilename, self.proxyfilename, version=__version__) dictresult, status, reason = server.get(self.uri, data={'timestamp': self.date}) dictresult = dictresult['result'] #take just the significant part if status != 200: msg = "Problem retrieving tasks:\ninput:%s\noutput:%s\nreason:%s" % ( str(self.date), str(dictresult), str(reason)) raise RESTCommunicationException(msg) dictresult.sort() dictresult.reverse() if self.options.status: dictresult = [ item for item in dictresult if item[1] == self.options.status ] result = [item[0:2] for item in dictresult] today = date.today() if not dictresult: msg = "No tasks found from %s until %s" % (self.date, today) if self.options.status: msg += " with status %s" % (self.options.status) self.logger.info(msg) return result msg = "\nList of tasks from %s until %s" % (self.date, today) if self.options.status: msg += " with status %s" % (self.options.status) self.logger.info(msg) msg = "Beware that STATUS here does not include information from grid jobs" self.logger.info(msg) self.logger.info('=' * 80) self.logger.info('NAME\t\t\t\t\t\t\t\tSTATUS') self.logger.info('=' * 80) for item in dictresult: name, status = item[0:2] self.logger.info('%s\n\t\t\t\t\t\t\t\t%s' % (name, status)) self.logger.info('-' * 80) self.logger.info('\n') return result
def __call__(self): serverFactory = CRABClient.Emulator.getEmulator('rest') server = serverFactory(self.serverurl, self.proxyfilename, self.proxyfilename, version=__version__) self.logger.debug('Looking up detailed status of task %s' % self.cachedinfo['RequestName']) user = self.cachedinfo['RequestName'].split("_")[2].split(":")[-1] verbose = int(self.options.summary or self.options.long or self.options.json) if self.options.idle: verbose = 2 dictresult, status, reason = server.get( self.uri, data={ 'workflow': self.cachedinfo['RequestName'], 'verbose': verbose }) dictresult = dictresult['result'][0] #take just the significant part if status != 200: msg = "Problem retrieving status:\ninput:%s\noutput:%s\nreason:%s" % ( str(self.cachedinfo['RequestName']), str(dictresult), str(reason)) raise RESTCommunicationException(msg) self.printShort(dictresult, user) if 'jobs' in dictresult: self.printPublication(dictresult) self.printErrors(dictresult) # Note several options could be combined if self.options.summary: self.printSummary(dictresult) if self.options.long or self.options.sort: sortdict = self.printLong(dictresult, quiet=(not self.options.long)) if self.options.sort: self.printSort(sortdict, self.options.sort) if self.options.idle: self.printIdle(dictresult, user) if self.options.json: self.logger.info(json.dumps(dictresult['jobs'])) return dictresult
def __call__(self): if self.options.short: #Check if splitting is automatic try: splitting=self.cachedinfo['OriginalConfig'].Data.splitting except AttributeError: #Default setting is 'Automatic' splitting='Automatic' except KeyError: #crab remade task does not have 'OriginalConfig' key, need to fetch from DB splitting='Unknown' taskname = self.cachedinfo['RequestName'] inputlist = {'subresource': 'webdir', 'workflow': taskname} serverFactory = CRABClient.Emulator.getEmulator('rest') server = serverFactory(self.serverurl, self.proxyfilename, self.proxyfilename, version=__version__) uri = self.getUrl(self.instance, resource = 'task') webdir=None if splitting!='Unknown': webdir = getProxiedWebDir(taskname, self.serverurl, uri, self.proxyfilename, self.logger.debug) if not webdir: dictresult, status, reason = server.get(uri, data = inputlist) if status != 200: msg = "Problem retrieving information from the server:\ninput:%s\noutput:%s\nreason:%s" % (str(inputlist), str(dictresult), str(reason)) raise RESTCommunicationException(msg) if splitting=='Unknown': splitting=getColumn(dictresult,'tm_split_algo') webdir = dictresult['result'][0] self.logger.info('Server result: %s' % webdir) self.setDestination() self.logger.info("Setting the destination to %s " % self.dest) #check the format of jobids self.options.jobids = validateJobids(self.options.jobids,splitting!='Automatic') failed, success = self.retrieveShortLogs(webdir, self.proxyfilename) if failed: msg = "%sError%s: Failed to retrieve the following files: %s" % (colors.RED,colors.NORMAL,failed) self.logger.info(msg) else: self.logger.info("%sSuccess%s: All files successfully retrieved." % (colors.GREEN,colors.NORMAL)) returndict = {'success': success, 'failed': failed} else: # Different from the old getlog code: set 'logs2' as subresource so that 'getcommand' uses the new logic. returndict = getcommand.__call__(self, subresource = 'logs2') if ('success' in returndict and not returndict['success']) or \ ('failed' in returndict and returndict['failed']): msg = "You can use the --short option to retrieve a short version of the log files from the Grid scheduler." self.logger.info(msg) return returndict
def __call__(self): if self.options.short: taskname = self.cachedinfo['RequestName'] inputlist = {'subresource': 'webdir', 'workflow': taskname} serverFactory = CRABClient.Emulator.getEmulator('rest') server = serverFactory(self.serverurl, self.proxyfilename, self.proxyfilename, version=__version__) uri = self.getUrl(self.instance, resource='task') webdir = getProxiedWebDir(taskname, self.serverurl, uri, self.proxyfilename, self.logger.debug) if not webdir: dictresult, status, reason = server.get(uri, data=inputlist) webdir = dictresult['result'][0] self.logger.info('Server result: %s' % webdir) if status != 200: msg = "Problem retrieving information from the server:\ninput:%s\noutput:%s\nreason:%s" % ( str(inputlist), str(dictresult), str(reason)) raise RESTCommunicationException(msg) self.setDestination() self.logger.info("Setting the destination to %s " % self.dest) failed, success = self.retrieveShortLogs(webdir, self.proxyfilename) if failed: msg = "%sError%s: Failed to retrieve the following files: %s" % ( colors.RED, colors.NORMAL, failed) self.logger.info(msg) else: self.logger.info( "%sSuccess%s: All files successfully retrieved." % (colors.GREEN, colors.NORMAL)) returndict = {'success': success, 'failed': failed} else: # Different from the old getlog code: set 'logs2' as subresource so that 'getcommand' uses the new logic. returndict = getcommand.__call__(self, subresource='logs2') if ('success' in returndict and not returndict['success']) or \ ('failed' in returndict and returndict['failed']): msg = "You can use the --short option to retrieve a short version of the log files from the Grid scheduler." self.logger.info(msg) return returndict
def __call__(self): serverFactory = CRABClient.Emulator.getEmulator('rest') server = serverFactory(self.serverurl, self.proxyfilename, self.proxyfilename, version = __version__) msg = "Requesting resubmission for failed jobs in task %s" % (self.cachedinfo['RequestName']) self.logger.debug(msg) configreq = {'workflow': self.cachedinfo['RequestName']} for attr_name in ['jobids', 'sitewhitelist', 'siteblacklist', 'maxjobruntime', 'maxmemory', 'numcores', 'priority']: attr_value = getattr(self, attr_name) if attr_value: configreq[attr_name] = attr_value self.logger.info("Sending the request to the server") self.logger.debug("Submitting %s " % str(configreq)) ## TODO: this shouldn't be hard-coded. listParams = ['jobids', 'sitewhitelist', 'siteblacklist'] configreq_encoded = self._encodeRequest(configreq, listParams) self.logger.debug("Encoded resubmit request: %s" % (configreq_encoded)) dictresult, status, reason = server.post(self.uri, data = configreq_encoded) self.logger.debug("Result: %s" % (dictresult)) if status != 200: msg = "Problem resubmitting the task to the server:\ninput:%s\noutput:%s\nreason:%s" \ % (str(data), str(dictresult), str(reason)) raise RESTCommunicationException(msg) msg = "Resubmit request successfuly sent to the CRAB3 server." if dictresult['result'][0]['result'] != 'ok': msg += "\nServer responded with: '%s'" % (dictresult['result'][0]['result']) returndict = {'status': 'FAILED'} else: returndict = {'status': 'SUCCESS'} self.logger.info(msg) return returndict
def __call__(self, **argv): # pylint: disable=arguments-differ ## Retrieve the transferLogs parameter from the task database. taskdbparam, configparam = '', '' if argv.get('subresource') in ['logs', 'logs2']: taskdbparam = 'tm_save_logs' configparam = "General.transferLogs" elif argv.get('subresource') in ['data', 'data2']: taskdbparam = 'tm_transfer_outputs' configparam = "General.transferOutputs" transferFlag = 'unknown' inputlist = {'subresource': 'search', 'workflow': self.cachedinfo['RequestName']} serverFactory = CRABClient.Emulator.getEmulator('rest') server = serverFactory(self.serverurl, self.proxyfilename, self.proxyfilename, version=__version__) uri = getUrl(self.instance, resource = 'task') dictresult, status, _ = server.get(uri, data = inputlist) self.logger.debug('Server result: %s' % dictresult) splitting = None if status == 200: if 'desc' in dictresult and 'columns' in dictresult['desc']: position = dictresult['desc']['columns'].index(taskdbparam) transferFlag = dictresult['result'][position] #= 'T' or 'F' position = dictresult['desc']['columns'].index('tm_split_algo') splitting = dictresult['result'][position] else: self.logger.debug("Unable to locate %s in server result." % (taskdbparam)) ## If transferFlag = False, there is nothing to retrieve. if transferFlag == 'F': msg = "No files to retrieve. Files not transferred to storage since task configuration parameter %s is False." % (configparam) self.logger.info(msg) return {'success': {}, 'failed': {}} ## Retrieve tm_edm_outfiles, tm_tfile_outfiles and tm_outfiles from the task database and check if they are empty. if argv.get('subresource') in ['data', 'data2'] and status == 200: if 'desc' in dictresult and 'columns' in dictresult['desc']: position = dictresult['desc']['columns'].index('tm_edm_outfiles') tm_edm_outfiles = dictresult['result'][position] position = dictresult['desc']['columns'].index('tm_tfile_outfiles') tm_tfile_outfiles = dictresult['result'][position] position = dictresult['desc']['columns'].index('tm_outfiles') tm_outfiles = dictresult['result'][position] if tm_edm_outfiles == '[]' and tm_tfile_outfiles == '[]' and tm_outfiles == '[]': msg = "%sWarning%s:" % (colors.RED, colors.NORMAL) msg += " There are no output files to retrieve, because CRAB could not detect any in the CMSSW configuration" msg += " nor was any explicitly specified in the CRAB configuration." self.logger.warning(msg) #check the format of jobids if getattr(self.options, 'jobids', None): self.options.jobids = validateJobids(self.options.jobids, splitting != 'Automatic') self.processAndStoreJobIds() #Retrieving output files location from the server self.logger.debug('Retrieving locations for task %s' % self.cachedinfo['RequestName']) inputlist = [('workflow', self.cachedinfo['RequestName'])] inputlist.extend(list(argv.iteritems())) if getattr(self.options, 'quantity', None): self.logger.debug('Retrieving %s file locations' % self.options.quantity) inputlist.append(('limit', self.options.quantity)) else: self.logger.debug('Retrieving all file locations') inputlist.append(('limit', -1)) if getattr(self.options, 'jobids', None): self.logger.debug('Retrieving jobs %s' % self.options.jobids) inputlist.extend(self.options.jobids) serverFactory = CRABClient.Emulator.getEmulator('rest') server = serverFactory(self.serverurl, self.proxyfilename, self.proxyfilename, version=__version__) dictresult, status, reason = server.get(self.uri, data = urllib.urlencode(inputlist)) self.logger.debug('Server result: %s' % dictresult) if status != 200: msg = "Problem retrieving information from the server:\ninput:%s\noutput:%s\nreason:%s" % (str(inputlist), str(dictresult), str(reason)) raise RESTCommunicationException(msg) totalfiles = len(dictresult['result']) fileInfoList = dictresult['result'] self.insertXrootPfns(fileInfoList) if len(fileInfoList) > 0: if self.options.dump or self.options.xroot: self.logger.debug("Getting url info") else: self.setDestination() self.logger.info("Setting the destination to %s " % self.dest) if self.options.xroot: self.logger.debug("XRootD urls are requested") xrootlfn = ["root://cms-xrd-global.cern.ch/%s" % link['lfn'] for link in fileInfoList] self.logger.info("\n".join(xrootlfn)) returndict = {'xrootd': xrootlfn} elif self.options.dump: jobid_pfn_lfn_list = sorted(map(lambda x: (x['jobid'], x['pfn'], x['lfn']), fileInfoList)) # pylint: disable=deprecated-lambda lastjobid = -1 filecounter = 1 msg = "" for jobid, pfn, lfn in jobid_pfn_lfn_list: if jobid != lastjobid: msg += "%s=== Files from job %s:" % ('\n' if lastjobid != -1 else '', jobid) lastjobid = jobid filecounter = 1 msg += "\n%d) PFN: %s" % (filecounter, pfn) msg += "\n%s LFN: %s" % (' '*(len(str(filecounter))), lfn) filecounter += 1 self.logger.info(msg) returndict = {'pfn': [pfn for _, pfn, _ in jobid_pfn_lfn_list], 'lfn': [lfn for _, _, lfn in jobid_pfn_lfn_list]} else: self.logger.info("Retrieving %s files" % (totalfiles)) arglist = ['--destination', self.dest, '--input', fileInfoList, '--dir', self.options.projdir, \ '--proxy', self.proxyfilename, '--parallel', self.options.nparallel, '--wait', self.options.waittime, \ '--checksum', self.checksum, '--command', self.command] copyoutput = remote_copy(self.logger, arglist) successdict, faileddict = copyoutput() #need to use deepcopy because successdict and faileddict are dict that is under the a manage dict, accessed multithreadly returndict = {'success': copy.deepcopy(successdict) , 'failed': copy.deepcopy(faileddict)} if totalfiles == 0: self.logger.info("No files to retrieve.") returndict = {'success': {} , 'failed': {}} if transferFlag == 'unknown': if ('success' in returndict and not returndict['success']) and \ ('failed' in returndict and not returndict['failed']): msg = "This is normal behavior if %s = False in the task configuration." % (configparam) self.logger.info(msg) return returndict
def __call__(self, **argv): #Retrieving output files location from the server self.logger.debug('Retrieving locations for task %s' % self.cachedinfo['RequestName']) inputlist = [('workflow', self.cachedinfo['RequestName'])] inputlist.extend(list(argv.iteritems())) if getattr(self.options, 'quantity', None): self.logger.debug('Retrieving %s file locations' % self.options.quantity) inputlist.append(('limit', self.options.quantity)) else: self.logger.debug('Retrieving all file locations') inputlist.append(('limit', -1)) if getattr(self.options, 'jobids', None): self.logger.debug('Retrieving jobs %s' % self.options.jobids) inputlist.extend(self.options.jobids) serverFactory = CRABClient.Emulator.getEmulator('rest') server = serverFactory(self.serverurl, self.proxyfilename, self.proxyfilename, version=__version__) dictresult, status, reason = server.get(self.uri, data=inputlist) self.logger.debug('Server result: %s' % dictresult) dictresult = self.processServerResult(dictresult) if status != 200: msg = "Problem retrieving information from the server:\ninput:%s\noutput:%s\nreason:%s" % ( str(inputlist), str(dictresult), str(reason)) raise RESTCommunicationException(msg) totalfiles = len(dictresult['result']) cpresults = [] # for workflow in dictresult['result']: TODO re-enable this when we will have resubmissions workflow = dictresult[ 'result'] #TODO assigning workflow to dictresult. for the moment we have only one wf if len(workflow) > 0: if self.options.dump or self.options.xroot: self.logger.debug("Getting url info") else: self.setDestination() self.logger.info("Setting the destination to %s " % self.dest) if self.options.xroot: self.logger.debug("XRootD urls are requested") xrootlfn = [ "root://cms-xrd-global.cern.ch/%s" % link['lfn'] for link in workflow ] self.logger.info("\n".join(xrootlfn)) returndict = {'xrootd': xrootlfn} elif self.dump: jobid_pfn_lfn_list = map( lambda x: (x['jobid'], x['pfn'], x['lfn']), workflow) jobid_pfn_lfn_list.sort() lastjobid = -1 filecounter = 1 msg = "" for jobid, pfn, lfn in jobid_pfn_lfn_list: if jobid != lastjobid: msg += "%s=== Files from job %s:" % ( '\n' if lastjobid != -1 else '', jobid) lastjobid = jobid filecounter = 1 msg += "\n%d) PFN: %s" % (filecounter, pfn) msg += "\n%s LFN: %s" % (' ' * (len(str(filecounter))), lfn) filecounter += 1 self.logger.info(msg) returndict = { 'pfn': [pfn for _, pfn, _ in jobid_pfn_lfn_list], 'lfn': [lfn for _, _, lfn in jobid_pfn_lfn_list] } else: self.logger.info("Retrieving %s files" % (totalfiles)) arglist = ['--destination', self.dest, '--input', workflow, '--dir', self.options.task, \ '--proxy', self.proxyfilename, '--parallel', self.options.nparallel, '--wait', self.options.waittime] copyoutput = remote_copy(self.logger, arglist) successdict, faileddict = copyoutput() #need to use deepcopy because successdict and faileddict are dict that is under the a manage dict, accessed multithreadly returndict = { 'success': copy.deepcopy(successdict), 'failed': copy.deepcopy(faileddict) } if totalfiles == 0: ## TODO: we should use an API to retrieve from the TaskDB what are the transfer flag values for the task. ## If the corresponding transfer flag is False, the user should not expect to be able to retrieve the files. self.logger.info("No files to retrieve.") returndict = {'success': {}, 'failed': {}} return returndict
def __call__(self): self.logger.debug("Started submission") serverFactory = CRABClient.Emulator.getEmulator('rest') uniquerequestname = None self.logger.debug("Working on %s" % str(self.requestarea)) self.configreq = {'dryrun': 1 if self.options.dryrun else 0} for param in parametersMapping['on-server']: mustbetype = getattr(types, parametersMapping['on-server'][param]['type']) default = parametersMapping['on-server'][param]['default'] config_params = parametersMapping['on-server'][param]['config'] for config_param in config_params: attrs = config_param.split('.') temp = self.configuration for attr in attrs: temp = getattr(temp, attr, None) if temp is None: break if temp is not None: self.configreq[param] = temp break elif default is not None: self.configreq[param] = default temp = default else: ## Parameter not strictly required. pass ## Check that the requestname is of the right type. ## This is not checked in SubCommand.validateConfig(). if param == 'workflow': if isinstance(self.requestname, mustbetype): self.configreq['workflow'] = self.requestname ## Translate boolean flags into integers. elif param in ['savelogsflag', 'publication', 'publishgroupname', 'nonprodsw', 'useparent',\ 'ignorelocality', 'saveoutput', 'oneEventMode', 'nonvaliddata', 'ignoreglobalblacklist']: self.configreq[param] = 1 if temp else 0 ## Translate DBS URL aliases into DBS URLs. elif param in ['dbsurl', 'publishdbsurl']: if param == 'dbsurl': dbstype = 'reader' elif param == 'publishdbsurl': dbstype = 'writer' allowed_dbsurls = DBSURLS[dbstype].values() allowed_dbsurls_aliases = DBSURLS[dbstype].keys() if self.configreq[param] in allowed_dbsurls_aliases: self.configreq[param] = DBSURLS[dbstype][ self.configreq[param]] elif self.configreq[param].rstrip('/') in allowed_dbsurls: self.configreq[param] = self.configreq[param].rstrip('/') elif param == 'scriptexe' and 'scriptexe' in self.configreq: self.configreq[param] = os.path.basename(self.configreq[param]) jobconfig = {} #get the backend URLs from the server external configuration serverBackendURLs = server_info('backendurls', self.serverurl, self.proxyfilename, getUrl(self.instance, resource='info')) #if cacheSSL is specified in the server external configuration we will use it to upload the sandbox filecacheurl = serverBackendURLs[ 'cacheSSL'] if 'cacheSSL' in serverBackendURLs else None pluginParams = [ self.configuration, self.proxyfilename, self.logger, os.path.join(self.requestarea, 'inputs') ] crab_job_types = getJobTypes() if upper(self.configreq['jobtype']) in crab_job_types: plugjobtype = crab_job_types[upper( self.configreq['jobtype'])](*pluginParams) dummy_inputfiles, jobconfig = plugjobtype.run(filecacheurl) else: fullname = self.configreq['jobtype'] basename = os.path.basename(fullname).split('.')[0] plugin = addPlugin(fullname)[basename] pluginInst = plugin(*pluginParams) dummy_inputfiles, jobconfig = pluginInst.run() if self.configreq['publication']: non_edm_files = jobconfig['tfileoutfiles'] + jobconfig[ 'addoutputfiles'] if non_edm_files: msg = "%sWarning%s: The following output files will not be published, as they are not EDM files: %s" % ( colors.RED, colors.NORMAL, non_edm_files) self.logger.warning(msg) self.configreq.update(jobconfig) server = serverFactory(self.serverurl, self.proxyfilename, self.proxyfilename, version=__version__) self.logger.info("Sending the request to the server at %s" % self.serverurl) self.logger.debug("Submitting %s " % str(self.configreq)) ## TODO: this shouldn't be hard-coded. listParams = ['addoutputfiles', 'sitewhitelist', 'siteblacklist', 'blockwhitelist', 'blockblacklist', \ 'tfileoutfiles', 'edmoutfiles', 'runs', 'lumis', 'userfiles', 'scriptargs', 'extrajdl'] self.configreq_encoded = self._encodeRequest(self.configreq, listParams) self.logger.debug('Encoded submit request: %s' % (self.configreq_encoded)) dictresult, status, reason = server.put(self.uri, data=self.configreq_encoded) self.logger.debug("Result: %s" % dictresult) if status != 200: msg = "Problem sending the request:\ninput:%s\noutput:%s\nreason:%s" % ( str(self.configreq), str(dictresult), str(reason)) raise RESTCommunicationException(msg) elif 'result' in dictresult: uniquerequestname = dictresult["result"][0]["RequestName"] else: msg = "Problem during submission, no request ID returned:\ninput:%s\noutput:%s\nreason:%s" \ % (str(self.configreq), str(dictresult), str(reason)) raise RESTCommunicationException(msg) tmpsplit = self.serverurl.split(':') createCache(self.requestarea, tmpsplit[0], tmpsplit[1] if len(tmpsplit) > 1 else '', uniquerequestname, voRole=self.voRole, voGroup=self.voGroup, instance=self.instance, originalConfig=self.configuration) self.logger.info( "%sSuccess%s: Your task has been delivered to the %s CRAB3 server." % (colors.GREEN, colors.NORMAL, self.instance)) if not (self.options.wait or self.options.dryrun): self.logger.info("Task name: %s" % uniquerequestname) projDir = os.path.join( getattr(self.configuration.General, 'workArea', '.'), self.requestname) self.logger.info("Project dir: %s" % projDir) self.logger.info( "Please use 'crab status -d %s' to check how the submission process proceeds.", projDir) else: targetTaskStatus = 'UPLOADED' if self.options.dryrun else 'SUBMITTED' checkStatusLoop(self.logger, server, self.uri, uniquerequestname, targetTaskStatus, self.name) if self.options.dryrun: self.printDryRunResults(*self.executeTestRun(filecacheurl)) self.logger.debug("About to return") return { 'requestname': self.requestname, 'uniquerequestname': uniquerequestname }
def __call__(self): self.logger.info('Getting the tarball hash key') inputlist = { 'subresource': 'search', 'workflow': self.cachedinfo['RequestName'] } serverFactory = CRABClient.Emulator.getEmulator('rest') server = serverFactory(self.serverurl, self.proxyfilename, self.proxyfilename, version=__version__) uri = self.getUrl(self.instance, resource='task') dictresult, status, reason = server.get(uri, data=inputlist) if status == 200: if 'desc' in dictresult and 'columns' in dictresult['desc']: position = dictresult['desc']['columns'].index( 'tm_user_sandbox') tm_user_sandbox = dictresult['result'][position] hashkey = tm_user_sandbox.replace(".tar.gz", "") else: self.logger.info( '%sError%s: Could not find tarball or there is more than one tarball' % (colors.RED, colors.NORMAL)) raise ConfigurationException #checking task status self.logger.info('Checking task status') serverFactory = CRABClient.Emulator.getEmulator('rest') server = serverFactory(self.serverurl, self.proxyfilename, self.proxyfilename, version=__version__) dictresult, status, _ = server.get(self.uri, data={ 'workflow': self.cachedinfo['RequestName'], 'verbose': 0 }) dictresult = dictresult['result'][0] #take just the significant part if status != 200: msg = "Problem retrieving task status:\ninput: %s\noutput: %s\nreason: %s" % ( str(self.cachedinfo['RequestName']), str(dictresult), str(reason)) raise RESTCommunicationException(msg) self.logger.info('Task status: %s' % dictresult['status']) accepstate = [ 'KILLED', 'FINISHED', 'FAILED', 'KILLFAILED', 'COMPLETED' ] if dictresult['status'] not in accepstate: msg = ('%sError%s: Only tasks with these status can be purged: {0}' .format(accepstate) % (colors.RED, colors.NORMAL)) raise ConfigurationException(msg) #getting the cache url cacheresult = {} scheddresult = {} gsisshdict = {} if not self.options.scheddonly: baseurl = getUrl(self.instance, resource='info') cacheurl = server_info('backendurls', self.serverurl, self.proxyfilename, baseurl) cacheurl = cacheurl['cacheSSL'] cacheurldict = {'endpoint': cacheurl, 'pycurl': True} ufc = UserFileCache(cacheurldict) self.logger.info('Tarball hashkey: %s' % hashkey) self.logger.info( 'Attempting to remove task file from crab server cache') try: ufcresult = ufc.removeFile(hashkey) except HTTPException as re: if 'X-Error-Info' in re.headers and 'Not such file' in re.headers[ 'X-Error-Info']: self.logger.info( '%sError%s: Failed to find task file in crab server cache; the file might have been already purged' % (colors.RED, colors.NORMAL)) raise if ufcresult == '': self.logger.info( '%sSuccess%s: Successfully removed task files from crab server cache' % (colors.GREEN, colors.NORMAL)) cacheresult = 'SUCCESS' else: self.logger.info( '%sError%s: Failed to remove task files from crab server cache' % (colors.RED, colors.NORMAL)) cacheresult = 'FAILED' if not self.options.cacheonly: self.logger.info('Getting schedd address') baseurl = self.getUrl(self.instance, resource='info') try: scheddaddress = server_info( 'scheddaddress', self.serverurl, self.proxyfilename, baseurl, workflow=self.cachedinfo['RequestName']) except HTTPException as he: self.logger.info('%sError%s: Failed to get schedd address' % (colors.RED, colors.NORMAL)) raise HTTPException(he) self.logger.debug('%sSuccess%s: Successfully got schedd address' % (colors.GREEN, colors.NORMAL)) self.logger.debug('Schedd address: %s' % scheddaddress) self.logger.info('Attempting to remove task from schedd') gssishrm = 'gsissh -o ConnectTimeout=60 -o PasswordAuthentication=no ' + scheddaddress + ' rm -rf ' + self.cachedinfo[ 'RequestName'] self.logger.debug('gsissh command: %s' % gssishrm) delprocess = subprocess.Popen(gssishrm, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) stdout, stderr = delprocess.communicate() exitcode = delprocess.returncode if exitcode == 0: self.logger.info( '%sSuccess%s: Successfully removed task from scehdd' % (colors.GREEN, colors.NORMAL)) scheddresult = 'SUCCESS' gsisshdict = {} else: self.logger.info( '%sError%s: Failed to remove task from schedd' % (colors.RED, colors.NORMAL)) scheddaddress = 'FAILED' self.logger.debug( 'gsissh stdout: %s\ngsissh stderr: %s\ngsissh exitcode: %s' % (stdout, stderr, exitcode)) gsisshdict = { 'stdout': stdout, 'stderr': stderr, 'exitcode': exitcode } return { 'cacheresult': cacheresult, 'scheddresult': scheddresult, 'gsiresult': gsisshdict }
def checkStatusLoop(self, server, uniquerequestname): self.logger.info("Waiting for task to be processed") maxwaittime = 900 #in second, changed to 15 minute max wait time, the original 1 hour is too long starttime = currenttime = time.time() endtime = currenttime + maxwaittime startimestring = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(starttime)) endtimestring = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(endtime)) self.logger.debug("Start time:%s" % startimestring) self.logger.debug("Max wait time: %s s until : %s" % (maxwaittime, endtimestring)) #self.logger.debug('Looking up detailed status of task %s' % uniquerequestname) continuecheck = True tmpresult = None self.logger.info("Checking task status") while continuecheck: currenttime = time.time() querytimestring = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(currenttime)) self.logger.debug('Looking up detailed status of task %s' % uniquerequestname) dictresult, status, reason = server.get( self.uri, data={'workflow': uniquerequestname}) dictresult = dictresult['result'][0] if status != 200: self.logger.info( "The task has been submitted, \nImpossible to check task status now. \nPlease check again later by using: crab status -d <crab project directory>" ) msg = "Problem retrieving status:\ninput:%s\noutput:%s\nreason:%s" % ( str(uniquerequestname), str(dictresult), str(reason)) raise RESTCommunicationException(msg) self.logger.debug("Query Time:%s Task status:%s" % (querytimestring, dictresult['status'])) if dictresult['status'] != tmpresult: self.logger.info("Task status:%s" % dictresult['status']) tmpresult = dictresult['status'] if dictresult['status'] == 'FAILED': continuecheck = False self.logger.info( "%sError%s: The submission of your task failed. Please use 'crab status -d <crab project directory>' to get the error message" % (colors.RED, colors.NORMAL)) elif dictresult['status'] == 'SUBMITTED' or dictresult[ 'status'] == 'UNKNOWN': #untile the node_state file is available status is unknown continuecheck = False self.logger.info( "%sSuccess%s: Your task has been processed and your jobs have been submitted successfully" % (colors.GREEN, colors.NORMAL)) elif dictresult['status'] in ['NEW', 'HOLDING', 'QUEUED']: self.logger.info("Please wait...") time.sleep( 30) #the original 60 second query time is too long else: continuecheck = False self.logger.info("Please check crab.log ") self.logger.debug( "CRABS Status other than FAILED,SUBMITTED,NEW,HOLDING,QUEUED" ) if currenttime > endtime: continuecheck = False self.logger.info( "Exceed maximum query time \n Please check again later by using: crab status -d <crab project directory>" ) waittime = currenttime - starttime self.logger.debug("Wait time:%s" % waittime) break print '\a' #Generate audio bell self.logger.debug("Ended submission process")
def __call__(self): serverFactory = CRABClient.Emulator.getEmulator('rest') server = serverFactory(self.serverurl, self.proxyfilename, self.proxyfilename, version=__version__) if self.jobids: msg = "Requesting resubmission of jobs %s in task %s" % ( self.jobids, self.cachedinfo['RequestName']) else: msg = "Requesting resubmission of failed jobs in task %s" % ( self.cachedinfo['RequestName']) self.logger.debug(msg) configreq = { 'workflow': self.cachedinfo['RequestName'], 'subresource': 'resubmit' } for attr_name in ['jobids', 'sitewhitelist', 'siteblacklist']: attr_value = getattr(self, attr_name) ## For 'jobids', 'sitewhitelist' and 'siteblacklist', attr_value is either a list of strings or None. if attr_value is not None: configreq[attr_name] = attr_value for attr_name in [ 'maxjobruntime', 'maxmemory', 'numcores', 'priority' ]: attr_value = getattr(self.options, attr_name) ## For 'maxjobruntime', 'maxmemory', 'numcores', and 'priority', attr_value is either an integer or None. if attr_value is not None: configreq[attr_name] = attr_value configreq['force'] = 1 if self.options.force else 0 configreq['publication'] = 1 if self.options.publication else 0 self.logger.info("Sending resubmit request to the server.") self.logger.debug("Submitting %s " % str(configreq)) configreq_encoded = self._encodeRequest(configreq) self.logger.debug("Encoded resubmit request: %s" % (configreq_encoded)) dictresult, status, reason = server.post(self.uri, data=configreq_encoded) self.logger.debug("Result: %s" % (dictresult)) if status != 200: msg = "Problem resubmitting the task to the server:\ninput:%s\noutput:%s\nreason:%s" \ % (str(configreq_encoded), str(dictresult), str(reason)) raise RESTCommunicationException(msg) self.logger.info("Resubmit request sent to the server.") if dictresult['result'][0]['result'] != 'ok': msg = "Server responded with: '%s'" % ( dictresult['result'][0]['result']) self.logger.info(msg) returndict = {'status': 'FAILED'} else: if not self.options.wait: msg = "Please use 'crab status' to check how the resubmission process proceeds." msg += "\nNotice it may take a couple of minutes for the resubmission to get fully processed." self.logger.info(msg) else: targetTaskStatus = 'SUBMITTED' checkStatusLoop(self.logger, server, self.uri, self.cachedinfo['RequestName'], targetTaskStatus, self.name) returndict = {'status': 'SUCCESS'} return returndict
def __call__(self): self.logger.info('Getting the tarball hash key') tarballdir = glob.glob(self.requestarea + '/inputs/*.tgz') if len(tarballdir) != 1: self.logger.info( '%sError%s: Could not find tarball or there is more than one tarball' % (colors.RED, colors.NORMAL)) raise ConfigurationException tarballdir = tarballdir[0] #checking task status self.logger.info('Checking task status') serverFactory = CRABClient.Emulator.getEmulator('rest') server = serverFactory(self.serverurl, self.proxyfilename, self.proxyfilename, version=__version__) dictresult, status, _ = server.get(self.uri, data={ 'workflow': self.cachedinfo['RequestName'], 'verbose': 0 }) dictresult = dictresult['result'][0] #take just the significant part if status != 200: msg = "Problem retrieving task status:\ninput: %s\noutput: %s\nreason: %s" % ( str(self.cachedinfo['RequestName']), str(dictresult), str(reason)) raise RESTCommunicationException(msg) self.logger.info('Task status: %s' % dictresult['status']) accepstate = [ 'KILLED', 'FINISHED', 'FAILED', 'KILLFAILED', 'COMPLETED' ] if dictresult['status'] not in accepstate: msg = ('%sError%s: Only tasks with these status can be purged: {0}' .format(accepstate) % (colors.RED, colors.NORMAL)) raise ConfigurationException(msg) #getting the cache url cacheresult = {} scheddresult = {} gsisshdict = {} if not self.options.scheddonly: baseurl = getUrl(self.instance, resource='info') cacheurl = server_info('backendurls', self.serverurl, self.proxyfilename, baseurl) cacheurl = cacheurl['cacheSSL'] cacheurldict = {'endpoint': cacheurl, 'pycurl': True} ufc = UserFileCache(cacheurldict) hashkey = ufc.checksum(tarballdir) self.logger.info('Tarball hashkey: %s' % hashkey) self.logger.info( 'Attempting to remove task file from crab server cache') try: ufcresult = ufc.removeFile(hashkey) except HTTPException, re: if re.headers.has_key( 'X-Error-Info' ) and 'Not such file' in re.headers['X-Error-Info']: self.logger.info( '%sError%s: Failed to find task file in crab server cache; the file might have been already purged' % (colors.RED, colors.NORMAL)) raise HTTPException, re if ufcresult == '': self.logger.info( '%sSuccess%s: Successfully removed task files from crab server cache' % (colors.GREEN, colors.NORMAL)) cacheresult = 'SUCCESS' else: self.logger.info( '%sError%s: Failed to remove task files from crab server cache' % (colors.RED, colors.NORMAL)) cacheresult = 'FAILED'
def __call__(self): serverFactory = CRABClient.Emulator.getEmulator('rest') server = serverFactory(self.serverurl, self.proxyfilename, self.proxyfilename, version=__version__) self.logger.debug('Looking up report for task %s' % self.cachedinfo['RequestName']) dictresult, status, reason = server.get( self.uri, data={ 'workflow': self.cachedinfo['RequestName'], 'subresource': 'report' }) self.logger.debug("Result: %s" % dictresult) if status != 200: msg = "Problem retrieving report:\ninput:%s\noutput:%s\nreason:%s" % ( str(self.cachedinfo['RequestName']), str(dictresult), str(reason)) raise RESTCommunicationException(msg) returndict = {} publication = dictresult['result'][0]['publication'] if self.options.recovery == 'notPublished' and not publication: msg = "%sError%s:" % (colors.RED, colors.NORMAL) msg += " The option --recovery=%s has been specified" % ( self.options.recovery) msg += " (which instructs to determine the not processed lumis based on published datasets)," msg += " but publication has been disabled in the CRAB configuration." raise ConfigurationException(msg) onlyDBSSummary = False if not dictresult['result'][0]['lumisToProcess'] or not dictresult[ 'result'][0]['runsAndLumis']: msg = "%sError%s:" % (colors.RED, colors.NORMAL) msg += " Cannot get all the needed information for the report." msg += " Notice, if your task has been submitted more than 30 days ago, then everything has been cleaned." self.logger.info(msg) if not publication: return returndict onlyDBSSummary = True def _getNumFiles(jobs, fileType): files = set() for dummy_jobid, reports in jobs.iteritems(): for rep in reports: if rep['type'] == fileType: # the split is done to remove the jobnumber at the end of the input file lfn files.add('_'.join(rep['lfn'].split('_')[:-1])) return len(files) def _getNumEvents(jobs, fileType): numEvents = 0 for dummy_jobid, reports in jobs.iteritems(): for rep in reports: if rep['type'] == fileType: numEvents += rep['events'] return numEvents ## Extract the reports of the input files. poolInOnlyRes = {} for jobid, reports in dictresult['result'][0][ 'runsAndLumis'].iteritems(): poolInOnlyRes[jobid] = [ rep for rep in reports if rep['type'] == 'POOLIN' ] ## Calculate how many input files have been processed. numFilesProcessed = _getNumFiles( dictresult['result'][0]['runsAndLumis'], 'POOLIN') returndict['numFilesProcessed'] = numFilesProcessed ## Calculate how many events have been read. numEventsRead = _getNumEvents(dictresult['result'][0]['runsAndLumis'], 'POOLIN') returndict['numEventsRead'] = numEventsRead ## Calculate how many events have been written. numEventsWritten = {} for filetype in ['EDM', 'TFile', 'FAKE']: numEventsWritten[filetype] = _getNumEvents( dictresult['result'][0]['runsAndLumis'], filetype) returndict['numEventsWritten'] = numEventsWritten ## Get the lumis in the input dataset. inputDatasetLumis = dictresult['result'][0]['inputDataset']['lumis'] returndict['inputDatasetLumis'] = inputDatasetLumis ## Get the lumis split across files in the input dataset. inputDatasetDuplicateLumis = dictresult['result'][0]['inputDataset'][ 'duplicateLumis'] returndict['inputDatasetDuplicateLumis'] = inputDatasetDuplicateLumis ## Get the lumis that the jobs had to process. This must be a subset of input ## dataset lumis & lumi-mask. lumisToProcessPerJob = dictresult['result'][0]['lumisToProcess'] lumisToProcess = {} for jobid in lumisToProcessPerJob.keys(): for run, lumiRanges in lumisToProcessPerJob[jobid].iteritems(): if run not in lumisToProcess: lumisToProcess[run] = [] for lumiRange in lumiRanges: lumisToProcess[run].extend( range(lumiRange[0], lumiRange[1] + 1)) lumisToProcess = LumiList(runsAndLumis=lumisToProcess).getCompactList() returndict['lumisToProcess'] = lumisToProcess ## Get the lumis that have been processed. processedLumis = BasicJobType.mergeLumis(poolInOnlyRes) returndict['processedLumis'] = processedLumis ## Get the run-lumi and number of events information about the output datasets. outputDatasetsInfo = dictresult['result'][0]['outputDatasets'] outputDatasetsLumis = {} outputDatasetsNumEvents = {} if publication: for dataset, info in outputDatasetsInfo.iteritems(): if info['lumis']: outputDatasetsLumis[dataset] = info['lumis'] outputDatasetsNumEvents[dataset] = info['numEvents'] returndict['outputDatasetsLumis'] = outputDatasetsLumis returndict['outputDatasetsNumEvents'] = outputDatasetsNumEvents numOutputDatasets = len(outputDatasetsInfo) ## Get the duplicate runs-lumis in the output files. Use for this the run-lumi ## information of the input files. Why not to use directly the output files? ## Because not all types of output files have run-lumi information in their ## filemetadata (note: the run-lumi information in the filemetadata is a copy ## of the corresponding information in the FJR). For example, output files ## produced by TFileService do not have run-lumi information in the FJR. On the ## other hand, input files always have run-lumi information in the FJR, which ## lists the runs-lumis in the input file that have been processed by the ## corresponding job. And of course, the run-lumi information of an output file ## produced by job X should be the (set made out of the) union of the run-lumi ## information of the input files to job X. outputFilesLumis = {} for jobid, reports in poolInOnlyRes.iteritems(): lumiDict = {} for rep in reports: for run, lumis in literal_eval(rep['runlumi']).iteritems(): lumiDict.setdefault(str(run), []).extend(map(int, lumis)) for run, lumis in lumiDict.iteritems(): outputFilesLumis.setdefault(run, []).extend(list(set(lumis))) outputFilesDuplicateLumis = BasicJobType.getDuplicateLumis( outputFilesLumis) returndict['outputFilesDuplicateLumis'] = outputFilesDuplicateLumis ## Calculate the not processed runs-lumis in one of three ways: ## 1) The lumis that were supposed to be processed by all jobs minus the lumis ## that were processed by finished (but not necessarily published) jobs. ## 2) The lumis that were supposed to be processed by all jobs minus the lumis ## published in all the output datasets. ## 3) The lumis that were supposed to be processed by jobs whose status is ## 'failed'. notProcessedLumis = {} notProcLumisCalcMethMsg = "The '%s' lumis were calculated as:" % ( self.options.recovery) if self.options.recovery == 'notFinished': notProcessedLumis = BasicJobType.subtractLumis( lumisToProcess, processedLumis) notProcLumisCalcMethMsg += " the lumis to process minus the processed lumis." elif self.options.recovery == 'notPublished': publishedLumis = {} firstdataset = True for dataset in outputDatasetsLumis.keys(): if firstdataset: publishedLumis = outputDatasetsLumis[dataset] firstdataset = False else: publishedLumis = BasicJobType.intersectLumis( publishedLumis, outputDatasetsLumis[dataset]) notProcessedLumis = BasicJobType.subtractLumis( lumisToProcess, publishedLumis) notProcLumisCalcMethMsg += " the lumis to process" if numOutputDatasets > 1: notProcLumisCalcMethMsg += " minus the lumis published in all the output datasets." else: notProcLumisCalcMethMsg += " minus the lumis published in the output dataset." elif self.options.recovery == 'failed': for jobid, status in dictresult['result'][0][ 'statusPerJob'].iteritems(): if status in ['failed']: for run, lumiRanges in lumisToProcessPerJob[ jobid].iteritems(): if run not in notProcessedLumis: notProcessedLumis[run] = [] for lumiRange in lumiRanges: notProcessedLumis[run].extend( range(lumiRange[0], lumiRange[1] + 1)) notProcessedLumis = LumiList( runsAndLumis=notProcessedLumis).getCompactList() notProcLumisCalcMethMsg += " the lumis to process by jobs in status 'failed'." returndict['notProcessedLumis'] = notProcessedLumis ## Create the output directory if it doesn't exists. if self.options.outdir: jsonFileDir = self.options.outdir else: jsonFileDir = os.path.join(self.requestarea, 'results') self.logger.info("Will save lumi files into output directory %s" % (jsonFileDir)) if not os.path.exists(jsonFileDir): self.logger.debug("Creating directory %s" % (jsonFileDir)) os.makedirs(jsonFileDir) ## Create the report JSON files and print a report summary: ## 1) First the summary that depends solely on successfully finished jobs (and ## other general information about the task, but not on failed/running jobs). if not onlyDBSSummary: self.logger.info("Summary from jobs in status 'finished':") msg = " Number of files processed: %d" % (numFilesProcessed) msg += "\n Number of events read: %d" % (numEventsRead) msg += "\n Number of events written in EDM files: %d" % ( numEventsWritten.get('EDM', 0)) msg += "\n Number of events written in TFileService files: %d" % ( numEventsWritten.get('TFile', 0)) msg += "\n Number of events written in other type of files: %d" % ( numEventsWritten.get('FAKE', 0)) self.logger.info(msg) if processedLumis: with open(os.path.join(jsonFileDir, 'processedLumis.json'), 'w') as jsonFile: json.dump(processedLumis, jsonFile) jsonFile.write("\n") self.logger.info( " Processed lumis written to processedLumis.json") if notProcessedLumis: filename = self.options.recovery + "Lumis.json" with open(os.path.join(jsonFileDir, filename), 'w') as jsonFile: json.dump(notProcessedLumis, jsonFile) jsonFile.write("\n") self.logger.info( " %sWarning%s: '%s' lumis written to %s" % (colors.RED, colors.NORMAL, self.options.recovery, filename)) self.logger.info(" %s" % (notProcLumisCalcMethMsg)) if outputFilesDuplicateLumis: with open( os.path.join(jsonFileDir, 'outputFilesDuplicateLumis.json'), 'w') as jsonFile: json.dump(outputFilesDuplicateLumis, jsonFile) jsonFile.write("\n") self.logger.info( " %sWarning%s: Duplicate lumis in output files written to outputFilesDuplicateLumis.json" % (colors.RED, colors.NORMAL)) ## 2) Then the summary about output datasets in DBS. For this, publication must ## be True and the output files must be publishable. if publication and outputDatasetsInfo: if onlyDBSSummary: self.logger.info( "Will provide a short report with information found in DBS." ) self.logger.info("Summary from output datasets in DBS:") if outputDatasetsNumEvents: msg = " Number of events:" for dataset, numEvents in outputDatasetsNumEvents.iteritems(): msg += "\n %s: %d" % (dataset, numEvents) self.logger.info(msg) if outputDatasetsLumis: with open( os.path.join(jsonFileDir, 'outputDatasetsLumis.json'), 'w') as jsonFile: json.dump(outputDatasetsLumis, jsonFile) jsonFile.write("\n") self.logger.info( " Output datasets lumis written to outputDatasetsLumis.json" ) ## 3) Finally additional files that can be useful for debugging. if inputDatasetLumis or inputDatasetDuplicateLumis or lumisToProcess: self.logger.info("Additional report lumi files:") if inputDatasetLumis: with open(os.path.join(jsonFileDir, 'inputDatasetLumis.json'), 'w') as jsonFile: json.dump(inputDatasetLumis, jsonFile) jsonFile.write("\n") self.logger.info( " Input dataset lumis (from DBS, at task submission time) written to inputDatasetLumis.json" ) if inputDatasetDuplicateLumis: with open( os.path.join(jsonFileDir, 'inputDatasetDuplicateLumis.json'), 'w') as jsonFile: json.dump(inputDatasetDuplicateLumis, jsonFile) jsonFile.write("\n") self.logger.info( " Input dataset duplicate lumis (from DBS, at task submission time) written to inputDatasetDuplicateLumis.json" ) if lumisToProcess: with open(os.path.join(jsonFileDir, 'lumisToProcess.json'), 'w') as jsonFile: json.dump(lumisToProcess, jsonFile) jsonFile.write("\n") self.logger.info( " Lumis to process written to lumisToProcess.json") return returndict
def checkStatusLoop(logger, server, uri, uniquerequestname, targetstatus, cmdname): logger.info("Waiting for task to be processed") maxwaittime = 900 #in second, changed to 15 minute max wait time, the original 1 hour is too long starttime = currenttime = time.time() endtime = currenttime + maxwaittime startimestring = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(starttime)) endtimestring = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(endtime)) logger.debug("Start time:%s" % (startimestring)) logger.debug("Max wait time: %s s until : %s" % (maxwaittime, endtimestring)) #logger.debug('Looking up detailed status of task %s' % uniquerequestname) continuecheck = True tmpresult = None logger.info("Checking task status") while continuecheck: currenttime = time.time() querytimestring = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(currenttime)) logger.debug("Looking up detailed status of task %s" % (uniquerequestname)) dictresult, status, reason = server.get( uri, data={'workflow': uniquerequestname}) dictresult = dictresult['result'][0] if status != 200: msg = "Error when trying to check the task status." msg += " Please check the task status later using 'crab status'." logger.error(msg) msg = "Problem retrieving status:\ninput:%s\noutput:%s\nreason:%s" % ( str(uniquerequestname), str(dictresult), str(reason)) raise RESTCommunicationException(msg) logger.debug("Query Time: %s Task status: %s" % (querytimestring, dictresult['status'])) logger.info("Task status: %s" % (dictresult['status'])) if dictresult['status'] != tmpresult: tmpresult = dictresult['status'] if dictresult['status'] in ['SUBMITFAILED', 'RESUBMITFAILED']: continuecheck = False msg = "%sError%s:" % (colors.RED, colors.NORMAL) msg += " The %s of your task has failed." % ( "resubmission" if cmdname == "resubmit" else "submission") logger.error(msg) if dictresult['taskFailureMsg']: msg = "%sFailure message%s:" % (colors.RED, colors.NORMAL) msg += "\t%s" % (dictresult['taskFailureMsg'].replace( '\n', '\n\t\t\t')) logger.error(msg) elif dictresult['status'] in [ 'SUBMITTED', 'UPLOADED', 'UNKNOWN' ]: #until the node_state file is available status is unknown continuecheck = False else: logger.info("Please wait...") time.sleep(30) elif dictresult['status'] in ['NEW', 'HOLDING', 'QUEUED', 'RESUBMIT']: logger.info("Please wait...") time.sleep(30) else: continuecheck = False logger.info("Please check crab.log") logger.debug( "Task status other than SUBMITFAILED, RESUBMITFAILED, SUBMITTED, UPLOADED, NEW, HOLDING, QUEUED, RESUBMIT" ) ## Break the loop if we were waiting already too much. if currenttime > endtime: continuecheck = False msg = "Maximum query time exceeded." msg += " Please check the status of the %s later using 'crab status'." % ( "resubmission" if cmdname == "resubmit" else "submission") logger.info(msg) waittime = currenttime - starttime logger.debug("Wait time: %s" % (waittime)) if targetstatus == 'SUBMITTED': if tmpresult == 'SUBMITTED': msg = "%sSuccess%s:" % (colors.GREEN, colors.NORMAL) msg += " Your task has been processed and your jobs have been %s successfully." % ( "resubmitted" if cmdname == "resubmit" else "submitted") logger.info(msg) elif currenttime < endtime and tmpresult not in [ 'SUBMITFAILED', 'RESUBMITFAILED' ]: msg = "The CRAB3 server finished processing your task." msg += " Use 'crab status' to see if your jobs have been %s successfully." % ( "resubmitted" if cmdname == "resubmit" else "submitted") logger.info(msg) print('\a') #Generate audio bell logger.debug("Ended %s process." % ("resubmission" if cmdname == "resubmit" else "submission"))