def fixupTask(task): """ Fixup some values obtained by the query. """ result = task._asdict() #fixup timestamps for field in ['tm_start_time', 'tm_start_injection', 'tm_end_injection']: current = result[field] result[field] = str(getEpochFromDBTime(current)) if current else '' #fixup CLOBS values by calling read (only for Oracle) for field in [ 'tm_task_failure', 'tm_split_args', 'tm_outfiles', 'tm_tfile_outfiles', 'tm_edm_outfiles', 'panda_resubmitted_jobs', 'tm_arguments', 'tm_scriptargs', 'tm_user_files', 'tm_arguments' ]: current = result[field] fixedCurr = current if ( current is None or isinstance(current, str)) else current.read() result[field] = fixedCurr #liter_evaluate values for field in [ 'tm_site_whitelist', 'tm_site_blacklist', 'tm_split_args', 'tm_outfiles', 'tm_tfile_outfiles', 'tm_edm_outfiles', 'panda_resubmitted_jobs', 'tm_user_infiles', 'tm_arguments', 'tm_scriptargs', 'tm_user_files' ]: current = result[field] result[field] = literal_eval(current) #convert tm_arguments to the desired values extraargs = result['tm_arguments'] result['resubmit_publication'] = extraargs[ 'resubmit_publication'] if 'resubmit_publication' in extraargs else None result['resubmit_jobids'] = extraargs[ 'resubmit_jobids'] if 'resubmit_jobids' in extraargs else None if result[ 'resubmit_jobids'] is None and 'resubmitList' in extraargs: ## For backward compatibility only. result['resubmit_jobids'] = extraargs['resubmitList'] result['resubmit_site_whitelist'] = extraargs[ 'site_whitelist'] if 'site_whitelist' in extraargs else None if result[ 'resubmit_site_whitelist'] is None and 'siteWhiteList' in extraargs: ## For backward compatibility only. result['resubmit_site_whitelist'] = extraargs['siteWhiteList'] result['resubmit_site_blacklist'] = extraargs[ 'site_blacklist'] if 'site_blacklist' in extraargs else None if result[ 'resubmit_site_blacklist'] is None and 'siteBlackList' in extraargs: ## For backward compatibility only. result['resubmit_site_blacklist'] = extraargs['siteBlackList'] result['resubmit_maxjobruntime'] = extraargs[ 'maxjobruntime'] if 'maxjobruntime' in extraargs else None result['resubmit_maxmemory'] = extraargs[ 'maxmemory'] if 'maxmemory' in extraargs else None result['resubmit_numcores'] = extraargs[ 'numcores'] if 'numcores' in extraargs else None result['resubmit_priority'] = extraargs[ 'priority'] if 'priority' in extraargs else None return result
def makeStatusReturnDict(self, crabDBInfo, combinedStatus, dagStatus='', statusFailureMsg='', shortResult={}, statusCacheInfo={}, pubStatus={}, proxiedWebDir=''): """ Create a dictionary which is mostly identical to the dictionary that was being returned by the old status (plus a few other keys needed by the other client commands). This is to ensure backward compatibility after the status2 transition for users relying on this dictionary in their scripts. """ statusDict = {} statusDict['status'] = combinedStatus statusDict['dbStatus'] = getColumn(crabDBInfo, 'tm_task_status') statusDict['dagStatus'] = dagStatus statusDict['username'] = getColumn(crabDBInfo, 'tm_username') statusDict['taskFailureMsg'] = getColumn(crabDBInfo, 'tm_task_failure') statusDict['taskWarningMsg'] = getColumn(crabDBInfo, 'tm_task_warnings') statusDict['outdatasets'] = getColumn(crabDBInfo, 'tm_output_dataset') statusDict['schedd'] = getColumn(crabDBInfo, 'tm_schedd') statusDict['collector'] = getColumn(crabDBInfo, 'tm_collector') statusDict['ASOURL'] = getColumn(crabDBInfo, 'tm_asourl') statusDict['command'] = getColumn(crabDBInfo, 'tm_task_command') statusDict['publicationEnabled'] = True if getColumn( crabDBInfo, 'tm_publication') == 'T' else False statusDict['userWebDirURL'] = getColumn(crabDBInfo, 'tm_user_webdir') statusDict['inputDataset'] = getColumn(crabDBInfo, 'tm_input_dataset') dbStartTime = getColumn(crabDBInfo, 'tm_start_time') statusDict['submissionTime'] = getEpochFromDBTime( datetime.strptime(dbStartTime, '%Y-%m-%d %H:%M:%S.%f')) statusDict['statusFailureMsg'] = statusFailureMsg statusDict['proxiedWebDir'] = proxiedWebDir statusDict['jobsPerStatus'] = shortResult.get('jobsPerStatus', {}) statusDict['jobList'] = shortResult.get('jobList', {}) statusDict['publication'] = pubStatus.get('status', {}) statusDict['publicationFailures'] = pubStatus.get( 'failure_reasons', {}) statusDict['jobs'] = statusCacheInfo return statusDict
def fixupTask(task): """ Fixup some values obtained by the query. """ result = task._asdict() #fixup timestamps for field in ['tm_start_time', 'tm_start_injection', 'tm_end_injection']: current = result[field] result[field] = str(getEpochFromDBTime(current)) if current else '' #fixup CLOBS values by calling read (only for Oracle) for field in ['tm_task_failure', 'tm_split_args', 'tm_outfiles', 'tm_tfile_outfiles', 'tm_edm_outfiles', 'panda_resubmitted_jobs', 'tm_arguments', 'tm_scriptargs', 'tm_user_files', 'tm_arguments']: current = result[field] fixedCurr = current if (current is None or isinstance(current, str)) else current.read() result[field] = fixedCurr #liter_evaluate values for field in ['tm_site_whitelist', 'tm_site_blacklist', 'tm_split_args', 'tm_outfiles', 'tm_tfile_outfiles', 'tm_edm_outfiles', 'panda_resubmitted_jobs', 'tm_user_infiles', 'tm_arguments', 'tm_scriptargs', 'tm_user_files']: current = result[field] result[field] = literal_eval(current) #convert tm_arguments to the desired values extraargs = result['tm_arguments'] result['resubmit_publication'] = extraargs['resubmit_publication'] if 'resubmit_publication' in extraargs else None result['resubmit_jobids'] = extraargs['resubmit_jobids'] if 'resubmit_jobids' in extraargs else None if result['resubmit_jobids'] is None and 'resubmitList' in extraargs: ## For backward compatibility only. result['resubmit_jobids'] = extraargs['resubmitList'] result['resubmit_site_whitelist'] = extraargs['site_whitelist'] if 'site_whitelist' in extraargs else None if result['resubmit_site_whitelist'] is None and 'siteWhiteList' in extraargs: ## For backward compatibility only. result['resubmit_site_whitelist'] = extraargs['siteWhiteList'] result['resubmit_site_blacklist'] = extraargs['site_blacklist'] if 'site_blacklist' in extraargs else None if result['resubmit_site_blacklist'] is None and 'siteBlackList' in extraargs: ## For backward compatibility only. result['resubmit_site_blacklist'] = extraargs['siteBlackList'] result['resubmit_maxjobruntime'] = extraargs['maxjobruntime'] if 'maxjobruntime' in extraargs else None result['resubmit_maxmemory'] = extraargs['maxmemory'] if 'maxmemory' in extraargs else None result['resubmit_numcores'] = extraargs['numcores'] if 'numcores' in extraargs else None result['resubmit_priority'] = extraargs['priority'] if 'priority' in extraargs else None result['kill_ids'] = extraargs['killList'] if 'killList' in extraargs else [] result['kill_all'] = extraargs['killAll'] if 'killAll' in extraargs else False return result
def resubmit2(self, workflow, publication, jobids, siteblacklist, sitewhitelist, maxjobruntime, maxmemory, numcores, priority, userproxy): """Request to reprocess what the workflow hasn't finished to reprocess. This needs to create a new workflow in the same campaign """ retmsg = "ok" self.logger.info("Getting task ID tuple from DB for task %s", workflow) row = self.api.query(None, None, self.Task.ID_sql, taskname = workflow) try: #just one row is picked up by the previous query row = self.Task.ID_tuple(*next(row)) except StopIteration: raise ExecutionError("Impossible to find task %s in the database." % workflow) submissionTime = getEpochFromDBTime(row.start_time) self.logger.info("Checking if resubmission is possible: we don't allow resubmission %s days before task expiration date", NUM_DAYS_FOR_RESUBMITDRAIN) retmsg = checkTaskLifetime(submissionTime) if retmsg != "ok": return [{'result': retmsg}] task_status = row.task_status task_splitting = row.split_algo resubmitWhat = "publications" if publication else "jobs" self.logger.info("About to resubmit %s for workflow: %s.", resubmitWhat, workflow) ## Ignore the following options if this is a publication resubmission or if the ## task was never submitted. if publication or task_status == 'SUBMITFAILED': jobids = None siteblacklist, sitewhitelist, maxjobruntime, maxmemory, numcores, priority = None, None, None, None, None, None # We only allow resubmission of tasks that are in a final state, listed here: allowedTaskStates = ['SUBMITTED', 'KILLED', 'KILLFAILED', 'RESUBMITFAILED', 'FAILED'] # Do not resubmit publication for tasks that were not submitted since they don't have any output. if not publication: allowedTaskStates += ['SUBMITFAILED'] #NB submitfailed goes to NEW, not RESUBMIT ## If the task status is not an allowed one, fail the resubmission. if task_status not in allowedTaskStates: msg = "You cannot resubmit %s if the task is in status %s." % (resubmitWhat, task_status) raise ExecutionError(msg) if task_status == 'KILLED' and task_splitting == 'Automatic': msg = "You cannot resubmit {0} if the task is in status {1} and uses automatic splitting.".format(resubmitWhat, task_status) raise ExecutionError(msg) if task_status != 'SUBMITFAILED': if publication: ## Retrieve publication information. publicationEnabled = row.publication asourl = row.asourl asodb = row.asodb username = row.username publicationInfo = self.publicationStatusWrapper(workflow, asourl, asodb, username, publicationEnabled) if 'status' not in publicationInfo: msg = "Cannot resubmit publication." msg += " Unable to retrieve the publication status." raise ExecutionError(msg) if 'disabled' in publicationInfo: msg = "Cannot resubmit publication." msg += " Publication was disabled in the CRAB configuration." raise ExecutionError(msg) if 'error' in publicationInfo: msg = "Cannot resubmit publication." msg += " Error in publication status: %s" % (publicationInfo['error']) raise ExecutionError(msg) if isCouchDBURL(asourl) and publicationInfo['status'].get('publication_failed', 0) == 0: msg = "There are no failed publications to resubmit." raise ExecutionError(msg) ## Here we can add a check on the publication status of the documents ## corresponding to the job ids in resubmitjobids and jobids. So far the ## publication resubmission will resubmit all the failed publications. self.resubmitPublication(asourl, asodb, userproxy, workflow) return [{'result': retmsg}] else: self.logger.info("Jobs to resubmit: %s", jobids) ## If these parameters are not set, give them the same values they had in the ## original task submission. if (siteblacklist is None) or (sitewhitelist is None) or (maxjobruntime is None) or (maxmemory is None) or (numcores is None) or (priority is None): ## origValues = [orig_siteblacklist, orig_sitewhitelist, orig_maxjobruntime, orig_maxmemory, orig_numcores, orig_priority] origValues = next(self.api.query(None, None, self.Task.GetResubmitParams_sql, taskname = workflow)) if siteblacklist is None: siteblacklist = literal_eval(origValues[0]) if sitewhitelist is None: sitewhitelist = literal_eval(origValues[1]) if maxjobruntime is None: maxjobruntime = origValues[2] if maxmemory is None: maxmemory = origValues[3] if numcores is None: numcores = origValues[4] if priority is None: priority = origValues[5] ## These are the parameters that we want to writte down in the 'tm_arguments' ## column of the Tasks DB each time a resubmission is done. ## DagmanResubmitter will read these parameters and write them into the task ad. arguments = {'resubmit_jobids' : jobids, 'site_blacklist' : siteblacklist, 'site_whitelist' : sitewhitelist, 'maxjobruntime' : maxjobruntime, 'maxmemory' : maxmemory, 'numcores' : numcores, 'priority' : priority, 'resubmit_publication' : publication } ## Change the 'tm_arguments' column of the Tasks DB for this task to contain the ## above parameters. self.api.modify(self.Task.SetArgumentsTask_sql, taskname = [workflow], arguments = [str(arguments)]) ## Change the status of the task in the Tasks DB to RESUBMIT (or NEW). if task_status == 'SUBMITFAILED': newstate = ["NEW"] newcommand = ["SUBMIT"] else: newstate = ["NEW"] newcommand = ["RESUBMIT"] self.api.modify(self.Task.SetStatusTask_sql, status = newstate, command = newcommand, taskname = [workflow]) return [{'result': retmsg}]
def resubmit2(self, workflow, publication, jobids, siteblacklist, sitewhitelist, maxjobruntime, maxmemory, numcores, priority, userproxy): """Request to reprocess what the workflow hasn't finished to reprocess. This needs to create a new workflow in the same campaign """ retmsg = "ok" self.logger.info("Getting task ID tuple from DB for task %s" % workflow) row = self.api.query(None, None, self.Task.ID_sql, taskname = workflow) try: #just one row is picked up by the previous query row = self.Task.ID_tuple(*next(row)) except StopIteration: raise ExecutionError("Impossible to find task %s in the database." % workflow) submissionTime = getEpochFromDBTime(row.start_time) self.logger.info("Checking if resubmission is possible: we don't allow resubmission %s days before task expiration date", NUM_DAYS_FOR_RESUBMITDRAIN) retmsg = self.checkTaskLifetime(submissionTime) if retmsg != "ok": return [{'result': retmsg}] task_status = row.task_status resubmitWhat = "publications" if publication else "jobs" self.logger.info("About to resubmit %s for workflow: %s." % (resubmitWhat, workflow)) ## Ignore the following options if this is a publication resubmission or if the ## task was never submitted. if publication or task_status == 'SUBMITFAILED': jobids = None siteblacklist, sitewhitelist, maxjobruntime, maxmemory, numcores, priority = None, None, None, None, None, None # We only allow resubmission of tasks that are in a final state, listed here: allowedTaskStates = ['SUBMITTED', 'KILLED', 'KILLFAILED', 'RESUBMITFAILED', 'FAILED'] # Do not resubmit publication for tasks that were not submitted since they don't have any output. if not publication: allowedTaskStates += ['SUBMITFAILED'] #NB submitfailed goes to NEW, not RESUBMIT ## If the task status is not an allowed one, fail the resubmission. if task_status not in allowedTaskStates: msg = "You cannot resubmit %s if the task is in status %s." % (resubmitWhat, task_status) raise ExecutionError(msg) if task_status != 'SUBMITFAILED': if publication: ## Retrieve publication information. publicationEnabled = row.publication asourl = row.asourl asodb = row.asodb username = row.username publicationInfo = self.publicationStatusWrapper(workflow, asourl, asodb, username, publicationEnabled) if 'status' not in publicationInfo: msg = "Cannot resubmit publication." msg += " Unable to retrieve the publication status." raise ExecutionError(msg) if 'disabled' in publicationInfo: msg = "Cannot resubmit publication." msg += " Publication was disabled in the CRAB configuration." raise ExecutionError(msg) if 'error' in publicationInfo: msg = "Cannot resubmit publication." msg += " Error in publication status: %s" % (publicationInfo['error']) raise ExecutionError(msg) if publicationInfo['status'].get('publication_failed', 0) == 0: msg = "There are no failed publications to resubmit." raise ExecutionError(msg) ## Here we can add a check on the publication status of the documents ## corresponding to the job ids in resubmitjobids and jobids. So far the ## publication resubmission will resubmit all the failed publications. self.resubmitPublication(asourl, asodb, userproxy, workflow) return [{'result': retmsg}] else: self.logger.info("Jobs to resubmit: %s" % (jobids)) ## If these parameters are not set, give them the same values they had in the ## original task submission. if (siteblacklist is None) or (sitewhitelist is None) or (maxjobruntime is None) or (maxmemory is None) or (numcores is None) or (priority is None): ## origValues = [orig_siteblacklist, orig_sitewhitelist, orig_maxjobruntime, orig_maxmemory, orig_numcores, orig_priority] origValues = next(self.api.query(None, None, self.Task.GetResubmitParams_sql, taskname = workflow)) if siteblacklist is None: siteblacklist = literal_eval(origValues[0]) if sitewhitelist is None: sitewhitelist = literal_eval(origValues[1]) if maxjobruntime is None: maxjobruntime = origValues[2] if maxmemory is None: maxmemory = origValues[3] if numcores is None: numcores = origValues[4] if priority is None: priority = origValues[5] ## These are the parameters that we want to writte down in the 'tm_arguments' ## column of the Tasks DB each time a resubmission is done. ## DagmanResubmitter will read these parameters and write them into the task ad. arguments = {'resubmit_jobids' : jobids, 'site_blacklist' : siteblacklist, 'site_whitelist' : sitewhitelist, 'maxjobruntime' : maxjobruntime, 'maxmemory' : maxmemory, 'numcores' : numcores, 'priority' : priority, 'resubmit_publication' : publication } ## Change the 'tm_arguments' column of the Tasks DB for this task to contain the ## above parameters. self.api.modify(self.Task.SetArgumentsTask_sql, taskname = [workflow], arguments = [str(arguments)]) ## Change the status of the task in the Tasks DB to RESUBMIT (or NEW). if task_status == 'SUBMITFAILED': newstate = ["NEW"] newcommand = ["SUBMIT"] else: newstate = ["NEW"] newcommand = ["RESUBMIT"] self.api.modify(self.Task.SetStatusTask_sql, status = newstate, command = newcommand, taskname = [workflow]) return [{'result': retmsg}]
def fixupTask(task): """ Fixup some values obtained by the query. """ result = task._asdict() # fixup timestamps for field in ["tm_start_time", "tm_start_injection", "tm_end_injection"]: current = result[field] result[field] = str(getEpochFromDBTime(current)) if current else "" # fixup CLOBS values by calling read (only for Oracle) for field in [ "tm_task_failure", "tm_split_args", "tm_outfiles", "tm_tfile_outfiles", "tm_edm_outfiles", "panda_resubmitted_jobs", "tm_arguments", "tm_scriptargs", "tm_user_files", "tm_arguments", ]: current = result[field] fixedCurr = current if (current is None or isinstance(current, str)) else current.read() result[field] = fixedCurr # liter_evaluate values for field in [ "tm_site_whitelist", "tm_site_blacklist", "tm_split_args", "tm_outfiles", "tm_tfile_outfiles", "tm_edm_outfiles", "panda_resubmitted_jobs", "tm_user_infiles", "tm_arguments", "tm_scriptargs", "tm_user_files", ]: current = result[field] result[field] = literal_eval(current) # convert tm_arguments to the desired values extraargs = result["tm_arguments"] result["resubmit_publication"] = extraargs["resubmit_publication"] if "resubmit_publication" in extraargs else None result["resubmit_jobids"] = extraargs["resubmit_jobids"] if "resubmit_jobids" in extraargs else None if result["resubmit_jobids"] is None and "resubmitList" in extraargs: ## For backward compatibility only. result["resubmit_jobids"] = extraargs["resubmitList"] result["resubmit_site_whitelist"] = extraargs["site_whitelist"] if "site_whitelist" in extraargs else None if result["resubmit_site_whitelist"] is None and "siteWhiteList" in extraargs: ## For backward compatibility only. result["resubmit_site_whitelist"] = extraargs["siteWhiteList"] result["resubmit_site_blacklist"] = extraargs["site_blacklist"] if "site_blacklist" in extraargs else None if result["resubmit_site_blacklist"] is None and "siteBlackList" in extraargs: ## For backward compatibility only. result["resubmit_site_blacklist"] = extraargs["siteBlackList"] result["resubmit_maxjobruntime"] = extraargs["maxjobruntime"] if "maxjobruntime" in extraargs else None result["resubmit_maxmemory"] = extraargs["maxmemory"] if "maxmemory" in extraargs else None result["resubmit_numcores"] = extraargs["numcores"] if "numcores" in extraargs else None result["resubmit_priority"] = extraargs["priority"] if "priority" in extraargs else None result["kill_ids"] = extraargs["killList"] if "killList" in extraargs else [] return result
def status(self, workflow, userdn, userproxy=None): """Retrieve the status of the workflow. :arg str workflow: a valid workflow name :return: a workflow status summary document""" #Empty results result = { "status": '', #from the db "command": '', #from the db "taskFailureMsg": '', #from the db "taskWarningMsg": [], #from the db "submissionTime": 0, #from the db "statusFailureMsg": '', #errors of the status itself "jobList": [], "schedd": '', #from the db "splitting": '', #from the db "taskWorker": '', #from the db "webdirPath": '', #from the db "username": '' } #from the db # First, verify the task has been submitted by the backend. self.logger.info("Got status request for workflow %s" % workflow) row = self.api.query(None, None, self.Task.ID_sql, taskname=workflow) try: #just one row is picked up by the previous query row = self.Task.ID_tuple(*next(row)) except StopIteration: raise ExecutionError( "Impossible to find task %s in the database." % workflow) result['submissionTime'] = getEpochFromDBTime(row.start_time) if row.task_command: result['command'] = row.task_command ## Add scheduler and collector to the result dictionary. if row.username: result['username'] = row.username if row.user_webdir: result['webdirPath'] = '/'.join(['/home/grid'] + row.user_webdir.split('/')[-2:]) if row.schedd: result['schedd'] = row.schedd if row.twname: result['taskWorker'] = row.twname if row.split_algo: result['splitting'] = row.split_algo self.asoDBURL = row.asourl # 0 - simple crab status # 1 - crab status -long # 2 - crab status -idle self.logger.info("Status result for workflow %s: %s " % (workflow, row.task_status)) ## Apply taskWarning flag to output. taskWarnings = literal_eval(row.task_warnings if isinstance( row.task_warnings, str) else row.task_warnings.read()) result["taskWarningMsg"] = taskWarnings ## Helper function to add the task status and the failure message (both as taken ## from the Task DB) to the result dictionary. def addStatusAndFailureFromDB(result, row): result['status'] = row.task_status if row.task_failure is not None: if isinstance(row.task_failure, str): result['taskFailureMsg'] = row.task_failure else: result['taskFailureMsg'] = row.task_failure.read() ## Helper function to add a failure message in retrieving the task/jobs status ## (and eventually a task status if there was none) to the result dictionary. def addStatusAndFailure(result, status, failure=None): if not result['status']: result['status'] = status if failure: #if not result['statusFailureMsg']: result['statusFailureMsg'] = failure #else: # result['statusFailureMsg'] += "\n%s" % (failure) #get rid of this? If there is a clusterid we go ahead and get jobs info, otherwise we return result self.logger.debug("Cluster id: %s" % row.clusterid) if row.task_status in [ 'NEW', 'HOLDING', 'UPLOADED', 'SUBMITFAILED', 'KILLFAILED', 'RESUBMITFAILED', 'FAILED' ]: addStatusAndFailureFromDB(result, row) if row.task_status in [ 'NEW', 'UPLOADED', 'SUBMITFAILED' ] and row.task_command not in ['KILL', 'RESUBMIT']: self.logger.debug("Detailed result for workflow %s: %s\n" % (workflow, result)) return [result] #even if we get rid these two should be filled # "taskFailureMsg" : '', #from the db # "taskWarningMsg" : [], #from the db #here we know we have a clusterid. But what if webdir is not there? return setting a proper statusFailureMsg #Now what to do # get node_state/job_log from the schedd. Needs Justas patch (is it ok?) # get error_report # get aso_status (it is going to change once we are done whith the oracle implementation) # combine everything ## Here we start to retrieve the jobs statuses. jobsPerStatus = {} taskJobCount = 0 taskStatus = {} jobList = [] results = [] # task_codes are used if condor_q command is done to retrieve task status task_codes = { 1: 'SUBMITTED', 2: 'SUBMITTED', 4: 'COMPLETED', 5: 'KILLED' } # dagman_codes are used if task status retrieved using node_state file # 1 = STATUS_READY (Means that task was not yet started) # 2 = STATUS_PRERUN (Means that task is doing PRE run) # 3 = STATUS_SUBMITTED (Means that task is submitted) # 4 = STATUS_POSTRUN (Means that task in PostRun) # 5 = STATUS_DONE (Means that task is Done) # 6 = STATUS_ERROR (Means that task is Failed/Killed) dagman_codes = { 1: 'SUBMITTED', 2: 'SUBMITTED', 3: 'SUBMITTED', 4: 'SUBMITTED', 5: 'COMPLETED', 6: 'FAILED' } # User web directory is needed for getting files from scheduler. if not row.user_webdir: self.logger.error( "webdir not found in DB. Impossible to retrieve task status") addStatusAndFailure(result, status='UNKNOWN', failure='missing webdir info') return [result] else: self.logger.info( "Getting status for workflow %s using node state file.", workflow) try: taskStatus = self.taskWebStatus( {'CRAB_UserWebDir': row.user_webdir}, result) #Check timestamp, if older then 2 minutes warn about stale info nodeStateUpd = int( taskStatus.get('DagStatus', {}).get("Timestamp", 0)) DAGStatus = int( taskStatus.get('DagStatus', {}).get('DagStatus', -1)) epochTime = int(time.time()) # If DAGStatus is 5 or 6, it means it is final state and node_state file will not be updated anymore # and there is no need to query schedd to get information about task. # If not, we check when the last time file was updated. It should update every 30s, which is set in # job classad: # https://github.com/dmwm/CRABServer/blob/5caac0d379f5e4522f026eeaf3621f7eb5ced98e/src/python/TaskWorker/Actions/DagmanCreator.py#L39 if (nodeStateUpd > 0 and (int(epochTime - nodeStateUpd) < 120)) or DAGStatus in [ 5, 6 ]: self.logger.info("Node state is up to date, using it") taskJobCount = int( taskStatus.get('DagStatus', {}).get('NodesTotal')) self.logger.info(taskStatus) if row.task_status in [ 'QUEUED', 'KILLED', 'KILLFAILED', 'RESUBMITFAILED', 'FAILED' ]: result['status'] = row.task_status else: result['status'] = dagman_codes.get( DAGStatus, row.task_status) # make sure taskStatusCode is defined if result['status'] in ['KILLED', 'KILLFAILED']: taskStatusCode = 5 else: taskStatusCode = 1 else: self.logger.info( "Node state file is too old or does not have an update time. Stale info is shown" ) except Exception as ee: addStatusAndFailure(result, status='UNKNOWN', failure=ee.info) return [result] if 'DagStatus' in taskStatus: del taskStatus['DagStatus'] for i in range(1, taskJobCount + 1): i = str(i) if i not in taskStatus: if taskStatusCode == 5: taskStatus[i] = {'State': 'killed'} else: taskStatus[i] = {'State': 'unsubmitted'} for job, info in taskStatus.items(): status = info['State'] jobsPerStatus.setdefault(status, 0) jobsPerStatus[status] += 1 jobList.append((status, job)) result['jobList'] = jobList #result['jobs'] = taskStatus if len(taskStatus) == 0 and results and results['JobStatus'] == 2: result['status'] = 'Running (jobs not submitted)' #Always returning ASOURL also, it is required for kill, resubmit self.logger.info("ASO: %s" % row.asourl) result['ASOURL'] = row.asourl ## Retrieve publication information. publicationInfo = {} if (row.publication == 'T' and 'finished' in jobsPerStatus): #let's default asodb to asynctransfer, for old task this is empty! asodb = row.asodb or 'asynctransfer' publicationInfo = self.publicationStatus(workflow, row.asourl, asodb, row.username) self.logger.info("Publication status for workflow %s done", workflow) elif (row.publication == 'F'): publicationInfo['status'] = {'disabled': []} else: self.logger.info( "No files to publish: Publish flag %s, files transferred: %s" % (row.publication, jobsPerStatus.get('finished', 0))) result['publication'] = publicationInfo.get('status', {}) result['publicationFailures'] = publicationInfo.get( 'failure_reasons', {}) ## The output datasets are written into the Task DB by the post-job ## when uploading the output files metadata. outdatasets = literal_eval( row.output_dataset.read() if row.output_dataset else 'None') result['outdatasets'] = outdatasets return [result]