def getInputFiles(self): """ Get the InputFiles.tar.gz and extract the necessary files """ taskname = self.cachedinfo['RequestName'] #Get task status from the task DB self.logger.debug("Getting status from he DB") uri = self.getUrl(self.instance, resource = 'task') serverFactory = CRABClient.Emulator.getEmulator('rest') server = serverFactory(self.serverurl, self.proxyfilename, self.proxyfilename, version = __version__) crabDBInfo, _, _ = server.get(uri, data = {'subresource': 'search', 'workflow': taskname}) status = getColumn(crabDBInfo, 'tm_task_status') self.destination = getColumn(crabDBInfo, 'tm_asyncdest') inputsFilename = os.path.join(os.getcwd(), 'InputFiles.tar.gz') if status == 'UPLOADED': filecacheurl = getColumn(crabDBInfo, 'tm_cache_url') ufc = CRABClient.Emulator.getEmulator('ufc')({'endpoint' : filecacheurl, "pycurl": True}) self.logger.debug("Downloading and extracting 'dry-run-sandbox.tar.gz' from %s" % filecacheurl) ufc.downloadLog('dry-run-sandbox.tar.gz', output=os.path.join(os.getcwd(), 'dry-run-sandbox.tar.gz')) with tarfile.open('dry-run-sandbox.tar.gz') as tf: tf.extractall() elif status == 'SUBMITTED': webdir = getProxiedWebDir(taskname, self.serverurl, uri, self.proxyfilename, self.logger.debug) if not webdir: webdir = getColumn(crabDBInfo, 'tm_user_webdir') self.logger.debug("Downloading 'InputFiles.tar.gz' from %s" % webdir) getFileFromURL(webdir + '/InputFiles.tar.gz', inputsFilename, self.proxyfilename) else: raise ClientException('Can only execute jobs from tasks in status SUBMITTED or UPLOADED. Current status is %s' % status) for name in [inputsFilename, 'CMSRunAnalysis.tar.gz', 'sandbox.tar.gz']: with tarfile.open(name) as tf: tf.extractall()
def collectReportData(self): """ Gather information from the server, status2, DBS and files in the webdir that is needed for the report. """ reportData = {} serverFactory = CRABClient.Emulator.getEmulator('rest') server = serverFactory(self.serverurl, self.proxyfilename, self.proxyfilename, version=__version__) self.logger.debug('Looking up report for task %s' % self.cachedinfo['RequestName']) # Query server for information from the taskdb, intput/output file metadata from metadatadb dictresult, status, reason = server.get(self.uri, data = {'workflow': self.cachedinfo['RequestName'], 'subresource': 'report2'}) self.logger.debug("Result: %s" % dictresult) self.logger.info("Running crab status2 first to fetch necessary information.") # Get job statuses crabDBInfo, shortResult = getMutedStatusInfo(self.logger) if not shortResult: # No point in continuing if the job list is empty. # Can happen when the task is very new / old and the files necessary for status2 # are unavailable. return None reportData['jobList'] = shortResult['jobList'] reportData['runsAndLumis'] = {} # Transform status joblist (tuples of job status and job id) into a dictionary jobStatusDict = {} for status, jobId in reportData['jobList']: jobStatusDict[jobId] = status # Filter output/input file metadata by finished job state for jobId in jobStatusDict: if jobStatusDict.get(jobId) in ['finished']: reportData['runsAndLumis'][jobId] = dictresult['result'][0]['runsAndLumis'][jobId] reportData['publication'] = True if getColumn(crabDBInfo, 'tm_publication') == "T" else False userWebDirURL = getColumn(crabDBInfo, 'tm_user_webdir') numJobs = len(shortResult['jobList']) reportData['lumisToProcess'] = self.getLumisToProcess(userWebDirURL, numJobs, self.cachedinfo['RequestName']) reportData['inputDataset'] = getColumn(crabDBInfo, 'tm_input_dataset') inputDatasetInfo = self.getInputDatasetLumis(reportData['inputDataset'], userWebDirURL)['inputDataset'] reportData['inputDatasetLumis'] = inputDatasetInfo['lumis'] reportData['inputDatasetDuplicateLumis'] = inputDatasetInfo['duplicateLumis'] reportData['outputDatasets'] = dictresult['result'][0]['taskDBInfo']['outputDatasets'] if reportData['publication']: reportData['outputDatasetsInfo'] = self.getDBSPublicationInfo(reportData['outputDatasets']) return reportData
def checkTaskInfo(crabserver, ad): """ Function checks that given task is registered in the database with status SUBMITTED and with the same clusterId and schedd name in the database as in the condor ads where it is currently running. In case above condition is not met, script immediately terminates """ task = ad['CRAB_ReqName'] clusterIdOnSchedd = ad['ClusterId'] data = {'subresource': 'search', 'workflow': task} try: dictresult, _, _ = crabserver.get(api='task', data=data) except HTTPException as hte: printLog(traceback.format_exc()) printLog(hte.headers) printLog(hte.result) sys.exit(2) taskStatusOnDB = getColumn(dictresult, 'tm_task_status') clusteridOnDB = getColumn(dictresult, 'clusterid') scheddOnDB = getColumn(dictresult, 'tm_schedd') scheddName = os.environ['schedd_name'] printLog( 'Task status on DB: %s, clusterID on DB: %s, schedd name on DB: %s; \nclusterID on condor ads: %s, schedd name on condor ads: %s ' % (taskStatusOnDB, clusteridOnDB, scheddOnDB, clusterIdOnSchedd, scheddName)) if not (taskStatusOnDB == 'SUBMITTED' and scheddOnDB == scheddName and clusteridOnDB == str(clusterIdOnSchedd)): printLog( 'Exiting AdjustSites because this dagman does not match task information in TASKS DB' ) sys.exit(3)
def searchTask(self, workflow): """ :param workflow: :return: """ data = dict() data['workflow'] = workflow data['subresource'] = 'search' try: result = self.oracleDB.get(self.config.oracleFileTrans.replace('filetransfers', 'task'), data=encodeRequest(data)) self.logger.debug("task: %s " % str(result[0])) self.logger.debug("task: %s " % getColumn(result[0], 'tm_last_publication')) except Exception as ex: self.logger.error("Error during task doc retrieving: %s" % ex) return {} return oracleOutputMapping(result)
def startSlave(self, task): # TODO: lock task! # - process logger logger = setProcessLogger(str(task[0][3])) logger.info("Process %s is starting. PID %s", task[0][3], os.getpid()) self.force_publication = False workflow = str(task[0][3]) wfnamemsg = "%s: " % (workflow) if int(workflow[0:2]) < 18: msg = "Skipped. Ignore tasks created before 2018." logger.info(wfnamemsg+msg) return 0 if len(task[1]) > self.max_files_per_block: self.force_publication = True msg = "All datasets have more than %s ready files." % (self.max_files_per_block) msg += " No need to retrieve task status nor last publication time." logger.info(wfnamemsg+msg) else: msg = "At least one dataset has less than %s ready files." % (self.max_files_per_block) logger.info(wfnamemsg+msg) # Retrieve the workflow status. If the status can not be retrieved, continue # with the next workflow. workflow_status = '' url = '/'.join(self.cache_area.split('/')[:-1]) #+ '/workflow' msg = "Retrieving status from %s" % (url) logger.info(wfnamemsg+msg) header = {"Content-Type":"application/json"} data = {'workflow': workflow, 'subresource': 'taskads'} url = 'cmsweb-testbed.cern.ch' connection = HTTPRequests(url, self.config.opsCert, self.config.opsKey) try: res = connection.get('/crabserver/preprod/workflow', data=encodeRequest(data)) except Exception as ex: #logger.info(wfnamemsg+encodeRequest(data)) logger.warn('Error retrieving status from cache for %s.' % workflow) return 0 try: workflow_status = res[0]['result'][0]['status'] msg = "Task status is %s." % workflow_status logger.info(wfnamemsg+msg) except ValueError: msg = "Workflow removed from WM." logger.error(wfnamemsg+msg) workflow_status = 'REMOVED' except Exception as ex: msg = "Error loading task status!" msg += str(ex) msg += str(traceback.format_exc()) logger.error(wfnamemsg+msg) # If the workflow status is terminal, go ahead and publish all the ready files # in the workflow. if workflow_status in ['COMPLETED', 'FAILED', 'KILLED', 'REMOVED']: self.force_publication = True if workflow_status in ['KILLED', 'REMOVED']: self.force_failure = True msg = "Considering task status as terminal. Will force publication." logger.info(wfnamemsg+msg) # Otherwise... else: msg = "Task status is not considered terminal." logger.info(wfnamemsg+msg) msg = "Getting last publication time." logger.info(wfnamemsg+msg) # Get when was the last time a publication was done for this workflow (this # should be more or less independent of the output dataset in case there are # more than one). last_publication_time = None data = {} data['workflow'] = workflow data['subresource'] = 'search' try: result = self.oracleDB.get(self.config.oracleFileTrans.replace('filetransfers', 'task'), data=encodeRequest(data)) logger.debug("task: %s " % str(result[0])) logger.debug("task: %s " % getColumn(result[0], 'tm_last_publication')) except Exception as ex: logger.error("Error during task doc retrieving: %s" %ex) if last_publication_time: date = oracleOutputMapping(result)['last_publication'] seconds = datetime.strptime(date, "%Y-%m-%d %H:%M:%S.%f").timetuple() last_publication_time = time.mktime(seconds) msg = "Last publication time: %s." % str(last_publication_time) logger.debug(wfnamemsg+msg) # If this is the first time a publication would be done for this workflow, go # ahead and publish. if not last_publication_time: self.force_publication = True msg = "There was no previous publication. Will force publication." logger.info(wfnamemsg+msg) # Otherwise... else: last = last_publication_time msg = "Last published block: %s" % (last) logger.debug(wfnamemsg+msg) # If the last publication was long time ago (> our block publication timeout), # go ahead and publish. now = int(time.time()) - time.timezone time_since_last_publication = now - last hours = int(time_since_last_publication/60/60) minutes = int((time_since_last_publication - hours*60*60)/60) timeout_hours = int(self.block_publication_timeout/60/60) timeout_minutes = int((self.block_publication_timeout - timeout_hours*60*60)/60) msg = "Last publication was %sh:%sm ago" % (hours, minutes) if time_since_last_publication > self.block_publication_timeout: self.force_publication = True msg += " (more than the timeout of %sh:%sm)." % (timeout_hours, timeout_minutes) msg += " Will force publication." else: msg += " (less than the timeout of %sh:%sm)." % (timeout_hours, timeout_minutes) msg += " Not enough to force publication." logger.info(wfnamemsg+msg) #logger.info(task[1]) try: if self.force_publication: # - get info active_ = [{'key': [x['username'], x['user_group'], x['user_role'], x['taskname']], 'value': [x['destination'], x['source_lfn'], x['destination_lfn'], x['input_dataset'], x['dbs_url'], x['last_update'] ]} for x in task[1] if x['transfer_state'] == 3 and x['publication_state'] not in [2, 3, 5]] lfn_ready = [] wf_jobs_endtime = [] pnn, input_dataset, input_dbs_url = "", "", "" for active_file in active_: job_end_time = active_file['value'][5] if job_end_time and self.config.isOracle: wf_jobs_endtime.append(int(job_end_time) - time.timezone) elif job_end_time: wf_jobs_endtime.append(int(time.mktime(time.strptime(str(job_end_time), '%Y-%m-%d %H:%M:%S'))) - time.timezone) source_lfn = active_file['value'][1] dest_lfn = active_file['value'][2] self.lfn_map[dest_lfn] = source_lfn if not pnn or not input_dataset or not input_dbs_url: pnn = str(active_file['value'][0]) input_dataset = str(active_file['value'][3]) input_dbs_url = str(active_file['value'][4]) lfn_ready.append(dest_lfn) userDN = '' username = task[0][0] user_group = "" if task[0][1]: user_group = task[0][1] user_role = "" if task[0][2]: user_role = task[0][2] logger.debug("Trying to get DN %s %s %s" % (username, user_group, user_role)) try: userDN = getDNFromUserName(username, logger) except Exception as ex: msg = "Error retrieving the user DN" msg += str(ex) msg += str(traceback.format_exc()) logger.error(msg) return 1 # Get metadata toPublish = [] publDescFiles_list = self.getPublDescFiles(workflow, lfn_ready) for file_ in active_: for _, doc in enumerate(publDescFiles_list): #logger.info(type(doc)) #logger.info(doc) if doc["lfn"] == file_["value"][2]: doc["User"] = username doc["Group"] = file_["key"][1] doc["Role"] = file_["key"][2] doc["UserDN"] = userDN doc["Destination"] = file_["value"][0] doc["SourceLFN"] = file_["value"][1] toPublish.append(doc) with open("/tmp/publisher_files/"+workflow+'.json', 'w') as outfile: json.dump(toPublish, outfile) logger.info(". publisher.py %s" % (workflow)) # find the location in the current environment of the script we want to run import Publisher.TaskPublish as tp taskPublishScript = tp.__file__ subprocess.call(["python", taskPublishScript, workflow]) except: logger.exception("Exception!") return 0
def __call__(self): self.logger.info('Getting the tarball hash key') inputlist = {'subresource': 'search', 'workflow': self.cachedinfo['RequestName']} serverFactory = CRABClient.Emulator.getEmulator('rest') server = serverFactory(self.serverurl, self.proxyfilename, self.proxyfilename, version=__version__) uri = getUrl(self.instance, resource = 'task') dictresult, _, _ = server.get(uri, data = inputlist) tm_user_sandbox = getColumn(dictresult, 'tm_user_sandbox') hashkey = tm_user_sandbox.replace(".tar.gz","") # Get the schedd address from the DB info and strip off the 'crab3@' prefix if it exists scheddaddress = getColumn(dictresult, 'tm_schedd') scheddaddress = scheddaddress.split('@')[1] if '@' in scheddaddress else scheddaddress self.logger.info('Checking task status') serverFactory = CRABClient.Emulator.getEmulator('rest') server = serverFactory(self.serverurl, self.proxyfilename, self.proxyfilename, version=__version__) dictresult, _, _ = server.get(self.uri, data = {'workflow': self.cachedinfo['RequestName'], 'verbose': 0}) dictresult = dictresult['result'][0] #take just the significant part self.logger.info('Task status: %s' % dictresult['status']) accepstate = ['SUBMITFAILED','KILLED','FINISHED','FAILED','KILLFAILED', 'COMPLETED'] if dictresult['status'] not in accepstate: msg = ('%sError%s: Only tasks with these status can be purged: {0}'.format(accepstate) % (colors.RED, colors.NORMAL)) raise ConfigurationException(msg) #getting the cache url cacheresult = {} scheddresult = {} gsisshdict = {} if not self.options.scheddonly: baseurl = getUrl(self.instance, resource='info') cacheurl = server_info('backendurls', self.serverurl, self.proxyfilename, baseurl) cacheurl = cacheurl['cacheSSL'] cacheurldict = {'endpoint': cacheurl, 'pycurl': True} ufc = UserFileCache(cacheurldict) self.logger.info('Tarball hashkey: %s' %hashkey) self.logger.info('Attempting to remove task file from crab server cache') try: ufcresult = ufc.removeFile(hashkey) except HTTPException as re: if 'X-Error-Info' in re.headers and 'Not such file' in re.headers['X-Error-Info']: self.logger.info('%sError%s: Failed to find task file in crab server cache; the file might have been already purged' % (colors.RED,colors.NORMAL)) raise if ufcresult == '': self.logger.info('%sSuccess%s: Successfully removed task files from crab server cache' % (colors.GREEN, colors.NORMAL)) cacheresult = 'SUCCESS' else: self.logger.info('%sError%s: Failed to remove task files from crab server cache' % (colors.RED, colors.NORMAL)) cacheresult = 'FAILED' if not self.options.cacheonly: self.logger.debug('%sSuccess%s: Successfully got schedd address' % (colors.GREEN, colors.NORMAL)) self.logger.debug('Schedd address: %s' % scheddaddress) self.logger.info('Attempting to remove task from schedd') gssishrm = 'gsissh -o ConnectTimeout=60 -o PasswordAuthentication=no ' + scheddaddress + ' rm -rf ' + self.cachedinfo['RequestName'] self.logger.debug('gsissh command: %s' % gssishrm) delprocess=subprocess.Popen(gssishrm, stdout= subprocess.PIPE, stderr= subprocess.PIPE, shell=True) stdout, stderr = delprocess.communicate() exitcode = delprocess.returncode if exitcode == 0 : self.logger.info('%sSuccess%s: Successfully removed task from schedd' % (colors.GREEN, colors.NORMAL)) scheddresult = 'SUCCESS' gsisshdict = {} else : self.logger.info('%sError%s: Failed to remove task from schedd' % (colors.RED, colors.NORMAL)) scheddaddress = 'FAILED' self.logger.debug('gsissh stdout: %s\ngsissh stderr: %s\ngsissh exitcode: %s' % (stdout,stderr,exitcode)) gsisshdict = {'stdout' : stdout, 'stderr' : stderr , 'exitcode' : exitcode} return {'cacheresult' : cacheresult , 'scheddresult' : scheddresult , 'gsiresult' : gsisshdict}
def __call__(self): self.logger.info('Getting the tarball hash key') inputlist = {'subresource': 'search', 'workflow': self.cachedinfo['RequestName']} serverFactory = CRABClient.Emulator.getEmulator('rest') server = serverFactory(self.serverurl, self.proxyfilename, self.proxyfilename, version=__version__) uri = self.getUrl(self.instance, resource = 'task') dictresult, _, _ = server.get(uri, data = inputlist) tm_user_sandbox = getColumn(dictresult, 'tm_user_sandbox') hashkey = tm_user_sandbox.replace(".tar.gz","") # Get the schedd address from the DB info and strip off the 'crab3@' prefix if it exists scheddaddress = getColumn(dictresult, 'tm_schedd') scheddaddress = scheddaddress.split('@')[1] if '@' in scheddaddress else scheddaddress self.logger.info('Checking task status') serverFactory = CRABClient.Emulator.getEmulator('rest') server = serverFactory(self.serverurl, self.proxyfilename, self.proxyfilename, version=__version__) dictresult, _, _ = server.get(self.uri, data = {'workflow': self.cachedinfo['RequestName'], 'verbose': 0}) dictresult = dictresult['result'][0] #take just the significant part self.logger.info('Task status: %s' % dictresult['status']) accepstate = ['SUBMITFAILED','KILLED','FINISHED','FAILED','KILLFAILED', 'COMPLETED'] if dictresult['status'] not in accepstate: msg = ('%sError%s: Only tasks with these status can be purged: {0}'.format(accepstate) % (colors.RED, colors.NORMAL)) raise ConfigurationException(msg) #getting the cache url cacheresult = {} scheddresult = {} gsisshdict = {} if not self.options.scheddonly: baseurl = getUrl(self.instance, resource='info') cacheurl = server_info('backendurls', self.serverurl, self.proxyfilename, baseurl) cacheurl = cacheurl['cacheSSL'] cacheurldict = {'endpoint': cacheurl, 'pycurl': True} ufc = UserFileCache(cacheurldict) self.logger.info('Tarball hashkey: %s' %hashkey) self.logger.info('Attempting to remove task file from crab server cache') try: ufcresult = ufc.removeFile(hashkey) except HTTPException as re: if 'X-Error-Info' in re.headers and 'Not such file' in re.headers['X-Error-Info']: self.logger.info('%sError%s: Failed to find task file in crab server cache; the file might have been already purged' % (colors.RED,colors.NORMAL)) raise if ufcresult == '': self.logger.info('%sSuccess%s: Successfully removed task files from crab server cache' % (colors.GREEN, colors.NORMAL)) cacheresult = 'SUCCESS' else: self.logger.info('%sError%s: Failed to remove task files from crab server cache' % (colors.RED, colors.NORMAL)) cacheresult = 'FAILED' if not self.options.cacheonly: self.logger.debug('%sSuccess%s: Successfully got schedd address' % (colors.GREEN, colors.NORMAL)) self.logger.debug('Schedd address: %s' % scheddaddress) self.logger.info('Attempting to remove task from schedd') gssishrm = 'gsissh -o ConnectTimeout=60 -o PasswordAuthentication=no ' + scheddaddress + ' rm -rf ' + self.cachedinfo['RequestName'] self.logger.debug('gsissh command: %s' % gssishrm) delprocess=subprocess.Popen(gssishrm, stdout= subprocess.PIPE, stderr= subprocess.PIPE, shell=True) stdout, stderr = delprocess.communicate() exitcode = delprocess.returncode if exitcode == 0 : self.logger.info('%sSuccess%s: Successfully removed task from schedd' % (colors.GREEN, colors.NORMAL)) scheddresult = 'SUCCESS' gsisshdict = {} else : self.logger.info('%sError%s: Failed to remove task from schedd' % (colors.RED, colors.NORMAL)) scheddaddress = 'FAILED' self.logger.debug('gsissh stdout: %s\ngsissh stderr: %s\ngsissh exitcode: %s' % (stdout,stderr,exitcode)) gsisshdict = {'stdout' : stdout, 'stderr' : stderr , 'exitcode' : exitcode} return {'cacheresult' : cacheresult , 'scheddresult' : scheddresult , 'gsiresult' : gsisshdict}
def startSlave(self, task): """ start a slave process to deal with publication for a single task :param task: one tupla describing a task as returned by active_tasks() :return: 0 It will always terminate normally, if publication fails it will mark it in the DB """ # TODO: lock task! # - process logger logger = setSlaveLogger(str(task[0][3])) logger.info("Process %s is starting. PID %s", task[0][3], os.getpid()) self.force_publication = False workflow = str(task[0][3]) if len(task[1]) > self.max_files_per_block: self.force_publication = True msg = "All datasets have more than %s ready files." % ( self.max_files_per_block) msg += " No need to retrieve task status nor last publication time." logger.info(msg) else: msg = "At least one dataset has less than %s ready files." % ( self.max_files_per_block) logger.info(msg) # Retrieve the workflow status. If the status can not be retrieved, continue # with the next workflow. workflow_status = '' msg = "Retrieving status" logger.info(msg) data = encodeRequest({'workflow': workflow}) try: res = self.crabServer.get(api='workflow', data=data) except Exception as ex: logger.warn( 'Error retrieving status from crabserver for %s:\n%s', workflow, str(ex)) return 0 try: workflow_status = res[0]['result'][0]['status'] msg = "Task status is %s." % workflow_status logger.info(msg) except ValueError: msg = "Workflow removed from WM." logger.error(msg) workflow_status = 'REMOVED' except Exception as ex: msg = "Error loading task status!" msg += str(ex) msg += str(traceback.format_exc()) logger.error(msg) # If the workflow status is terminal, go ahead and publish all the ready files # in the workflow. if workflow_status in ['COMPLETED', 'FAILED', 'KILLED', 'REMOVED']: self.force_publication = True if workflow_status in ['KILLED', 'REMOVED']: self.force_failure = True msg = "Considering task status as terminal. Will force publication." logger.info(msg) # Otherwise... else: ## TODO put this else in a function like def checkForPublication() msg = "Task status is not considered terminal." logger.info(msg) msg = "Getting last publication time." logger.info(msg) # Get when was the last time a publication was done for this workflow (this # should be more or less independent of the output dataset in case there are # more than one). last_publication_time = None data = encodeRequest({ 'workflow': workflow, 'subresource': 'search' }) try: result = self.crabServer.get(api='task', data=data) logger.debug("task: %s ", str(result[0])) last_publication_time = getColumn(result[0], 'tm_last_publication') except Exception as ex: logger.error("Error during task doc retrieving:\n%s", ex) if last_publication_time: date = last_publication_time # datetime in Oracle format timetuple = datetime.strptime( date, "%Y-%m-%d %H:%M:%S.%f").timetuple( ) # convert to time tuple last_publication_time = time.mktime( timetuple) # convert to seconds since Epoch (float) msg = "Last publication time: %s." % str(last_publication_time) logger.debug(msg) # If this is the first time a publication would be done for this workflow, go # ahead and publish. if not last_publication_time: self.force_publication = True msg = "There was no previous publication. Will force publication." logger.info(msg) # Otherwise... else: last = last_publication_time msg = "Last published block time: %s" % last logger.debug(msg) # If the last publication was long time ago (> our block publication timeout), # go ahead and publish. now = int(time.time()) - time.timezone time_since_last_publication = now - last hours = int(time_since_last_publication / 60 / 60) minutes = int( (time_since_last_publication - hours * 60 * 60) / 60) timeout_hours = int(self.block_publication_timeout / 60 / 60) timeout_minutes = int((self.block_publication_timeout - timeout_hours * 60 * 60) / 60) msg = "Last publication was %sh:%sm ago" % (hours, minutes) if time_since_last_publication > self.block_publication_timeout: self.force_publication = True msg += " (more than the timeout of %sh:%sm)." % ( timeout_hours, timeout_minutes) msg += " Will force publication." else: msg += " (less than the timeout of %sh:%sm)." % ( timeout_hours, timeout_minutes) msg += " Not enough to force publication." logger.info(msg) # logger.info(task[1]) try: if self.force_publication: # - get info active_ = [{ 'key': [ x['username'], x['user_group'], x['user_role'], x['taskname'] ], 'value': [ x['destination'], x['source_lfn'], x['destination_lfn'], x['input_dataset'], x['dbs_url'], x['last_update'] ] } for x in task[1] if x['transfer_state'] == 3 and x['publication_state'] not in [2, 3, 5]] lfn_ready = [] wf_jobs_endtime = [] pnn, input_dataset, input_dbs_url = "", "", "" for active_file in active_: job_end_time = active_file['value'][5] if job_end_time: wf_jobs_endtime.append( int(job_end_time) - time.timezone) source_lfn = active_file['value'][1] dest_lfn = active_file['value'][2] self.lfn_map[dest_lfn] = source_lfn if not pnn or not input_dataset or not input_dbs_url: pnn = str(active_file['value'][0]) input_dataset = str(active_file['value'][3]) input_dbs_url = str(active_file['value'][4]) lfn_ready.append(dest_lfn) username = task[0][0] # Get metadata toPublish = [] toFail = [] publDescFiles_list = self.getPublDescFiles( workflow, lfn_ready, logger) for file_ in active_: metadataFound = False for doc in publDescFiles_list: # logger.info(type(doc)) # logger.info(doc) if doc["lfn"] == file_["value"][2]: doc["User"] = username doc["Group"] = file_["key"][1] doc["Role"] = file_["key"][2] doc["UserDN"] = self.myDN doc["Destination"] = file_["value"][0] doc["SourceLFN"] = file_["value"][1] toPublish.append(doc) metadataFound = True break # if we failed to find metadata mark publication as failed to avoid to keep looking # at same files over and over if not metadataFound: toFail.append(file_["value"][1]) with open(self.taskFilesDir + workflow + '.json', 'w') as outfile: json.dump(toPublish, outfile) logger.debug( 'Unitarity check: active_:%d toPublish:%d toFail:%d', len(active_), len(toPublish), len(toFail)) if len(toPublish) + len(toFail) != len(active_): logger.error("SOMETHING WRONG IN toPublish vs toFail !!") if toFail: logger.info( 'Did not find useful metadata for %d files. Mark as failed', len(toFail)) from ServerUtilities import getHashLfn nMarked = 0 for lfn in toFail: source_lfn = lfn docId = getHashLfn(source_lfn) data = dict() data['asoworker'] = self.config.asoworker data['subresource'] = 'updatePublication' data['list_of_ids'] = docId data['list_of_publication_state'] = 'FAILED' data['list_of_retry_value'] = 1 data[ 'list_of_failure_reason'] = 'File type not EDM or metadata not found' try: result = self.crabServer.post( api='filetransfers', data=encodeRequest(data)) #logger.debug("updated DocumentId: %s lfn: %s Result %s", docId, source_lfn, result) except Exception as ex: logger.error( "Error updating status for DocumentId: %s lfn: %s", docId, source_lfn) logger.error("Error reason: %s", ex) nMarked += 1 #if nMarked % 10 == 0: logger.info('marked %d files as Failed', nMarked) # find the location in the current environment of the script we want to run import Publisher.TaskPublish as tp taskPublishScript = tp.__file__ cmd = "python %s " % taskPublishScript cmd += " --configFile=%s" % self.configurationFile cmd += " --taskname=%s" % workflow if self.TPconfig.dryRun: cmd += " --dry" logger.info("Now execute: %s", cmd) stdout, stderr, exitcode = executeCommand(cmd) if exitcode != 0: errorMsg = 'Failed to execute command: %s.\n StdErr: %s.' % ( cmd, stderr) raise Exception(errorMsg) else: logger.info('TaskPublishScript done : %s', stdout) jsonSummary = stdout.split()[-1] with open(jsonSummary, 'r') as fd: summary = json.load(fd) result = summary['result'] reason = summary['reason'] taskname = summary['taskname'] if result == 'OK': if reason == 'NOTHING TO DO': logger.info('Taskname %s is OK. Nothing to do', taskname) else: msg = 'Taskname %s is OK. Published %d files in %d blocks.' % \ (taskname, summary['publishedFiles'], summary['publishedBlocks']) if summary['nextIterFiles']: msg += ' %d files left for next iteration.' % summary[ 'nextIterFiles'] logger.info(msg) if result == 'FAIL': logger.error('Taskname %s : TaskPublish failed with: %s', taskname, reason) if reason == 'DBS Publication Failure': logger.error( 'Taskname %s : %d blocks failed for a total of %d files', taskname, summary['failedBlocks'], summary['failedFiles']) logger.error( 'Taskname %s : Failed block(s) details have been saved in %s', taskname, summary['failedBlockDumps']) except Exception as ex: logger.exception("Exception when calling TaskPublish!\n%s", str(ex)) return 0
def collectReportData(self): """ Gather information from the server, status2, DBS and files in the webdir that is needed for the report. """ reportData = {} serverFactory = CRABClient.Emulator.getEmulator('rest') server = serverFactory(self.serverurl, self.proxyfilename, self.proxyfilename, version=__version__) self.logger.debug('Looking up report for task %s' % self.cachedinfo['RequestName']) # Query server for information from the taskdb, intput/output file metadata from metadatadb dictresult, status, reason = server.get( self.uri, data={ 'workflow': self.cachedinfo['RequestName'], 'subresource': 'report2' }) self.logger.debug("Result: %s" % dictresult) self.logger.info( "Running crab status2 first to fetch necessary information.") # Get job statuses crabDBInfo, shortResult = getMutedStatusInfo(self.logger) if not shortResult: # No point in continuing if the job list is empty. # Can happen when the task is very new / old and the files necessary for status2 # are unavailable. return None reportData['jobList'] = shortResult['jobList'] reportData['runsAndLumis'] = {} # Transform status joblist (tuples of job status and job id) into a dictionary jobStatusDict = {} for status, jobId in reportData['jobList']: jobStatusDict[jobId] = status # Filter output/input file metadata by finished job state for jobId in jobStatusDict: if jobStatusDict.get(jobId) in ['finished']: reportData['runsAndLumis'][jobId] = dictresult['result'][0][ 'runsAndLumis'][jobId] reportData['publication'] = True if getColumn( crabDBInfo, 'tm_publication') == "T" else False userWebDirURL = getColumn(crabDBInfo, 'tm_user_webdir') numJobs = len(shortResult['jobList']) reportData['lumisToProcess'] = self.getLumisToProcess( userWebDirURL, numJobs, self.cachedinfo['RequestName']) reportData['inputDataset'] = getColumn(crabDBInfo, 'tm_input_dataset') inputDatasetInfo = self.getInputDatasetLumis( reportData['inputDataset'], userWebDirURL)['inputDataset'] reportData['inputDatasetLumis'] = inputDatasetInfo['lumis'] reportData['inputDatasetDuplicateLumis'] = inputDatasetInfo[ 'duplicateLumis'] reportData['outputDatasets'] = dictresult['result'][0]['taskDBInfo'][ 'outputDatasets'] if reportData['publication']: reportData['outputDatasetsInfo'] = self.getDBSPublicationInfo( reportData['outputDatasets']) return reportData
def startSlave(self, task): """ start a slave process to deal with publication for a single task :param task: one tupla describing a task as returned by active_tasks() :return: 0 It will always terminate normally, if publication fails it will mark it in the DB """ # TODO: lock task! # - process logger logger = setSlaveLogger(str(task[0][3])) logger.info("Process %s is starting. PID %s", task[0][3], os.getpid()) self.force_publication = False workflow = str(task[0][3]) if int(workflow[0:2]) < 20: msg = "Skipped. Ignore tasks created before 2020." logger.info(msg) return 0 if len(task[1]) > self.max_files_per_block: self.force_publication = True msg = "All datasets have more than %s ready files." % ( self.max_files_per_block) msg += " No need to retrieve task status nor last publication time." logger.info(msg) else: msg = "At least one dataset has less than %s ready files." % ( self.max_files_per_block) logger.info(msg) # Retrieve the workflow status. If the status can not be retrieved, continue # with the next workflow. workflow_status = '' msg = "Retrieving status" logger.info(msg) uri = self.REST_workflow data = encodeRequest({'workflow': workflow}) try: res = self.crabServer.get(uri=uri, data=data) except Exception as ex: logger.warn('Error retrieving status from %s for %s.', uri, workflow) return 0 try: workflow_status = res[0]['result'][0]['status'] msg = "Task status is %s." % workflow_status logger.info(msg) except ValueError: msg = "Workflow removed from WM." logger.error(msg) workflow_status = 'REMOVED' except Exception as ex: msg = "Error loading task status!" msg += str(ex) msg += str(traceback.format_exc()) logger.error(msg) # If the workflow status is terminal, go ahead and publish all the ready files # in the workflow. if workflow_status in ['COMPLETED', 'FAILED', 'KILLED', 'REMOVED']: self.force_publication = True if workflow_status in ['KILLED', 'REMOVED']: self.force_failure = True msg = "Considering task status as terminal. Will force publication." logger.info(msg) # Otherwise... else: msg = "Task status is not considered terminal." logger.info(msg) msg = "Getting last publication time." logger.info(msg) # Get when was the last time a publication was done for this workflow (this # should be more or less independent of the output dataset in case there are # more than one). last_publication_time = None data = encodeRequest({ 'workflow': workflow, 'subresource': 'search' }) try: result = self.crabServer.get(self.REST_task, data) logger.debug("task: %s ", str(result[0])) last_publication_time = getColumn(result[0], 'tm_last_publication') except Exception as ex: logger.error("Error during task doc retrieving: %s", ex) if last_publication_time: date = last_publication_time # datetime in Oracle format timetuple = datetime.strptime( date, "%Y-%m-%d %H:%M:%S.%f").timetuple( ) # convert to time tuple last_publication_time = time.mktime( timetuple) # convert to seconds since Epoch (float) msg = "Last publication time: %s." % str(last_publication_time) logger.debug(msg) # If this is the first time a publication would be done for this workflow, go # ahead and publish. if not last_publication_time: self.force_publication = True msg = "There was no previous publication. Will force publication." logger.info(msg) # Otherwise... else: last = last_publication_time msg = "Last published block time: %s" % last logger.debug(msg) # If the last publication was long time ago (> our block publication timeout), # go ahead and publish. now = int(time.time()) - time.timezone time_since_last_publication = now - last hours = int(time_since_last_publication / 60 / 60) minutes = int( (time_since_last_publication - hours * 60 * 60) / 60) timeout_hours = int(self.block_publication_timeout / 60 / 60) timeout_minutes = int((self.block_publication_timeout - timeout_hours * 60 * 60) / 60) msg = "Last publication was %sh:%sm ago" % (hours, minutes) if time_since_last_publication > self.block_publication_timeout: self.force_publication = True msg += " (more than the timeout of %sh:%sm)." % ( timeout_hours, timeout_minutes) msg += " Will force publication." else: msg += " (less than the timeout of %sh:%sm)." % ( timeout_hours, timeout_minutes) msg += " Not enough to force publication." logger.info(msg) # logger.info(task[1]) try: if self.force_publication: # - get info active_ = [{ 'key': [ x['username'], x['user_group'], x['user_role'], x['taskname'] ], 'value': [ x['destination'], x['source_lfn'], x['destination_lfn'], x['input_dataset'], x['dbs_url'], x['last_update'] ] } for x in task[1] if x['transfer_state'] == 3 and x['publication_state'] not in [2, 3, 5]] lfn_ready = [] wf_jobs_endtime = [] pnn, input_dataset, input_dbs_url = "", "", "" for active_file in active_: job_end_time = active_file['value'][5] if job_end_time: wf_jobs_endtime.append( int(job_end_time) - time.timezone) source_lfn = active_file['value'][1] dest_lfn = active_file['value'][2] self.lfn_map[dest_lfn] = source_lfn if not pnn or not input_dataset or not input_dbs_url: pnn = str(active_file['value'][0]) input_dataset = str(active_file['value'][3]) input_dbs_url = str(active_file['value'][4]) lfn_ready.append(dest_lfn) username = task[0][0] # Get metadata toPublish = [] publDescFiles_list = self.getPublDescFiles( workflow, lfn_ready, logger) for file_ in active_: for _, doc in enumerate(publDescFiles_list): # logger.info(type(doc)) # logger.info(doc) if doc["lfn"] == file_["value"][2]: doc["User"] = username doc["Group"] = file_["key"][1] doc["Role"] = file_["key"][2] doc["UserDN"] = self.myDN doc["Destination"] = file_["value"][0] doc["SourceLFN"] = file_["value"][1] toPublish.append(doc) with open(self.taskFilesDir + workflow + '.json', 'w') as outfile: json.dump(toPublish, outfile) # find the location in the current environment of the script we want to run import Publisher.TaskPublish as tp taskPublishScript = tp.__file__ cmd = "python %s " % taskPublishScript cmd += " --configFile=%s" % self.configurationFile cmd += " --taskname=%s" % workflow if self.TPconfig.dryRun: cmd += " --dry" logger.info("Now execute: %s", cmd) subprocess.call(cmd, shell=True) logger.info('ALL DONE') except Exception: logger.exception("Exception when calling TaskPublish!") return 0