def handleNewTask(resthost, resturi, config, task, procnum, *args, **kwargs): """Performs the injection of a new task :arg str resthost: the hostname where the rest interface is running :arg str resturi: the rest base url to contact :arg WMCore.Configuration config: input configuration :arg TaskWorker.DataObjects.Task task: the task to work on :arg int procnum: the process number taking care of the work :*args and *kwargs: extra parameters currently not defined :return: the handler.""" server = HTTPRequests(resthost, config.TaskWorker.cmscert, config.TaskWorker.cmskey, retry=20, logger=logging.getLogger(str(procnum))) handler = TaskHandler(task, procnum, server, config, 'handleNewTask', createTempDir=True) handler.addWork(MyProxyLogon(config=config, server=server, resturi=resturi, procnum=procnum, myproxylen=60 * 60 * 24)) handler.addWork(StageoutCheck(config=config, server=server, resturi=resturi, procnum=procnum)) if task['tm_job_type'] == 'Analysis': if task.get('tm_user_files'): handler.addWork(UserDataDiscovery(config=config, server=server, resturi=resturi, procnum=procnum)) else: handler.addWork(DBSDataDiscovery(config=config, server=server, resturi=resturi, procnum=procnum)) elif task['tm_job_type'] == 'PrivateMC': handler.addWork(MakeFakeFileSet(config=config, server=server, resturi=resturi, procnum=procnum)) handler.addWork(Splitter(config=config, server=server, resturi=resturi, procnum=procnum)) handler.addWork(DagmanCreator(config=config, server=server, resturi=resturi, procnum=procnum)) if task['tm_dry_run'] == 'T': handler.addWork(DryRunUploader(config=config, server=server, resturi=resturi, procnum=procnum)) else: handler.addWork(DagmanSubmitter(config=config, server=server, resturi=resturi, procnum=procnum)) return handler.actionWork(args, kwargs)
def handleKill(resthost, dbInstance, config, task, procnum, *args, **kwargs): """Asks to kill jobs :arg str resthost: the hostname where the rest interface is running :arg str dbInstance: the rest base url to contact :arg WMCore.Configuration config: input configuration :arg TaskWorker.DataObjects.Task task: the task to work on :arg int procnum: the process number taking care of the work :*args and *kwargs: extra parameters currently not defined :return: the result of the handler operation.""" crabserver = CRABRest(resthost, config.TaskWorker.cmscert, config.TaskWorker.cmskey, retry=20, logger=logging.getLogger(str(procnum)), userAgent='CRABTaskWorker', version=__version__) crabserver.setDbInstance(dbInstance) handler = TaskHandler(task, procnum, crabserver, config, 'handleKill') handler.addWork( MyProxyLogon(config=config, crabserver=crabserver, procnum=procnum, myproxylen=60 * 5)) handler.addWork( DagmanKiller(config=config, crabserver=crabserver, procnum=procnum)) return handler.actionWork(args, kwargs)
def handleKill(resthost, resturi, config, task, procnum, *args, **kwargs): """Asks to kill jobs :arg str resthost: the hostname where the rest interface is running :arg str resturi: the rest base url to contact :arg WMCore.Configuration config: input configuration :arg TaskWorker.DataObjects.Task task: the task to work on :arg int procnum: the process number taking care of the work :*args and *kwargs: extra parameters currently not defined :return: the result of the handler operation.""" server = HTTPRequests(resthost, config.TaskWorker.cmscert, config.TaskWorker.cmskey, retry=20, logger=logging.getLogger(str(procnum))) handler = TaskHandler(task, procnum, server, config, 'handleKill') handler.addWork( MyProxyLogon(config=config, server=server, resturi=resturi, procnum=procnum, myproxylen=60 * 5)) handler.addWork( DagmanKiller(config=config, server=server, resturi=resturi, procnum=procnum)) return handler.actionWork(args, kwargs)
def handleKill(instance, resturl, config, task, *args, **kwargs): """Asks to kill jobs :arg str instance: the hostname where the rest interface is running :arg str resturl: the rest base url to contact :arg WMCore.Configuration config: input configuration :arg TaskWorker.DataObjects.Task task: the task to work on :*args and *kwargs: extra parameters currently not defined :return: the result of the handler operation.""" server = HTTPRequests(instance, config.TaskWorker.cmscert, config.TaskWorker.cmskey, version=__version__) handler = TaskHandler(task) handler.addWork( MyProxyLogon(config=config, server=server, resturl=resturl, myproxylen=60 * 5)) def glidein(config): """Performs kill of jobs sent through Glidein :arg WMCore.Configuration config: input configuration""" raise NotImplementedError #handler.addWork( DagmanKiller(glideinconfig=config, server=server, resturl=resturl) ) def panda(config): """Performs the re-injection into PanDA :arg WMCore.Configuration config: input configuration""" handler.addWork( PanDAKill(pandaconfig=config, server=server, resturl=resturl)) locals()[getattr(config.TaskWorker, 'backend', DEFAULT_BACKEND).lower()](config) return handler.actionWork(args, kwargs)
def handleNewTask(resthost, resturi, config, task, *args, **kwargs): """Performs the injection of a new task :arg str resthost: the hostname where the rest interface is running :arg str resturi: the rest base url to contact :arg WMCore.Configuration config: input configuration :arg TaskWorker.DataObjects.Task task: the task to work on :*args and *kwargs: extra parameters currently not defined :return: the handler.""" server = HTTPRequests(resthost, config.TaskWorker.cmscert, config.TaskWorker.cmskey) handler = TaskHandler(task) handler.addWork( MyProxyLogon(config=config, server=server, resturi=resturi, myproxylen=60*60*24) ) if task['tm_job_type'] == 'Analysis': if task.get('tm_arguments', {}).get('userfiles'): handler.addWork( UserDataDiscovery(config=config, server=server, resturi=resturi) ) else: handler.addWork( DBSDataDiscovery(config=config, server=server, resturi=resturi) ) elif task['tm_job_type'] == 'PrivateMC': handler.addWork( MakeFakeFileSet(config=config, server=server, resturi=resturi) ) handler.addWork( Splitter(config=config, server=server, resturi=resturi) ) def glidein(config): """Performs the injection of a new task into Glidein :arg WMCore.Configuration config: input configuration""" handler.addWork( DagmanCreator(config=config, server=server, resturi=resturi) ) handler.addWork( DagmanSubmitter(config=config, server=server, resturi=resturi) ) def panda(config): """Performs the injection into PanDA of a new task :arg WMCore.Configuration config: input configuration""" handler.addWork( PanDABrokerage(pandaconfig=config, server=server, resturi=resturi) ) handler.addWork( PanDAInjection(pandaconfig=config, server=server, resturi=resturi) ) locals()[getattr(config.TaskWorker, 'backend', DEFAULT_BACKEND).lower()](config) return handler.actionWork(args)
def handleResubmit(resthost, resturi, config, task, *args, **kwargs): """Performs the re-injection of failed jobs :arg str resthost: the hostname where the rest interface is running :arg str resturi: the rest base url to contact :arg WMCore.Configuration config: input configuration :arg TaskWorker.DataObjects.Task task: the task to work on :*args and *kwargs: extra parameters currently not defined :return: the result of the handler operation.""" server = HTTPRequests(resthost, config.TaskWorker.cmscert, config.TaskWorker.cmskey) handler = TaskHandler(task) handler.addWork( MyProxyLogon(config=config, server=server, resturi=resturi, myproxylen=60*60*24) ) def glidein(config): """Performs the re-injection into Glidein :arg WMCore.Configuration config: input configuration""" handler.addWork( DagmanResubmitter(config=config, server=server, resturi=resturi) ) def panda(config): """Performs the re-injection into PanDA :arg WMCore.Configuration config: input configuration""" handler.addWork( PanDAgetSpecs(pandaconfig=config, server=server, resturi=resturi) ) handler.addWork( PanDASpecs2Jobs(pandaconfig=config, server=server, resturi=resturi) ) handler.addWork( PanDABrokerage(pandaconfig=config, server=server, resturi=resturi) ) handler.addWork( PanDAInjection(pandaconfig=config, server=server, resturi=resturi) ) locals()[getattr(config.TaskWorker, 'backend', DEFAULT_BACKEND).lower()](config) return handler.actionWork(args)
def handleKill(resthost, resturi, config, task, procnum, *args, **kwargs): """Asks to kill jobs :arg str resthost: the hostname where the rest interface is running :arg str resturi: the rest base url to contact :arg WMCore.Configuration config: input configuration :arg TaskWorker.DataObjects.Task task: the task to work on :arg int procnum: the process number taking care of the work :*args and *kwargs: extra parameters currently not defined :return: the result of the handler operation.""" server = HTTPRequests(resthost, config.TaskWorker.cmscert, config.TaskWorker.cmskey, retry=2) handler = TaskHandler(task, procnum, server, 'handleKill') handler.addWork( MyProxyLogon(config=config, server=server, resturi=resturi, procnum=procnum, myproxylen=60 * 5)) def glidein(config): """Performs kill of jobs sent through Glidein :arg WMCore.Configuration config: input configuration""" handler.addWork( DagmanKiller(config=config, server=server, resturi=resturi, procnum=procnum)) locals()[getattr(config.TaskWorker, 'backend', DEFAULT_BACKEND).lower()](config) return handler.actionWork(args, kwargs)
def handleNewTask(instance, resturl, config, task, *args, **kwargs): """Performs the injection of a new task :arg str instance: the hostname where the rest interface is running :arg str resturl: the rest base url to contact :arg WMCore.Configuration config: input configuration :arg TaskWorker.DataObjects.Task task: the task to work on :*args and *kwargs: extra parameters currently not defined :return: the handler.""" server = HTTPRequests(instance, config.TaskWorker.cmscert, config.TaskWorker.cmskey, version=__version__) handler = TaskHandler(task) handler.addWork( MyProxyLogon(config=config, server=server, resturl=resturl, myproxylen=60 * 60 * 24)) if task['tm_job_type'] == 'Analysis': handler.addWork( DBSDataDiscovery(config=config, server=server, resturl=resturl)) handler.addWork( LumiMaskBuilder(config=config, server=server, resturl=resturl)) elif task['tm_job_type'] == 'PrivateMC': handler.addWork( MakeFakeFileSet(config=config, server=server, resturl=resturl)) handler.addWork(Splitter(config=config, server=server, resturl=resturl)) def glidein(config): """Performs the injection of a new task into Glidein :arg WMCore.Configuration config: input configuration""" raise NotImplementedError #handler.addWork( DagmanCreator(glideinconfig=config, server=server, resturl=resturl) ) def panda(config): """Performs the injection into PanDA of a new task :arg WMCore.Configuration config: input configuration""" handler.addWork( PanDABrokerage(pandaconfig=config, server=server, resturl=resturl)) handler.addWork( PanDAInjection(pandaconfig=config, server=server, resturl=resturl)) locals()[getattr(config.TaskWorker, 'backend', DEFAULT_BACKEND).lower()](config) return handler.actionWork(args)
def _execute(self, resthost, resturi, config, task): mw = MasterWorker(config, quiet=False, debug=True, test=False) tapeRecallStatus = 'TAPERECALL' self.logger.info("Retrieving %s tasks", tapeRecallStatus) recallingTasks = mw.getWork(limit=999999, getstatus=tapeRecallStatus) if len(recallingTasks) > 0: self.logger.info("Retrieved a total of %d %s tasks", len(recallingTasks), tapeRecallStatus) self.logger.debug("Retrieved the following %s tasks: \n%s", tapeRecallStatus, str(recallingTasks)) for recallingTask in recallingTasks: if not recallingTask['tm_DDM_reqid']: self.logger.debug( "tm_DDM_reqid' is not defined for task %s, skipping such task", recallingTask['tm_taskname']) continue ddmRequest = statusRequest(recallingTask['tm_DDM_reqid'], config.TaskWorker.DDMServer, config.TaskWorker.cmscert, config.TaskWorker.cmskey, verbose=False) self.logger.info("Contacted %s using %s and %s, got:\n%s", config.TaskWorker.DDMServer, config.TaskWorker.cmscert, config.TaskWorker.cmskey, ddmRequest) # The query above returns a JSON with a format {"result": "OK", "message": "Request found", "data": [{"request_id": 14, "site": <site>, "item": [<list of blocks>], "group": "AnalysisOps", "n": 1, "status": "new", "first_request": "2018-02-26 23:25:41", "last_request": "2018-02-26 23:25:41", "request_count": 1}]} if ddmRequest["data"][0][ "status"] == "completed": # possible values: new, activated, updated, completed, rejected, cancelled self.logger.info( "Request %d is completed, setting status of task %s to NEW", recallingTask['tm_DDM_reqid'], recallingTask['tm_taskname']) mw.updateWork(recallingTask['tm_taskname'], recallingTask['tm_task_command'], 'NEW') # Delete all task warnings (the tapeRecallStatus added a dataset warning which is no longer valid now) server = HTTPRequests(config.TaskWorker.resturl, config.TaskWorker.cmscert, config.TaskWorker.cmskey, retry=20, logger=self.logger) mpl = MyProxyLogon(config=config, server=server, resturi=config.TaskWorker.restURInoAPI, myproxylen=self.pollingTime) mpl.execute(task=recallingTask ) # this adds 'user_proxy' to recallingTask mpl.deleteWarnings(recallingTask['user_proxy'], recallingTask['tm_taskname'])
def _execute(self, resthost, resturi, config, task): mw = MasterWorker(config, logWarning=False, logDebug=False, sequential=True, console=False) tapeRecallStatus = 'TAPERECALL' self.logger.info("Retrieving %s tasks", tapeRecallStatus) recallingTasks = mw.getWork(limit=999999, getstatus=tapeRecallStatus) if len(recallingTasks) > 0: self.logger.info("Retrieved a total of %d %s tasks", len(recallingTasks), tapeRecallStatus) self.logger.debug("Retrieved the following %s tasks: \n%s", tapeRecallStatus, str(recallingTasks)) for recallingTask in recallingTasks: if not recallingTask['tm_DDM_reqid']: self.logger.debug("tm_DDM_reqid' is not defined for task %s, skipping such task", recallingTask['tm_taskname']) continue # Make sure the task sandbox in the crabcache is not deleted until the tape recall is completed from WMCore.Services.UserFileCache.UserFileCache import UserFileCache ufc = UserFileCache({'endpoint': recallingTask['tm_cache_url'], "pycurl": True}) sandbox = recallingTask['tm_user_sandbox'].replace(".tar.gz","") try: ufc.download(sandbox, sandbox, recallingTask['tm_username']) os.remove(sandbox) except Exception as ex: self.logger.exception(ex) self.logger.info("The CRAB3 server backend could not download the input sandbox (%s) from the frontend (%s) using the '%s' username."+\ " This could be a temporary glitch, will try again in next occurrence of the recurring action."+\ " Error reason:\n%s", sandbox, recallingTask['tm_cache_url'], recallingTask['tm_username'], str(ex)) ddmRequest = statusRequest(recallingTask['tm_DDM_reqid'], config.TaskWorker.DDMServer, config.TaskWorker.cmscert, config.TaskWorker.cmskey, verbose=False) self.logger.info("Contacted %s using %s and %s, got:\n%s", config.TaskWorker.DDMServer, config.TaskWorker.cmscert, config.TaskWorker.cmskey, ddmRequest) # The query above returns a JSON with a format {"result": "OK", "message": "Request found", "data": [{"request_id": 14, "site": <site>, "item": [<list of blocks>], "group": "AnalysisOps", "n": 1, "status": "new", "first_request": "2018-02-26 23:25:41", "last_request": "2018-02-26 23:25:41", "request_count": 1}]} if ddmRequest["data"][0]["status"] == "completed": # possible values: new, activated, updated, completed, rejected, cancelled self.logger.info("Request %d is completed, setting status of task %s to NEW", recallingTask['tm_DDM_reqid'], recallingTask['tm_taskname']) mw.updateWork(recallingTask['tm_taskname'], recallingTask['tm_task_command'], 'NEW') # Delete all task warnings (the tapeRecallStatus added a dataset warning which is no longer valid now) server = HTTPRequests(config.TaskWorker.resturl, config.TaskWorker.cmscert, config.TaskWorker.cmskey, retry=20, logger=self.logger) mpl = MyProxyLogon(config=config, server=server, resturi=config.TaskWorker.restURInoAPI, myproxylen=self.pollingTime) mpl.execute(task=recallingTask) # this adds 'user_proxy' to recallingTask mpl.deleteWarnings(recallingTask['user_proxy'], recallingTask['tm_taskname']) else: self.logger.info("No %s task retrieved.", tapeRecallStatus)
def _execute(self, resthost, resturi, config, task): mw = MasterWorker(config, logWarning=False, logDebug=False, sequential=True, console=False) tapeRecallStatus = 'TAPERECALL' self.logger.info("Retrieving %s tasks", tapeRecallStatus) recallingTasks = mw.getWork(limit=999999, getstatus=tapeRecallStatus, ignoreTWName=True) if len(recallingTasks) > 0: self.logger.info("Retrieved a total of %d %s tasks", len(recallingTasks), tapeRecallStatus) for recallingTask in recallingTasks: taskName = recallingTask['tm_taskname'] self.logger.info("Working on task %s", taskName) reqId = recallingTask['tm_DDM_reqid'] if not reqId: self.logger.debug("tm_DDM_reqid' is not defined for task %s, skipping such task", taskName) continue server = HTTPRequests(config.TaskWorker.resturl, config.TaskWorker.cmscert, config.TaskWorker.cmskey, retry=20, logger=self.logger) if (time.time() - getTimeFromTaskname(str(taskName)) > MAX_DAYS_FOR_TAPERECALL*24*60*60): self.logger.info("Task %s is older than %d days, setting its status to FAILED", taskName, MAX_DAYS_FOR_TAPERECALL) msg = "The disk replica request (ID: %d) for the input dataset did not complete in %d days." % (reqId, MAX_DAYS_FOR_TAPERECALL) failTask(taskName, server, config.TaskWorker.restURInoAPI+'workflowdb', msg, self.logger, 'FAILED') continue mpl = MyProxyLogon(config=config, server=server, resturi=config.TaskWorker.restURInoAPI, myproxylen=self.pollingTime) user_proxy = True try: mpl.execute(task=recallingTask) # this adds 'user_proxy' to recallingTask except TaskWorkerException as twe: user_proxy = False self.logger.exception(twe) # Make sure the task sandbox in the crabcache is not deleted until the tape recall is completed if user_proxy: from WMCore.Services.UserFileCache.UserFileCache import UserFileCache ufc = UserFileCache({'cert': recallingTask['user_proxy'], 'key': recallingTask['user_proxy'], 'endpoint': recallingTask['tm_cache_url'], "pycurl": True}) sandbox = recallingTask['tm_user_sandbox'].replace(".tar.gz","") debugFiles = recallingTask['tm_debug_files'].replace(".tar.gz","") sandboxPath = os.path.join("/tmp", sandbox) debugFilesPath = os.path.join("/tmp", debugFiles) try: ufc.download(sandbox, sandboxPath, recallingTask['tm_username']) ufc.download(debugFiles, debugFilesPath, recallingTask['tm_username']) self.logger.info("Successfully touched input and debug sandboxes (%s and %s) of task %s (frontend: %s) using the '%s' username (request_id = %d).", sandbox, debugFiles, taskName, recallingTask['tm_cache_url'], recallingTask['tm_username'], reqId) except Exception as ex: self.logger.info("The CRAB3 server backend could not download the input and/or debug sandbox (%s and/or %s) of task %s from the frontend (%s) using the '%s' username (request_id = %d)."+\ " This could be a temporary glitch, will try again in next occurrence of the recurring action."+\ " Error reason:\n%s", sandbox, debugFiles, taskName, recallingTask['tm_cache_url'], recallingTask['tm_username'], reqId, str(ex)) finally: if os.path.exists(sandboxPath): os.remove(sandboxPath) if os.path.exists(debugFilesPath): os.remove(debugFilesPath) ddmRequest = statusRequest(reqId, config.TaskWorker.DDMServer, config.TaskWorker.cmscert, config.TaskWorker.cmskey, verbose=False) # The query above returns a JSON with a format {"result": "OK", "message": "Request found", "data": [{"request_id": 14, "site": <site>, "item": [<list of blocks>], "group": "AnalysisOps", "n": 1, "status": "new", "first_request": "2018-02-26 23:25:41", "last_request": "2018-02-26 23:25:41", "request_count": 1}]} self.logger.info("Contacted %s using %s and %s for request_id = %d, got:\n%s", config.TaskWorker.DDMServer, config.TaskWorker.cmscert, config.TaskWorker.cmskey, reqId, ddmRequest) if ddmRequest["message"] == "Request found": status = ddmRequest["data"][0]["status"] if status == "completed": # possible values: new, activated, updated, completed, rejected, cancelled self.logger.info("Request %d is completed, setting status of task %s to NEW", reqId, taskName) mw.updateWork(taskName, recallingTask['tm_task_command'], 'NEW') # Delete all task warnings (the tapeRecallStatus added a dataset warning which is no longer valid now) if user_proxy: mpl.deleteWarnings(recallingTask['user_proxy'], taskName) elif status == "rejected": msg = "The DDM request (ID: %d) has been rejected with this reason: %s" % (reqId, ddmRequest["data"][0]["reason"]) self.logger.info(msg + "\nSetting status of task %s to FAILED", taskName) failTask(taskName, server, config.TaskWorker.restURInoAPI+'workflowdb', msg, self.logger, 'FAILED') else: msg = "DDM request_id %d not found. Please report to experts" % reqId self.logger.info(msg) if user_proxy: mpl.uploadWarning(msg, recallingTask['user_proxy'], taskName) else: self.logger.info("No %s task retrieved.", tapeRecallStatus)
def _execute(self, config, task): # setup logger if not self.logger: self.logger = logging.getLogger(__name__) handler = logging.StreamHandler(sys.stdout) # pylint: disable=redefined-outer-name formatter = logging.Formatter( "%(asctime)s:%(levelname)s:%(module)s %(message)s") # pylint: disable=redefined-outer-name handler.setFormatter(formatter) self.logger.addHandler(handler) self.logger.setLevel(logging.DEBUG) else: # do not use BaseRecurringAction logger but create a new logger # which writes to config.TaskWorker.logsDir/taks/recurring/TapeRecallStatus_YYMMDD-HHMM.log self.logger = logging.getLogger('TapeRecallStatus') logDir = config.TaskWorker.logsDir + '/tasks/recurring/' if not os.path.exists(logDir): os.makedirs(logDir) timeStamp = time.strftime('%y%m%d-%H%M', time.localtime()) logFile = 'TapeRecallStatus_' + timeStamp + '.log' handler = logging.FileHandler(logDir + logFile) formatter = logging.Formatter( '%(asctime)s:%(levelname)s:%(module)s:%(message)s') handler.setFormatter(formatter) self.logger.addHandler(handler) mw = MasterWorker(config, logWarning=False, logDebug=False, sequential=True, console=False, name='masterForTapeRecall') tapeRecallStatus = 'TAPERECALL' self.logger.info("Retrieving %s tasks", tapeRecallStatus) recallingTasks = mw.getWork(limit=999999, getstatus=tapeRecallStatus, ignoreTWName=True) if not recallingTasks: self.logger.info("No %s task retrieved.", tapeRecallStatus) return self.logger.info("Retrieved a total of %d %s tasks", len(recallingTasks), tapeRecallStatus) crabserver = mw.crabserver for recallingTask in recallingTasks: taskName = recallingTask['tm_taskname'] self.logger.info("Working on task %s", taskName) reqId = recallingTask['tm_DDM_reqid'] if not reqId: self.logger.debug( "tm_DDM_reqid' is not defined for task %s, skipping such task", taskName) continue else: msg = "Task points to Rucio RuleId: %s " % reqId self.logger.info(msg) if (time.time() - getTimeFromTaskname( str(taskName))) > MAX_DAYS_FOR_TAPERECALL * 24 * 60 * 60: self.logger.info( "Task %s is older than %d days, setting its status to FAILED", taskName, MAX_DAYS_FOR_TAPERECALL) msg = "The disk replica request (ID: %s) for the input dataset did not complete in %d days." % ( reqId, MAX_DAYS_FOR_TAPERECALL) failTask(taskName, crabserver, msg, self.logger, 'FAILED') continue if not 'S3' in recallingTask['tm_cache_url'].upper(): # when using old crabcache had to worry about sandbox purging after 3 days mpl = MyProxyLogon(config=config, crabserver=crabserver, myproxylen=self.pollingTime) user_proxy = True try: mpl.execute(task=recallingTask ) # this adds 'user_proxy' to recallingTask except TaskWorkerException as twe: user_proxy = False self.logger.exception(twe) # Make sure the task sandbox in the crabcache is not deleted until the tape recall is completed if user_proxy: self.refreshSandbox(recallingTask) # Retrieve status of recall request if not self.rucioClient: self.rucioClient = getNativeRucioClient(config=config, logger=self.logger) try: ddmRequest = self.rucioClient.get_replication_rule(reqId) except RuleNotFound: msg = "Rucio rule id %s not found. Please report to experts" % reqId self.logger.error(msg) if user_proxy: mpl.uploadWarning(msg, recallingTask['user_proxy'], taskName) if ddmRequest['state'] == 'OK': self.logger.info( "Request %s is completed, setting status of task %s to NEW", reqId, taskName) mw.updateWork(taskName, recallingTask['tm_task_command'], 'NEW') # Delete all task warnings (the tapeRecallStatus added a dataset warning which is no longer valid now) if user_proxy: mpl.deleteWarnings(recallingTask['user_proxy'], taskName) else: expiration = ddmRequest[ 'expires_at'] # this is a datetime.datetime object if expiration < datetime.datetime.now(): # give up waiting msg = ( "Replication request %s for task %s expired. Setting its status to FAILED" % (reqId, taskName)) self.logger.info(msg) failTask(taskName, crabserver, msg, self.logger, 'FAILED')
def _execute(self, resthost, resturi, config, task): # setup logger if not self.logger: self.logger = logging.getLogger(__name__) handler = logging.StreamHandler(sys.stdout) formatter = logging.Formatter( "%(asctime)s:%(levelname)s:%(module)s %(message)s") handler.setFormatter(formatter) self.logger.addHandler(handler) self.logger.setLevel(logging.DEBUG) else: # do not use BaseRecurringAction logger but create a new logger # which writes to config.TaskWorker.logsDir/taks/recurring/TapeRecallStatus_YYMMDD-HHMM.log self.logger = logging.getLogger('TapeRecallStatus') logDir = config.TaskWorker.logsDir + '/tasks/recurring/' if not os.path.exists(logDir): os.makedirs(logDir) timeStamp = time.strftime('%y%m%d-%H%M', time.localtime()) logFile = 'TapeRecallStatus_' + timeStamp + '.log' handler = logging.FileHandler(logDir + logFile) formatter = logging.Formatter( '%(asctime)s:%(levelname)s:%(module)s:%(message)s') handler.setFormatter(formatter) self.logger.addHandler(handler) mw = MasterWorker(config, logWarning=False, logDebug=False, sequential=True, console=False, name='masterForTapeRecall') tapeRecallStatus = 'TAPERECALL' self.logger.info("Retrieving %s tasks", tapeRecallStatus) recallingTasks = mw.getWork(limit=999999, getstatus=tapeRecallStatus, ignoreTWName=True) if len(recallingTasks) > 0: self.logger.info("Retrieved a total of %d %s tasks", len(recallingTasks), tapeRecallStatus) for recallingTask in recallingTasks: taskName = recallingTask['tm_taskname'] self.logger.info("Working on task %s", taskName) reqId = recallingTask['tm_DDM_reqid'] if not reqId: self.logger.debug( "tm_DDM_reqid' is not defined for task %s, skipping such task", taskName) continue server = HTTPRequests(resthost, config.TaskWorker.cmscert, config.TaskWorker.cmskey, retry=20, logger=self.logger) if (time.time() - getTimeFromTaskname(str(taskName)) > MAX_DAYS_FOR_TAPERECALL * 24 * 60 * 60): self.logger.info( "Task %s is older than %d days, setting its status to FAILED", taskName, MAX_DAYS_FOR_TAPERECALL) msg = "The disk replica request (ID: %d) for the input dataset did not complete in %d days." % ( reqId, MAX_DAYS_FOR_TAPERECALL) failTask(taskName, server, resturi, msg, self.logger, 'FAILED') continue mpl = MyProxyLogon(config=config, server=server, resturi=resturi, myproxylen=self.pollingTime) user_proxy = True try: mpl.execute(task=recallingTask ) # this adds 'user_proxy' to recallingTask except TaskWorkerException as twe: user_proxy = False self.logger.exception(twe) # Make sure the task sandbox in the crabcache is not deleted until the tape recall is completed if user_proxy: from WMCore.Services.UserFileCache.UserFileCache import UserFileCache ufc = UserFileCache({ 'cert': recallingTask['user_proxy'], 'key': recallingTask['user_proxy'], 'endpoint': recallingTask['tm_cache_url'], "pycurl": True }) sandbox = recallingTask['tm_user_sandbox'].replace( ".tar.gz", "") debugFiles = recallingTask['tm_debug_files'].replace( ".tar.gz", "") sandboxPath = os.path.join("/tmp", sandbox) debugFilesPath = os.path.join("/tmp", debugFiles) try: ufc.download(sandbox, sandboxPath, recallingTask['tm_username']) ufc.download(debugFiles, debugFilesPath, recallingTask['tm_username']) self.logger.info( "Successfully touched input and debug sandboxes (%s and %s) of task %s (frontend: %s) using the '%s' username (request_id = %d).", sandbox, debugFiles, taskName, recallingTask['tm_cache_url'], recallingTask['tm_username'], reqId) except Exception as ex: self.logger.info("The CRAB3 server backend could not download the input and/or debug sandbox (%s and/or %s) of task %s from the frontend (%s) using the '%s' username (request_id = %d)."+\ " This could be a temporary glitch, will try again in next occurrence of the recurring action."+\ " Error reason:\n%s", sandbox, debugFiles, taskName, recallingTask['tm_cache_url'], recallingTask['tm_username'], reqId, str(ex)) finally: if os.path.exists(sandboxPath): os.remove(sandboxPath) if os.path.exists(debugFilesPath): os.remove(debugFilesPath) ddmRequest = statusRequest(reqId, config.TaskWorker.DDMServer, config.TaskWorker.cmscert, config.TaskWorker.cmskey, verbose=False) # The query above returns a JSON with a format {"result": "OK", "message": "Request found", "data": [{"request_id": 14, "site": <site>, "item": [<list of blocks>], "group": "AnalysisOps", "n": 1, "status": "new", "first_request": "2018-02-26 23:25:41", "last_request": "2018-02-26 23:25:41", "request_count": 1}]} self.logger.info( "Contacted %s using %s and %s for request_id = %d, got:\n%s", config.TaskWorker.DDMServer, config.TaskWorker.cmscert, config.TaskWorker.cmskey, reqId, ddmRequest) if ddmRequest["message"] == "Request found": status = ddmRequest["data"][0]["status"] if status == "completed": # possible values: new, activated, updated, completed, rejected, cancelled self.logger.info( "Request %d is completed, setting status of task %s to NEW", reqId, taskName) mw.updateWork(taskName, recallingTask['tm_task_command'], 'NEW') # Delete all task warnings (the tapeRecallStatus added a dataset warning which is no longer valid now) if user_proxy: mpl.deleteWarnings(recallingTask['user_proxy'], taskName) elif status == "rejected": msg = "The DDM request (ID: %d) has been rejected with this reason: %s" % ( reqId, ddmRequest["data"][0]["reason"]) self.logger.info( msg + "\nSetting status of task %s to FAILED", taskName) failTask(taskName, server, resturi, msg, self.logger, 'FAILED') else: msg = "DDM request_id %d not found. Please report to experts" % reqId self.logger.info(msg) if user_proxy: mpl.uploadWarning(msg, recallingTask['user_proxy'], taskName) else: self.logger.info("No %s task retrieved.", tapeRecallStatus)