Beispiel #1
0
def handleNewTask(resthost, resturi, config, task, procnum, *args, **kwargs):
    """Performs the injection of a new task

    :arg str resthost: the hostname where the rest interface is running
    :arg str resturi: the rest base url to contact
    :arg WMCore.Configuration config: input configuration
    :arg TaskWorker.DataObjects.Task task: the task to work on
    :arg int procnum: the process number taking care of the work
    :*args and *kwargs: extra parameters currently not defined
    :return: the handler."""
    server = HTTPRequests(resthost, config.TaskWorker.cmscert, config.TaskWorker.cmskey, retry=20, logger=logging.getLogger(str(procnum)))
    handler = TaskHandler(task, procnum, server, config, 'handleNewTask', createTempDir=True)
    handler.addWork(MyProxyLogon(config=config, server=server, resturi=resturi, procnum=procnum, myproxylen=60 * 60 * 24))
    handler.addWork(StageoutCheck(config=config, server=server, resturi=resturi, procnum=procnum))
    if task['tm_job_type'] == 'Analysis':
        if task.get('tm_user_files'):
            handler.addWork(UserDataDiscovery(config=config, server=server, resturi=resturi, procnum=procnum))
        else:
            handler.addWork(DBSDataDiscovery(config=config, server=server, resturi=resturi, procnum=procnum))
    elif task['tm_job_type'] == 'PrivateMC':
        handler.addWork(MakeFakeFileSet(config=config, server=server, resturi=resturi, procnum=procnum))
    handler.addWork(Splitter(config=config, server=server, resturi=resturi, procnum=procnum))
    handler.addWork(DagmanCreator(config=config, server=server, resturi=resturi, procnum=procnum))
    if task['tm_dry_run'] == 'T':
        handler.addWork(DryRunUploader(config=config, server=server, resturi=resturi, procnum=procnum))
    else:
        handler.addWork(DagmanSubmitter(config=config, server=server, resturi=resturi, procnum=procnum))

    return handler.actionWork(args, kwargs)
Beispiel #2
0
def handleKill(resthost, dbInstance, config, task, procnum, *args, **kwargs):
    """Asks to kill jobs

    :arg str resthost: the hostname where the rest interface is running
    :arg str dbInstance: the rest base url to contact
    :arg WMCore.Configuration config: input configuration
    :arg TaskWorker.DataObjects.Task task: the task to work on
    :arg int procnum: the process number taking care of the work
    :*args and *kwargs: extra parameters currently not defined
    :return: the result of the handler operation."""
    crabserver = CRABRest(resthost,
                          config.TaskWorker.cmscert,
                          config.TaskWorker.cmskey,
                          retry=20,
                          logger=logging.getLogger(str(procnum)),
                          userAgent='CRABTaskWorker',
                          version=__version__)
    crabserver.setDbInstance(dbInstance)
    handler = TaskHandler(task, procnum, crabserver, config, 'handleKill')
    handler.addWork(
        MyProxyLogon(config=config,
                     crabserver=crabserver,
                     procnum=procnum,
                     myproxylen=60 * 5))
    handler.addWork(
        DagmanKiller(config=config, crabserver=crabserver, procnum=procnum))

    return handler.actionWork(args, kwargs)
Beispiel #3
0
def handleKill(resthost, resturi, config, task, procnum, *args, **kwargs):
    """Asks to kill jobs

    :arg str resthost: the hostname where the rest interface is running
    :arg str resturi: the rest base url to contact
    :arg WMCore.Configuration config: input configuration
    :arg TaskWorker.DataObjects.Task task: the task to work on
    :arg int procnum: the process number taking care of the work
    :*args and *kwargs: extra parameters currently not defined
    :return: the result of the handler operation."""
    server = HTTPRequests(resthost,
                          config.TaskWorker.cmscert,
                          config.TaskWorker.cmskey,
                          retry=20,
                          logger=logging.getLogger(str(procnum)))
    handler = TaskHandler(task, procnum, server, config, 'handleKill')
    handler.addWork(
        MyProxyLogon(config=config,
                     server=server,
                     resturi=resturi,
                     procnum=procnum,
                     myproxylen=60 * 5))
    handler.addWork(
        DagmanKiller(config=config,
                     server=server,
                     resturi=resturi,
                     procnum=procnum))

    return handler.actionWork(args, kwargs)
Beispiel #4
0
def handleKill(instance, resturl, config, task, *args, **kwargs):
    """Asks to kill jobs

    :arg str instance: the hostname where the rest interface is running
    :arg str resturl: the rest base url to contact
    :arg WMCore.Configuration config: input configuration
    :arg TaskWorker.DataObjects.Task task: the task to work on
    :*args and *kwargs: extra parameters currently not defined
    :return: the result of the handler operation."""
    server = HTTPRequests(instance,
                          config.TaskWorker.cmscert,
                          config.TaskWorker.cmskey,
                          version=__version__)
    handler = TaskHandler(task)
    handler.addWork(
        MyProxyLogon(config=config,
                     server=server,
                     resturl=resturl,
                     myproxylen=60 * 5))

    def glidein(config):
        """Performs kill of jobs sent through Glidein
        :arg WMCore.Configuration config: input configuration"""
        raise NotImplementedError
        #handler.addWork( DagmanKiller(glideinconfig=config, server=server, resturl=resturl) )

    def panda(config):
        """Performs the re-injection into PanDA
        :arg WMCore.Configuration config: input configuration"""
        handler.addWork(
            PanDAKill(pandaconfig=config, server=server, resturl=resturl))

    locals()[getattr(config.TaskWorker, 'backend',
                     DEFAULT_BACKEND).lower()](config)
    return handler.actionWork(args, kwargs)
Beispiel #5
0
def handleNewTask(resthost, resturi, config, task, *args, **kwargs):
    """Performs the injection of a new task

    :arg str resthost: the hostname where the rest interface is running
    :arg str resturi: the rest base url to contact
    :arg WMCore.Configuration config: input configuration
    :arg TaskWorker.DataObjects.Task task: the task to work on
    :*args and *kwargs: extra parameters currently not defined
    :return: the handler."""
    server = HTTPRequests(resthost, config.TaskWorker.cmscert, config.TaskWorker.cmskey)
    handler = TaskHandler(task)
    handler.addWork( MyProxyLogon(config=config, server=server, resturi=resturi, myproxylen=60*60*24) )
    if task['tm_job_type'] == 'Analysis': 
        if task.get('tm_arguments', {}).get('userfiles'):
            handler.addWork( UserDataDiscovery(config=config, server=server, resturi=resturi) )
        else:
            handler.addWork( DBSDataDiscovery(config=config, server=server, resturi=resturi) )
    elif task['tm_job_type'] == 'PrivateMC': 
        handler.addWork( MakeFakeFileSet(config=config, server=server, resturi=resturi) )
    handler.addWork( Splitter(config=config, server=server, resturi=resturi) )

    def glidein(config):
        """Performs the injection of a new task into Glidein
        :arg WMCore.Configuration config: input configuration"""
        handler.addWork( DagmanCreator(config=config, server=server, resturi=resturi) )
        handler.addWork( DagmanSubmitter(config=config, server=server, resturi=resturi) )

    def panda(config):
        """Performs the injection into PanDA of a new task
        :arg WMCore.Configuration config: input configuration"""
        handler.addWork( PanDABrokerage(pandaconfig=config, server=server, resturi=resturi) )
        handler.addWork( PanDAInjection(pandaconfig=config, server=server, resturi=resturi) )

    locals()[getattr(config.TaskWorker, 'backend', DEFAULT_BACKEND).lower()](config)
    return handler.actionWork(args)
Beispiel #6
0
def handleResubmit(resthost, resturi, config, task, *args, **kwargs):
    """Performs the re-injection of failed jobs

    :arg str resthost: the hostname where the rest interface is running
    :arg str resturi: the rest base url to contact
    :arg WMCore.Configuration config: input configuration
    :arg TaskWorker.DataObjects.Task task: the task to work on
    :*args and *kwargs: extra parameters currently not defined
    :return: the result of the handler operation."""
    server = HTTPRequests(resthost, config.TaskWorker.cmscert, config.TaskWorker.cmskey)
    handler = TaskHandler(task)
    handler.addWork( MyProxyLogon(config=config, server=server, resturi=resturi, myproxylen=60*60*24) )
    def glidein(config):
        """Performs the re-injection into Glidein
        :arg WMCore.Configuration config: input configuration"""
        handler.addWork( DagmanResubmitter(config=config, server=server, resturi=resturi) )

    def panda(config):
        """Performs the re-injection into PanDA
        :arg WMCore.Configuration config: input configuration"""
        handler.addWork( PanDAgetSpecs(pandaconfig=config, server=server, resturi=resturi) )
        handler.addWork( PanDASpecs2Jobs(pandaconfig=config, server=server, resturi=resturi) )
        handler.addWork( PanDABrokerage(pandaconfig=config, server=server, resturi=resturi) )
        handler.addWork( PanDAInjection(pandaconfig=config, server=server, resturi=resturi) )

    locals()[getattr(config.TaskWorker, 'backend', DEFAULT_BACKEND).lower()](config)
    return handler.actionWork(args)
Beispiel #7
0
def handleKill(resthost, resturi, config, task, procnum, *args, **kwargs):
    """Asks to kill jobs

    :arg str resthost: the hostname where the rest interface is running
    :arg str resturi: the rest base url to contact
    :arg WMCore.Configuration config: input configuration
    :arg TaskWorker.DataObjects.Task task: the task to work on
    :arg int procnum: the process number taking care of the work
    :*args and *kwargs: extra parameters currently not defined
    :return: the result of the handler operation."""
    server = HTTPRequests(resthost,
                          config.TaskWorker.cmscert,
                          config.TaskWorker.cmskey,
                          retry=2)
    handler = TaskHandler(task, procnum, server, 'handleKill')
    handler.addWork(
        MyProxyLogon(config=config,
                     server=server,
                     resturi=resturi,
                     procnum=procnum,
                     myproxylen=60 * 5))

    def glidein(config):
        """Performs kill of jobs sent through Glidein
        :arg WMCore.Configuration config: input configuration"""
        handler.addWork(
            DagmanKiller(config=config,
                         server=server,
                         resturi=resturi,
                         procnum=procnum))

    locals()[getattr(config.TaskWorker, 'backend',
                     DEFAULT_BACKEND).lower()](config)
    return handler.actionWork(args, kwargs)
Beispiel #8
0
def handleNewTask(instance, resturl, config, task, *args, **kwargs):
    """Performs the injection of a new task

    :arg str instance: the hostname where the rest interface is running
    :arg str resturl: the rest base url to contact
    :arg WMCore.Configuration config: input configuration
    :arg TaskWorker.DataObjects.Task task: the task to work on
    :*args and *kwargs: extra parameters currently not defined
    :return: the handler."""
    server = HTTPRequests(instance,
                          config.TaskWorker.cmscert,
                          config.TaskWorker.cmskey,
                          version=__version__)
    handler = TaskHandler(task)
    handler.addWork(
        MyProxyLogon(config=config,
                     server=server,
                     resturl=resturl,
                     myproxylen=60 * 60 * 24))
    if task['tm_job_type'] == 'Analysis':
        handler.addWork(
            DBSDataDiscovery(config=config, server=server, resturl=resturl))
        handler.addWork(
            LumiMaskBuilder(config=config, server=server, resturl=resturl))
    elif task['tm_job_type'] == 'PrivateMC':
        handler.addWork(
            MakeFakeFileSet(config=config, server=server, resturl=resturl))
    handler.addWork(Splitter(config=config, server=server, resturl=resturl))

    def glidein(config):
        """Performs the injection of a new task into Glidein
        :arg WMCore.Configuration config: input configuration"""
        raise NotImplementedError
        #handler.addWork( DagmanCreator(glideinconfig=config, server=server, resturl=resturl) )

    def panda(config):
        """Performs the injection into PanDA of a new task
        :arg WMCore.Configuration config: input configuration"""
        handler.addWork(
            PanDABrokerage(pandaconfig=config, server=server, resturl=resturl))
        handler.addWork(
            PanDAInjection(pandaconfig=config, server=server, resturl=resturl))

    locals()[getattr(config.TaskWorker, 'backend',
                     DEFAULT_BACKEND).lower()](config)
    return handler.actionWork(args)
Beispiel #9
0
    def _execute(self, resthost, resturi, config, task):
        mw = MasterWorker(config, quiet=False, debug=True, test=False)

        tapeRecallStatus = 'TAPERECALL'
        self.logger.info("Retrieving %s tasks", tapeRecallStatus)
        recallingTasks = mw.getWork(limit=999999, getstatus=tapeRecallStatus)
        if len(recallingTasks) > 0:
            self.logger.info("Retrieved a total of %d %s tasks",
                             len(recallingTasks), tapeRecallStatus)
            self.logger.debug("Retrieved the following %s tasks: \n%s",
                              tapeRecallStatus, str(recallingTasks))
            for recallingTask in recallingTasks:
                if not recallingTask['tm_DDM_reqid']:
                    self.logger.debug(
                        "tm_DDM_reqid' is not defined for task %s, skipping such task",
                        recallingTask['tm_taskname'])
                    continue
                ddmRequest = statusRequest(recallingTask['tm_DDM_reqid'],
                                           config.TaskWorker.DDMServer,
                                           config.TaskWorker.cmscert,
                                           config.TaskWorker.cmskey,
                                           verbose=False)
                self.logger.info("Contacted %s using %s and %s, got:\n%s",
                                 config.TaskWorker.DDMServer,
                                 config.TaskWorker.cmscert,
                                 config.TaskWorker.cmskey, ddmRequest)
                # The query above returns a JSON with a format {"result": "OK", "message": "Request found", "data": [{"request_id": 14, "site": <site>, "item": [<list of blocks>], "group": "AnalysisOps", "n": 1, "status": "new", "first_request": "2018-02-26 23:25:41", "last_request": "2018-02-26 23:25:41", "request_count": 1}]}
                if ddmRequest["data"][0][
                        "status"] == "completed":  # possible values: new, activated, updated, completed, rejected, cancelled
                    self.logger.info(
                        "Request %d is completed, setting status of task %s to NEW",
                        recallingTask['tm_DDM_reqid'],
                        recallingTask['tm_taskname'])
                    mw.updateWork(recallingTask['tm_taskname'],
                                  recallingTask['tm_task_command'], 'NEW')
                    # Delete all task warnings (the tapeRecallStatus added a dataset warning which is no longer valid now)
                    server = HTTPRequests(config.TaskWorker.resturl,
                                          config.TaskWorker.cmscert,
                                          config.TaskWorker.cmskey,
                                          retry=20,
                                          logger=self.logger)
                    mpl = MyProxyLogon(config=config,
                                       server=server,
                                       resturi=config.TaskWorker.restURInoAPI,
                                       myproxylen=self.pollingTime)
                    mpl.execute(task=recallingTask
                                )  # this adds 'user_proxy' to recallingTask
                    mpl.deleteWarnings(recallingTask['user_proxy'],
                                       recallingTask['tm_taskname'])
Beispiel #10
0
    def _execute(self, resthost, resturi, config, task):
        mw = MasterWorker(config, logWarning=False, logDebug=False, sequential=True, console=False)

        tapeRecallStatus = 'TAPERECALL'
        self.logger.info("Retrieving %s tasks", tapeRecallStatus)
        recallingTasks = mw.getWork(limit=999999, getstatus=tapeRecallStatus)
        if len(recallingTasks) > 0:
            self.logger.info("Retrieved a total of %d %s tasks", len(recallingTasks), tapeRecallStatus)
            self.logger.debug("Retrieved the following %s tasks: \n%s", tapeRecallStatus, str(recallingTasks))
            for recallingTask in recallingTasks:
                if not recallingTask['tm_DDM_reqid']:
                    self.logger.debug("tm_DDM_reqid' is not defined for task %s, skipping such task", recallingTask['tm_taskname'])
                    continue

                # Make sure the task sandbox in the crabcache is not deleted until the tape recall is completed
                from WMCore.Services.UserFileCache.UserFileCache import UserFileCache
                ufc = UserFileCache({'endpoint': recallingTask['tm_cache_url'], "pycurl": True})
                sandbox = recallingTask['tm_user_sandbox'].replace(".tar.gz","")
                try:
                    ufc.download(sandbox, sandbox, recallingTask['tm_username'])
                    os.remove(sandbox)
                except Exception as ex:
                    self.logger.exception(ex)
                    self.logger.info("The CRAB3 server backend could not download the input sandbox (%s) from the frontend (%s) using the '%s' username."+\
                                     " This could be a temporary glitch, will try again in next occurrence of the recurring action."+\
                                     " Error reason:\n%s", sandbox, recallingTask['tm_cache_url'], recallingTask['tm_username'], str(ex))

                ddmRequest = statusRequest(recallingTask['tm_DDM_reqid'], config.TaskWorker.DDMServer, config.TaskWorker.cmscert, config.TaskWorker.cmskey, verbose=False)
                self.logger.info("Contacted %s using %s and %s, got:\n%s", config.TaskWorker.DDMServer, config.TaskWorker.cmscert, config.TaskWorker.cmskey, ddmRequest)
                # The query above returns a JSON with a format {"result": "OK", "message": "Request found", "data": [{"request_id": 14, "site": <site>, "item": [<list of blocks>], "group": "AnalysisOps", "n": 1, "status": "new", "first_request": "2018-02-26 23:25:41", "last_request": "2018-02-26 23:25:41", "request_count": 1}]}
                if ddmRequest["data"][0]["status"] == "completed": # possible values: new, activated, updated, completed, rejected, cancelled
                    self.logger.info("Request %d is completed, setting status of task %s to NEW", recallingTask['tm_DDM_reqid'], recallingTask['tm_taskname'])
                    mw.updateWork(recallingTask['tm_taskname'], recallingTask['tm_task_command'], 'NEW')
                    # Delete all task warnings (the tapeRecallStatus added a dataset warning which is no longer valid now)
                    server = HTTPRequests(config.TaskWorker.resturl, config.TaskWorker.cmscert, config.TaskWorker.cmskey, retry=20, logger=self.logger)
                    mpl = MyProxyLogon(config=config, server=server, resturi=config.TaskWorker.restURInoAPI, myproxylen=self.pollingTime)
                    mpl.execute(task=recallingTask) # this adds 'user_proxy' to recallingTask
                    mpl.deleteWarnings(recallingTask['user_proxy'], recallingTask['tm_taskname'])

        else:
            self.logger.info("No %s task retrieved.", tapeRecallStatus)
Beispiel #11
0
    def _execute(self, resthost, resturi, config, task):
        mw = MasterWorker(config, logWarning=False, logDebug=False, sequential=True, console=False)

        tapeRecallStatus = 'TAPERECALL'
        self.logger.info("Retrieving %s tasks", tapeRecallStatus)
        recallingTasks = mw.getWork(limit=999999, getstatus=tapeRecallStatus, ignoreTWName=True)
        if len(recallingTasks) > 0:
            self.logger.info("Retrieved a total of %d %s tasks", len(recallingTasks), tapeRecallStatus)
            for recallingTask in recallingTasks:
                taskName = recallingTask['tm_taskname']
                self.logger.info("Working on task %s", taskName)

                reqId = recallingTask['tm_DDM_reqid']
                if not reqId:
                    self.logger.debug("tm_DDM_reqid' is not defined for task %s, skipping such task", taskName)
                    continue

                server = HTTPRequests(config.TaskWorker.resturl, config.TaskWorker.cmscert, config.TaskWorker.cmskey, retry=20, logger=self.logger)
                if (time.time() - getTimeFromTaskname(str(taskName)) > MAX_DAYS_FOR_TAPERECALL*24*60*60):
                    self.logger.info("Task %s is older than %d days, setting its status to FAILED", taskName, MAX_DAYS_FOR_TAPERECALL)
                    msg = "The disk replica request (ID: %d) for the input dataset did not complete in %d days." % (reqId, MAX_DAYS_FOR_TAPERECALL)
                    failTask(taskName, server, config.TaskWorker.restURInoAPI+'workflowdb', msg, self.logger, 'FAILED')
                    continue

                mpl = MyProxyLogon(config=config, server=server, resturi=config.TaskWorker.restURInoAPI, myproxylen=self.pollingTime)
                user_proxy = True
                try:
                    mpl.execute(task=recallingTask) # this adds 'user_proxy' to recallingTask
                except TaskWorkerException as twe:
                    user_proxy = False
                    self.logger.exception(twe)

                # Make sure the task sandbox in the crabcache is not deleted until the tape recall is completed
                if user_proxy:
                    from WMCore.Services.UserFileCache.UserFileCache import UserFileCache
                    ufc = UserFileCache({'cert': recallingTask['user_proxy'], 'key': recallingTask['user_proxy'], 'endpoint': recallingTask['tm_cache_url'], "pycurl": True})
                    sandbox = recallingTask['tm_user_sandbox'].replace(".tar.gz","")
                    debugFiles = recallingTask['tm_debug_files'].replace(".tar.gz","")
                    sandboxPath = os.path.join("/tmp", sandbox)
                    debugFilesPath = os.path.join("/tmp", debugFiles)
                    try:
                        ufc.download(sandbox, sandboxPath, recallingTask['tm_username'])
                        ufc.download(debugFiles, debugFilesPath, recallingTask['tm_username'])
                        self.logger.info("Successfully touched input and debug sandboxes (%s and %s) of task %s (frontend: %s) using the '%s' username (request_id = %d).",
                                         sandbox, debugFiles, taskName, recallingTask['tm_cache_url'], recallingTask['tm_username'], reqId)
                    except Exception as ex:
                        self.logger.info("The CRAB3 server backend could not download the input and/or debug sandbox (%s and/or %s) of task %s from the frontend (%s) using the '%s' username (request_id = %d)."+\
                                         " This could be a temporary glitch, will try again in next occurrence of the recurring action."+\
                                         " Error reason:\n%s", sandbox, debugFiles, taskName, recallingTask['tm_cache_url'], recallingTask['tm_username'], reqId, str(ex))
                    finally:
                        if os.path.exists(sandboxPath): os.remove(sandboxPath)
                        if os.path.exists(debugFilesPath): os.remove(debugFilesPath)

                ddmRequest = statusRequest(reqId, config.TaskWorker.DDMServer, config.TaskWorker.cmscert, config.TaskWorker.cmskey, verbose=False)
                # The query above returns a JSON with a format {"result": "OK", "message": "Request found", "data": [{"request_id": 14, "site": <site>, "item": [<list of blocks>], "group": "AnalysisOps", "n": 1, "status": "new", "first_request": "2018-02-26 23:25:41", "last_request": "2018-02-26 23:25:41", "request_count": 1}]}                
                self.logger.info("Contacted %s using %s and %s for request_id = %d, got:\n%s", config.TaskWorker.DDMServer, config.TaskWorker.cmscert, config.TaskWorker.cmskey, reqId, ddmRequest)

                if ddmRequest["message"] == "Request found":
                    status = ddmRequest["data"][0]["status"]
                    if status == "completed": # possible values: new, activated, updated, completed, rejected, cancelled
                        self.logger.info("Request %d is completed, setting status of task %s to NEW", reqId, taskName)
                        mw.updateWork(taskName, recallingTask['tm_task_command'], 'NEW')
                        # Delete all task warnings (the tapeRecallStatus added a dataset warning which is no longer valid now)
                        if user_proxy: mpl.deleteWarnings(recallingTask['user_proxy'], taskName)
                    elif status == "rejected":
                        msg = "The DDM request (ID: %d) has been rejected with this reason: %s" % (reqId, ddmRequest["data"][0]["reason"])
                        self.logger.info(msg + "\nSetting status of task %s to FAILED", taskName)
                        failTask(taskName, server, config.TaskWorker.restURInoAPI+'workflowdb', msg, self.logger, 'FAILED')

                else:
                    msg = "DDM request_id %d not found. Please report to experts" % reqId
                    self.logger.info(msg)
                    if user_proxy: mpl.uploadWarning(msg, recallingTask['user_proxy'], taskName)

        else:
            self.logger.info("No %s task retrieved.", tapeRecallStatus)
    def _execute(self, config, task):

        # setup logger
        if not self.logger:
            self.logger = logging.getLogger(__name__)
            handler = logging.StreamHandler(sys.stdout)  # pylint: disable=redefined-outer-name
            formatter = logging.Formatter(
                "%(asctime)s:%(levelname)s:%(module)s %(message)s")  # pylint: disable=redefined-outer-name
            handler.setFormatter(formatter)
            self.logger.addHandler(handler)
            self.logger.setLevel(logging.DEBUG)
        else:
            # do not use BaseRecurringAction logger but create a new logger
            # which writes to config.TaskWorker.logsDir/taks/recurring/TapeRecallStatus_YYMMDD-HHMM.log
            self.logger = logging.getLogger('TapeRecallStatus')
            logDir = config.TaskWorker.logsDir + '/tasks/recurring/'
            if not os.path.exists(logDir):
                os.makedirs(logDir)
            timeStamp = time.strftime('%y%m%d-%H%M', time.localtime())
            logFile = 'TapeRecallStatus_' + timeStamp + '.log'
            handler = logging.FileHandler(logDir + logFile)
            formatter = logging.Formatter(
                '%(asctime)s:%(levelname)s:%(module)s:%(message)s')
            handler.setFormatter(formatter)
            self.logger.addHandler(handler)

        mw = MasterWorker(config,
                          logWarning=False,
                          logDebug=False,
                          sequential=True,
                          console=False,
                          name='masterForTapeRecall')

        tapeRecallStatus = 'TAPERECALL'
        self.logger.info("Retrieving %s tasks", tapeRecallStatus)
        recallingTasks = mw.getWork(limit=999999,
                                    getstatus=tapeRecallStatus,
                                    ignoreTWName=True)
        if not recallingTasks:
            self.logger.info("No %s task retrieved.", tapeRecallStatus)
            return

        self.logger.info("Retrieved a total of %d %s tasks",
                         len(recallingTasks), tapeRecallStatus)
        crabserver = mw.crabserver
        for recallingTask in recallingTasks:
            taskName = recallingTask['tm_taskname']
            self.logger.info("Working on task %s", taskName)

            reqId = recallingTask['tm_DDM_reqid']
            if not reqId:
                self.logger.debug(
                    "tm_DDM_reqid' is not defined for task %s, skipping such task",
                    taskName)
                continue
            else:
                msg = "Task points to Rucio RuleId:  %s " % reqId
                self.logger.info(msg)

            if (time.time() - getTimeFromTaskname(
                    str(taskName))) > MAX_DAYS_FOR_TAPERECALL * 24 * 60 * 60:
                self.logger.info(
                    "Task %s is older than %d days, setting its status to FAILED",
                    taskName, MAX_DAYS_FOR_TAPERECALL)
                msg = "The disk replica request (ID: %s) for the input dataset did not complete in %d days." % (
                    reqId, MAX_DAYS_FOR_TAPERECALL)
                failTask(taskName, crabserver, msg, self.logger, 'FAILED')
                continue

            if not 'S3' in recallingTask['tm_cache_url'].upper():
                # when using old crabcache had to worry about sandbox purging after 3 days
                mpl = MyProxyLogon(config=config,
                                   crabserver=crabserver,
                                   myproxylen=self.pollingTime)
                user_proxy = True
                try:
                    mpl.execute(task=recallingTask
                                )  # this adds 'user_proxy' to recallingTask
                except TaskWorkerException as twe:
                    user_proxy = False
                    self.logger.exception(twe)

                # Make sure the task sandbox in the crabcache is not deleted until the tape recall is completed
                if user_proxy:
                    self.refreshSandbox(recallingTask)

            # Retrieve status of recall request
            if not self.rucioClient:
                self.rucioClient = getNativeRucioClient(config=config,
                                                        logger=self.logger)
            try:
                ddmRequest = self.rucioClient.get_replication_rule(reqId)
            except RuleNotFound:
                msg = "Rucio rule id %s not found. Please report to experts" % reqId
                self.logger.error(msg)
                if user_proxy:
                    mpl.uploadWarning(msg, recallingTask['user_proxy'],
                                      taskName)
            if ddmRequest['state'] == 'OK':
                self.logger.info(
                    "Request %s is completed, setting status of task %s to NEW",
                    reqId, taskName)
                mw.updateWork(taskName, recallingTask['tm_task_command'],
                              'NEW')
                # Delete all task warnings (the tapeRecallStatus added a dataset warning which is no longer valid now)
                if user_proxy:
                    mpl.deleteWarnings(recallingTask['user_proxy'], taskName)
            else:
                expiration = ddmRequest[
                    'expires_at']  # this is a datetime.datetime object
                if expiration < datetime.datetime.now():
                    # give up waiting
                    msg = (
                        "Replication request %s for task %s expired. Setting its status to FAILED"
                        % (reqId, taskName))
                    self.logger.info(msg)
                    failTask(taskName, crabserver, msg, self.logger, 'FAILED')
Beispiel #13
0
    def _execute(self, resthost, resturi, config, task):

        # setup logger
        if not self.logger:
            self.logger = logging.getLogger(__name__)
            handler = logging.StreamHandler(sys.stdout)
            formatter = logging.Formatter(
                "%(asctime)s:%(levelname)s:%(module)s %(message)s")
            handler.setFormatter(formatter)
            self.logger.addHandler(handler)
            self.logger.setLevel(logging.DEBUG)
        else:
            # do not use BaseRecurringAction logger but create a new logger
            # which writes to config.TaskWorker.logsDir/taks/recurring/TapeRecallStatus_YYMMDD-HHMM.log
            self.logger = logging.getLogger('TapeRecallStatus')
            logDir = config.TaskWorker.logsDir + '/tasks/recurring/'
            if not os.path.exists(logDir):
                os.makedirs(logDir)
            timeStamp = time.strftime('%y%m%d-%H%M', time.localtime())
            logFile = 'TapeRecallStatus_' + timeStamp + '.log'
            handler = logging.FileHandler(logDir + logFile)
            formatter = logging.Formatter(
                '%(asctime)s:%(levelname)s:%(module)s:%(message)s')
            handler.setFormatter(formatter)
            self.logger.addHandler(handler)

        mw = MasterWorker(config,
                          logWarning=False,
                          logDebug=False,
                          sequential=True,
                          console=False,
                          name='masterForTapeRecall')

        tapeRecallStatus = 'TAPERECALL'
        self.logger.info("Retrieving %s tasks", tapeRecallStatus)
        recallingTasks = mw.getWork(limit=999999,
                                    getstatus=tapeRecallStatus,
                                    ignoreTWName=True)
        if len(recallingTasks) > 0:
            self.logger.info("Retrieved a total of %d %s tasks",
                             len(recallingTasks), tapeRecallStatus)
            for recallingTask in recallingTasks:
                taskName = recallingTask['tm_taskname']
                self.logger.info("Working on task %s", taskName)

                reqId = recallingTask['tm_DDM_reqid']
                if not reqId:
                    self.logger.debug(
                        "tm_DDM_reqid' is not defined for task %s, skipping such task",
                        taskName)
                    continue

                server = HTTPRequests(resthost,
                                      config.TaskWorker.cmscert,
                                      config.TaskWorker.cmskey,
                                      retry=20,
                                      logger=self.logger)
                if (time.time() - getTimeFromTaskname(str(taskName)) >
                        MAX_DAYS_FOR_TAPERECALL * 24 * 60 * 60):
                    self.logger.info(
                        "Task %s is older than %d days, setting its status to FAILED",
                        taskName, MAX_DAYS_FOR_TAPERECALL)
                    msg = "The disk replica request (ID: %d) for the input dataset did not complete in %d days." % (
                        reqId, MAX_DAYS_FOR_TAPERECALL)
                    failTask(taskName, server, resturi, msg, self.logger,
                             'FAILED')
                    continue

                mpl = MyProxyLogon(config=config,
                                   server=server,
                                   resturi=resturi,
                                   myproxylen=self.pollingTime)
                user_proxy = True
                try:
                    mpl.execute(task=recallingTask
                                )  # this adds 'user_proxy' to recallingTask
                except TaskWorkerException as twe:
                    user_proxy = False
                    self.logger.exception(twe)

                # Make sure the task sandbox in the crabcache is not deleted until the tape recall is completed
                if user_proxy:
                    from WMCore.Services.UserFileCache.UserFileCache import UserFileCache
                    ufc = UserFileCache({
                        'cert':
                        recallingTask['user_proxy'],
                        'key':
                        recallingTask['user_proxy'],
                        'endpoint':
                        recallingTask['tm_cache_url'],
                        "pycurl":
                        True
                    })
                    sandbox = recallingTask['tm_user_sandbox'].replace(
                        ".tar.gz", "")
                    debugFiles = recallingTask['tm_debug_files'].replace(
                        ".tar.gz", "")
                    sandboxPath = os.path.join("/tmp", sandbox)
                    debugFilesPath = os.path.join("/tmp", debugFiles)
                    try:
                        ufc.download(sandbox, sandboxPath,
                                     recallingTask['tm_username'])
                        ufc.download(debugFiles, debugFilesPath,
                                     recallingTask['tm_username'])
                        self.logger.info(
                            "Successfully touched input and debug sandboxes (%s and %s) of task %s (frontend: %s) using the '%s' username (request_id = %d).",
                            sandbox, debugFiles, taskName,
                            recallingTask['tm_cache_url'],
                            recallingTask['tm_username'], reqId)
                    except Exception as ex:
                        self.logger.info("The CRAB3 server backend could not download the input and/or debug sandbox (%s and/or %s) of task %s from the frontend (%s) using the '%s' username (request_id = %d)."+\
                                         " This could be a temporary glitch, will try again in next occurrence of the recurring action."+\
                                         " Error reason:\n%s", sandbox, debugFiles, taskName, recallingTask['tm_cache_url'], recallingTask['tm_username'], reqId, str(ex))
                    finally:
                        if os.path.exists(sandboxPath): os.remove(sandboxPath)
                        if os.path.exists(debugFilesPath):
                            os.remove(debugFilesPath)

                ddmRequest = statusRequest(reqId,
                                           config.TaskWorker.DDMServer,
                                           config.TaskWorker.cmscert,
                                           config.TaskWorker.cmskey,
                                           verbose=False)
                # The query above returns a JSON with a format {"result": "OK", "message": "Request found", "data": [{"request_id": 14, "site": <site>, "item": [<list of blocks>], "group": "AnalysisOps", "n": 1, "status": "new", "first_request": "2018-02-26 23:25:41", "last_request": "2018-02-26 23:25:41", "request_count": 1}]}
                self.logger.info(
                    "Contacted %s using %s and %s for request_id = %d, got:\n%s",
                    config.TaskWorker.DDMServer, config.TaskWorker.cmscert,
                    config.TaskWorker.cmskey, reqId, ddmRequest)

                if ddmRequest["message"] == "Request found":
                    status = ddmRequest["data"][0]["status"]
                    if status == "completed":  # possible values: new, activated, updated, completed, rejected, cancelled
                        self.logger.info(
                            "Request %d is completed, setting status of task %s to NEW",
                            reqId, taskName)
                        mw.updateWork(taskName,
                                      recallingTask['tm_task_command'], 'NEW')
                        # Delete all task warnings (the tapeRecallStatus added a dataset warning which is no longer valid now)
                        if user_proxy:
                            mpl.deleteWarnings(recallingTask['user_proxy'],
                                               taskName)
                    elif status == "rejected":
                        msg = "The DDM request (ID: %d) has been rejected with this reason: %s" % (
                            reqId, ddmRequest["data"][0]["reason"])
                        self.logger.info(
                            msg + "\nSetting status of task %s to FAILED",
                            taskName)
                        failTask(taskName, server, resturi, msg, self.logger,
                                 'FAILED')

                else:
                    msg = "DDM request_id %d not found. Please report to experts" % reqId
                    self.logger.info(msg)
                    if user_proxy:
                        mpl.uploadWarning(msg, recallingTask['user_proxy'],
                                          taskName)

        else:
            self.logger.info("No %s task retrieved.", tapeRecallStatus)