Example #1
0
    def killAll(self):

        # We need to keep ROOT DAG in hold until periodic remove kicks in.
        # See DagmanSubmitter.py#L390 (dagAd["PeriodicRemove"])
        # This is needed in case user wants to resubmit.
        rootConst = "TaskType =?= \"ROOT\" && CRAB_ReqName =?= %s" % HTCondorUtils.quote(self.workflow)
        # Holding DAG job does not mean that it will remove all jobs
        # and this must be done separately
        # --------------------------------------
        # From HTCondor documentation
        # http://research.cs.wisc.edu/htcondor/manual/v8.3/2_10DAGMan_Applications.html#SECTION003107000000000000000
        # --------------------------------------
        # After placing the condor_dagman job on hold, no new node jobs will be submitted,
        # and no PRE or POST scripts will be run. Any node jobs already in the HTCondor queue
        # will continue undisturbed. If the condor_dagman job is left on hold, it will remain
        # in the HTCondor queue after all of the currently running node jobs are finished.
        # --------------------------------------
        # TODO: Remove jobConst query when htcondor ticket is solved
        # https://htcondor-wiki.cs.wisc.edu/index.cgi/tktview?tn=5175
        jobConst = "TaskType =!= \"ROOT\" && CRAB_ReqName =?= %s" % HTCondorUtils.quote(self.workflow)

        with HTCondorUtils.AuthenticatedSubprocess(self.proxy) as (parent, rpipe):
            if not parent:
                with self.schedd.transaction() as tsc:
                    self.schedd.act(htcondor.JobAction.Hold, rootConst)
                    self.schedd.act(htcondor.JobAction.Remove, jobConst)
        results = rpipe.read()
        if results != "OK":
            raise TaskWorkerException("The CRAB3 server backend could not kill the task because the Grid scheduler answered with an error\n"\
                                      "This is probably a temporary glitch, please try it again and contact an expert if the error persist\n"+\
                                      "Error reason %s" % results)
Example #2
0
    def killAll(self):

        # We need to keep ROOT DAG in hold until periodic remove kicks in.
        # See DagmanSubmitter.py#L390 (dagAd["PeriodicRemove"])
        # This is needed in case user wants to resubmit.
        rootConst = "TaskType =?= \"ROOT\" && CRAB_ReqName =?= %s" % HTCondorUtils.quote(self.workflow)
        # Holding DAG job does not mean that it will remove all jobs
        # and this must be done separately
        # --------------------------------------
        # From HTCondor documentation
        # http://research.cs.wisc.edu/htcondor/manual/v8.3/2_10DAGMan_Applications.html#SECTION003107000000000000000
        # --------------------------------------
        # After placing the condor_dagman job on hold, no new node jobs will be submitted,
        # and no PRE or POST scripts will be run. Any node jobs already in the HTCondor queue
        # will continue undisturbed. If the condor_dagman job is left on hold, it will remain
        # in the HTCondor queue after all of the currently running node jobs are finished.
        # --------------------------------------
        # TODO: Remove jobConst query when htcondor ticket is solved
        # https://htcondor-wiki.cs.wisc.edu/index.cgi/tktview?tn=5175
        jobConst = "TaskType =!= \"ROOT\" && CRAB_ReqName =?= %s" % HTCondorUtils.quote(self.workflow)

        with HTCondorUtils.AuthenticatedSubprocess(self.proxy) as (parent, rpipe):
            if not parent:
                with self.schedd.transaction() as tsc:
                    self.schedd.act(htcondor.JobAction.Hold, rootConst)
                    self.schedd.act(htcondor.JobAction.Remove, jobConst)
        results = rpipe.read()
        if results != "OK":
            msg  = "The CRAB server backend was not able to kill the task,"
            msg += " because the Grid scheduler answered with an error."
            msg += " This is probably a temporary glitch. Please try again later."
            msg += " If the error persists send an e-mail to %s." % (FEEDBACKMAIL)
            msg += " Error reason: %s" % (results)
            raise TaskWorkerException(msg)
Example #3
0
 def killJobs(self, ids):
     ad = classad.ClassAd()
     ad['foo'] = ids
     const = "CRAB_ReqName =?= %s && member(CRAB_Id, %s)" % (
         HTCondorUtils.quote(self.workflow), ad.lookup("foo").__repr__())
     with HTCondorUtils.AuthenticatedSubprocess(self.proxy) as (parent,
                                                                rpipe):
         if not parent:
             self.schedd.act(htcondor.JobAction.Remove, const)
     results = rpipe.read()
     if results != "OK":
         raise TaskWorkerException("The CRAB3 server backend could not kill jobs [%s]. because the Grid scheduler answered with an error\n" % ", ".join(ids)+\
                                   "This is probably a temporary glitch, please try it again and contact an expert if the error persist\n"+\
                                   "Error reason %s" % results)
Example #4
0
    def killAll(self):

        # Search for and hold the DAG
        rootConst = "TaskType =?= \"ROOT\" && CRAB_ReqName =?= %s" % HTCondorUtils.quote(
            self.workflow)

        with HTCondorUtils.AuthenticatedSubprocess(self.proxy) as (parent,
                                                                   rpipe):
            if not parent:
                self.schedd.act(htcondor.JobAction.Hold, rootConst)
        results = rpipe.read()
        if results != "OK":
            raise TaskWorkerException("The CRAB3 server backend could not kill the task because the Grid scheduler answered with an error\n"\
                                      "This is probably a temporary glitch, please try it again and contact an expert if the error persist\n"+\
                                      "Error reason %s" % results)
Example #5
0
 def killJobs(self, ids):
     ad = classad.ClassAd()
     ad['foo'] = ids
     const = "CRAB_ReqName =?= %s && member(CRAB_Id, %s)" % (HTCondorUtils.quote(self.workflow), ad.lookup("foo").__repr__())
     with HTCondorUtils.AuthenticatedSubprocess(self.proxy) as (parent, rpipe):
         if not parent:
             self.schedd.act(htcondor.JobAction.Remove, const)
     results = rpipe.read()
     if results != "OK":
         msg  = "The CRAB server backend was not able to kill these jobs %s," % (ids)
         msg += " because the Grid scheduler answered with an error."
         msg += " This is probably a temporary glitch. Please try again later."
         msg += " If the error persists send an e-mail to %s." % (FEEDBACKMAIL)
         msg += " Error reason: %s" % (results)
         raise TaskWorkerException(msg)
Example #6
0
 def getScheddObj(self, name):
     """
     Return a tuple (schedd, address) containing an object representing the
     remote schedd and its corresponding address.
     """
     info = name.split("_")
     if len(info) > 3:
         name = info[2]
     else:
         name = self.getSchedd()
     if name == "localhost":
         schedd = htcondor.Schedd()
         with open(htcondor.param['SCHEDD_ADDRESS_FILE']) as fd:
             address = fd.read().split("\n")[0]
     else:
         info = name.split(":")
         pool = "localhost"
         if len(info) == 3:
             pool = info[1]
         htcondor.param['COLLECTOR_HOST'] = self.getCollector(pool)
         coll = htcondor.Collector()
         schedds = coll.query(htcondor.AdTypes.Schedd, 'regexp(%s, Name)' % HTCondorUtils.quote(info[0]))
         if not schedds:
             raise Exception("Unable to locate schedd %s" % info[0])
         self.scheddAd = schedds[0]
         address = self.scheddAd['MyAddress']
         schedd = htcondor.Schedd(self.scheddAd)
     return schedd, address
Example #7
0
 def getScheddObj(self, name):
     """
     Return a tuple (schedd, address) containing an object representing the
     remote schedd and its corresponding address.
     Still required for OLD tasks. Remove it later TODO
     """
     info = name.split("_")
     if len(info) > 3:
         name = info[2]
     else:
         name = self.getSchedd()
     if name == "localhost":
         schedd = htcondor.Schedd()
         with open(htcondor.param['SCHEDD_ADDRESS_FILE']) as fd:
             address = fd.read().split("\n")[0]
     else:
         info = name.split(":")
         pool = "localhost"
         if len(info) == 3:
             pool = info[1]
         htcondor.param['COLLECTOR_HOST'] = self.getCollector(pool)
         coll = htcondor.Collector()
         schedds = coll.query(htcondor.AdTypes.Schedd, 'regexp(%s, Name)' % HTCondorUtils.quote(info[0]))
         self.scheddAd = ""
         if not schedds:
             self.scheddAd = self.getCachedCollectorOutput(info[0])
         else:
             self.cacheCollectorOutput(info[0], schedds[0])
             self.scheddAd = self.getCachedCollectorOutput(info[0])
         address = self.scheddAd['MyAddress']
         schedd = htcondor.Schedd(self.scheddAd)
     return schedd, address
Example #8
0
    def executeInternal(self, apmon, *args, **kw):

        if 'task' not in kw:
            raise ValueError("No task specified.")
        self.task = kw['task']
        if 'tm_taskname' not in self.task:
            raise ValueError("No taskname specified")
        self.workflow = self.task['tm_taskname']
        if 'user_proxy' not in self.task:
            raise ValueError("No proxy provided")
        self.proxy = self.task['user_proxy']

        try:
            self.killTransfers(apmon)
        except:
            self.logger.exception("Failed to kill transfers; suppressing error until functionality is confirmed")

        self.logger.info("About to kill workflow: %s. Getting status first." % self.workflow)

        self.workflow = str(self.workflow)
        if not WORKFLOW_RE.match(self.workflow):
            raise Exception("Invalid workflow name.")

        # Query HTCondor for information about running jobs and update Dashboard appropriately
        loc = HTCondorLocator.HTCondorLocator(self.backendurls)
        self.schedd, address = loc.getScheddObj(self.workflow)

        ad = classad.ClassAd()
        ad['foo'] = self.task['kill_ids']
        try:
            hostname = socket.getfqdn()
        except:
            hostname = ''

        const = "CRAB_ReqName =?= %s && member(CRAB_Id, %s)" % (HTCondorUtils.quote(self.workflow), ad.lookup("foo").__repr__())
        try:
            for ad in self.schedd.query(const, ['CRAB_Id', 'CRAB_Retry']):
                if ('CRAB_Id' not in ad) or ('CRAB_Retry' not in ad):
                    continue
                jobid = str(ad.eval('CRAB_Id'))
                jobretry = str(ad.eval('CRAB_Retry'))
                jinfo = {'jobId': ("%s_https://glidein.cern.ch/%s/%s_%s" % (jobid, jobid, self.workflow, jobretry)),
                         'sid': "https://glidein.cern.ch/%s%s" % (jobid, self.workflow),
                         'broker': hostname,
                         'bossId': jobid,
                         'StatusValue' : 'killed',
                        }
                self.logger.info("Sending kill info to Dashboard: %s" % str(jinfo))
                apmon.sendToML(jinfo)
        except:
            self.logger.exception("Failed to notify Dashboard of job kills")

        # Note that we can not send kills for jobs not in queue at this time; we'll need the
        # DAG FINAL node to be fixed and the node status to include retry number.

        if self.task['kill_all']:
            return self.killAll()
        else:
            return self.killJobs(self.task['kill_ids'])
Example #9
0
    def executeInternal(self, apmon, *args, **kwargs):
        #Marco: I guess these value errors only happens for development instances
        if 'task' not in kwargs:
            raise ValueError("No task specified.")
        self.task = kwargs['task']
        if 'tm_taskname' not in self.task:
            raise ValueError("No taskname specified")
        self.workflow = self.task['tm_taskname']
        if 'user_proxy' not in self.task:
            raise ValueError("No proxy provided")
        self.proxy = self.task['user_proxy']

        self.logger.info("About to kill workflow: %s." % self.workflow)

        self.workflow = str(self.workflow)
        if not WORKFLOW_RE.match(self.workflow):
            raise Exception("Invalid workflow name.")

        # Query HTCondor for information about running jobs and update Dashboard appropriately
        if self.task['tm_collector']:
            self.backendurls['htcondorPool'] = self.task['tm_collector']
        loc = HTCondorLocator.HTCondorLocator(self.backendurls)

        address = ""
        try:
            self.schedd, address = loc.getScheddObjNew(self.task['tm_schedd'])
        except Exception as exp:
            msg  = "The CRAB server backend was not able to contact the Grid scheduler."
            msg += " Please try again later."
            msg += " If the error persists send an e-mail to %s." % (FEEDBACKMAIL)
            msg += " Message from the scheduler: %s" % (str(exp))
            self.logger.exception("%s: %s" % (self.workflow, msg))
            raise TaskWorkerException(msg)

        try:
            hostname = socket.getfqdn()
        except:
            hostname = ''

        const = 'CRAB_ReqName =?= %s && TaskType=?="Job"' % HTCondorUtils.quote(self.workflow)
        try:
            for ad in list(self.schedd.xquery(const, ['CRAB_Id', 'CRAB_Retry'])):
                if ('CRAB_Id' not in ad) or ('CRAB_Retry' not in ad):
                    continue
                jobid = str(ad.eval('CRAB_Id'))
                jobretry = str(ad.eval('CRAB_Retry'))
                jinfo = {'broker': hostname,
                         'bossId': jobid,
                         'StatusValue': 'killed',
                        }
                insertJobIdSid(jinfo, jobid, self.workflow, jobretry)
                self.logger.info("Sending kill info to Dashboard: %s" % str(jinfo))
                apmon.sendToML(jinfo)
        except:
            self.logger.exception("Failed to notify Dashboard of job kills") #warning

        # Note that we can not send kills for jobs not in queue at this time; we'll need the
        # DAG FINAL node to be fixed and the node status to include retry number.
        return self.killAll(const)
Example #10
0
    def executeInternal(self, apmon, *args, **kwargs):
        #Marco: I guess these value errors only happens for development instances
        if 'task' not in kwargs:
            raise ValueError("No task specified.")
        self.task = kwargs['task']
        if 'tm_taskname' not in self.task:
            raise ValueError("No taskname specified")
        self.workflow = self.task['tm_taskname']
        if 'user_proxy' not in self.task:
            raise ValueError("No proxy provided")
        self.proxy = self.task['user_proxy']

        self.logger.info("About to kill workflow: %s." % self.workflow)

        self.workflow = str(self.workflow)
        if not WORKFLOW_RE.match(self.workflow):
            raise Exception("Invalid workflow name.")

        # Query HTCondor for information about running jobs and update Dashboard appropriately
        if self.task['tm_collector']:
            self.backendurls['htcondorPool'] = self.task['tm_collector']
        loc = HTCondorLocator.HTCondorLocator(self.backendurls)

        address = ""
        try:
            self.schedd, address = loc.getScheddObjNew(self.task['tm_schedd'])
        except Exception as exp:
            msg  = "The CRAB server backend was not able to contact the Grid scheduler."
            msg += " Please try again later."
            msg += " If the error persists send an e-mail to %s." % (FEEDBACKMAIL)
            msg += " Message from the scheduler: %s" % (str(exp))
            self.logger.exception("%s: %s" % (self.workflow, msg))
            raise TaskWorkerException(msg)

        try:
            hostname = socket.getfqdn()
        except:
            hostname = ''

        const = 'CRAB_ReqName =?= %s && TaskType=?="Job"' % HTCondorUtils.quote(self.workflow)
        try:
            for ad in list(self.schedd.xquery(const, ['CRAB_Id', 'CRAB_Retry'])):
                if ('CRAB_Id' not in ad) or ('CRAB_Retry' not in ad):
                    continue
                jobid = str(ad.eval('CRAB_Id'))
                jobretry = str(ad.eval('CRAB_Retry'))
                jinfo = {'broker': hostname,
                         'bossId': jobid,
                         'StatusValue': 'killed',
                        }
                insertJobIdSid(jinfo, jobid, self.workflow, jobretry)
                self.logger.info("Sending kill info to Dashboard: %s" % str(jinfo))
                apmon.sendToML(jinfo)
        except:
            self.logger.exception("Failed to notify Dashboard of job kills") #warning

        # Note that we can not send kills for jobs not in queue at this time; we'll need the
        # DAG FINAL node to be fixed and the node status to include retry number.
        return self.killAll(const)
Example #11
0
 def killJobs(self, ids):
     ad = classad.ClassAd()
     ad['foo'] = ids
     const = "CRAB_ReqName =?= %s && member(CRAB_Id, %s)" % (HTCondorUtils.quote(self.workflow), ad.lookup("foo").__repr__())
     with HTCondorUtils.AuthenticatedSubprocess(self.proxy) as (parent, rpipe):
         if not parent:
            self.schedd.act(htcondor.JobAction.Remove, const)
     results = rpipe.read()
     if results != "OK":
         raise Exception("Failure when killing jobs [%s]: %s" % (", ".join(ids), results))
Example #12
0
    def killAll(self):

        # Search for and hold the DAG
        rootConst = "TaskType =?= \"ROOT\" && CRAB_ReqName =?= %s" % HTCondorUtils.quote(self.workflow)

        with HTCondorUtils.AuthenticatedSubprocess(self.proxy) as (parent, rpipe):
            if not parent:
               self.schedd.act(htcondor.JobAction.Hold, rootConst)
        results = rpipe.read()
        if results != "OK":
            raise Exception("Failure when killing task: %s" % results)
 def push_new_proxy_to_schedd(self, schedd, ad, proxy):
     if not hasattr(schedd, 'refreshGSIProxy'):
         raise NotImplementedError()
     with HTCondorUtils.AuthenticatedSubprocess(proxy) as (parent, rpipe):
         if not parent:
             schedd.refreshGSIProxy(ad['ClusterId'], ad['ProcID'], proxy,
                                    -1)
     results = rpipe.read()
     if results != "OK":
         raise Exception("Failure when renewing HTCondor task proxy: '%s'" %
                         results)
 def renew_proxy(self, schedd, ad, proxy):
     now = time.time()
     self.logger.info("Renewing proxy for task %s." % ad['CRAB_ReqName'])
     if not hasattr(schedd, 'refreshGSIProxy'):
         raise NotImplementedError()
     with HTCondorUtils.AuthenticatedSubprocess(proxy) as (parent, rpipe):
         if not parent:
             lifetime = schedd.refreshGSIProxy(ad['ClusterId'], ad['ProcID'], proxy, -1)
             schedd.edit(['%s.%s' % (ad['ClusterId'], ad['ProcId'])], 'x509userproxyexpiration', str(int(now+lifetime)))
     results = rpipe.read()
     if results != "OK":
         raise Exception("Failure when renewing HTCondor task proxy: '%s'" % results)
Example #15
0
 def killJobs(self, ids):
     ad = classad.ClassAd()
     ad['foo'] = ids
     const = "CRAB_ReqName =?= %s && member(CRAB_Id, %s)" % (HTCondorUtils.quote(self.workflow), ad.lookup("foo").__repr__())
     with HTCondorUtils.AuthenticatedSubprocess(self.proxy) as (parent, rpipe):
         if not parent:
             self.schedd.act(htcondor.JobAction.Remove, const)
     results = rpipe.read()
     if results != "OK":
         raise TaskWorkerException("The CRAB3 server backend could not kill jobs [%s]. because the Grid scheduler answered with an error\n" % ", ".join(ids)+\
                                   "This is probably a temporary glitch, please try it again and contact an expert if the error persist\n"+\
                                   "Error reason %s" % results)
Example #16
0
    def killAll(self):

        # Search for and hold the DAG
        rootConst = "TaskType =?= \"ROOT\" && CRAB_ReqName =?= %s" % HTCondorUtils.quote(self.workflow)

        with HTCondorUtils.AuthenticatedSubprocess(self.proxy) as (parent, rpipe):
            if not parent:
                self.schedd.act(htcondor.JobAction.Hold, rootConst)
        results = rpipe.read()
        if results != "OK":
            raise TaskWorkerException("The CRAB3 server backend could not kill the task because the Grid scheduler answered with an error\n"\
                                      "This is probably a temporary glitch, please try it again and contact an expert if the error persist\n"+\
                                      "Error reason %s" % results)
    def duplicateCheck(self, task):
        """
        Look to see if the task we are about to submit is already in the schedd.
        If so, assume that this task in TaskWorker was run successfully, but killed
        before it could update the frontend.
        """
        workflow = task["tm_taskname"]

        if task["tm_collector"]:
            self.backendurls["htcondorPool"] = task["tm_collector"]
        loc = HTCondorLocator.HTCondorLocator(self.backendurls)

        schedd = ""
        try:
            self.logger.debug("Duplicate check is getting the schedd obj. Collector is: %s", task["tm_collector"])
            schedd, dummyAddress = loc.getScheddObjNew(task["tm_schedd"])
            self.logger.debug("Got schedd obj for %s ", task["tm_schedd"])
        except Exception as exp:
            msg = "The CRAB server backend was not able to contact the Grid scheduler."
            msg += " Please try again later."
            msg += " If the error persists send an e-mail to %s." % (FEEDBACKMAIL)
            msg += " Message from the scheduler: %s" % (str(exp))
            self.logger.exception("%s: %s", workflow, msg)
            raise TaskWorkerException(msg)

        rootConst = (
            'TaskType =?= "ROOT" && CRAB_ReqName =?= %s && (isUndefined(CRAB_Attempt) || CRAB_Attempt == 0)'
            % HTCondorUtils.quote(workflow)
        )

        self.logger.debug("Duplicate check is querying the schedd: %s", rootConst)
        results = list(schedd.xquery(rootConst, []))
        self.logger.debug("Schedd queried %s", results)

        if not results:
            # Task not already in schedd
            return None

        configreq = {"workflow": workflow, "status": "SUBMITTED", "subresource": "success"}
        self.logger.warning(
            "Task %s already submitted to HTCondor; pushing information centrally: %s", workflow, str(configreq)
        )
        data = urllib.urlencode(configreq)
        self.server.post(self.resturi, data=data)

        # Note that we don't re-send Dashboard jobs; we assume this is a rare occurrance and
        # don't want to upset any info already in the Dashboard.

        return Result.Result(task=task, result=(-1))
Example #18
0
 def killJobs(self, ids):
     ad = classad.ClassAd()
     ad['foo'] = ids
     const = "CRAB_ReqName =?= %s && member(CRAB_Id, %s)" % (HTCondorUtils.quote(self.workflow), ad.lookup("foo").__repr__())
     with HTCondorUtils.AuthenticatedSubprocess(self.proxy) as (parent, rpipe):
         if not parent:
             self.schedd.act(htcondor.JobAction.Remove, const)
     results = rpipe.read()
     if results != "OK":
         msg  = "The CRAB server backend was not able to kill these jobs %s," % (ids)
         msg += " because the Grid scheduler answered with an error."
         msg += " This is probably a temporary glitch. Please try again later."
         msg += " If the error persists send an e-mail to %s." % (FEEDBACKMAIL)
         msg += " Error reason: %s" % (results)
         raise TaskWorkerException(msg)
Example #19
0
    def submitDirect(self, schedd, cmd, arg, info): #pylint: disable=R0201
        """
        Submit directly to the schedd using the HTCondor module
        """
        dagAd = classad.ClassAd()
        addCRABInfoToClassAd(dagAd, info)

        groups = CMSGroupMapper.map_user_to_groups(dagAd["CRAB_UserHN"])
        if groups:
            dagAd["CMSGroups"] = groups

        # NOTE: Changes here must be synchronized with the job_submit in DagmanCreator.py in CAFTaskWorker
        dagAd["Out"] = str(os.path.join(info['scratch'], "request.out"))
        dagAd["Err"] = str(os.path.join(info['scratch'], "request.err"))
        dagAd["CRAB_Attempt"] = 0
        # We switched from local to scheduler universe.  Why?  It seems there's no way in the
        # local universe to change the hold signal at runtime.  That's fairly important for our
        # resubmit implementation.
        #dagAd["JobUniverse"] = 12
        dagAd["JobUniverse"] = 7
        dagAd["HoldKillSig"] = "SIGUSR1"
        dagAd["Cmd"] = cmd
        dagAd['Args'] = arg
        dagAd["TransferInput"] = str(info['inputFilesString'])
        dagAd["LeaveJobInQueue"] = classad.ExprTree("(JobStatus == 4) && ((StageOutFinish =?= UNDEFINED) || (StageOutFinish == 0))")
        dagAd["PeriodicRemove"] = classad.ExprTree("(JobStatus == 5) && (time()-EnteredCurrentStatus > 30*86400)")
        dagAd["TransferOutput"] = info['outputFilesString']
        dagAd["OnExitRemove"] = classad.ExprTree("( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2))")
        dagAd["OtherJobRemoveRequirements"] = classad.ExprTree("DAGManJobId =?= ClusterId")
        dagAd["RemoveKillSig"] = "SIGUSR1"
        dagAd["OnExitHold"] = classad.ExprTree("(ExitCode =!= UNDEFINED && ExitCode != 0)")
        dagAd["Environment"] = classad.ExprTree('strcat("PATH=/usr/bin:/bin CRAB3_VERSION=3.3.0-pre1 CONDOR_ID=", ClusterId, ".", ProcId," %s")' % " ".join(info['additional_environment_options'].split(";")))
        dagAd["RemoteCondorSetup"] = info['remote_condor_setup']
        dagAd["Requirements"] = classad.ExprTree('true || false')
        dagAd["TaskType"] = "ROOT"
        dagAd["X509UserProxy"] = info['user_proxy']

        with HTCondorUtils.AuthenticatedSubprocess(info['user_proxy']) as (parent, rpipe):
            if not parent:
                resultAds = []
                schedd.submit(dagAd, 1, True, resultAds)
                schedd.spool(resultAds)
                if resultAds:
                    id = "%s.%s" % (resultAds[0]['ClusterId'], resultAds[0]['ProcId'])
                    schedd.edit([id], "LeaveJobInQueue", classad.ExprTree("(JobStatus == 4) && (time()-EnteredCurrentStatus < 30*86400)"))
        results = rpipe.read()
        if results != "OK":
            raise TaskWorkerException("Failure when submitting task to scheduler. Error reason: '%s'" % results)
Example #20
0
 def getScheddObjNew(self, schedd):
     """
     Return a tuple (schedd, address) containing an object representing the
     remote schedd and its corresponding address.
     """
     htcondor.param['COLLECTOR_HOST'] = self.getCollector().encode('ascii', 'ignore')
     coll = htcondor.Collector()
     schedds = coll.query(htcondor.AdTypes.Schedd, 'regexp(%s, Name)' % HTCondorUtils.quote(schedd.encode('ascii', 'ignore')))
     self.scheddAd = ""
     if not schedds:
         self.scheddAd = self.getCachedCollectorOutput(schedd)
     else:
         self.cacheCollectorOutput(schedd, schedds[0])
         self.scheddAd = self.getCachedCollectorOutput(schedd)
     address = self.scheddAd['MyAddress']
     scheddObj = htcondor.Schedd(self.scheddAd)
     return scheddObj, address
Example #21
0
    def executeInternal(self, *args, **kwargs):  # pylint: disable=unused-argument
        #Marco: I guess these value errors only happens for development instances
        if 'task' not in kwargs:
            raise ValueError("No task specified.")
        self.task = kwargs['task']  # pylint: disable=attribute-defined-outside-init
        if 'tm_taskname' not in self.task:
            raise ValueError("No taskname specified")
        self.workflow = self.task['tm_taskname']  # pylint: disable=attribute-defined-outside-init
        if 'user_proxy' not in self.task:
            raise ValueError("No proxy provided")
        self.proxy = self.task['user_proxy']  # pylint: disable=attribute-defined-outside-init

        self.logger.info("About to kill workflow: %s.", self.workflow)

        self.workflow = str(self.workflow)  # pylint: disable=attribute-defined-outside-init
        if not WORKFLOW_RE.match(self.workflow):
            raise Exception("Invalid workflow name.")

        # Query HTCondor for information about running jobs and update Dashboard appropriately
        if self.task['tm_collector']:
            self.backendurls['htcondorPool'] = self.task['tm_collector']
        loc = HTCondorLocator.HTCondorLocator(self.backendurls)

        try:
            self.schedd, _ = loc.getScheddObjNew(self.task['tm_schedd'])  # pylint: disable=attribute-defined-outside-init
        except Exception as exp:
            msg = "The CRAB server backend was not able to contact the Grid scheduler."
            msg += " Please try again later."
            msg += " If the error persists send an e-mail to %s." % (
                FEEDBACKMAIL)
            msg += " Message from the scheduler: %s" % (str(exp))
            self.logger.exception("%s: %s", self.workflow, msg)
            raise TaskWorkerException(msg)

        try:
            hostname = socket.getfqdn()
        except Exception:
            hostname = ''

        const = 'CRAB_ReqName =?= %s && TaskType=?="Job"' % HTCondorUtils.quote(
            self.workflow)

        # Note that we can not send kills for jobs not in queue at this time; we'll need the
        # DAG FINAL node to be fixed and the node status to include retry number.
        return self.killAll(const)
Example #22
0
 def getScheddObjNew(self, schedd):
     """
     Return a tuple (schedd, address) containing an object representing the
     remote schedd and its corresponding address.
     """
     htcondor.param['COLLECTOR_HOST'] = self.getCollector().encode('ascii', 'ignore')
     coll = htcondor.Collector()
     schedds = coll.query(htcondor.AdTypes.Schedd, 'Name=?=%s' % HTCondorUtils.quote(schedd.encode('ascii', 'ignore')),
                          ["AddressV1", "CondorPlatform", "CondorVersion", "Machine", "MyAddress", "Name", "MyType", "ScheddIpAddr", "RemoteCondorSetup"])
     self.scheddAd = ""
     if not schedds:
         self.scheddAd = self.getCachedCollectorOutput(schedd)
     else:
         self.cacheCollectorOutput(schedd, schedds[0])
         self.scheddAd = self.getCachedCollectorOutput(schedd)
     address = self.scheddAd['MyAddress']
     scheddObj = htcondor.Schedd(self.scheddAd)
     return scheddObj, address
Example #23
0
 def getScheddObjNew(self, schedd):
     """
     Return a tuple (schedd, address) containing an object representing the
     remote schedd and its corresponding address.
     """
     htcondor.param['COLLECTOR_HOST'] = self.getCollector().encode(
         'ascii', 'ignore')
     coll = htcondor.Collector()
     schedds = coll.query(
         htcondor.AdTypes.Schedd, 'Name=?=%s' %
         HTCondorUtils.quote(schedd.encode('ascii', 'ignore')), [
             "AddressV1", "CondorPlatform", "CondorVersion", "Machine",
             "MyAddress", "Name", "MyType", "ScheddIpAddr",
             "RemoteCondorSetup"
         ])
     if not schedds:
         self.scheddAd = self.getCachedCollectorOutput(schedd)
     else:
         self.cacheCollectorOutput(schedd, schedds[0])
         self.scheddAd = self.getCachedCollectorOutput(schedd)
     address = self.scheddAd['MyAddress']
     scheddObj = htcondor.Schedd(self.scheddAd)
     return scheddObj, address
Example #24
0
        workflow = task['tm_taskname']

        if task['tm_collector']:
            self.backendurls['htcondorPool'] = task['tm_collector']
        loc = HTCondorLocator.HTCondorLocator(self.backendurls)

        address = ""
        schedd = ""
        try:
            schedd, address = loc.getScheddObjNew(task['tm_schedd'])
        except Exception, exp:
            msg = ("%s: The CRAB3 server backend is not able to contact Grid scheduler. Please, retry later. Message from the scheduler: %s") % (workflow, str(exp))
            self.logger.exception(msg)
            raise TaskWorkerException(msg)

        rootConst = 'TaskType =?= "ROOT" && CRAB_ReqName =?= %s && (isUndefined(CRAB_Attempt) || CRAB_Attempt == 0)' % HTCondorUtils.quote(workflow)

        results = list(schedd.xquery(rootConst, []))

        if not results:
            # Task not already in schedd
            return None

        configreq = {'workflow': workflow,
                     'status': "SUBMITTED",
                     'jobset': "-1",
                     'subresource': 'success',
                    }
        self.logger.warning("Task %s already submitted to HTCondor; pushing information centrally: %s" % (workflow, str(configreq)))
        data = urllib.urlencode(configreq)
        self.server.post(self.resturi, data = data)
Example #25
0
    def executeInternal(self, apmon, *args, **kw):
        #Marco: I guess these value errors only happens for development instances
        if 'task' not in kw:
            raise ValueError("No task specified.")
        self.task = kw['task']
        if 'tm_taskname' not in self.task:
            raise ValueError("No taskname specified")
        self.workflow = self.task['tm_taskname']
        if 'user_proxy' not in self.task:
            raise ValueError("No proxy provided")
        self.proxy = self.task['user_proxy']

        try:
            self.killTransfers(apmon)
        except:
            self.logger.exception("Failed to kill transfers; suppressing error until functionality is confirmed") #TODO send a warning?

        self.logger.info("About to kill workflow: %s." % self.workflow)

        self.workflow = str(self.workflow)
        if not WORKFLOW_RE.match(self.workflow):
            raise Exception("Invalid workflow name.")

        # Query HTCondor for information about running jobs and update Dashboard appropriately
        if self.task['tm_collector']:
            self.backendurls['htcondorPool'] = self.task['tm_collector']
        loc = HTCondorLocator.HTCondorLocator(self.backendurls)

        address = ""
        try:
            self.schedd, address = loc.getScheddObjNew(self.task['tm_schedd'])
        except Exception as exp:
            msg = ("%s: The CRAB3 server backend is not able to contact Grid scheduler. Please, retry later. Message from the scheduler: %s") % (self.workflow, str(exp))
            self.logger.exception(msg)
            raise TaskWorkerException(msg)

        ad = classad.ClassAd()
        ad['foo'] = self.task['kill_ids']
        try:
            hostname = socket.getfqdn()
        except:
            hostname = ''

        const = "CRAB_ReqName =?= %s && member(CRAB_Id, %s)" % (HTCondorUtils.quote(self.workflow), ad.lookup("foo").__repr__())
        try:
            for ad in list(self.schedd.xquery(const, ['CRAB_Id', 'CRAB_Retry'])):
                if ('CRAB_Id' not in ad) or ('CRAB_Retry' not in ad):
                    continue
                jobid = str(ad.eval('CRAB_Id'))
                jobretry = str(ad.eval('CRAB_Retry'))
                jinfo = {'jobId': ("%s_https://glidein.cern.ch/%s/%s_%s" % (jobid, jobid, self.workflow.replace("_", ":"), jobretry)),
                         'sid': "https://glidein.cern.ch/%s%s" % (jobid, self.workflow.replace("_", ":")),
                         'broker': hostname,
                         'bossId': jobid,
                         'StatusValue' : 'killed',
                        }
                self.logger.info("Sending kill info to Dashboard: %s" % str(jinfo))
                apmon.sendToML(jinfo)
        except:
            self.logger.exception("Failed to notify Dashboard of job kills") #warning

        # Note that we can not send kills for jobs not in queue at this time; we'll need the
        # DAG FINAL node to be fixed and the node status to include retry number.

        if self.task['kill_all']:
            return self.killAll()
        else:
            return self.killJobs(self.task['kill_ids'])
Example #26
0
    def duplicateCheck(self, task):
        """
        Look to see if the task we are about to submit is already in the schedd.
        If so, assume that this task in TaskWorker was run successfully, but killed
        before it could update the frontend.
        """
        workflow = task['tm_taskname']

        if task['tm_collector']:
            self.backendurls['htcondorPool'] = task['tm_collector']
        loc = HTCondorLocator.HTCondorLocator(self.backendurls)

        schedd = ""
        try:
            self.logger.debug("Duplicate check is getting the schedd obj. Collector is: %s", task['tm_collector'])
            schedd, _address = loc.getScheddObjNew(task['tm_schedd'])
            self.logger.debug("Got schedd obj for %s ", task['tm_schedd'])
        except Exception as exp:
            msg = "The CRAB server backend was not able to contact the Grid scheduler."
            msg += " Please try again later."
            msg += " If the error persists send an e-mail to %s." % (FEEDBACKMAIL)
            msg += " Message from the scheduler: %s" % (str(exp))
            self.logger.exception("%s: %s", workflow, msg)
            raise TaskWorkerException(msg)

        rootConst = 'TaskType =?= "ROOT" && CRAB_ReqName =?= %s && (isUndefined(CRAB_Attempt) || CRAB_Attempt == 0)' % HTCondorUtils.quote(workflow)

        self.logger.debug("Duplicate check is querying the schedd: %s", rootConst)
        results = list(schedd.xquery(rootConst, []))
        self.logger.debug("Schedd queried %s", results)

        if not results:
            # Task not already in schedd
            return None

        configreq = {'workflow': workflow,
                     'status': "SUBMITTED",
                     'jobset': "-1",
                     'subresource': 'success',
                    }
        self.logger.warning("Task %s already submitted to HTCondor; pushing information centrally: %s", workflow, str(configreq))
        data = urllib.urlencode(configreq)
        self.server.post(self.resturi, data=data)

        # Note that we don't re-send Dashboard jobs; we assume this is a rare occurrance and
        # don't want to upset any info already in the Dashboard.

        return Result.Result(task=task, result=(-1))
Example #27
0
def bootstrap():
    print "Entering TaskManagerBootstrap with args: %s" % sys.argv
    command = sys.argv[1]
    if command == "POSTJOB":
        return PostJob.PostJob().execute(*sys.argv[2:])
    elif command == "PREJOB":
        return PreJob.PreJob().execute(*sys.argv[2:])
    elif command == "FINAL":
        return Final.Final().execute(*sys.argv[2:])
    elif command == "ASO":
        return ASO.async_stageout(*sys.argv[2:])

    infile, outfile = sys.argv[2:]

    adfile = os.environ["_CONDOR_JOB_AD"]
    print "Parsing classad"
    with open(adfile, "r") as fd:
        ad = classad.parseOld(fd)
    print "..done"
    in_args = []
    if infile != "None":
        with open(infile, "r") as fd:
            in_args = pickle.load(fd)

    config = Configuration.Configuration()
    config.section_("Services")
    config.Services.DBSUrl = 'https://cmsweb.cern.ch/dbs/prod/phys03/DBSWriter/'

    ad['tm_taskname'] = ad.eval("CRAB_Workflow")
    ad['tm_split_algo'] = ad.eval("CRAB_SplitAlgo")
    ad['tm_dbs_url'] = ad.eval("CRAB_DBSUrl")
    ad['tm_input_dataset'] = ad.eval("CRAB_InputData")
    ad['tm_outfiles'] = HTCondorUtils.unquote(
        ad.eval("CRAB_AdditionalOutputFiles"))
    ad['tm_tfile_outfiles'] = HTCondorUtils.unquote(
        ad.eval("CRAB_TFileOutputFiles"))
    ad['tm_edm_outfiles'] = HTCondorUtils.unquote(
        ad.eval("CRAB_EDMOutputFiles"))
    ad['tm_site_whitelist'] = HTCondorUtils.unquote(
        ad.eval("CRAB_SiteWhitelist"))
    ad['tm_site_blacklist'] = HTCondorUtils.unquote(
        ad.eval("CRAB_SiteBlacklist"))
    ad['tm_job_type'] = 'Analysis'
    print "TaskManager got this raw ad"
    print ad
    pure_ad = {}
    for key in ad:
        try:
            pure_ad[key] = ad.eval(key)
            if isinstance(pure_ad[key], classad.Value):
                del pure_ad[key]
            if isinstance(pure_ad[key], types.ListType):
                pure_ad[key] = [i.eval() for i in pure_ad[key]]
        except:
            pass
    ad = pure_ad
    ad['CRAB_AlgoArgs'] = json.loads(ad["CRAB_AlgoArgs"])
    ad['tm_split_args'] = ad["CRAB_AlgoArgs"]
    ad['tarball_location'] = os.environ.get('CRAB_TARBALL_LOCATION', '')
    print "TaskManagerBootstrap got this ad:"
    pprint.pprint(ad)
    if command == "DBS":
        task = DBSDataDiscovery.DBSDataDiscovery(config)
    elif command == "SPLIT":
        task = Splitter.Splitter(config)
        print "Got this result from the splitter"
        pprint.pprint(task)
    results = task.execute(in_args, task=ad).result
    if command == "SPLIT":
        results = DagmanCreator.create_subdag(results, task=ad)

    print results
    with open(outfile, "w") as fd:
        pickle.dump(results, fd)

    return 0
Example #28
0
def bootstrap():
    print("Entering TaskManagerBootstrap with args: %s" % sys.argv)
    command = sys.argv[1]
    if command == "POSTJOB":
        return PostJob.PostJob().execute(*sys.argv[2:])
    elif command == "PREJOB":
        return PreJob.PreJob().execute(*sys.argv[2:])
    elif command == "PREDAG":
        return PreDAG.PreDAG().execute(*sys.argv[2:])

    infile, outfile = sys.argv[2:]

    adfile = os.environ["_CONDOR_JOB_AD"]
    print("Parsing classad")
    with open(adfile, "r") as fd:
        ad = classad.parseOld(fd)
    print("..done")
    in_args = []
    if infile != "None":
        with open(infile, "r") as fd:
            in_args = pickle.load(fd)

    config = Configuration.Configuration()
    config.section_("Services")
    config.Services.DBSUrl = 'https://cmsweb.cern.ch/dbs/prod/phys03/DBSWriter/'

    ad['tm_taskname'] = ad.eval("CRAB_Workflow")
    ad['tm_split_algo'] = ad.eval("CRAB_SplitAlgo")
    ad['tm_dbs_url'] = ad.eval("CRAB_DBSURL")
    ad['tm_input_dataset'] = ad.eval("DESIRED_CMSDataset")
    ad['tm_outfiles'] = HTCondorUtils.unquote(
        ad.eval("CRAB_AdditionalOutputFiles"))
    ad['tm_tfile_outfiles'] = HTCondorUtils.unquote(
        ad.eval("CRAB_TFileOutputFiles"))
    ad['tm_edm_outfiles'] = HTCondorUtils.unquote(
        ad.eval("CRAB_EDMOutputFiles"))
    ad['tm_site_whitelist'] = HTCondorUtils.unquote(
        ad.eval("CRAB_SiteWhitelist"))
    ad['tm_site_blacklist'] = HTCondorUtils.unquote(
        ad.eval("CRAB_SiteBlacklist"))
    ad['tm_job_type'] = 'Analysis'
    print("TaskManager got this raw ad")
    print(ad)
    pure_ad = {}
    for key in ad:
        try:
            pure_ad[key] = ad.eval(key)
            if isinstance(pure_ad[key], classad.Value):
                del pure_ad[key]
            if isinstance(pure_ad[key], list):
                pure_ad[key] = [i.eval() for i in pure_ad[key]]
        except:
            pass
    ad = pure_ad
    ad['CRAB_AlgoArgs'] = json.loads(ad["CRAB_AlgoArgs"])
    ad['tm_split_args'] = ad["CRAB_AlgoArgs"]
    ad['tarball_location'] = os.environ.get('CRAB_TARBALL_LOCATION', '')
    print("TaskManagerBootstrap got this ad:")
    pprint.pprint(ad)

    results = task.execute(in_args, task=ad).result

    print(results)
    with open(outfile, "w") as fd:
        pickle.dump(results, fd)

    return 0
Example #29
0
    def executeInternal(self, *args, **kwargs):  # pylint: disable=unused-argument
        # Marco: I guess these value errors only happens for development instances
        if "task" not in kwargs:
            raise ValueError("No task specified.")
        task = kwargs["task"]
        if "tm_taskname" not in task:
            raise ValueError("No taskname specified.")
        workflow = str(task["tm_taskname"])
        if "user_proxy" not in task:
            raise ValueError("No proxy provided")
        proxy = task["user_proxy"]

        if task.get("resubmit_publication", False):
            resubmitWhat = "publications"
        else:
            resubmitWhat = "jobs"

        self.logger.info("About to resubmit %s for workflow: %s." % (resubmitWhat, workflow))
        self.logger.debug("Task info: %s" % str(task))

        if task.get("resubmit_publication", False):
            asourl = task.get("tm_asourl", None)
            # Let's not assume the db has been updated (mostly for devs), let's default asodb to asynctransfer!
            # Also the "or" takes care of the case were the new code is executed on old task
            # i.e.: tm_asodb is there but empty.
            asodb = task.get("tm_asodb", "asynctransfer") or "asynctransfer"
            if not asourl:
                msg = "ASO URL not set. Can not resubmit publication."
                raise TaskWorkerException(msg)
            self.logger.info("Will resubmit failed publications")
            self.resubmitPublication(asourl, asodb, proxy, workflow)
            return

        if task["tm_collector"]:
            self.backendurls["htcondorPool"] = task["tm_collector"]
        loc = HTCondorLocator.HTCondorLocator(self.backendurls)

        schedd = ""
        dummyAddress = ""
        try:
            schedd, dummyAddress = loc.getScheddObjNew(task["tm_schedd"])
        except Exception as exp:
            msg = "The CRAB server backend was not able to contact the Grid scheduler."
            msg += " Please try again later."
            msg += " If the error persists send an e-mail to %s." % (FEEDBACKMAIL)
            msg += " Message from the scheduler: %s" % (str(exp))
            self.logger.exception("%s: %s" % (workflow, msg))
            raise TaskWorkerException(msg)

        # Check memory and walltime
        stdmaxjobruntime = 2800
        stdmaxmemory = 2500
        if task["resubmit_maxjobruntime"] is not None and task["resubmit_maxjobruntime"] > stdmaxjobruntime:
            msg = "Task requests %s minutes of walltime, but only %s are guaranteed to be available." % (
                task["resubmit_maxjobruntime"],
                stdmaxjobruntime,
            )
            msg += " Jobs may not find a site where to run."
            msg += " CRAB has changed this value to %s minutes." % (stdmaxjobruntime)
            self.logger.warning(msg)
            task["resubmit_maxjobruntime"] = str(stdmaxjobruntime)
            self.uploadWarning(msg, proxy, kwargs["task"]["tm_taskname"])
        if task["resubmit_maxmemory"] is not None and task["resubmit_maxmemory"] > stdmaxmemory:
            msg = "Task requests %s MB of memory, but only %s MB are guaranteed to be available." % (
                task["resubmit_maxmemory"],
                stdmaxmemory,
            )
            msg += " Jobs may not find a site where to run and stay idle forever."
            self.logger.warning(msg)
            self.uploadWarning(msg, proxy, kwargs["task"]["tm_taskname"])

        # Release the DAG
        rootConst = 'TaskType =?= "ROOT" && CRAB_ReqName =?= %s' % HTCondorUtils.quote(workflow)

        ## Calculate new parameters for resubmitted jobs. These parameters will
        ## be (re)written in the _CONDOR_JOB_AD when we do schedd.edit() below.
        ad = classad.ClassAd()
        params = {
            "CRAB_ResubmitList": "jobids",
            "CRAB_SiteBlacklist": "site_blacklist",
            "CRAB_SiteWhitelist": "site_whitelist",
            "MaxWallTimeMins": "maxjobruntime",
            "RequestMemory": "maxmemory",
            "RequestCpus": "numcores",
            "JobPrio": "priority",
        }
        overwrite = False
        for taskparam in params.values():
            if ("resubmit_" + taskparam in task) and task["resubmit_" + taskparam] != None:
                # In case resubmission parameters contain a list of unicode strings,
                # convert it to a list of ascii strings because of HTCondor unicode
                # incompatibility.
                # Note that unicode strings that are not in a list are not handled,
                # but so far they don't exist in this part of the code.
                if isinstance(task["resubmit_" + taskparam], list):
                    nonUnicodeList = []
                    for p in task["resubmit_" + taskparam]:
                        if isinstance(p, unicode):
                            nonUnicodeList.append(p.encode("ascii", "ignore"))
                        else:
                            nonUnicodeList.append(p)
                    ad[taskparam] = nonUnicodeList
                if taskparam != "jobids":
                    overwrite = True

        if ("resubmit_jobids" in task) and task["resubmit_jobids"]:
            with HTCondorUtils.AuthenticatedSubprocess(proxy) as (parent, rpipe):
                if not parent:
                    schedd.edit(rootConst, "HoldKillSig", "SIGKILL")
                    ## Overwrite parameters in the os.environ[_CONDOR_JOB_AD] file. This will affect
                    ## all the jobs, not only the ones we want to resubmit. That's why the pre-job
                    ## is saving the values of the parameters for each job retry in text files (the
                    ## files are in the directory resubmit_info in the schedd).
                    for adparam, taskparam in params.iteritems():
                        if taskparam in ad:
                            schedd.edit(rootConst, adparam, ad.lookup(taskparam))
                        elif task["resubmit_" + taskparam] != None:
                            schedd.edit(rootConst, adparam, str(task["resubmit_" + taskparam]))
                    schedd.act(htcondor.JobAction.Hold, rootConst)
                    schedd.edit(rootConst, "HoldKillSig", "SIGUSR1")
                    schedd.act(htcondor.JobAction.Release, rootConst)
        elif overwrite:
            self.logger.debug("Resubmitting under condition overwrite = True")
            with HTCondorUtils.AuthenticatedSubprocess(proxy) as (parent, rpipe):
                if not parent:
                    for adparam, taskparam in params.iteritems():
                        if taskparam in ad:
                            if taskparam == "jobids" and len(list(ad[taskparam])) == 0:
                                self.logger.debug("Setting %s = True in the task ad." % (adparam))
                                schedd.edit(rootConst, adparam, classad.ExprTree("true"))
                            else:
                                schedd.edit(rootConst, adparam, ad.lookup(taskparam))
                        elif task["resubmit_" + taskparam] != None:
                            schedd.edit(rootConst, adparam, str(task["resubmit_" + taskparam]))
                    schedd.act(htcondor.JobAction.Release, rootConst)
        else:
            ## This should actually not occur anymore in CRAB 3.3.16 or above, because
            ## starting from CRAB 3.3.16 the resubmission parameters are written to the
            ## Task DB with value != None, so the overwrite variable should never be False.
            self.logger.debug("Resubmitting under condition overwrite = False")
            with HTCondorUtils.AuthenticatedSubprocess(proxy) as (parent, rpipe):
                if not parent:
                    schedd.edit(rootConst, "HoldKillSig", "SIGKILL")
                    schedd.edit(rootConst, "CRAB_ResubmitList", classad.ExprTree("true"))
                    schedd.act(htcondor.JobAction.Hold, rootConst)
                    schedd.edit(rootConst, "HoldKillSig", "SIGUSR1")
                    schedd.act(htcondor.JobAction.Release, rootConst)

        results = rpipe.read()
        if results != "OK":
            msg = "The CRAB server backend was not able to resubmit the task,"
            msg += " because the Grid scheduler answered with an error."
            msg += " This is probably a temporary glitch. Please try again later."
            msg += " If the error persists send an e-mail to %s." % (FEEDBACKMAIL)
            msg += " Error reason: %s" % (results)
            raise TaskWorkerException(msg)
Example #30
0
    def duplicateCheck(self, task):
        """
        Look to see if the task we are about to submit is already in the schedd.
        If so, assume that this task in TaskWorker was run successfully, but killed
        before it could update the frontend.
        """
        workflow = task['tm_taskname']

        if task['tm_collector']:
            self.backendurls['htcondorPool'] = task['tm_collector']
        loc = HTCondorLocator.HTCondorLocator(self.backendurls)

        schedd = ""
        oldx509 = os.environ.get("X509_USER_PROXY", None)
        try:
            self.logger.debug("Duplicate check is getting the schedd obj. Collector is: %s", task['tm_collector'])
            os.environ["X509_USER_PROXY"] = task['user_proxy']
            schedd, dummyAddress = loc.getScheddObjNew(task['tm_schedd'])
            self.logger.debug("Got schedd obj for %s ", task['tm_schedd'])

            rootConst = 'TaskType =?= "ROOT" && CRAB_ReqName =?= %s && (isUndefined(CRAB_Attempt) || '\
                        'CRAB_Attempt == 0)' % HTCondorUtils.quote(workflow.encode('ascii', 'ignore'))

            self.logger.debug("Duplicate check is querying the schedd: %s", rootConst)
            results = list(schedd.xquery(rootConst, []))
            self.logger.debug("Schedd queried %s", results)
        except Exception as exp:
            msg = "The CRAB server backend was not able to contact the Grid scheduler."
            msg += " Please try again later."
            msg += " If the error persists send an e-mail to %s." % (FEEDBACKMAIL)
            msg += " Message from the scheduler: %s" % (str(exp))
            self.logger.exception("%s: %s", workflow, msg)
            raise TaskWorkerException(msg, retry=True)
        finally:
            if oldx509:
                os.environ["X509_USER_PROXY"] = oldx509
            else:
                del os.environ["X509_USER_PROXY"]

        if not results:
            # Task not already in schedd
            return None

        # Need to double check if JobStatus is 1(idle) or 2(running).
        # All other statuses means that task is not submitted or failed to be submitted.
        # There was issue with spooling files to scheduler and duplicate check found dagman on scheduler
        # but the filew were not correctly transferred.
        if results[0]['JobStatus'] not in [1,2]:
            # if the state of the dag is not idle or running then we raise exception and let
            # the dagman submitter retry later. hopefully after seven minutes the dag gets removed
            # from the schedd and the submission succeds
            retry = results[0]['JobStatus'] == 5 #5==Held
            msg = "Task %s already found on schedd %s " % (workflow, task['tm_schedd'])
            if retry:
                msg += "Going to retry submission later since the dag status is Held and the task should be removed on the schedd"
            else:
                msg += "Aborting submission since the task is in state %s" % results[0]['JobStatus']
            raise TaskWorkerException(msg, retry)
        else:
            self.logger.debug("Task seems to be submitted correctly. Classads got from scheduler: %s", results)

        configreq = {'workflow': workflow,
                     'status': "SUBMITTED",
                     'subresource': 'success',
                     'clusterid': results[0]['ClusterId']
                    }
        self.logger.warning("Task %s already submitted to HTCondor; pushing information centrally: %s", workflow, str(configreq))
        data = urllib.urlencode(configreq)
        self.server.post(self.resturi, data=data)

        # Note that we don't re-send Dashboard jobs; we assume this is a rare occurrance and
        # don't want to upset any info already in the Dashboard.

        return Result.Result(task=task, result=(-1))
    def parseJobLog(self, fp, nodes):
        node_map = {}
        count = 0
        for event in HTCondorUtils.readEvents(fp):
            count += 1
            eventtime = time.mktime(time.strptime(event['EventTime'], "%Y-%m-%dT%H:%M:%S"))
            if event['MyType'] == 'SubmitEvent':
                m = self.node_name_re.match(event['LogNotes'])
                if m:
                    node = m.groups()[0]
                    proc = event['Cluster'], event['Proc']
                    info = nodes.setdefault(node, {'Retries': 0, 'Restarts': 0, 'SiteHistory': [], 'ResidentSetSize': [], 'SubmitTimes': [], 'StartTimes': [],
                                                'EndTimes': [], 'TotalUserCpuTimeHistory': [], 'TotalSysCpuTimeHistory': [], 'WallDurations': [], 'JobIds': []})
                    info['State'] = 'idle'
                    info['JobIds'].append("%d.%d" % proc)
                    info['RecordedSite'] = False
                    info['SubmitTimes'].append(eventtime)
                    info['TotalUserCpuTimeHistory'].append(0)
                    info['TotalSysCpuTimeHistory'].append(0)
                    info['WallDurations'].append(0)
                    info['ResidentSetSize'].append(0)
                    info['Retries'] = len(info['SubmitTimes'])-1
                    node_map[proc] = node
            elif event['MyType'] == 'ExecuteEvent':
                node = node_map[event['Cluster'], event['Proc']]
                nodes[node]['StartTimes'].append(eventtime)
                nodes[node]['State'] = 'running'
                nodes[node]['RecordedSite'] = False
            elif event['MyType'] == 'JobTerminatedEvent':
                node = node_map[event['Cluster'], event['Proc']]
                nodes[node]['EndTimes'].append(eventtime)
                nodes[node]['WallDurations'][-1] = nodes[node]['EndTimes'][-1] - nodes[node]['StartTimes'][-1]
                self.insertCpu(event, nodes[node])
                if event['TerminatedNormally']:
                    if event['ReturnValue'] == 0:
                        nodes[node]['State'] = 'transferring'
                    else:
                        nodes[node]['State'] = 'cooloff'
                else:
                    nodes[node]['State']  = 'cooloff'
            elif event['MyType'] == 'PostScriptTerminatedEvent':
                m = self.node_name2_re.match(event['DAGNodeName'])
                if m:
                    node = m.groups()[0]
                    if event['TerminatedNormally']:
                        if event['ReturnValue'] == 0:
                            nodes[node]['State'] = 'finished'
                        elif event['ReturnValue'] == 2:
                            nodes[node]['State'] = 'failed'
                        else:
                            nodes[node]['State'] = 'cooloff'
                    else:
                        nodes[node]['State']  = 'cooloff'
            elif event['MyType'] == 'ShadowExceptionEvent' or event["MyType"] == "JobReconnectFailedEvent" or event['MyType'] == 'JobEvictedEvent':
                node = node_map[event['Cluster'], event['Proc']]
                if nodes[node]['State'] != 'idle':
                    nodes[node]['EndTimes'].append(eventtime)
                    if nodes[node]['WallDurations'] and nodes[node]['EndTimes'] and nodes[node]['StartTimes']:
                        nodes[node]['WallDurations'][-1] = nodes[node]['EndTimes'][-1] - nodes[node]['StartTimes'][-1]
                    nodes[node]['State'] = 'idle'
                    self.insertCpu(event, nodes[node])
                    nodes[node]['TotalUserCpuTimeHistory'].append(0)
                    nodes[node]['TotalSysCpuTimeHistory'].append(0)
                    nodes[node]['WallDurations'].append(0)
                    nodes[node]['ResidentSetSize'].append(0)
                    nodes[node]['SubmitTimes'].append(-1)
                    nodes[node]['JobIds'].append(nodes[node]['JobIds'][-1])
                    nodes[node]['Restarts'] += 1
            elif event['MyType'] == 'JobAbortedEvent':
                node = node_map[event['Cluster'], event['Proc']]
                if nodes[node]['State'] == "idle" or nodes[node]['State'] == "held":
                    nodes[node]['StartTimes'].append(-1)
                    if not nodes[node]['RecordedSite']:
                        nodes[node]['SiteHistory'].append("Unknown")
                nodes[node]['State'] = 'killed'
                self.insertCpu(event, nodes[node])
            elif event['MyType'] == 'JobHeldEvent':
                node = node_map[event['Cluster'], event['Proc']]
                if nodes[node]['State'] == 'running':
                    nodes[node]['EndTimes'].append(eventtime)
                    if nodes[node]['WallDurations'] and nodes[node]['EndTimes'] and nodes[node]['StartTimes']:
                        nodes[node]['WallDurations'][-1] = nodes[node]['EndTimes'][-1] - nodes[node]['StartTimes'][-1]
                    self.insertCpu(event, nodes[node])
                    nodes[node]['TotalUserCpuTimeHistory'].append(0)
                    nodes[node]['TotalSysCpuTimeHistory'].append(0)
                    nodes[node]['WallDurations'].append(0)
                    nodes[node]['ResidentSetSize'].append(0)
                    nodes[node]['SubmitTimes'].append(-1)
                    nodes[node]['JobIds'].append(nodes[node]['JobIds'][-1])
                    nodes[node]['Restarts'] += 1
                nodes[node]['State'] = 'held'
            elif event['MyType'] == 'JobReleaseEvent':
                node = node_map[event['Cluster'], event['Proc']]
                nodes[node]['State'] = 'idle'
            elif event['MyType'] == 'JobAdInformationEvent':
                node = node_map[event['Cluster'], event['Proc']]
                if (not nodes[node]['RecordedSite']) and ('JOBGLIDEIN_CMSSite' in event) and not event['JOBGLIDEIN_CMSSite'].startswith("$$"):
                    nodes[node]['SiteHistory'].append(event['JOBGLIDEIN_CMSSite'])
                    nodes[node]['RecordedSite'] = True
                self.insertCpu(event, nodes[node])
            elif event['MyType'] == 'JobImageSizeEvent':
                nodes[node]['ResidentSetSize'][-1] = int(event['ResidentSetSize'])
                if nodes[node]['StartTimes']:
                    nodes[node]['WallDurations'][-1] = eventtime - nodes[node]['StartTimes'][-1]
                self.insertCpu(event, nodes[node])
            elif event["MyType"] == "JobDisconnectedEvent" or event["MyType"] == "JobReconnectedEvent":
                # These events don't really affect the node status
                pass
            else:
                self.logger.warning("Unknown event type: %s" % event['MyType'])

        self.logger.debug("There were %d events in the job log." % count)
        now = time.time()
        for node, info in nodes.items():
            last_start = now
            if info['StartTimes']:
                last_start = info['StartTimes'][-1]
            while len(info['WallDurations']) < len(info['SiteHistory']):
                info['WallDurations'].append(now - last_start)
            while len(info['WallDurations']) > len(info['SiteHistory']):
                info['SiteHistory'].append("Unknown")
    def getRootTasks(self, workflow, schedd):
        rootConst = 'TaskType =?= "ROOT" && CRAB_ReqName =?= %s && (isUndefined(CRAB_Attempt) || CRAB_Attempt == 0)' % HTCondorUtils.quote(workflow)
        rootAttrList = ["JobStatus", "ExitCode", 'CRAB_JobCount', 'CRAB_ReqName', 'TaskType', "HoldReason", "HoldReasonCode", "CRAB_UserWebDir",
                        "CRAB_SiteWhitelist", "CRAB_SiteBlacklist", "DagmanHoldReason"]

        # Note: may throw if the schedd is down.  We may want to think about wrapping the
        # status function and have it catch / translate HTCondor errors.
        results = list(schedd.xquery(rootConst, rootAttrList))

        if not results:
            self.logger.info("An invalid workflow name was requested: %s" % workflow)
            raise InvalidParameter("An invalid workflow name was requested: %s" % workflow)
        return results
Example #33
0
    def executeInternal(self, *args, **kwargs):  #pylint: disable=unused-argument
        #Marco: I guess these value errors only happens for development instances
        if 'task' not in kwargs:
            raise ValueError("No task specified.")
        task = kwargs['task']
        if 'tm_taskname' not in task:
            raise ValueError("No taskname specified.")
        workflow = str(task['tm_taskname'])
        if 'user_proxy' not in task:
            raise ValueError("No proxy provided")
        proxy = task['user_proxy']

        if task.get('resubmit_publication', False):
            resubmitWhat = "publications"
        else:
            resubmitWhat = "jobs"

        self.logger.info("About to resubmit %s for workflow: %s.",
                         resubmitWhat, workflow)
        self.logger.debug("Task info: %s", str(task))

        if task.get('resubmit_publication', False):
            asourl = task.get('tm_asourl', None)
            #Let's not assume the db has been updated (mostly for devs), let's default asodb to asynctransfer!
            #Also the "or" takes care of the case were the new code is executed on old task
            #i.e.: tm_asodb is there but empty.
            asodb = task.get('tm_asodb', 'asynctransfer') or 'asynctransfer'
            if not asourl:
                msg = "ASO URL not set. Can not resubmit publication."
                raise TaskWorkerException(msg)
            self.logger.info("Will resubmit failed publications")
            self.resubmitPublication(asourl, asodb, proxy, workflow)
            return

        if task['tm_collector']:
            self.backendurls['htcondorPool'] = task['tm_collector']
        loc = HTCondorLocator.HTCondorLocator(self.backendurls)

        schedd = ""
        dummyAddress = ""
        try:
            schedd, dummyAddress = loc.getScheddObjNew(task['tm_schedd'])
        except Exception as exp:
            msg = "The CRAB server backend was not able to contact the Grid scheduler."
            msg += " Please try again later."
            msg += " If the error persists send an e-mail to %s." % (
                FEEDBACKMAIL)
            msg += " Message from the scheduler: %s" % (str(exp))
            self.logger.exception("%s: %s", workflow, msg)
            raise TaskWorkerException(msg)

        # Check memory and walltime
        stdmaxjobruntime = 2800
        stdmaxmemory = 2500
        if task['resubmit_maxjobruntime'] is not None and task[
                'resubmit_maxjobruntime'] > stdmaxjobruntime:
            msg = "Task requests %s minutes of walltime, but only %s are guaranteed to be available." % (
                task['resubmit_maxjobruntime'], stdmaxjobruntime)
            msg += " Jobs may not find a site where to run."
            msg += " CRAB has changed this value to %s minutes." % (
                stdmaxjobruntime)
            self.logger.warning(msg)
            task['resubmit_maxjobruntime'] = str(stdmaxjobruntime)
            self.uploadWarning(msg, proxy, kwargs['task']['tm_taskname'])
        if task['resubmit_maxmemory'] is not None and task[
                'resubmit_maxmemory'] > stdmaxmemory:
            msg = "Task requests %s MB of memory, but only %s MB are guaranteed to be available." % (
                task['resubmit_maxmemory'], stdmaxmemory)
            msg += " Jobs may not find a site where to run and stay idle forever."
            self.logger.warning(msg)
            self.uploadWarning(msg, proxy, kwargs['task']['tm_taskname'])

        # Find only the originally submitted DAG to hold and release: this
        # will re-trigger the scripts and adjust retries and other
        # resubmission parameters.
        #
        # Processing and tail DAGs will be restarted by these scrips on the
        # schedd after the modifications are made.
        rootConst = "TaskType =?= \"ROOT\" && CRAB_ReqName =?= %s" % HTCondorUtils.quote(
            workflow)

        ## Calculate new parameters for resubmitted jobs. These parameters will
        ## be (re)written in the _CONDOR_JOB_AD when we do schedd.edit() below.
        ad = classad.ClassAd()
        params = {
            'CRAB_ResubmitList': 'jobids',
            'CRAB_SiteBlacklist': 'site_blacklist',
            'CRAB_SiteWhitelist': 'site_whitelist',
            'MaxWallTimeMins': 'maxjobruntime',
            'RequestMemory': 'maxmemory',
            'RequestCpus': 'numcores',
            'JobPrio': 'priority'
        }
        overwrite = False
        for taskparam in params.values():
            if ('resubmit_' + taskparam
                    in task) and task['resubmit_' + taskparam] != None:
                # In case resubmission parameters contain a list of unicode strings,
                # convert it to a list of ascii strings because of HTCondor unicode
                # incompatibility.
                # Note that unicode strings that are not in a list are not handled,
                # but so far they don't exist in this part of the code.
                if isinstance(task['resubmit_' + taskparam], list):
                    nonUnicodeList = []
                    for p in task['resubmit_' + taskparam]:
                        if isinstance(p, unicode):
                            nonUnicodeList.append(p.encode('ascii', 'ignore'))
                        else:
                            nonUnicodeList.append(p)
                    ad[taskparam] = nonUnicodeList
                if taskparam != 'jobids':
                    overwrite = True

        if ('resubmit_jobids' in task) and task['resubmit_jobids']:
            with HTCondorUtils.AuthenticatedSubprocess(
                    proxy, logger=self.logger) as (parent, rpipe):
                if not parent:
                    schedd.edit(rootConst, "HoldKillSig", 'SIGKILL')
                    ## Overwrite parameters in the os.environ[_CONDOR_JOB_AD] file. This will affect
                    ## all the jobs, not only the ones we want to resubmit. That's why the pre-job
                    ## is saving the values of the parameters for each job retry in text files (the
                    ## files are in the directory resubmit_info in the schedd).
                    for adparam, taskparam in params.iteritems():
                        if taskparam in ad:
                            schedd.edit(rootConst, adparam,
                                        ad.lookup(taskparam))
                        elif task['resubmit_' + taskparam] != None:
                            schedd.edit(rootConst, adparam,
                                        str(task['resubmit_' + taskparam]))
                    schedd.act(htcondor.JobAction.Hold, rootConst)
                    schedd.edit(rootConst, "HoldKillSig", 'SIGUSR1')
                    schedd.act(htcondor.JobAction.Release, rootConst)
        elif overwrite:
            self.logger.debug("Resubmitting under condition overwrite = True")
            with HTCondorUtils.AuthenticatedSubprocess(
                    proxy, logger=self.logger) as (parent, rpipe):
                if not parent:
                    for adparam, taskparam in params.iteritems():
                        if taskparam in ad:
                            if taskparam == 'jobids' and len(
                                    list(ad[taskparam])) == 0:
                                self.logger.debug(
                                    "Setting %s = True in the task ad.",
                                    adparam)
                                schedd.edit(rootConst, adparam,
                                            classad.ExprTree("true"))
                            else:
                                schedd.edit(rootConst, adparam,
                                            ad.lookup(taskparam))
                        elif task['resubmit_' + taskparam] != None:
                            schedd.edit(rootConst, adparam,
                                        str(task['resubmit_' + taskparam]))
                    schedd.act(htcondor.JobAction.Release, rootConst)
        else:
            ## This should actually not occur anymore in CRAB 3.3.16 or above, because
            ## starting from CRAB 3.3.16 the resubmission parameters are written to the
            ## Task DB with value != None, so the overwrite variable should never be False.
            self.logger.debug("Resubmitting under condition overwrite = False")
            with HTCondorUtils.AuthenticatedSubprocess(
                    proxy, logger=self.logger) as (parent, rpipe):
                if not parent:
                    schedd.edit(rootConst, "HoldKillSig", 'SIGKILL')
                    schedd.edit(rootConst, "CRAB_ResubmitList",
                                classad.ExprTree("true"))
                    schedd.act(htcondor.JobAction.Hold, rootConst)
                    schedd.edit(rootConst, "HoldKillSig", 'SIGUSR1')
                    schedd.act(htcondor.JobAction.Release, rootConst)
        try:
            results = rpipe.read()
        except EOFError:
            results = "Timeout while executing condor commands for resubmission"
        if results != "OK":
            msg = "The CRAB server backend was not able to resubmit the task,"
            msg += " because the Grid scheduler answered with an error."
            msg += " This is probably a temporary glitch. Please try again later."
            msg += " If the error persists send an e-mail to %s." % (
                FEEDBACKMAIL)
            msg += " Error reason: %s" % (results)
            raise TaskWorkerException(msg)
Example #34
0
    def execute_internal(self, *args, **kw):
        #Marco: I guess these value errors only happens for development instances
        if 'task' not in kw:
            raise ValueError("No task specified.")
        task = kw['task']
        if 'tm_taskname' not in task:
            raise ValueError("No taskname specified.")
        workflow = str(task['tm_taskname'])
        if 'user_proxy' not in task:
            raise ValueError("No proxy provided")
        proxy = task['user_proxy']

        self.logger.info("About to resubmit workflow: %s." % workflow)
        self.logger.info("Task info: %s" % str(task))

        loc = HTCondorLocator.HTCondorLocator(self.backendurls)
        schedd, address = loc.getScheddObj(workflow)  #TODO wrap

        # Release the DAG
        rootConst = "TaskType =?= \"ROOT\" && CRAB_ReqName =?= %s" % HTCondorUtils.quote(
            workflow)

        # Calculate a new white/blacklist
        ad = classad.ClassAd()
        ad['whitelist'] = task['resubmit_site_whitelist']
        ad['blacklist'] = task['resubmit_site_blacklist']

        if ('resubmit_ids' in task) and task['resubmit_ids']:
            ad['resubmit'] = task['resubmit_ids']
            with HTCondorUtils.AuthenticatedSubprocess(proxy) as (parent,
                                                                  rpipe):
                if not parent:
                    schedd.edit(rootConst, "HoldKillSig", 'SIGKILL')
                    schedd.edit(rootConst, "CRAB_ResubmitList", ad['resubmit'])
                    schedd.act(htcondor.JobAction.Hold, rootConst)
                    schedd.edit(rootConst, "HoldKillSig", 'SIGUSR1')
                    schedd.act(htcondor.JobAction.Release, rootConst)

        elif task['resubmit_site_whitelist'] or task['resubmit_site_blacklist'] or \
                task['resubmit_priority'] != None or task['resubmit_maxmemory'] != None or \
                task['resubmit_numcores'] != None or task['resubmit_maxjobruntime'] != None:
            with HTCondorUtils.AuthenticatedSubprocess(proxy) as (parent,
                                                                  rpipe):
                if not parent:
                    if task['resubmit_site_blacklist']:
                        schedd.edit(rootConst, "CRAB_SiteResubmitBlacklist",
                                    ad['blacklist'])
                    if task['resubmit_site_whitelist']:
                        schedd.edit(rootConst, "CRAB_SiteResubmitWhitelist",
                                    ad['whitelist'])
                    if task['resubmit_priority'] != None:
                        schedd.edit(rootConst, "JobPrio",
                                    task['resubmit_priority'])
                    if task['resubmit_numcores'] != None:
                        schedd.edit(rootConst, "RequestCpus",
                                    task['resubmit_numcores'])
                    if task['resubmit_maxjobruntime'] != None:
                        schedd.edit(rootConst, "MaxWallTimeMins",
                                    task['resubmit_maxjobruntime'])
                    if task['resubmit_maxmemory'] != None:
                        schedd.edit(rootConst, "RequestMemory",
                                    task['resubmit_maxmemory'])
                    schedd.act(htcondor.JobAction.Release, rootConst)

        else:
            with HTCondorUtils.AuthenticatedSubprocess(proxy) as (parent,
                                                                  rpipe):
                if not parent:
                    schedd.edit(rootConst, "HoldKillSig", 'SIGKILL')
                    schedd.edit(rootConst, "CRAB_ResubmitList",
                                classad.ExprTree("true"))
                    schedd.act(htcondor.JobAction.Hold, rootConst)
                    schedd.edit(rootConst, "HoldKillSig", 'SIGUSR1')
                    schedd.act(htcondor.JobAction.Release, rootConst)

        results = rpipe.read()
        if results != "OK":
            raise TaskWorkerException("The CRAB3 server backend could not reubmit your task because the Grid scheduler answered with an error\n"+\
                                      "This is probably a temporary glitch, please try it again and contact an expert if the error persist\n"+\
                                      "Error reason %s" % results)
Example #35
0
    def executeInternal(self, *args, **kwargs):
        #Marco: I guess these value errors only happens for development instances
        if 'task' not in kwargs:
            raise ValueError("No task specified.")
        task = kwargs['task']
        if 'tm_taskname' not in task:
            raise ValueError("No taskname specified.")
        workflow = str(task['tm_taskname'])
        if 'user_proxy' not in task:
            raise ValueError("No proxy provided")
        proxy = task['user_proxy']

        if task.get('resubmit_publication', False):
            resubmitWhat = "publications"
        else:
            resubmitWhat = "jobs"

        self.logger.info("About to resubmit %s for workflow: %s." %
                         (resubmitWhat, workflow))
        self.logger.info("Task info: %s" % str(task))

        if task.get('resubmit_publication', False):
            asourl = task.get('tm_asourl', None)
            if not asourl:
                msg = "ASO URL not set. Can not resubmit publication."
                raise TaskWorkerException(msg)
            self.logger.info("Will resubmit failed publications")
            self.resubmitPublication(asourl, proxy, workflow)
            return

        if task['tm_collector']:
            self.backendurls['htcondorPool'] = task['tm_collector']
        loc = HTCondorLocator.HTCondorLocator(self.backendurls)

        schedd = ""
        address = ""
        try:
            schedd, address = loc.getScheddObjNew(task['tm_schedd'])
        except Exception as exp:
            msg = "The CRAB server backend was not able to contact the Grid scheduler."
            msg += " Please try again later."
            msg += " If the error persists send an e-mail to %s." % (
                FEEDBACKMAIL)
            msg += " Message from the scheduler: %s" % (str(exp))
            self.logger.exception("%s: %s" % (workflow, msg))
            raise TaskWorkerException(msg)

        # Check memory and walltime
        stdmaxjobruntime = 2800
        stdmaxmemory = 2500
        if task['resubmit_maxjobruntime'] is not None and task[
                'resubmit_maxjobruntime'] > stdmaxjobruntime:
            msg = "Task requests %s minutes of walltime, but only %s are guaranteed to be available." % (
                task['resubmit_maxjobruntime'], stdmaxjobruntime)
            msg += " Jobs may not find a site where to run."
            msg += " CRAB has changed this value to %s minutes." % (
                stdmaxjobruntime)
            self.logger.warning(msg)
            task['resubmit_maxjobruntime'] = str(stdmaxjobruntime)
            self.uploadWarning(msg, proxy, kwargs['task']['tm_taskname'])
        if task['resubmit_maxmemory'] is not None and task[
                'resubmit_maxmemory'] > stdmaxmemory:
            msg = "Task requests %s MB of memory, but only %s MB are guaranteed to be available." % (
                task['resubmit_maxmemory'], stdmaxmemory)
            msg += " Jobs may not find a site where to run and stay idle forever."
            self.logger.warning(msg)
            self.uploadWarning(msg, proxy, kwargs['task']['tm_taskname'])

        # Release the DAG
        rootConst = "TaskType =?= \"ROOT\" && CRAB_ReqName =?= %s" % HTCondorUtils.quote(
            workflow)

        ## Calculate new parameters for resubmited jobs. These parameters will
        ## be (re)written in the _CONDOR_JOB_AD when we do schedd.edit() below.
        ad = classad.ClassAd()
        params = {
            'CRAB_ResubmitList': 'jobids',
            'CRAB_SiteBlacklist': 'site_blacklist',
            'CRAB_SiteWhitelist': 'site_whitelist',
            'MaxWallTimeMins': 'maxjobruntime',
            'RequestMemory': 'maxmemory',
            'RequestCpus': 'numcores',
            'JobPrio': 'priority'
        }
        overwrite = False
        for taskparam in params.values():
            if ('resubmit_' + taskparam
                    in task) and task['resubmit_' + taskparam] != None:
                if isinstance(task['resubmit_' + taskparam], list):
                    ad[taskparam] = task['resubmit_' + taskparam]
                if taskparam != 'jobids':
                    overwrite = True

        if ('resubmit_jobids' in task) and task['resubmit_jobids']:
            with HTCondorUtils.AuthenticatedSubprocess(proxy) as (parent,
                                                                  rpipe):
                if not parent:
                    schedd.edit(rootConst, "HoldKillSig", 'SIGKILL')
                    ## Overwrite parameters in the os.environ[_CONDOR_JOB_AD] file. This will affect
                    ## all the jobs, not only the ones we want to resubmit. That's why the pre-job
                    ## is saving the values of the parameters for each job retry in text files (the
                    ## files are in the directory resubmit_info in the schedd).
                    for adparam, taskparam in params.iteritems():
                        if taskparam in ad:
                            schedd.edit(rootConst, adparam, ad[taskparam])
                        elif task['resubmit_' + taskparam] != None:
                            schedd.edit(rootConst, adparam,
                                        str(task['resubmit_' + taskparam]))
                    schedd.act(htcondor.JobAction.Hold, rootConst)
                    schedd.edit(rootConst, "HoldKillSig", 'SIGUSR1')
                    schedd.act(htcondor.JobAction.Release, rootConst)
        elif overwrite:
            with HTCondorUtils.AuthenticatedSubprocess(proxy) as (parent,
                                                                  rpipe):
                if not parent:
                    self.logger.debug(
                        "Resubmitting under condition overwrite = True")
                    for adparam, taskparam in params.iteritems():
                        if taskparam in ad:
                            if taskparam == 'jobids' and len(
                                    list(ad[taskparam])) == 0:
                                self.logger.debug(
                                    "Setting %s = True in the task ad." %
                                    (adparam))
                                schedd.edit(rootConst, adparam,
                                            classad.ExprTree("true"))
                            else:
                                schedd.edit(rootConst, adparam, ad[taskparam])
                        elif task['resubmit_' + taskparam] != None:
                            schedd.edit(rootConst, adparam,
                                        str(task['resubmit_' + taskparam]))
                    schedd.act(htcondor.JobAction.Release, rootConst)
        else:
            ## This should actually not occur anymore in CRAB 3.3.16 or above, because
            ## starting from CRAB 3.3.16 the resubmission parameters are written to the
            ## Task DB with value != None, so the overwrite variable should never be False.
            with HTCondorUtils.AuthenticatedSubprocess(proxy) as (parent,
                                                                  rpipe):
                if not parent:
                    self.logger.debug(
                        "Resubmitting under condition overwrite = False")
                    schedd.edit(rootConst, "HoldKillSig", 'SIGKILL')
                    schedd.edit(rootConst, "CRAB_ResubmitList",
                                classad.ExprTree("true"))
                    schedd.act(htcondor.JobAction.Hold, rootConst)
                    schedd.edit(rootConst, "HoldKillSig", 'SIGUSR1')
                    schedd.act(htcondor.JobAction.Release, rootConst)

        results = rpipe.read()
        if results != "OK":
            msg = "The CRAB server backend was not able to resubmit the task,"
            msg += " because the Grid scheduler answered with an error."
            msg += " This is probably a temporary glitch. Please try again later."
            msg += " If the error persists send an e-mail to %s." % (
                FEEDBACKMAIL)
            msg += " Error reason: %s" % (results)
            raise TaskWorkerException(msg)
Example #36
0
    def executeInternal(self, *args, **kwargs): #pylint: disable=unused-argument
        #Marco: I guess these value errors only happens for development instances
        if 'task' not in kwargs:
            raise ValueError("No task specified.")
        task = kwargs['task']
        if 'tm_taskname' not in task:
            raise ValueError("No taskname specified.")
        workflow = str(task['tm_taskname'])
        if 'user_proxy' not in task:
            raise ValueError("No proxy provided")
        proxy = task['user_proxy']

        if task.get('resubmit_publication', False):
            resubmitWhat = "publications"
        else:
            resubmitWhat = "jobs"

        self.logger.info("About to resubmit %s for workflow: %s.", resubmitWhat, workflow)
        self.logger.debug("Task info: %s", str(task))

        if task.get('resubmit_publication', False):
            asourl = task.get('tm_asourl', None)
            #Let's not assume the db has been updated (mostly for devs), let's default asodb to asynctransfer!
            #Also the "or" takes care of the case were the new code is executed on old task
            #i.e.: tm_asodb is there but empty.
            asodb = task.get('tm_asodb', 'asynctransfer') or 'asynctransfer'
            if not asourl:
                msg = "ASO URL not set. Can not resubmit publication."
                raise TaskWorkerException(msg)
            self.logger.info("Will resubmit failed publications")
            self.resubmitPublication(asourl, asodb, proxy, workflow)
            return

        if task['tm_collector']:
            self.backendurls['htcondorPool'] = task['tm_collector']
        loc = HTCondorLocator.HTCondorLocator(self.backendurls)

        schedd = ""
        dummyAddress = ""
        try:
            schedd, dummyAddress = loc.getScheddObjNew(task['tm_schedd'])
        except Exception as exp:
            msg  = "The CRAB server backend was not able to contact the Grid scheduler."
            msg += " Please try again later."
            msg += " If the error persists send an e-mail to %s." % (FEEDBACKMAIL)
            msg += " Message from the scheduler: %s" % (str(exp))
            self.logger.exception("%s: %s", workflow, msg)
            raise TaskWorkerException(msg)

        # Check memory and walltime
        checkMemoryWalltime(None, task, 'resubmit', self.logger, self.uploadWarning)


        # Find only the originally submitted DAG to hold and release: this
        # will re-trigger the scripts and adjust retries and other
        # resubmission parameters.
        #
        # Processing and tail DAGs will be restarted by these scrips on the
        # schedd after the modifications are made.
        rootConst = "TaskType =?= \"ROOT\" && CRAB_ReqName =?= %s" % HTCondorUtils.quote(workflow)

        ## Calculate new parameters for resubmitted jobs. These parameters will
        ## be (re)written in the _CONDOR_JOB_AD when we do schedd.edit() below.
        ad = classad.ClassAd()
        params = {'CRAB_ResubmitList'  : 'jobids',
                  'CRAB_SiteBlacklist' : 'site_blacklist',
                  'CRAB_SiteWhitelist' : 'site_whitelist',
                  'MaxWallTimeMinsRun' : 'maxjobruntime',
                  'RequestMemory'      : 'maxmemory',
                  'RequestCpus'        : 'numcores',
                  'JobPrio'            : 'priority'
                 }
        overwrite = False
        for taskparam in params.values():
            if ('resubmit_'+taskparam in task) and task['resubmit_'+taskparam] != None:
                # In case resubmission parameters contain a list of unicode strings,
                # convert it to a list of ascii strings because of HTCondor unicode
                # incompatibility.
                # Note that unicode strings that are not in a list are not handled,
                # but so far they don't exist in this part of the code.
                if isinstance(task['resubmit_'+taskparam], list):
                    nonUnicodeList = []
                    for p in task['resubmit_'+taskparam]:
                        if isinstance(p, unicode):
                            nonUnicodeList.append(p.encode('ascii', 'ignore'))
                        else:
                            nonUnicodeList.append(p)
                    ad[taskparam] = nonUnicodeList
                if taskparam != 'jobids':
                    overwrite = True

        if ('resubmit_jobids' in task) and task['resubmit_jobids']:
            with HTCondorUtils.AuthenticatedSubprocess(proxy, logger=self.logger) as (parent, rpipe):
                if not parent:
                    schedd.edit(rootConst, "HoldKillSig", 'SIGKILL')
                    ## Overwrite parameters in the os.environ[_CONDOR_JOB_AD] file. This will affect
                    ## all the jobs, not only the ones we want to resubmit. That's why the pre-job
                    ## is saving the values of the parameters for each job retry in text files (the
                    ## files are in the directory resubmit_info in the schedd).
                    for adparam, taskparam in params.iteritems():
                        if taskparam in ad:
                            schedd.edit(rootConst, adparam, ad.lookup(taskparam))
                        elif task['resubmit_'+taskparam] != None:
                            schedd.edit(rootConst, adparam, str(task['resubmit_'+taskparam]))
                    schedd.act(htcondor.JobAction.Hold, rootConst)
                    schedd.edit(rootConst, "HoldKillSig", 'SIGUSR1')
                    schedd.act(htcondor.JobAction.Release, rootConst)
        elif overwrite:
            self.logger.debug("Resubmitting under condition overwrite = True")
            with HTCondorUtils.AuthenticatedSubprocess(proxy, logger=self.logger) as (parent, rpipe):
                if not parent:
                    for adparam, taskparam in params.iteritems():
                        if taskparam in ad:
                            if taskparam == 'jobids' and len(list(ad[taskparam])) == 0:
                                self.logger.debug("Setting %s = True in the task ad.", adparam)
                                schedd.edit(rootConst, adparam, classad.ExprTree("true"))
                            else:
                                schedd.edit(rootConst, adparam, ad.lookup(taskparam))
                        elif task['resubmit_'+taskparam] != None:
                            schedd.edit(rootConst, adparam, str(task['resubmit_'+taskparam]))
                    schedd.act(htcondor.JobAction.Release, rootConst)
        else:
            ## This should actually not occur anymore in CRAB 3.3.16 or above, because
            ## starting from CRAB 3.3.16 the resubmission parameters are written to the
            ## Task DB with value != None, so the overwrite variable should never be False.
            self.logger.debug("Resubmitting under condition overwrite = False")
            with HTCondorUtils.AuthenticatedSubprocess(proxy, logger=self.logger) as (parent, rpipe):
                if not parent:
                    schedd.edit(rootConst, "HoldKillSig", 'SIGKILL')
                    schedd.edit(rootConst, "CRAB_ResubmitList", classad.ExprTree("true"))
                    schedd.act(htcondor.JobAction.Hold, rootConst)
                    schedd.edit(rootConst, "HoldKillSig", 'SIGUSR1')
                    schedd.act(htcondor.JobAction.Release, rootConst)
        try:
            results = rpipe.read()
        except EOFError:
            results = "Timeout while executing condor commands for resubmission"
        if results != "OK":
            msg  = "The CRAB server backend was not able to resubmit the task,"
            msg += " because the Grid scheduler answered with an error."
            msg += " This is probably a temporary glitch. Please try again later."
            msg += " If the error persists send an e-mail to %s." % (FEEDBACKMAIL)
            msg += " Error reason: %s" % (results)
            raise TaskWorkerException(msg)
Example #37
0
    def duplicateCheck(self, task):
        """
        Look to see if the task we are about to submit is already in the schedd.
        If so, assume that this task in TaskWorker was run successfully, but killed
        before it could update the frontend.
        """
        workflow = task['tm_taskname']

        if task['tm_collector']:
            self.backendurls['htcondorPool'] = task['tm_collector']
        loc = HTCondorLocator.HTCondorLocator(self.backendurls)

        address = ""
        schedd = ""
        try:
            self.logger.debug("Duplicate check is getting the schedd obj. Collector is: %s" % task['tm_collector'])
            schedd, address = loc.getScheddObjNew(task['tm_schedd'])
            self.logger.debug("Got schedd obj for %s " % task['tm_schedd'])
        except Exception as exp:
            msg = ("%s: The CRAB3 server backend is not able to contact Grid scheduler. Please, retry later. Message from the scheduler: %s") % (workflow, str(exp))
            self.logger.exception(msg)
            raise TaskWorkerException(msg)

        rootConst = 'TaskType =?= "ROOT" && CRAB_ReqName =?= %s && (isUndefined(CRAB_Attempt) || CRAB_Attempt == 0)' % HTCondorUtils.quote(workflow)

        self.logger.debug("Duplicate check is querying the schedd: %s" % rootConst)
        results = list(schedd.xquery(rootConst, []))
        self.logger.debug("Schedd queried %s" % results)

        if not results:
            # Task not already in schedd
            return None

        configreq = {'workflow': workflow,
                     'status': "SUBMITTED",
                     'jobset': "-1",
                     'subresource': 'success',
                    }
        self.logger.warning("Task %s already submitted to HTCondor; pushing information centrally: %s" % (workflow, str(configreq)))
        data = urllib.urlencode(configreq)
        self.server.post(self.resturi, data = data)

        # Note that we don't re-send Dashboard jobs; we assume this is a rare occurrance and
        # don't want to upset any info already in the Dashboard.

        return Result.Result(task=task, result=(-1))
Example #38
0
    def killAll(self, jobConst):

        # We need to keep ROOT, PROCESSING, and TAIL DAGs in hold until periodic remove kicks in.
        # This is needed in case user wants to resubmit.
        rootConst = 'stringListMember(TaskType, "ROOT PROCESSING TAIL", " ") && CRAB_ReqName =?= %s' % HTCondorUtils.quote(self.workflow)

        # Holding DAG job does not mean that it will remove all jobs
        # and this must be done separately
        # --------------------------------------
        # From HTCondor documentation
        # http://research.cs.wisc.edu/htcondor/manual/v8.3/2_10DAGMan_Applications.html#SECTION003107000000000000000
        # --------------------------------------
        # After placing the condor_dagman job on hold, no new node jobs will be submitted,
        # and no PRE or POST scripts will be run. Any node jobs already in the HTCondor queue
        # will continue undisturbed. If the condor_dagman job is left on hold, it will remain
        # in the HTCondor queue after all of the currently running node jobs are finished.
        # --------------------------------------
        # TODO: Remove jobConst query when htcondor ticket is solved
        # https://htcondor-wiki.cs.wisc.edu/index.cgi/tktview?tn=5175

        with HTCondorUtils.AuthenticatedSubprocess(self.proxy) as (parent, rpipe):
            if not parent:
                with self.schedd.transaction() as dummytsc:
                    self.schedd.act(htcondor.JobAction.Hold, rootConst)
                    self.schedd.act(htcondor.JobAction.Remove, jobConst)
        results = rpipe.read()
        if results != "OK":
            msg  = "The CRAB server backend was not able to kill the task,"
            msg += " because the Grid scheduler answered with an error."
            msg += " This is probably a temporary glitch. Please try again later."
            msg += " If the error persists send an e-mail to %s." % (FEEDBACKMAIL)
            msg += " Error reason: %s" % (results)
            raise TaskWorkerException(msg)
Example #39
0
    def getRootTasks(self, workflow, schedd):
        rootConst = 'TaskType =?= "ROOT" && CRAB_ReqName =?= %s && (isUndefined(CRAB_Attempt) || CRAB_Attempt == 0)' % HTCondorUtils.quote(
            workflow)
        rootAttrList = [
            "JobStatus", "ExitCode", 'CRAB_JobCount', 'CRAB_ReqName',
            'TaskType', "HoldReason", "HoldReasonCode", "CRAB_UserWebDir",
            "CRAB_SiteWhitelist", "CRAB_SiteBlacklist", "DagmanHoldReason"
        ]

        # Note: may throw if the schedd is down.  We may want to think about wrapping the
        # status function and have it catch / translate HTCondor errors.
        results = list(schedd.xquery(rootConst, rootAttrList))

        if not results:
            self.logger.info("An invalid workflow name was requested: %s" %
                             workflow)
            raise InvalidParameter(
                "An invalid workflow name was requested: %s" % workflow)
        return results[-1]
Example #40
0
    def duplicateCheck(self, task):
        """
        Look to see if the task we are about to submit is already in the schedd.
        If so, assume that this task in TaskWorker was run successfully, but killed
        before it could update the frontend.
        """
        workflow = task["tm_taskname"]

        if task["tm_collector"]:
            self.backendurls["htcondorPool"] = task["tm_collector"]
        loc = HTCondorLocator.HTCondorLocator(self.backendurls)

        schedd = ""
        try:
            self.logger.debug("Duplicate check is getting the schedd obj. Collector is: %s", task["tm_collector"])
            schedd, dummyAddress = loc.getScheddObjNew(task["tm_schedd"])
            self.logger.debug("Got schedd obj for %s ", task["tm_schedd"])
        except Exception as exp:
            msg = "The CRAB server backend was not able to contact the Grid scheduler."
            msg += " Please try again later."
            msg += " If the error persists send an e-mail to %s." % (FEEDBACKMAIL)
            msg += " Message from the scheduler: %s" % (str(exp))
            self.logger.exception("%s: %s", workflow, msg)
            raise TaskWorkerException(msg, retry=True)

        rootConst = (
            'TaskType =?= "ROOT" && CRAB_ReqName =?= %s && (isUndefined(CRAB_Attempt) || '
            "CRAB_Attempt == 0)" % HTCondorUtils.quote(workflow.encode("ascii", "ignore"))
        )

        self.logger.debug("Duplicate check is querying the schedd: %s", rootConst)
        results = list(schedd.xquery(rootConst, []))
        self.logger.debug("Schedd queried %s", results)

        if not results:
            # Task not already in schedd
            return None

        # Need to double check if JobStatus is 1(idle) or 2(running).
        # All other statuses means that task is not submitted or failed to be submitted.
        # There was issue with spooling files to scheduler and duplicate check found dagman on scheduler
        # but the filew were not correctly transferred.
        if results[0]["JobStatus"] not in [1, 2]:
            # if the state of the dag is not idle or running then we raise exception and let
            # the dagman submitter retry later. hopefully after seven minutes the dag gets removed
            # from the schedd and the submission succeds
            retry = results[0]["JobStatus"] == 5  # 5==Held
            msg = "Task %s already found on schedd %s " % (workflow, task["tm_schedd"])
            if retry:
                msg += "Going to retry submission later since the dag status is Held and the task should be removed on the schedd"
            else:
                msg += "Aborting submission since the task is in state %s" % results[0]["JobStatus"]
            raise TaskWorkerException(msg, retry)
        else:
            self.logger.debug("Task seems to be submitted correctly. Classads got from scheduler: %s", results)

        configreq = {"workflow": workflow, "status": "SUBMITTED", "subresource": "success"}
        self.logger.warning(
            "Task %s already submitted to HTCondor; pushing information centrally: %s", workflow, str(configreq)
        )
        data = urllib.urlencode(configreq)
        self.server.post(self.resturi, data=data)

        # Note that we don't re-send Dashboard jobs; we assume this is a rare occurrance and
        # don't want to upset any info already in the Dashboard.

        return Result.Result(task=task, result=(-1))
Example #41
0
    def submitDirect(self, schedd, cmd, arg, info): #pylint: disable=R0201
        """
        Submit directly to the schedd using the HTCondor module
        """
        dagAd = classad.ClassAd()
        addCRABInfoToClassAd(dagAd, info)

        if info["CMSGroups"]:
            dagAd["CMSGroups"] = ','.join(info["CMSGroups"])
        else:
            dagAd["CMSGroups"] = classad.Value.Undefined

        # NOTE: Changes here must be synchronized with the job_submit in DagmanCreator.py in CAFTaskWorker
        dagAd["CRAB_Attempt"] = 0
        # We switched from local to scheduler universe.  Why?  It seems there's no way in the
        # local universe to change the hold signal at runtime.  That's fairly important for our
        # resubmit implementation.
        #dagAd["JobUniverse"] = 12
        dagAd["JobUniverse"] = 7
        dagAd["HoldKillSig"] = "SIGUSR1"
        dagAd["X509UserProxy"] = info['user_proxy']
        dagAd["Requirements"] = classad.ExprTree('true || false')
        dagAd["Environment"] = classad.ExprTree('strcat("PATH=/usr/bin:/bin CRAB3_VERSION=3.3.0-pre1 CONDOR_ID=", ClusterId, ".", ProcId," %s")' % " ".join(info['additional_environment_options'].split(";")))
        dagAd["RemoteCondorSetup"] = info['remote_condor_setup']

        dagAd["CRAB_TaskSubmitTime"] = classad.ExprTree("%s" % info["start_time"].encode('ascii', 'ignore'))
        dagAd['CRAB_TaskLifetimeDays'] = TASKLIFETIME // 24 // 60 // 60
        dagAd['CRAB_TaskEndTime'] = int(info["start_time"]) + TASKLIFETIME
        #For task management info see https://github.com/dmwm/CRABServer/issues/4681#issuecomment-302336451
        dagAd["LeaveJobInQueue"] = classad.ExprTree("true")
        dagAd["PeriodicHold"] = classad.ExprTree("time() > CRAB_TaskEndTime")
        dagAd["TransferOutput"] = info['outputFilesString']
        dagAd["OnExitHold"] = classad.ExprTree("(ExitCode =!= UNDEFINED && ExitCode != 0)")
        dagAd["OnExitRemove"] = classad.ExprTree("( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2))")
        dagAd["OtherJobRemoveRequirements"] = classad.ExprTree("DAGManJobId =?= ClusterId")
        dagAd["RemoveKillSig"] = "SIGUSR1"

        with open('subdag.ad' ,'w') as fd:
            for k, v in dagAd.items():
                if k == 'X509UserProxy':
                    v = os.path.basename(v)
                if isinstance(v, basestring):
                    value = classad.quote(v)
                elif isinstance(v, classad.ExprTree):
                    value = repr(v)
                elif isinstance(v, list):
                    value = "{{{0}}}".format(json.dumps(v)[1:-1])
                else:
                    value = v
                fd.write('+{0} = {1}\n'.format(k, value))

        dagAd["TaskType"] = "ROOT"
        dagAd["Out"] = str(os.path.join(info['scratch'], "request.out"))
        dagAd["Err"] = str(os.path.join(info['scratch'], "request.err"))
        dagAd["Cmd"] = cmd
        dagAd['Args'] = arg
        dagAd["TransferInput"] = str(info['inputFilesString'])

        condorIdDict = {}
        with HTCondorUtils.AuthenticatedSubprocess(info['user_proxy'], pickleOut=True, outputObj=condorIdDict, logger=self.logger) as (parent, rpipe):
            if not parent:
                resultAds = []
                condorIdDict['ClusterId'] = schedd.submit(dagAd, 1, True, resultAds)
                schedd.spool(resultAds)
                # editing the LeaveJobInQueue since the remote submit overwrites it
                # see https://github.com/dmwm/CRABServer/pull/5212#issuecomment-216519749
                if resultAds:
                    id_ = "%s.%s" % (resultAds[0]['ClusterId'], resultAds[0]['ProcId'])
                    schedd.edit([id_], "LeaveJobInQueue", classad.ExprTree("true"))

        try:
            results = pickle.load(rpipe)
        except EOFError:
            #Do not want to retry this since error may happen after submit (during edit for example).
            #And this can cause the task to be submitted twice (although we have a protection in the duplicatedCheck)
            raise TaskWorkerException("Timeout executing condor submit command.", retry=False)

        #notice that the clusterId might be set even if there was a failure. This is if the schedd.submit succeded, but the spool  call failed
        if 'ClusterId' in results.outputObj:
            self.logger.debug("Condor cluster ID just submitted is: %s", results.outputObj['ClusterId'])
        if results.outputMessage != "OK":
            self.logger.debug("Now printing the environment used for submission:\n" + "-"*70 + "\n" + results.environmentStr + "-"*70)
            raise TaskWorkerException("Failure when submitting task to scheduler. Error reason: '%s'" % results.outputMessage, retry=True)

        #if we don't raise exception above the id is here
        return results.outputObj['ClusterId']
Example #42
0
    def parseJobLog(self, fp, nodes):
        node_map = {}
        count = 0
        for event in HTCondorUtils.readEvents(fp):
            count += 1
            eventtime = time.mktime(time.strptime(event['EventTime'], "%Y-%m-%dT%H:%M:%S"))
            if event['MyType'] == 'SubmitEvent':
                m = self.node_name_re.match(event['LogNotes'])
                if m:
                    node = m.groups()[0]
                    proc = event['Cluster'], event['Proc']
                    info = nodes.setdefault(node, {'Retries': 0, 'Restarts': 0, 'SiteHistory': [], 'ResidentSetSize': [], 'SubmitTimes': [], 'StartTimes': [],
                                                'EndTimes': [], 'TotalUserCpuTimeHistory': [], 'TotalSysCpuTimeHistory': [], 'WallDurations': [], 'JobIds': []})
                    info['State'] = 'idle'
                    info['JobIds'].append("%d.%d" % proc)
                    info['RecordedSite'] = False
                    info['SubmitTimes'].append(eventtime)
                    info['TotalUserCpuTimeHistory'].append(0)
                    info['TotalSysCpuTimeHistory'].append(0)
                    info['WallDurations'].append(0)
                    info['ResidentSetSize'].append(0)
                    info['Retries'] = len(info['SubmitTimes'])-1
                    node_map[proc] = node
            elif event['MyType'] == 'ExecuteEvent':
                node = node_map[event['Cluster'], event['Proc']]
                nodes[node]['StartTimes'].append(eventtime)
                nodes[node]['State'] = 'running'
                nodes[node]['RecordedSite'] = False
            elif event['MyType'] == 'JobTerminatedEvent':
                node = node_map[event['Cluster'], event['Proc']]
                nodes[node]['EndTimes'].append(eventtime)
                nodes[node]['WallDurations'][-1] = nodes[node]['EndTimes'][-1] - nodes[node]['StartTimes'][-1]
                self.insertCpu(event, nodes[node])
                if event['TerminatedNormally']:
                    if event['ReturnValue'] == 0:
                        nodes[node]['State'] = 'transferring'
                    else:
                        nodes[node]['State'] = 'cooloff'
                else:
                    nodes[node]['State']  = 'cooloff'
            elif event['MyType'] == 'PostScriptTerminatedEvent':
                m = self.node_name2_re.match(event['DAGNodeName'])
                if m:
                    node = m.groups()[0]
                    if event['TerminatedNormally']:
                        if event['ReturnValue'] == 0:
                            nodes[node]['State'] = 'finished'
                        elif event['ReturnValue'] == 2:
                            nodes[node]['State'] = 'failed'
                        else:
                            nodes[node]['State'] = 'cooloff'
                    else:
                        nodes[node]['State']  = 'cooloff'
            elif event['MyType'] == 'ShadowExceptionEvent' or event["MyType"] == "JobReconnectFailedEvent" or event['MyType'] == 'JobEvictedEvent':
                node = node_map[event['Cluster'], event['Proc']]
                if nodes[node]['State'] != 'idle':
                    nodes[node]['EndTimes'].append(eventtime)
                    if nodes[node]['WallDurations'] and nodes[node]['EndTimes'] and nodes[node]['StartTimes']:
                        nodes[node]['WallDurations'][-1] = nodes[node]['EndTimes'][-1] - nodes[node]['StartTimes'][-1]
                    nodes[node]['State'] = 'idle'
                    self.insertCpu(event, nodes[node])
                    nodes[node]['TotalUserCpuTimeHistory'].append(0)
                    nodes[node]['TotalSysCpuTimeHistory'].append(0)
                    nodes[node]['WallDurations'].append(0)
                    nodes[node]['ResidentSetSize'].append(0)
                    nodes[node]['SubmitTimes'].append(-1)
                    nodes[node]['JobIds'].append(nodes[node]['JobIds'][-1])
                    nodes[node]['Restarts'] += 1
            elif event['MyType'] == 'JobAbortedEvent':
                node = node_map[event['Cluster'], event['Proc']]
                if nodes[node]['State'] == "idle" or nodes[node]['State'] == "held":
                    nodes[node]['StartTimes'].append(-1)
                    if not nodes[node]['RecordedSite']:
                        nodes[node]['SiteHistory'].append("Unknown")
                nodes[node]['State'] = 'killed'
                self.insertCpu(event, nodes[node])
            elif event['MyType'] == 'JobHeldEvent':
                node = node_map[event['Cluster'], event['Proc']]
                if nodes[node]['State'] == 'running':
                    nodes[node]['EndTimes'].append(eventtime)
                    if nodes[node]['WallDurations'] and nodes[node]['EndTimes'] and nodes[node]['StartTimes']:
                        nodes[node]['WallDurations'][-1] = nodes[node]['EndTimes'][-1] - nodes[node]['StartTimes'][-1]
                    self.insertCpu(event, nodes[node])
                    nodes[node]['TotalUserCpuTimeHistory'].append(0)
                    nodes[node]['TotalSysCpuTimeHistory'].append(0)
                    nodes[node]['WallDurations'].append(0)
                    nodes[node]['ResidentSetSize'].append(0)
                    nodes[node]['SubmitTimes'].append(-1)
                    nodes[node]['JobIds'].append(nodes[node]['JobIds'][-1])
                    nodes[node]['Restarts'] += 1
                nodes[node]['State'] = 'held'
            elif event['MyType'] == 'JobReleaseEvent':
                node = node_map[event['Cluster'], event['Proc']]
                nodes[node]['State'] = 'idle'
            elif event['MyType'] == 'JobAdInformationEvent':
                node = node_map[event['Cluster'], event['Proc']]
                if (not nodes[node]['RecordedSite']) and ('JOBGLIDEIN_CMSSite' in event) and not event['JOBGLIDEIN_CMSSite'].startswith("$$"):
                    nodes[node]['SiteHistory'].append(event['JOBGLIDEIN_CMSSite'])
                    nodes[node]['RecordedSite'] = True
                self.insertCpu(event, nodes[node])
            elif event['MyType'] == 'JobImageSizeEvent':
                nodes[node]['ResidentSetSize'][-1] = int(event['ResidentSetSize'])
                if nodes[node]['StartTimes']:
                    nodes[node]['WallDurations'][-1] = eventtime - nodes[node]['StartTimes'][-1]
                self.insertCpu(event, nodes[node])
            elif event["MyType"] == "JobDisconnectedEvent" or event["MyType"] == "JobReconnectedEvent":
                # These events don't really affect the node status
                pass
            else:
                self.logger.warning("Unknown event type: %s" % event['MyType'])

        self.logger.debug("There were %d events in the job log." % count)
        now = time.time()
        for node, info in nodes.items():
            last_start = now
            if info['StartTimes']:
                last_start = info['StartTimes'][-1]
            while len(info['WallDurations']) < len(info['SiteHistory']):
                info['WallDurations'].append(now - last_start)
            while len(info['WallDurations']) > len(info['SiteHistory']):
                info['SiteHistory'].append("Unknown")
    def execute_internal(self, *args, **kw):
        #Marco: I guess these value errors only happens for development instances
        if 'task' not in kw:
            raise ValueError("No task specified.")
        task = kw['task']
        if 'tm_taskname' not in task:
            raise ValueError("No taskname specified.")
        workflow = str(task['tm_taskname'])
        if 'user_proxy' not in task:
            raise ValueError("No proxy provided")
        proxy = task['user_proxy']

        self.logger.info("About to resubmit workflow: %s." % workflow)
        self.logger.info("Task info: %s" % str(task))

        if task['tm_collector']:
            self.backendurls['htcondorPool'] = task['tm_collector']
        loc = HTCondorLocator.HTCondorLocator(self.backendurls)

        schedd = ""
        address = ""
        try:
            schedd, address = loc.getScheddObjNew(task['tm_schedd'])
        except Exception as exp:
            msg = ("%s: The CRAB3 server backend is not able to contact Grid scheduler. Please, retry later. Message from the scheduler: %s") % (workflow, str(exp))
            self.logger.exception(msg)
            raise TaskWorkerException(msg)

        # Check memory and walltime
        if task['resubmit_maxjobruntime'] != None and task['resubmit_maxjobruntime'] > 2800:
            msg = "task requests %s minutes of walltime but only %s is guaranteed to be available. Jobs may not find a site where to run. CRAB3 have changed this value to %s minutes" % (task['resubmit_maxjobruntime'], '2800', '2800')
            self.logger.warning(msg)
            task['resubmit_maxjobruntime'] = '2800'
            self.uploadWarning(msg, kw['task']['user_proxy'], kw['task']['tm_taskname'])
        if task['resubmit_maxmemory'] != None and task['resubmit_maxmemory'] > 2500:
            msg = "task requests %s memory but only %s is guaranteed to be available. Jobs may not find a site where to run and stay idle forever" % (task['resubmit_maxmemory'], '2500')
            self.logger.warning(msg)
            self.uploadWarning(msg, kw['task']['user_proxy'], kw['task']['tm_taskname'])

        # Release the DAG
        rootConst = "TaskType =?= \"ROOT\" && CRAB_ReqName =?= %s" % HTCondorUtils.quote(workflow)

        ## Calculate new parameters for resubmited jobs. These parameters will
        ## be (re)written in the _CONDOR_JOB_AD when we do schedd.edit() below.
        ad = classad.ClassAd()
        params = {'CRAB_ResubmitList'  : 'jobids',
                  'CRAB_SiteBlacklist' : 'site_blacklist',
                  'CRAB_SiteWhitelist' : 'site_whitelist',
                  'MaxWallTimeMins'    : 'maxjobruntime',
                  'RequestMemory'      : 'maxmemory',
                  'RequestCpus'        : 'numcores',
                  'JobPrio'            : 'priority'
                 }
        overwrite = False
        for taskparam in params.values():
            if ('resubmit_'+taskparam in task) and task['resubmit_'+taskparam] != None:
                if type(task['resubmit_'+taskparam]) == list:
                    ad[taskparam] = task['resubmit_'+taskparam]
                if taskparam != 'jobids':
                    overwrite = True

        if ('resubmit_jobids' in task) and task['resubmit_jobids']:
            with HTCondorUtils.AuthenticatedSubprocess(proxy) as (parent, rpipe):
                if not parent:
                    schedd.edit(rootConst, "HoldKillSig", 'SIGKILL')
                    ## Overwrite parameters in the os.environ[_CONDOR_JOB_AD] file. This will affect
                    ## all the jobs, not only the ones we want to resubmit. That's why the pre-job
                    ## is saving the values of the parameters for each job retry in text files (the
                    ## files are in the directory resubmit_info in the schedd).
                    for adparam, taskparam in params.iteritems():
                        if taskparam in ad:
                            schedd.edit(rootConst, adparam, ad[taskparam])
                        elif task['resubmit_'+taskparam] != None:
                            schedd.edit(rootConst, adparam, str(task['resubmit_'+taskparam]))
                    schedd.act(htcondor.JobAction.Hold, rootConst)
                    schedd.edit(rootConst, "HoldKillSig", 'SIGUSR1')
                    schedd.act(htcondor.JobAction.Release, rootConst)
        elif overwrite:
            with HTCondorUtils.AuthenticatedSubprocess(proxy) as (parent, rpipe):
                if not parent:
                    self.logger.debug("Resubmitting under condition overwrite = True")
                    for adparam, taskparam in params.iteritems():
                        if taskparam in ad:
                            if taskparam == 'jobids' and len(list(ad[taskparam])) == 0:
                                self.logger.debug("Setting %s = True in the task ad." % (adparam))
                                schedd.edit(rootConst, adparam, classad.ExprTree("true"))
                            else:
                                schedd.edit(rootConst, adparam, ad[taskparam])
                        elif task['resubmit_'+taskparam] != None:
                            schedd.edit(rootConst, adparam, str(task['resubmit_'+taskparam]))
                    schedd.act(htcondor.JobAction.Release, rootConst)
        else:
            ## This should actually not occur anymore in CRAB 3.3.16 or above, because
            ## starting from CRAB 3.3.16 the resubmission parameters are written to the
            ## Task DB with value != None, so the overwrite variable should never be False.
            with HTCondorUtils.AuthenticatedSubprocess(proxy) as (parent, rpipe):
                if not parent:
                    self.logger.debug("Resubmitting under condition overwrite = False")
                    schedd.edit(rootConst, "HoldKillSig", 'SIGKILL')
                    schedd.edit(rootConst, "CRAB_ResubmitList", classad.ExprTree("true"))
                    schedd.act(htcondor.JobAction.Hold, rootConst)
                    schedd.edit(rootConst, "HoldKillSig", 'SIGUSR1')
                    schedd.act(htcondor.JobAction.Release, rootConst)

        results = rpipe.read()
        if results != "OK":
            raise TaskWorkerException("The CRAB3 server backend could not resubmit your task because the Grid scheduler answered with an error.\n"+\
                                      "This is probably a temporary glitch, please try it again and contact an expert if the error persist.\n"+\
                                      "Error reason: %s" % (results))
def bootstrap():
    print "Entering TaskManagerBootstrap with args: %s" % sys.argv
    command = sys.argv[1]
    if command == "POSTJOB":
        return PostJob.PostJob().execute(*sys.argv[2:])
    elif command == "PREJOB":
        return PreJob.PreJob().execute(*sys.argv[2:])
    elif command == "FINAL":
        return Final.Final().execute(*sys.argv[2:])
    elif command == "ASO":
        return ASO.async_stageout(*sys.argv[2:])

    infile, outfile = sys.argv[2:]

    adfile = os.environ["_CONDOR_JOB_AD"]
    print "Parsing classad"
    with open(adfile, "r") as fd:
        ad = classad.parseOld(fd)
    print "..done"
    in_args = []
    if infile != "None":
        with open(infile, "r") as fd:
            in_args = pickle.load(fd)

    config = Configuration.Configuration()
    config.section_("Services")
    config.Services.DBSUrl = 'https://cmsweb.cern.ch/dbs/prod/phys03/DBSWriter/'
    
    ad['tm_taskname'] = ad.eval("CRAB_Workflow")
    ad['tm_split_algo'] = ad.eval("CRAB_SplitAlgo")
    ad['tm_dbs_url'] = ad.eval("CRAB_DBSURL")
    ad['tm_input_dataset'] = ad.eval("CRAB_InputData")
    ad['tm_outfiles'] = HTCondorUtils.unquote(ad.eval("CRAB_AdditionalOutputFiles"))
    ad['tm_tfile_outfiles'] = HTCondorUtils.unquote(ad.eval("CRAB_TFileOutputFiles"))
    ad['tm_edm_outfiles'] = HTCondorUtils.unquote(ad.eval("CRAB_EDMOutputFiles"))
    ad['tm_site_whitelist'] = HTCondorUtils.unquote(ad.eval("CRAB_SiteWhitelist"))
    ad['tm_site_blacklist'] = HTCondorUtils.unquote(ad.eval("CRAB_SiteBlacklist"))
    ad['tm_job_type'] = 'Analysis'
    print "TaskManager got this raw ad"
    print ad
    pure_ad = {}
    for key in ad:
        try:
            pure_ad[key] = ad.eval(key)
            if isinstance(pure_ad[key], classad.Value):
                del pure_ad[key]
            if isinstance(pure_ad[key], types.ListType):
                pure_ad[key] = [i.eval() for i in pure_ad[key]]
        except:
            pass
    ad = pure_ad
    ad['CRAB_AlgoArgs'] = json.loads(ad["CRAB_AlgoArgs"])
    ad['tm_split_args'] = ad["CRAB_AlgoArgs"]
    ad['tarball_location'] = os.environ.get('CRAB_TARBALL_LOCATION', '')
    print "TaskManagerBootstrap got this ad:"
    pprint.pprint(ad)
    if command == "DBS":
        task = DBSDataDiscovery.DBSDataDiscovery(config)
    elif command == "SPLIT":
        task = Splitter.Splitter(config)
        print "Got this result from the splitter"
        pprint.pprint(task)
    results = task.execute(in_args, task=ad).result
    if command == "SPLIT":
        results = DagmanCreator.create_subdag(results, task=ad)

    print results
    with open(outfile, "w") as fd:
        pickle.dump(results, fd)

    return 0
Example #45
0
def parseJobLog(fp, nodes, nodeMap):
    count = 0
    for event in HTCondorUtils.readEvents(fp):
        count += 1
        eventtime = time.mktime(
            time.strptime(event['EventTime'], "%Y-%m-%dT%H:%M:%S"))
        if event['MyType'] == 'SubmitEvent':
            m = nodeNameRe.match(event['LogNotes'])
            if m:
                node = m.groups()[0]
                proc = event['Cluster'], event['Proc']
                info = nodes.setdefault(node, copy.deepcopy(NODE_DEFAULTS))
                info['State'] = 'idle'
                info['JobIds'].append("%d.%d" % proc)
                info['RecordedSite'] = False
                info['SubmitTimes'].append(eventtime)
                info['TotalUserCpuTimeHistory'].append(0)
                info['TotalSysCpuTimeHistory'].append(0)
                info['WallDurations'].append(0)
                info['ResidentSetSize'].append(0)
                info['Retries'] = len(info['SubmitTimes']) - 1
                nodeMap[proc] = node
        elif event['MyType'] == 'ExecuteEvent':
            node = nodeMap[event['Cluster'], event['Proc']]
            nodes[node]['StartTimes'].append(eventtime)
            nodes[node]['State'] = 'running'
            nodes[node]['RecordedSite'] = False
        elif event['MyType'] == 'JobTerminatedEvent':
            node = nodeMap[event['Cluster'], event['Proc']]
            nodes[node]['EndTimes'].append(eventtime)
            # at times HTCondor does not log the ExecuteEvent and there's no StartTime
            if nodes[node]['StartTimes']:
                nodes[node]['WallDurations'][-1] = nodes[node]['EndTimes'][
                    -1] - nodes[node]['StartTimes'][-1]
            else:
                nodes[node]['WallDurations'][-1] = 0
            insertCpu(event, nodes[node])
            if event['TerminatedNormally']:
                if event['ReturnValue'] == 0:
                    nodes[node]['State'] = 'transferring'
                else:
                    nodes[node]['State'] = 'cooloff'
            else:
                nodes[node]['State'] = 'cooloff'
        elif event['MyType'] == 'PostScriptTerminatedEvent':
            m = nodeName2Re.match(event['DAGNodeName'])
            if m:
                node = m.groups()[0]
                if event['TerminatedNormally']:
                    if event['ReturnValue'] == 0:
                        nodes[node]['State'] = 'finished'
                    elif event['ReturnValue'] == 2:
                        nodes[node]['State'] = 'failed'
                    else:
                        nodes[node]['State'] = 'cooloff'
                else:
                    nodes[node]['State'] = 'cooloff'
        elif event['MyType'] == 'ShadowExceptionEvent' or event[
                "MyType"] == "JobReconnectFailedEvent" or event[
                    'MyType'] == 'JobEvictedEvent':
            node = nodeMap[event['Cluster'], event['Proc']]
            if nodes[node]['State'] != 'idle':
                nodes[node]['EndTimes'].append(eventtime)
                if nodes[node]['WallDurations'] and nodes[node][
                        'EndTimes'] and nodes[node]['StartTimes']:
                    nodes[node]['WallDurations'][-1] = nodes[node]['EndTimes'][
                        -1] - nodes[node]['StartTimes'][-1]
                nodes[node]['State'] = 'idle'
                insertCpu(event, nodes[node])
                nodes[node]['TotalUserCpuTimeHistory'].append(0)
                nodes[node]['TotalSysCpuTimeHistory'].append(0)
                nodes[node]['WallDurations'].append(0)
                nodes[node]['ResidentSetSize'].append(0)
                nodes[node]['SubmitTimes'].append(-1)
                nodes[node]['JobIds'].append(nodes[node]['JobIds'][-1])
                nodes[node]['Restarts'] += 1
        elif event['MyType'] == 'JobAbortedEvent':
            node = nodeMap[event['Cluster'], event['Proc']]
            if nodes[node]['State'] == "idle" or nodes[node]['State'] == "held":
                nodes[node]['StartTimes'].append(-1)
                if not nodes[node]['RecordedSite']:
                    nodes[node]['SiteHistory'].append("Unknown")
            if nodes[node]['State'] == 'running':
                nodes[node]['EndTimes'].append(eventtime)
                # nodes[node]['State'] can be 'running' only if an ExcuteEvent was found, so StartTime must be defined
                nodes[node]['WallDurations'][-1] = nodes[node]['EndTimes'][
                    -1] - nodes[node]['StartTimes'][-1]
            nodes[node]['State'] = 'killed'
            insertCpu(event, nodes[node])
        elif event['MyType'] == 'JobHeldEvent':
            node = nodeMap[event['Cluster'], event['Proc']]
            if nodes[node]['State'] == 'running':
                nodes[node]['EndTimes'].append(eventtime)
                if nodes[node]['WallDurations'] and nodes[node][
                        'EndTimes'] and nodes[node]['StartTimes']:
                    nodes[node]['WallDurations'][-1] = nodes[node]['EndTimes'][
                        -1] - nodes[node]['StartTimes'][-1]
                insertCpu(event, nodes[node])
                nodes[node]['TotalUserCpuTimeHistory'].append(0)
                nodes[node]['TotalSysCpuTimeHistory'].append(0)
                nodes[node]['WallDurations'].append(0)
                nodes[node]['ResidentSetSize'].append(0)
                nodes[node]['SubmitTimes'].append(-1)
                nodes[node]['JobIds'].append(nodes[node]['JobIds'][-1])
                nodes[node]['Restarts'] += 1
            nodes[node]['State'] = 'held'
        elif event['MyType'] == 'JobReleaseEvent':
            node = nodeMap[event['Cluster'], event['Proc']]
            nodes[node]['State'] = 'idle'
        elif event['MyType'] == 'JobAdInformationEvent':
            node = nodeMap[event['Cluster'], event['Proc']]
            if (not nodes[node]['RecordedSite']) and (
                    'JOBGLIDEIN_CMSSite' in event
            ) and not event['JOBGLIDEIN_CMSSite'].startswith("$$"):
                nodes[node]['SiteHistory'].append(event['JOBGLIDEIN_CMSSite'])
                nodes[node]['RecordedSite'] = True
            insertCpu(event, nodes[node])
        elif event['MyType'] == 'JobImageSizeEvent':
            node = nodeMap[event['Cluster'], event['Proc']]
            nodes[node]['ResidentSetSize'][-1] = int(event['ResidentSetSize'])
            if nodes[node]['StartTimes']:
                nodes[node]['WallDurations'][
                    -1] = eventtime - nodes[node]['StartTimes'][-1]
            insertCpu(event, nodes[node])
        elif event["MyType"] == "JobDisconnectedEvent" or event[
                "MyType"] == "JobReconnectedEvent":
            # These events don't really affect the node status
            pass
        else:
            logging.warning("Unknown event type: %s", event['MyType'])

    logging.debug("There were %d events in the job log.", count)
    now = time.time()
    for node, info in nodes.items():
        if node == 'DagStatus':
            # StartTimes and WallDurations are not present, though crab status2 uses this record to get the DagStatus.
            continue
        lastStart = now
        if info['StartTimes']:
            lastStart = info['StartTimes'][-1]
        while len(info['WallDurations']) < len(info['SiteHistory']):
            info['WallDurations'].append(now - lastStart)
        while len(info['WallDurations']) > len(info['SiteHistory']):
            info['SiteHistory'].append("Unknown")
Example #46
0
    def executeInternal(self, apmon, *args, **kw):
        #Marco: I guess these value errors only happens for development instances
        if 'task' not in kw:
            raise ValueError("No task specified.")
        self.task = kw['task']
        if 'tm_taskname' not in self.task:
            raise ValueError("No taskname specified")
        self.workflow = self.task['tm_taskname']
        if 'user_proxy' not in self.task:
            raise ValueError("No proxy provided")
        self.proxy = self.task['user_proxy']

        try:
            self.killTransfers(apmon)
        except:
            self.logger.exception(
                "Failed to kill transfers; suppressing error until functionality is confirmed"
            )  #TODO send a warning?

        self.logger.info("About to kill workflow: %s." % self.workflow)

        self.workflow = str(self.workflow)
        if not WORKFLOW_RE.match(self.workflow):
            raise Exception("Invalid workflow name.")

        # Query HTCondor for information about running jobs and update Dashboard appropriately
        loc = HTCondorLocator.HTCondorLocator(self.backendurls)
        self.schedd, address = loc.getScheddObj(
            self.workflow
        )  #TODO wrap this with a try/except. Copy from HTCondorDataWf

        ad = classad.ClassAd()
        ad['foo'] = self.task['kill_ids']
        try:
            hostname = socket.getfqdn()
        except:
            hostname = ''

        const = "CRAB_ReqName =?= %s && member(CRAB_Id, %s)" % (
            HTCondorUtils.quote(self.workflow), ad.lookup("foo").__repr__())
        try:
            for ad in list(self.schedd.xquery(const,
                                              ['CRAB_Id', 'CRAB_Retry'])):
                if ('CRAB_Id' not in ad) or ('CRAB_Retry' not in ad):
                    continue
                jobid = str(ad.eval('CRAB_Id'))
                jobretry = str(ad.eval('CRAB_Retry'))
                jinfo = {
                    'jobId': ("%s_https://glidein.cern.ch/%s/%s_%s" %
                              (jobid, jobid, self.workflow, jobretry)),
                    'sid':
                    "https://glidein.cern.ch/%s%s" % (jobid, self.workflow),
                    'broker':
                    hostname,
                    'bossId':
                    jobid,
                    'StatusValue':
                    'killed',
                }
                self.logger.info("Sending kill info to Dashboard: %s" %
                                 str(jinfo))
                apmon.sendToML(jinfo)
        except:
            self.logger.exception(
                "Failed to notify Dashboard of job kills")  #warning

        # Note that we can not send kills for jobs not in queue at this time; we'll need the
        # DAG FINAL node to be fixed and the node status to include retry number.

        if self.task['kill_all']:
            return self.killAll()
        else:
            return self.killJobs(self.task['kill_ids'])
Example #47
0
def parseJobLog(fp, nodes, node_map):
    count = 0
    for event in HTCondorUtils.readEvents(fp):
        count += 1
        eventtime = time.mktime(time.strptime(event["EventTime"], "%Y-%m-%dT%H:%M:%S"))
        if event["MyType"] == "SubmitEvent":
            m = node_name_re.match(event["LogNotes"])
            if m:
                node = m.groups()[0]
                proc = event["Cluster"], event["Proc"]
                info = nodes.setdefault(node, NODE_DEFAULTS)
                info["State"] = "idle"
                info["JobIds"].append("%d.%d" % proc)
                info["RecordedSite"] = False
                info["SubmitTimes"].append(eventtime)
                info["TotalUserCpuTimeHistory"].append(0)
                info["TotalSysCpuTimeHistory"].append(0)
                info["WallDurations"].append(0)
                info["ResidentSetSize"].append(0)
                info["Retries"] = len(info["SubmitTimes"]) - 1
                node_map[proc] = node
        elif event["MyType"] == "ExecuteEvent":
            node = node_map[event["Cluster"], event["Proc"]]
            nodes[node]["StartTimes"].append(eventtime)
            nodes[node]["State"] = "running"
            nodes[node]["RecordedSite"] = False
        elif event["MyType"] == "JobTerminatedEvent":
            node = node_map[event["Cluster"], event["Proc"]]
            nodes[node]["EndTimes"].append(eventtime)
            nodes[node]["WallDurations"][-1] = nodes[node]["EndTimes"][-1] - nodes[node]["StartTimes"][-1]
            insertCpu(event, nodes[node])
            if event["TerminatedNormally"]:
                if event["ReturnValue"] == 0:
                    nodes[node]["State"] = "transferring"
                else:
                    nodes[node]["State"] = "cooloff"
            else:
                nodes[node]["State"] = "cooloff"
        elif event["MyType"] == "PostScriptTerminatedEvent":
            m = node_name2_re.match(event["DAGNodeName"])
            if m:
                node = m.groups()[0]
                if event["TerminatedNormally"]:
                    if event["ReturnValue"] == 0:
                        nodes[node]["State"] = "finished"
                    elif event["ReturnValue"] == 2:
                        nodes[node]["State"] = "failed"
                    else:
                        nodes[node]["State"] = "cooloff"
                else:
                    nodes[node]["State"] = "cooloff"
        elif (
            event["MyType"] == "ShadowExceptionEvent"
            or event["MyType"] == "JobReconnectFailedEvent"
            or event["MyType"] == "JobEvictedEvent"
        ):
            node = node_map[event["Cluster"], event["Proc"]]
            if nodes[node]["State"] != "idle":
                nodes[node]["EndTimes"].append(eventtime)
                if nodes[node]["WallDurations"] and nodes[node]["EndTimes"] and nodes[node]["StartTimes"]:
                    nodes[node]["WallDurations"][-1] = nodes[node]["EndTimes"][-1] - nodes[node]["StartTimes"][-1]
                nodes[node]["State"] = "idle"
                insertCpu(event, nodes[node])
                nodes[node]["TotalUserCpuTimeHistory"].append(0)
                nodes[node]["TotalSysCpuTimeHistory"].append(0)
                nodes[node]["WallDurations"].append(0)
                nodes[node]["ResidentSetSize"].append(0)
                nodes[node]["SubmitTimes"].append(-1)
                nodes[node]["JobIds"].append(nodes[node]["JobIds"][-1])
                nodes[node]["Restarts"] += 1
        elif event["MyType"] == "JobAbortedEvent":
            node = node_map[event["Cluster"], event["Proc"]]
            if nodes[node]["State"] == "idle" or nodes[node]["State"] == "held":
                nodes[node]["StartTimes"].append(-1)
                if not nodes[node]["RecordedSite"]:
                    nodes[node]["SiteHistory"].append("Unknown")
            nodes[node]["State"] = "killed"
            insertCpu(event, nodes[node])
        elif event["MyType"] == "JobHeldEvent":
            node = node_map[event["Cluster"], event["Proc"]]
            if nodes[node]["State"] == "running":
                nodes[node]["EndTimes"].append(eventtime)
                if nodes[node]["WallDurations"] and nodes[node]["EndTimes"] and nodes[node]["StartTimes"]:
                    nodes[node]["WallDurations"][-1] = nodes[node]["EndTimes"][-1] - nodes[node]["StartTimes"][-1]
                insertCpu(event, nodes[node])
                nodes[node]["TotalUserCpuTimeHistory"].append(0)
                nodes[node]["TotalSysCpuTimeHistory"].append(0)
                nodes[node]["WallDurations"].append(0)
                nodes[node]["ResidentSetSize"].append(0)
                nodes[node]["SubmitTimes"].append(-1)
                nodes[node]["JobIds"].append(nodes[node]["JobIds"][-1])
                nodes[node]["Restarts"] += 1
            nodes[node]["State"] = "held"
        elif event["MyType"] == "JobReleaseEvent":
            node = node_map[event["Cluster"], event["Proc"]]
            nodes[node]["State"] = "idle"
        elif event["MyType"] == "JobAdInformationEvent":
            node = node_map[event["Cluster"], event["Proc"]]
            if (
                (not nodes[node]["RecordedSite"])
                and ("JOBGLIDEIN_CMSSite" in event)
                and not event["JOBGLIDEIN_CMSSite"].startswith("$$")
            ):
                nodes[node]["SiteHistory"].append(event["JOBGLIDEIN_CMSSite"])
                nodes[node]["RecordedSite"] = True
            insertCpu(event, nodes[node])
        elif event["MyType"] == "JobImageSizeEvent":
            node = node_map[event["Cluster"], event["Proc"]]
            nodes[node]["ResidentSetSize"][-1] = int(event["ResidentSetSize"])
            if nodes[node]["StartTimes"]:
                nodes[node]["WallDurations"][-1] = eventtime - nodes[node]["StartTimes"][-1]
            insertCpu(event, nodes[node])
        elif event["MyType"] == "JobDisconnectedEvent" or event["MyType"] == "JobReconnectedEvent":
            # These events don't really affect the node status
            pass
        else:
            logging.warning("Unknown event type: %s" % event["MyType"])

    logging.debug("There were %d events in the job log." % count)
    now = time.time()
    for node, info in nodes.items():
        if node == "DagStatus":
            # StartTimes and WallDurations are not present, though crab status2 uses this record to get the DagStatus.
            continue
        last_start = now
        if info["StartTimes"]:
            last_start = info["StartTimes"][-1]
        while len(info["WallDurations"]) < len(info["SiteHistory"]):
            info["WallDurations"].append(now - last_start)
        while len(info["WallDurations"]) > len(info["SiteHistory"]):
            info["SiteHistory"].append("Unknown")
Example #48
0
    def execute_internal(self, *args, **kw):
        #Marco: I guess these value errors only happens for development instances
        if 'task' not in kw:
            raise ValueError("No task specified.")
        task = kw['task']
        if 'tm_taskname' not in task:
            raise ValueError("No taskname specified.")
        workflow = str(task['tm_taskname'])
        if 'user_proxy' not in task:
            raise ValueError("No proxy provided")
        proxy = task['user_proxy']

        self.logger.info("About to resubmit workflow: %s." % workflow)
        self.logger.info("Task info: %s" % str(task))

        loc = HTCondorLocator.HTCondorLocator(self.backendurls)
        schedd, address = loc.getScheddObj(workflow) #TODO wrap

        # Release the DAG
        rootConst = "TaskType =?= \"ROOT\" && CRAB_ReqName =?= %s" % HTCondorUtils.quote(workflow)

        # Calculate a new white/blacklist
        ad = classad.ClassAd()
        ad['whitelist'] = task['resubmit_site_whitelist']
        ad['blacklist'] = task['resubmit_site_blacklist']

        if ('resubmit_ids' in task) and task['resubmit_ids']:
            ad['resubmit'] = task['resubmit_ids']
            with HTCondorUtils.AuthenticatedSubprocess(proxy) as (parent, rpipe):
                if not parent:
                    schedd.edit(rootConst, "HoldKillSig", 'SIGKILL')
                    schedd.edit(rootConst, "CRAB_ResubmitList", ad['resubmit'])
                    schedd.act(htcondor.JobAction.Hold, rootConst)
                    schedd.edit(rootConst, "HoldKillSig", 'SIGUSR1')
                    schedd.act(htcondor.JobAction.Release, rootConst)

        elif task['resubmit_site_whitelist'] or task['resubmit_site_blacklist'] or \
                task['resubmit_priority'] != None or task['resubmit_maxmemory'] != None or \
                task['resubmit_numcores'] != None or task['resubmit_maxjobruntime'] != None:
            with HTCondorUtils.AuthenticatedSubprocess(proxy) as (parent, rpipe):
                if not parent:
                    if task['resubmit_site_blacklist']:
                        schedd.edit(rootConst, "CRAB_SiteResubmitBlacklist", ad['blacklist'])
                    if task['resubmit_site_whitelist']:
                        schedd.edit(rootConst, "CRAB_SiteResubmitWhitelist", ad['whitelist'])
                    if task['resubmit_priority'] != None:
                        schedd.edit(rootConst, "JobPrio", task['resubmit_priority'])
                    if task['resubmit_numcores'] != None:
                        schedd.edit(rootConst, "RequestCpus", task['resubmit_numcores'])
                    if task['resubmit_maxjobruntime'] != None:
                        schedd.edit(rootConst, "MaxWallTimeMins", task['resubmit_maxjobruntime'])
                    if task['resubmit_maxmemory'] != None:
                        schedd.edit(rootConst, "RequestMemory", task['resubmit_maxmemory'])
                    schedd.act(htcondor.JobAction.Release, rootConst)

        else:
            with HTCondorUtils.AuthenticatedSubprocess(proxy) as (parent, rpipe):
                if not parent:
                    schedd.edit(rootConst, "HoldKillSig", 'SIGKILL')
                    schedd.edit(rootConst, "CRAB_ResubmitList", classad.ExprTree("true"))
                    schedd.act(htcondor.JobAction.Hold, rootConst)
                    schedd.edit(rootConst, "HoldKillSig", 'SIGUSR1')
                    schedd.act(htcondor.JobAction.Release, rootConst)

        results = rpipe.read()
        if results != "OK":
            raise TaskWorkerException("The CRAB3 server backend could not reubmit your task because the Grid scheduler answered with an error\n"+\
                                      "This is probably a temporary glitch, please try it again and contact an expert if the error persist\n"+\
                                      "Error reason %s" % results)
def bootstrap():
    print("Entering TaskManagerBootstrap with args: %s" % sys.argv)
    command = sys.argv[1]
    if command == "POSTJOB":
        return PostJob.PostJob().execute(*sys.argv[2:])
    elif command == "PREJOB":
        return PreJob.PreJob().execute(*sys.argv[2:])
    elif command == "PREDAG":
        return PreDAG.PreDAG().execute(*sys.argv[2:])

    infile, outfile = sys.argv[2:]

    adfile = os.environ["_CONDOR_JOB_AD"]
    print("Parsing classad")
    with open(adfile, "r") as fd:
        ad = classad.parseOld(fd)
    print("..done")
    in_args = []
    if infile != "None":
        with open(infile, "r") as fd:
            in_args = pickle.load(fd)

    config = Configuration.Configuration()
    config.section_("Services")
    config.Services.DBSUrl = 'https://cmsweb.cern.ch/dbs/prod/phys03/DBSWriter/'
    
    ad['tm_taskname'] = ad.eval("CRAB_Workflow")
    ad['tm_split_algo'] = ad.eval("CRAB_SplitAlgo")
    ad['tm_dbs_url'] = ad.eval("CRAB_DBSURL")
    ad['tm_input_dataset'] = ad.eval("DESIRED_CMSDataset")
    ad['tm_outfiles'] = HTCondorUtils.unquote(ad.eval("CRAB_AdditionalOutputFiles"))
    ad['tm_tfile_outfiles'] = HTCondorUtils.unquote(ad.eval("CRAB_TFileOutputFiles"))
    ad['tm_edm_outfiles'] = HTCondorUtils.unquote(ad.eval("CRAB_EDMOutputFiles"))
    ad['tm_site_whitelist'] = HTCondorUtils.unquote(ad.eval("CRAB_SiteWhitelist"))
    ad['tm_site_blacklist'] = HTCondorUtils.unquote(ad.eval("CRAB_SiteBlacklist"))
    ad['tm_job_type'] = 'Analysis'
    print("TaskManager got this raw ad")
    print(ad)
    pure_ad = {}
    for key in ad:
        try:
            pure_ad[key] = ad.eval(key)
            if isinstance(pure_ad[key], classad.Value):
                del pure_ad[key]
            if isinstance(pure_ad[key], list):
                pure_ad[key] = [i.eval() for i in pure_ad[key]]
        except:
            pass
    ad = pure_ad
    ad['CRAB_AlgoArgs'] = json.loads(ad["CRAB_AlgoArgs"])
    ad['tm_split_args'] = ad["CRAB_AlgoArgs"]
    ad['tarball_location'] = os.environ.get('CRAB_TARBALL_LOCATION', '')
    print("TaskManagerBootstrap got this ad:")
    pprint.pprint(ad)

    results = task.execute(in_args, task=ad).result

    print(results)
    with open(outfile, "w") as fd:
        pickle.dump(results, fd)

    return 0
Example #50
0
def parseJobLog(fp, nodes, nodeMap):
    count = 0
    for event in HTCondorUtils.readEvents(fp):
        count += 1
        eventtime = time.mktime(time.strptime(event['EventTime'], "%Y-%m-%dT%H:%M:%S"))
        if event['MyType'] == 'SubmitEvent':
            m = nodeNameRe.match(event['LogNotes'])
            if m:
                node = m.groups()[0]
                proc = event['Cluster'], event['Proc']
                info = nodes.setdefault(node, copy.deepcopy(NODE_DEFAULTS))
                info['State'] = 'idle'
                info['JobIds'].append("%d.%d" % proc)
                info['RecordedSite'] = False
                info['SubmitTimes'].append(eventtime)
                info['TotalUserCpuTimeHistory'].append(0)
                info['TotalSysCpuTimeHistory'].append(0)
                info['WallDurations'].append(0)
                info['ResidentSetSize'].append(0)
                info['Retries'] = len(info['SubmitTimes'])-1
                nodeMap[proc] = node
        elif event['MyType'] == 'ExecuteEvent':
            node = nodeMap[event['Cluster'], event['Proc']]
            nodes[node]['StartTimes'].append(eventtime)
            nodes[node]['State'] = 'running'
            nodes[node]['RecordedSite'] = False
        elif event['MyType'] == 'JobTerminatedEvent':
            node = nodeMap[event['Cluster'], event['Proc']]
            nodes[node]['EndTimes'].append(eventtime)
            # at times HTCondor does not log the ExecuteEvent and there's no StartTime
            if nodes[node]['StartTimes'] :
                nodes[node]['WallDurations'][-1] = nodes[node]['EndTimes'][-1] - nodes[node]['StartTimes'][-1]
            else:
                 nodes[node]['WallDurations'][-1] = 0
            insertCpu(event, nodes[node])
            if event['TerminatedNormally']:
                if event['ReturnValue'] == 0:
                    nodes[node]['State'] = 'transferring'
                else:
                    nodes[node]['State'] = 'cooloff'
            else:
                nodes[node]['State'] = 'cooloff'
        elif event['MyType'] == 'PostScriptTerminatedEvent':
            m = nodeName2Re.match(event['DAGNodeName'])
            if m:
                node = m.groups()[0]
                if event['TerminatedNormally']:
                    if event['ReturnValue'] == 0:
                        nodes[node]['State'] = 'finished'
                    elif event['ReturnValue'] == 2:
                        nodes[node]['State'] = 'failed'
                    else:
                        nodes[node]['State'] = 'cooloff'
                else:
                    nodes[node]['State'] = 'cooloff'
        elif event['MyType'] == 'ShadowExceptionEvent' or event["MyType"] == "JobReconnectFailedEvent" or event['MyType'] == 'JobEvictedEvent':
            node = nodeMap[event['Cluster'], event['Proc']]
            if nodes[node]['State'] != 'idle':
                nodes[node]['EndTimes'].append(eventtime)
                if nodes[node]['WallDurations'] and nodes[node]['EndTimes'] and nodes[node]['StartTimes']:
                    nodes[node]['WallDurations'][-1] = nodes[node]['EndTimes'][-1] - nodes[node]['StartTimes'][-1]
                nodes[node]['State'] = 'idle'
                insertCpu(event, nodes[node])
                nodes[node]['TotalUserCpuTimeHistory'].append(0)
                nodes[node]['TotalSysCpuTimeHistory'].append(0)
                nodes[node]['WallDurations'].append(0)
                nodes[node]['ResidentSetSize'].append(0)
                nodes[node]['SubmitTimes'].append(-1)
                nodes[node]['JobIds'].append(nodes[node]['JobIds'][-1])
                nodes[node]['Restarts'] += 1
        elif event['MyType'] == 'JobAbortedEvent':
            node = nodeMap[event['Cluster'], event['Proc']]
            if nodes[node]['State'] == "idle" or nodes[node]['State'] == "held":
                nodes[node]['StartTimes'].append(-1)
                if not nodes[node]['RecordedSite']:
                    nodes[node]['SiteHistory'].append("Unknown")
            nodes[node]['State'] = 'killed'
            insertCpu(event, nodes[node])
        elif event['MyType'] == 'JobHeldEvent':
            node = nodeMap[event['Cluster'], event['Proc']]
            if nodes[node]['State'] == 'running':
                nodes[node]['EndTimes'].append(eventtime)
                if nodes[node]['WallDurations'] and nodes[node]['EndTimes'] and nodes[node]['StartTimes']:
                    nodes[node]['WallDurations'][-1] = nodes[node]['EndTimes'][-1] - nodes[node]['StartTimes'][-1]
                insertCpu(event, nodes[node])
                nodes[node]['TotalUserCpuTimeHistory'].append(0)
                nodes[node]['TotalSysCpuTimeHistory'].append(0)
                nodes[node]['WallDurations'].append(0)
                nodes[node]['ResidentSetSize'].append(0)
                nodes[node]['SubmitTimes'].append(-1)
                nodes[node]['JobIds'].append(nodes[node]['JobIds'][-1])
                nodes[node]['Restarts'] += 1
            nodes[node]['State'] = 'held'
        elif event['MyType'] == 'JobReleaseEvent':
            node = nodeMap[event['Cluster'], event['Proc']]
            nodes[node]['State'] = 'idle'
        elif event['MyType'] == 'JobAdInformationEvent':
            node = nodeMap[event['Cluster'], event['Proc']]
            if (not nodes[node]['RecordedSite']) and ('JOBGLIDEIN_CMSSite' in event) and not event['JOBGLIDEIN_CMSSite'].startswith("$$"):
                nodes[node]['SiteHistory'].append(event['JOBGLIDEIN_CMSSite'])
                nodes[node]['RecordedSite'] = True
            insertCpu(event, nodes[node])
        elif event['MyType'] == 'JobImageSizeEvent':
            node = nodeMap[event['Cluster'], event['Proc']]
            nodes[node]['ResidentSetSize'][-1] = int(event['ResidentSetSize'])
            if nodes[node]['StartTimes']:
                nodes[node]['WallDurations'][-1] = eventtime - nodes[node]['StartTimes'][-1]
            insertCpu(event, nodes[node])
        elif event["MyType"] == "JobDisconnectedEvent" or event["MyType"] == "JobReconnectedEvent":
            # These events don't really affect the node status
            pass
        else:
            logging.warning("Unknown event type: %s", event['MyType'])

    logging.debug("There were %d events in the job log.", count)
    now = time.time()
    for node, info in nodes.items():
        if node == 'DagStatus':
            # StartTimes and WallDurations are not present, though crab status2 uses this record to get the DagStatus.
            continue
        lastStart = now
        if info['StartTimes']:
            lastStart = info['StartTimes'][-1]
        while len(info['WallDurations']) < len(info['SiteHistory']):
            info['WallDurations'].append(now - lastStart)
        while len(info['WallDurations']) > len(info['SiteHistory']):
            info['SiteHistory'].append("Unknown")
    def executeInternal(self, *args, **kwargs):
        #Marco: I guess these value errors only happens for development instances
        if 'task' not in kwargs:
            raise ValueError("No task specified.")
        task = kwargs['task']
        if 'tm_taskname' not in task:
            raise ValueError("No taskname specified.")
        workflow = str(task['tm_taskname'])
        if 'user_proxy' not in task:
            raise ValueError("No proxy provided")
        proxy = task['user_proxy']

        if task.get('resubmit_publication', False):
            resubmitWhat = "publications"
        else:
            resubmitWhat = "jobs"

        self.logger.info("About to resubmit %s for workflow: %s." % (resubmitWhat, workflow))
        self.logger.info("Task info: %s" % str(task))

        if task.get('resubmit_publication', False):
            asourl = task.get('tm_asourl', None)
            #Let's not assume the db has been updated (mostly for devs), let's default asodb to asynctransfer!
            #Also the "or" takes care of the case were the new code is executed on old task
            #i.e.: tm_asodb is there but empty.
            asodb = task.get('tm_asodb', 'asynctransfer') or 'asynctransfer'
            if not asourl:
                msg = "ASO URL not set. Can not resubmit publication."
                raise TaskWorkerException(msg)
            self.logger.info("Will resubmit failed publications")
            self.resubmitPublication(asourl, asodb, proxy, workflow)
            return

        if task['tm_collector']:
            self.backendurls['htcondorPool'] = task['tm_collector']
        loc = HTCondorLocator.HTCondorLocator(self.backendurls)

        schedd = ""
        address = ""
        try:
            schedd, address = loc.getScheddObjNew(task['tm_schedd'])
        except Exception as exp:
            msg  = "The CRAB server backend was not able to contact the Grid scheduler."
            msg += " Please try again later."
            msg += " If the error persists send an e-mail to %s." % (FEEDBACKMAIL)
            msg += " Message from the scheduler: %s" % (str(exp))
            self.logger.exception("%s: %s" % (workflow, msg))
            raise TaskWorkerException(msg)

        # Check memory and walltime
        stdmaxjobruntime = 2800
        stdmaxmemory = 2500
        if task['resubmit_maxjobruntime'] is not None and task['resubmit_maxjobruntime'] > stdmaxjobruntime:
            msg  = "Task requests %s minutes of walltime, but only %s are guaranteed to be available." % (task['resubmit_maxjobruntime'], stdmaxjobruntime)
            msg += " Jobs may not find a site where to run."
            msg += " CRAB has changed this value to %s minutes." % (stdmaxjobruntime)
            self.logger.warning(msg)
            task['resubmit_maxjobruntime'] = str(stdmaxjobruntime)
            self.uploadWarning(msg, proxy, kwargs['task']['tm_taskname'])
        if task['resubmit_maxmemory'] is not None and task['resubmit_maxmemory'] > stdmaxmemory:
            msg  = "Task requests %s MB of memory, but only %s MB are guaranteed to be available." % (task['resubmit_maxmemory'], stdmaxmemory)
            msg += " Jobs may not find a site where to run and stay idle forever."
            self.logger.warning(msg)
            self.uploadWarning(msg, proxy, kwargs['task']['tm_taskname'])

        # Release the DAG
        rootConst = "TaskType =?= \"ROOT\" && CRAB_ReqName =?= %s" % HTCondorUtils.quote(workflow)

        ## Calculate new parameters for resubmited jobs. These parameters will
        ## be (re)written in the _CONDOR_JOB_AD when we do schedd.edit() below.
        ad = classad.ClassAd()
        params = {'CRAB_ResubmitList'  : 'jobids',
                  'CRAB_SiteBlacklist' : 'site_blacklist',
                  'CRAB_SiteWhitelist' : 'site_whitelist',
                  'MaxWallTimeMins'    : 'maxjobruntime',
                  'RequestMemory'      : 'maxmemory',
                  'RequestCpus'        : 'numcores',
                  'JobPrio'            : 'priority'
                 }
        overwrite = False
        for taskparam in params.values():
            if ('resubmit_'+taskparam in task) and task['resubmit_'+taskparam] != None:
                if isinstance(task['resubmit_'+taskparam], list):
                    ad[taskparam] = task['resubmit_'+taskparam]
                if taskparam != 'jobids':
                    overwrite = True

        if ('resubmit_jobids' in task) and task['resubmit_jobids']:
            with HTCondorUtils.AuthenticatedSubprocess(proxy) as (parent, rpipe):
                if not parent:
                    schedd.edit(rootConst, "HoldKillSig", 'SIGKILL')
                    ## Overwrite parameters in the os.environ[_CONDOR_JOB_AD] file. This will affect
                    ## all the jobs, not only the ones we want to resubmit. That's why the pre-job
                    ## is saving the values of the parameters for each job retry in text files (the
                    ## files are in the directory resubmit_info in the schedd).
                    for adparam, taskparam in params.iteritems():
                        if taskparam in ad:
                            schedd.edit(rootConst, adparam, ad[taskparam])
                        elif task['resubmit_'+taskparam] != None:
                            schedd.edit(rootConst, adparam, str(task['resubmit_'+taskparam]))
                    schedd.act(htcondor.JobAction.Hold, rootConst)
                    schedd.edit(rootConst, "HoldKillSig", 'SIGUSR1')
                    schedd.act(htcondor.JobAction.Release, rootConst)
        elif overwrite:
            with HTCondorUtils.AuthenticatedSubprocess(proxy) as (parent, rpipe):
                if not parent:
                    self.logger.debug("Resubmitting under condition overwrite = True")
                    for adparam, taskparam in params.iteritems():
                        if taskparam in ad:
                            if taskparam == 'jobids' and len(list(ad[taskparam])) == 0:
                                self.logger.debug("Setting %s = True in the task ad." % (adparam))
                                schedd.edit(rootConst, adparam, classad.ExprTree("true"))
                            else:
                                schedd.edit(rootConst, adparam, ad[taskparam])
                        elif task['resubmit_'+taskparam] != None:
                            schedd.edit(rootConst, adparam, str(task['resubmit_'+taskparam]))
                    schedd.act(htcondor.JobAction.Release, rootConst)
        else:
            ## This should actually not occur anymore in CRAB 3.3.16 or above, because
            ## starting from CRAB 3.3.16 the resubmission parameters are written to the
            ## Task DB with value != None, so the overwrite variable should never be False.
            with HTCondorUtils.AuthenticatedSubprocess(proxy) as (parent, rpipe):
                if not parent:
                    self.logger.debug("Resubmitting under condition overwrite = False")
                    schedd.edit(rootConst, "HoldKillSig", 'SIGKILL')
                    schedd.edit(rootConst, "CRAB_ResubmitList", classad.ExprTree("true"))
                    schedd.act(htcondor.JobAction.Hold, rootConst)
                    schedd.edit(rootConst, "HoldKillSig", 'SIGUSR1')
                    schedd.act(htcondor.JobAction.Release, rootConst)

        results = rpipe.read()
        if results != "OK":
            msg  = "The CRAB server backend was not able to resubmit the task,"
            msg += " because the Grid scheduler answered with an error."
            msg += " This is probably a temporary glitch. Please try again later."
            msg += " If the error persists send an e-mail to %s." % (FEEDBACKMAIL)
            msg += " Error reason: %s" % (results)
            raise TaskWorkerException(msg)
Example #52
0
    def duplicateCheck(self, task):
        """
        Look to see if the task we are about to submit is already in the schedd.
        If so, assume that this task in TaskWorker was run successfully, but killed
        before it could update the frontend.
        """
        workflow = task['tm_taskname']

        loc = HTCondorLocator.HTCondorLocator(self.backendurls)
        schedd, address = loc.getScheddObj(workflow)

        rootConst = 'TaskType =?= "ROOT" && CRAB_ReqName =?= %s && (isUndefined(CRAB_Attempt) || CRAB_Attempt == 0)' % HTCondorUtils.quote(
            workflow)

        results = list(schedd.xquery(rootConst, []))

        if not results:
            # Task not already in schedd
            return None

        configreq = {
            'workflow': workflow,
            'status': "SUBMITTED",
            'jobset': "-1",
            'subresource': 'success',
        }
        self.logger.warning(
            "Task %s already submitted to HTCondor; pushing information centrally: %s"
            % (workflow, str(configreq)))
        data = urllib.urlencode(configreq)
        self.server.post(self.resturi, data=data)

        # Note that we don't re-send Dashboard jobs; we assume this is a rare occurrance and
        # don't want to upset any info already in the Dashboard.

        return Result.Result(task=task, result=(-1))