Ejemplo n.º 1
0
 def test_getting_a_target(self):
     retriever = arc.ComputingServiceRetriever(self.usercfg)
     self.expect(retriever).to_be_empty()
     retriever.addEndpoint(self.ce)
     retriever.wait()
     self.expect(retriever).to_have(1).target()
     etlist = retriever.GetExecutionTargets()
     self.expect(etlist).to_have(1).target()
Ejemplo n.º 2
0
  def getCEStatus(self):
    """ Method to return information on running and pending jobs.
        We hope to satisfy both instances that use robot proxies and those which use proper configurations.
    """

    result = self._prepareProxy()
    if not result['OK']:
      gLogger.error('ARCComputingElement: failed to set up proxy', result['Message'])
      return result
    self.usercfg.ProxyPath(os.environ['X509_USER_PROXY'])

    # Try to find out which VO we are running for.
    vo = ''
    res = getVOfromProxyGroup()
    if res['OK']:
      vo = res['Value']

    result = S_OK()
    result['SubmittedJobs'] = 0
    if not vo:
      # Presumably the really proper way forward once the infosys-discuss WG comes up with a solution
      # and it is implemented. Needed for DIRAC instances which use robot certificates for pilots.
      endpoints = [arc.Endpoint("ldap://" + self.ceHost + "/MDS-Vo-name=local,o=grid",
                                arc.Endpoint.COMPUTINGINFO, 'org.nordugrid.ldapng')]
      retriever = arc.ComputingServiceRetriever(self.usercfg, endpoints)
      retriever.wait()  # Takes a bit of time to get and parse the ldap information
      targets = retriever.GetExecutionTargets()
      ceStats = targets[0].ComputingShare
      gLogger.debug("Running jobs for CE %s : %s" % (self.ceHost, ceStats.RunningJobs))
      gLogger.debug("Waiting jobs for CE %s : %s" % (self.ceHost, ceStats.WaitingJobs))
      result['RunningJobs'] = ceStats.RunningJobs
      result['WaitingJobs'] = ceStats.WaitingJobs
    else:
      # The system which works properly at present for ARC CEs that are configured correctly.
      # But for this we need the VO to be known - ask me (Raja) for the whole story if interested.
      cmd = 'ldapsearch -x -LLL -H ldap://%s:2135 -b mds-vo-name=resource,o=grid "(GlueVOViewLocalID=%s)"' % (
          self.ceHost, vo.lower())
      res = shellCall(0, cmd)
      if not res['OK']:
        gLogger.debug("Could not query CE %s - is it down?" % self.ceHost)
        return res
      try:
        ldapValues = res['Value'][1].split("\n")
        running = [lValue for lValue in ldapValues if 'GlueCEStateRunningJobs' in lValue]
        waiting = [lValue for lValue in ldapValues if 'GlueCEStateWaitingJobs' in lValue]
        result['RunningJobs'] = int(running[0].split(":")[1])
        result['WaitingJobs'] = int(waiting[0].split(":")[1])
      except IndexError:
        res = S_ERROR('Unknown ldap failure for site %s' % self.ceHost)
        return res

    return result
Ejemplo n.º 3
0
def retrieve(uc, endpoints):
    # The ComputingServiceRetriever needs the UserConfig to know which credentials
    # to use in case of HTTPS connections
    retriever = arc.ComputingServiceRetriever(uc, endpoints)
    # the constructor of the ComputingServiceRetriever returns immediately
    sys.stdout.write('\n')
    sys.stdout.write(
        "ComputingServiceRetriever created with the following endpoints:\n")
    for endpoint in endpoints:
        sys.stdout.write("- %s\n" % endpoint.str())
    # here we want to wait until all the results arrive
    sys.stdout.write("Waiting for the results...\n")
    retriever.wait()
    return retriever
Ejemplo n.º 4
0
def example():
    # Creating a UserConfig object with the user's proxy
    # and the path of the trusted CA certificates
    uc = arc.UserConfig()
    uc.ProxyPath("/tmp/x509up_u%s" % os.getuid())
    uc.CACertificatesDirectory("/etc/grid-security/certificates")

    # Creating an endpoint for a Computing Element
    endpoint = arc.Endpoint("piff.hep.lu.se", arc.Endpoint.COMPUTINGINFO,
                            "org.nordugrid.ldapglue2")

    # Get the ExecutionTargets of this ComputingElement
    retriever = arc.ComputingServiceRetriever(uc, [endpoint])
    retriever.wait()
    targets = retriever.GetExecutionTargets()

    # Shuffle the targets to simulate a random broker
    targets = list(targets)
    random.shuffle(targets)

    # Create a JobDescription
    jobdesc = arc.JobDescription()
    jobdesc.Application.Executable.Path = "/bin/hostname"
    jobdesc.Application.Output = "stdout.txt"

    # create an empty job object which will contain our submitted job
    job = arc.Job()
    success = False
    # Submit job directly to the execution targets, without a broker
    for target in targets:
        sys.stdout.write("Trying to submit to %s (%s) ... " %
                         (target.ComputingEndpoint.URLString,
                          target.ComputingEndpoint.InterfaceName))
        sys.stdout.flush()
        success = target.Submit(uc, jobdesc, job)
        if success:
            sys.stdout.write("succeeded!\n")
            break
        else:
            sys.stdout.write("failed!\n")
    if success:
        sys.stdout.write("Job was submitted:\n")
        job.SaveToStream(arc.CPyOstream(sys.stdout), False)
    else:
        sys.stdout.write("Job submission failed\n")
Ejemplo n.º 5
0
    def getCEStatus(self):
        """Method to return information on running and pending jobs.
        We hope to satisfy both instances that use robot proxies and those which use proper configurations.
        """

        result = self._prepareProxy()
        if not result["OK"]:
            self.log.error("ARCComputingElement: failed to set up proxy", result["Message"])
            return result
        self.usercfg.ProxyPath(os.environ["X509_USER_PROXY"])

        # Creating an endpoint
        endpoint = arc.Endpoint(self.ceHost, arc.Endpoint.COMPUTINGINFO, "org.nordugrid.ldapglue2")

        # Get the ExecutionTargets of the ComputingElement (Can be REST, EMI-ES or GRIDFTP)
        retriever = arc.ComputingServiceRetriever(self.usercfg, [endpoint])
        retriever.wait()  # Takes a bit of time to get and parse the ldap information
        targetsWithQueues = retriever.GetExecutionTargets()

        # Targets also include queues
        # Some of them might be used by different VOs
        targets = []
        for target in targetsWithQueues:
            if target.ComputingShare.Name == self.arcQueue:
                self.log.debug(
                    "Adding target:",
                    "%s (%s)" % (target.ComputingEndpoint.URLString, target.ComputingEndpoint.InterfaceName),
                )
                targets.append(target)

        # We extract stat from the AREX service (targets[0])
        ceStats = targets[0].ComputingShare
        self.log.debug("Running jobs for CE %s : %s" % (self.ceHost, ceStats.RunningJobs))
        self.log.debug("Waiting jobs for CE %s : %s" % (self.ceHost, ceStats.WaitingJobs))

        result = S_OK()
        result["SubmittedJobs"] = 0
        result["RunningJobs"] = ceStats.RunningJobs
        result["WaitingJobs"] = ceStats.WaitingJobs

        return result
Ejemplo n.º 6
0
    def _arc_submit(self, xrsl, arcces, userconfig, log):
        '''Check the available CEs and submit'''

        queuelist = []

        for arcce in arcces:
            (ce_endpoint, ce_queue) = arcce
            aris = arc.URL(str(ce_endpoint))
            ce_host = aris.Host()
            if aris.Protocol() == 'https':
                aris.ChangePath('/arex')
                infoendpoints = [
                    arc.Endpoint(aris.str(), arc.Endpoint.COMPUTINGINFO,
                                 'org.ogf.glue.emies.resourceinfo')
                ]
            else:
                aris = 'ldap://' + aris.Host() + '/mds-vo-name=local,o=grid'
                infoendpoints = [
                    arc.Endpoint(aris, arc.Endpoint.COMPUTINGINFO,
                                 'org.nordugrid.ldapng')
                ]

            # retriever contains a list of CE endpoints
            retriever = arc.ComputingServiceRetriever(userconfig,
                                                      infoendpoints)
            retriever.wait()
            # targets is the list of queues
            # parse target.ComputingService.ID for the CE hostname
            # target.ComputingShare.Name is the queue name
            targets = retriever.GetExecutionTargets()

            # Filter only sites for this process
            for target in targets:
                if not target.ComputingService.ID:
                    log.info(
                        "Target {0} does not have ComputingService ID defined, skipping"
                        .format(target.ComputingService.Name))
                    continue
                # If EMI-ES infoendpoint, force EMI-ES submission
                if infoendpoints[0].InterfaceName == 'org.ogf.glue.emies.resourceinfo' \
                  and target.ComputingEndpoint.InterfaceName != 'org.ogf.glue.emies.activitycreation':
                    log.debug(
                        "Rejecting target interface {0} because not EMI-ES".
                        format(target.ComputingEndpoint.InterfaceName))
                    continue
                # Check for matching host and queue
                targethost = re.sub(
                    ':arex$', '',
                    re.sub('urn:ogf:ComputingService:', '',
                           target.ComputingService.ID))
                targetqueue = target.ComputingShare.Name
                if targethost != ce_host:
                    log.debug(
                        'Rejecting target host {0} as it does not match {1}'.
                        format(targethost, ce_host))
                    continue
                if targetqueue != ce_queue:
                    log.debug(
                        'Rejecting target queue {0} as it does not match {1}'.
                        format(targetqueue, ce_queue))
                    continue

                queuelist.append(target)
                log.debug("Adding target {0}:{1}".format(
                    targethost, targetqueue))

        # check if any queues are available, if not leave and try again next time
        if not queuelist:
            raise Exception("No free queues available")

        log.debug("preparing submission")
        jobdescs = arc.JobDescriptionList()
        if not arc.JobDescription_Parse(str(xrsl), jobdescs):
            raise Exception("Failed to prepare job description")

        # Run the submission in a separate thread
        thr = SubmitThr(queuelist, jobdescs, userconfig)
        return self._run_submit(thr)
Ejemplo n.º 7
0
  def getCEStatus(self):
    """ Method to return information on running and pending jobs.
        We hope to satisfy both instances that use robot proxies and those which use proper configurations.
    """

    result = self._prepareProxy()
    if not result['OK']:
      self.log.error('ARCComputingElement: failed to set up proxy', result['Message'])
      return result
    self.usercfg.ProxyPath(os.environ['X509_USER_PROXY'])

    # Try to find out which VO we are running for.
    vo = ''
    res = getVOfromProxyGroup()
    if res['OK']:
      vo = res['Value']

    result = S_OK()
    result['SubmittedJobs'] = 0
    if not vo:
      # Presumably the really proper way forward once the infosys-discuss WG comes up with a solution
      # and it is implemented. Needed for DIRAC instances which use robot certificates for pilots.
      endpoints = [arc.Endpoint(str("ldap://" + self.ceHost + "/MDS-Vo-name=local,o=grid"),
                                arc.Endpoint.COMPUTINGINFO, 'org.nordugrid.ldapng')]
      retriever = arc.ComputingServiceRetriever(self.usercfg, endpoints)
      retriever.wait()  # Takes a bit of time to get and parse the ldap information
      targets = retriever.GetExecutionTargets()
      ceStats = targets[0].ComputingShare
      self.log.debug("Running jobs for CE %s : %s" % (self.ceHost, ceStats.RunningJobs))
      self.log.debug("Waiting jobs for CE %s : %s" % (self.ceHost, ceStats.WaitingJobs))
      result['RunningJobs'] = ceStats.RunningJobs
      result['WaitingJobs'] = ceStats.WaitingJobs
    else:
      # The system which works properly at present for ARC CEs that are configured correctly.
      # But for this we need the VO to be known - ask me (Raja) for the whole story if interested.
      # cmd = 'ldapsearch -x -LLL -H ldap://%s:2135 -b mds-vo-name=resource,o=grid "(GlueVOViewLocalID=%s)"' % (
      #     self.ceHost, vo.lower())
      if not self.queue:
        self.log.error('ARCComputingElement: No queue ...')
        res = S_ERROR('Unknown queue (%s) failure for site %s' % (self.queue, self.ceHost))
        return res
      cmd1 = "ldapsearch -x -o ldif-wrap=no -LLL -h %s:2135  -b \'o=glue\' " % self.ceHost
      cmd2 = '"(&(objectClass=GLUE2MappingPolicy)(GLUE2PolicyRule=vo:%s))"' % vo.lower()
      cmd3 = ' | grep GLUE2MappingPolicyShareForeignKey | grep %s' % (self.queue.split("-")[-1])
      cmd4 = ' | sed \'s/GLUE2MappingPolicyShareForeignKey: /GLUE2ShareID=/\' '
      cmd5 = ' | xargs -L1 ldapsearch -x -o ldif-wrap=no -LLL -h %s:2135 -b \'o=glue\' ' % self.ceHost
      cmd6 = ' | egrep \'(ShareWaiting|ShareRunning)\''
      res = shellCall(0, cmd1 + cmd2 + cmd3 + cmd4 + cmd5 + cmd6)
      if not res['OK']:
        self.log.debug("Could not query CE %s - is it down?" % self.ceHost)
        return res
      try:
        ldapValues = res['Value'][1].split("\n")
        running = [lValue for lValue in ldapValues if 'GLUE2ComputingShareRunningJobs' in lValue]
        waiting = [lValue for lValue in ldapValues if 'GLUE2ComputingShareWaitingJobs' in lValue]
        result['RunningJobs'] = int(running[0].split(":")[1])
        result['WaitingJobs'] = int(waiting[0].split(":")[1])
      except IndexError:
        res = S_ERROR('Unknown ldap failure for site %s' % self.ceHost)
        return res

    return result
Ejemplo n.º 8
0
    def submit(self):
        """
        Main function to submit jobs.
        """

        global queuelist

        # check for stopsubmission flag
        if self.conf.get(['downtime', 'stopsubmission']) == "true":
            self.log.info('Submission suspended due to downtime')
            return 0

        # Get cluster host and queue: cluster/queue
        clusterhost = clusterqueue = None
        if self.cluster:
            cluster = self.cluster
            if cluster.find('://') == -1:
                cluster = 'gsiftp://' + cluster
            clusterurl = arc.URL(cluster)
            clusterhost = clusterurl.Host()
            clusterqueue = clusterurl.Path()[1:]  # strip off leading slash

        # Apply fair-share
        if self.cluster:
            fairshares = self.db.getArcJobsInfo(
                "arcstate='tosubmit' and clusterlist like '%" + self.cluster +
                "%'", ['fairshare'])
        else:
            fairshares = self.db.getArcJobsInfo(
                "arcstate='tosubmit' and clusterlist=''", ['fairshare'])

        if not fairshares:
            self.log.info('Nothing to submit')
            return 0

        fairshares = list(set([p['fairshare'] for p in fairshares]))
        # For EMI-ES proxy bug - see below
        shuffle(fairshares)
        count = 0

        for fairshare in fairshares:

            try:
                # catch any exceptions here to avoid leaving lock
                if self.cluster:
                    # Lock row for update in case multiple clusters are specified
                    #jobs=self.db.getArcJobsInfo("arcstate='tosubmit' and ( clusterlist like '%{0}' or clusterlist like '%{0},%' ) and fairshare='{1}' order by priority desc limit 10".format(self.cluster, fairshare),
                    jobs = self.db.getArcJobsInfo(
                        "arcstate='tosubmit' and ( clusterlist like '%{0}' or clusterlist like '%{0},%' ) and fairshare='{1}' limit 10"
                        .format(self.cluster, fairshare),
                        columns=[
                            "id", "jobdesc", "appjobid", "priority", "proxyid"
                        ],
                        lock=True)
                    if jobs:
                        self.log.debug("started lock for writing %d jobs" %
                                       len(jobs))
                else:
                    jobs = self.db.getArcJobsInfo(
                        "arcstate='tosubmit' and clusterlist='' and fairshare='{0}' limit 10"
                        .format(fairshare),
                        columns=["id", "jobdesc", "appjobid", "priority"])
                # mark submitting in db
                jobs_taken = []
                for j in jobs:
                    jd = {
                        'cluster': self.cluster,
                        'arcstate': 'submitting',
                        'tarcstate': self.db.getTimeStamp()
                    }
                    self.db.updateArcJobLazy(j['id'], jd)
                    jobs_taken.append(j)
                jobs = jobs_taken

            finally:
                if self.cluster:
                    try:
                        self.db.Commit(lock=True)
                        self.log.debug("ended lock")
                    except:
                        self.log.warning("Failed to release DB lock")
                else:
                    self.db.Commit()

            if len(jobs) == 0:
                #self.log.debug("No jobs to submit")
                continue
            self.log.info("Submitting %d jobs for fairshare %s" %
                          (len(jobs), fairshare))

            # max waiting priority
            try:
                maxpriowaiting = max(jobs,
                                     key=lambda x: x['priority'])['priority']
            except:
                maxpriowaiting = 0
            self.log.info("Maximum priority of waiting jobs: %d" %
                          maxpriowaiting)

            # Query infosys - either local or index
            if self.cluster:
                if self.cluster.find('://') != -1:
                    aris = arc.URL(self.cluster)
                else:
                    aris = arc.URL('gsiftp://%s' % self.cluster)
                if aris.Protocol() == 'https':
                    aris.ChangePath('/arex')
                    infoendpoints = [
                        arc.Endpoint(aris.str(), arc.Endpoint.COMPUTINGINFO,
                                     'org.ogf.glue.emies.resourceinfo')
                    ]
                elif aris.Protocol() == 'local':
                    infoendpoints = [
                        arc.Endpoint(aris.str(), arc.Endpoint.COMPUTINGINFO,
                                     'org.nordugrid.local')
                    ]
                else:
                    aris = 'ldap://' + aris.Host(
                    ) + '/mds-vo-name=local,o=grid'
                    infoendpoints = [
                        arc.Endpoint(aris, arc.Endpoint.COMPUTINGINFO,
                                     'org.nordugrid.ldapng')
                    ]
            else:
                giises = self.conf.getList(['atlasgiis', 'item'])
                infoendpoints = []
                for g in giises:
                    # Specify explicitly EGIIS
                    infoendpoints.append(
                        arc.Endpoint(str(g), arc.Endpoint.REGISTRY,
                                     "org.nordugrid.ldapegiis"))

            # Set UserConfig credential for each proxy. Assumes that any proxy
            # in the fairshare can query the CE infosys
            self.uc.CredentialString(self.db.getProxy(jobs[0]['proxyid']))
            # retriever contains a list of CE endpoints
            retriever = arc.ComputingServiceRetriever(self.uc, infoendpoints)
            retriever.wait()
            # targets is the list of queues
            # parse target.ComputingService.ID for the CE hostname
            # target.ComputingShare.Name is the queue name
            targets = retriever.GetExecutionTargets()

            # Filter only sites for this process
            queuelist = []
            for target in targets:
                if not target.ComputingService.ID:
                    self.log.info(
                        "Target %s does not have ComputingService ID defined, skipping"
                        % target.ComputingService.Name)
                    continue
                # If EMI-ES infoendpoint, force EMI-ES submission
                if infoendpoints[
                        0].InterfaceName == 'org.ogf.glue.emies.resourceinfo' and target.ComputingEndpoint.InterfaceName != 'org.ogf.glue.emies.activitycreation':
                    self.log.debug(
                        "Rejecting target interface %s because not EMI-ES" %
                        target.ComputingEndpoint.InterfaceName)
                    continue
                # Check for matching host and queue
                targethost = re.sub(
                    ':arex$', '',
                    re.sub('urn:ogf:ComputingService:', '',
                           target.ComputingService.ID))
                targetqueue = target.ComputingShare.Name
                if clusterhost and targethost != clusterhost:
                    self.log.debug(
                        'Rejecting target host %s as it does not match %s' %
                        (targethost, clusterhost))
                    continue
                if clusterqueue and targetqueue != clusterqueue:
                    self.log.debug(
                        'Rejecting target queue %s as it does not match %s' %
                        (targetqueue, clusterqueue))
                    continue
                if targetqueue in self.conf.getList(['queuesreject', 'item']):
                    self.log.debug(
                        'Rejecting target queue %s in queuesreject list' %
                        targetqueue)
                    continue
                elif targethost in self.conf.getList(
                    ['clustersreject', 'item']):
                    self.log.debug(
                        'Rejecting target host %s in clustersreject list' %
                        targethost)
                    continue
                else:
                    # tmp hack
                    target.ComputingShare.LocalWaitingJobs = 0
                    target.ComputingShare.PreLRMSWaitingJobs = 0
                    target.ExecutionEnvironment.CPUClockSpeed = 2000
                    qjobs = self.db.getArcJobsInfo(
                        "cluster='" + str(self.cluster) +
                        "' and  arcstate='submitted' and fairshare='%s'" %
                        fairshare, ['id', 'priority'])
                    rjobs = self.db.getArcJobsInfo(
                        "cluster='" + str(self.cluster) +
                        "' and  arcstate='running' and fairshare='%s'" %
                        fairshare, ['id'])

                    # max queued priority
                    try:
                        maxprioqueued = max(
                            qjobs, key=lambda x: x['priority'])['priority']
                    except:
                        maxprioqueued = 0
                    self.log.info("Max priority queued: %d" % maxprioqueued)

                    # Set number of submitted jobs to running * 0.15 + 400/num of shares
                    # Note: assumes only a few shares are used
                    jlimit = len(rjobs) * 0.15 + 100 / len(fairshares)
                    if str(self.cluster).find('arc-boinc-0') != -1:
                        jlimit = len(rjobs) * 0.15 + 400
                    if str(self.cluster).find('XXXpikolit') != -1:
                        jlimit = len(rjobs) * 0.15 + 100
                    if str(self.cluster).find('arc05.lcg') != -1:
                        jlimit = len(rjobs) * 0.15 + 400
                    target.ComputingShare.PreLRMSWaitingJobs = len(qjobs)
                    if len(qjobs) < jlimit or ((maxpriowaiting > maxprioqueued)
                                               and (maxpriowaiting > 10)):
                        if maxpriowaiting > maxprioqueued:
                            self.log.info(
                                "Overriding limit, maxpriowaiting: %d > maxprioqueued: %d"
                                % (maxpriowaiting, maxprioqueued))
                        queuelist.append(target)
                        self.log.debug("Adding target %s:%s" %
                                       (targethost, targetqueue))
                    else:
                        self.log.info(
                            "%s/%s already at limit of submitted jobs for fairshare %s"
                            % (targethost, targetqueue, fairshare))

            # check if any queues are available, if not leave and try again next time
            if not queuelist:
                self.log.info("No free queues available")
                self.db.Commit()
                # EMI-ES proxy problem - see bug 3685
                if self.cluster and self.cluster.startswith('https://'):
                    raise ExceptInterrupt(15)
                continue

            self.log.info("start submitting")

            # Just run one thread for each job in sequence. Strange things happen
            # when trying to create a new UserConfig object for each thread.
            for j in jobs:
                self.log.debug("%s: preparing submission" % j['appjobid'])
                jobdescstr = str(
                    self.db.getArcJobDescription(str(j['jobdesc'])))
                jobdescs = arc.JobDescriptionList()
                if not jobdescstr or not arc.JobDescription_Parse(
                        jobdescstr, jobdescs):
                    self.log.error("%s: Failed to prepare job description" %
                                   j['appjobid'])
                    continue
                # TODO: might not work if proxies are different within a share
                # since same uc object is shared among threads
                self.uc.CredentialString(self.db.getProxy(j['proxyid']))
                t = SubmitThr(Submit, j['id'], j['appjobid'], jobdescs,
                              self.uc, self.log)
                self.RunThreadsSplit([t], 1)
                count = count + 1

            self.log.info("threads finished")
            # commit transaction to release row locks
            self.db.Commit()

            # EMI-ES proxy problem - see bug 3685
            if self.cluster and self.cluster.startswith('https://'):
                raise ExceptInterrupt(15)

        self.log.info("end submitting")

        return count
Ejemplo n.º 9
0
    def submit(self):
        """
        Main function to submit jobs.
        """

        global queuelist

        # check for stopsubmission flag
        if self.conf.get(['downtime', 'stopsubmission']) == "true":
            self.log.info('Submission suspended due to downtime')
            return

        # check for any site-specific limits or status
        clusterstatus = self.conf.getCond(["sites", "site"],
                                          f"endpoint={self.cluster}",
                                          ["status"]) or 'online'
        if clusterstatus == 'offline':
            self.log.info('Site status is offline')
            return

        clustermaxjobs = int(
            self.conf.getCond(["sites", "site"], f"endpoint={self.cluster}",
                              ["maxjobs"]) or 999999)
        nsubmitted = self.db.getNArcJobs(f"cluster='{self.cluster}'")
        if nsubmitted >= clustermaxjobs:
            self.log.info(
                f'{nsubmitted} submitted jobs is greater than or equal to max jobs {clustermaxjobs}'
            )
            return

        # Get cluster host and queue: cluster/queue
        clusterhost = clusterqueue = None
        if self.cluster:
            cluster = self.cluster
            if cluster.find('://') == -1:
                cluster = 'gsiftp://' + cluster
            clusterurl = arc.URL(cluster)
            clusterhost = clusterurl.Host()
            clusterqueue = clusterurl.Path()[1:]  # strip off leading slash

        # Apply fair-share
        if self.cluster:
            fairshares = self.db.getArcJobsInfo(
                "arcstate='tosubmit' and clusterlist like '%" + self.cluster +
                "%'", ['fairshare', 'proxyid'])
        else:
            fairshares = self.db.getArcJobsInfo(
                "arcstate='tosubmit' and clusterlist=''",
                ['fairshare', 'proxyid'])

        if not fairshares:
            self.log.info('Nothing to submit')
            return

        # split by proxy for GU queues
        fairshares = list(
            set([(p['fairshare'], p['proxyid']) for p in fairshares]))
        # For proxy bug - see below
        shuffle(fairshares)

        for fairshare, proxyid in fairshares:

            # apply maxjobs limit (check above should make sure greater than zero)
            # Note: relies on exit after first loop
            limit = min(clustermaxjobs - nsubmitted, 10)
            try:
                # catch any exceptions here to avoid leaving lock
                if self.cluster:
                    # Lock row for update in case multiple clusters are specified
                    #jobs=self.db.getArcJobsInfo("arcstate='tosubmit' and ( clusterlist like '%{0}' or clusterlist like '%{0},%' ) and fairshare='{1}' order by priority desc limit 10".format(self.cluster, fairshare),
                    jobs = self.db.getArcJobsInfo(
                        "arcstate='tosubmit' and ( clusterlist like '%{0}' or clusterlist like '%{0},%' ) and fairshare='{1}' and proxyid='{2}' limit {3}"
                        .format(self.cluster, fairshare, proxyid, limit),
                        columns=[
                            "id", "jobdesc", "appjobid", "priority", "proxyid",
                            "clusterlist"
                        ],
                        lock=True)
                    if jobs:
                        self.log.debug("started lock for writing %d jobs" %
                                       len(jobs))
                else:
                    jobs = self.db.getArcJobsInfo(
                        "arcstate='tosubmit' and clusterlist='' and fairshare='{0} and proxyid={1}' limit {2}"
                        .format(fairshare, proxyid, limit),
                        columns=[
                            "id", "jobdesc", "appjobid", "priority", "proxyid",
                            "clusterlist"
                        ])
                # mark submitting in db
                jobs_taken = []
                for j in jobs:
                    jd = {
                        'cluster': self.cluster,
                        'arcstate': 'submitting',
                        'tarcstate': self.db.getTimeStamp()
                    }
                    self.db.updateArcJobLazy(j['id'], jd)
                    jobs_taken.append(j)
                jobs = jobs_taken

            finally:
                if self.cluster:
                    try:
                        self.db.Commit(lock=True)
                        self.log.debug("ended lock")
                    except:
                        self.log.warning("Failed to release DB lock")
                else:
                    self.db.Commit()

            if len(jobs) == 0:
                #self.log.debug("No jobs to submit")
                continue
            self.log.info(
                "Submitting %d jobs for fairshare %s and proxyid %d" %
                (len(jobs), fairshare, proxyid))

            # max waiting priority
            try:
                maxpriowaiting = max(jobs,
                                     key=lambda x: x['priority'])['priority']
            except:
                maxpriowaiting = 0
            self.log.info("Maximum priority of waiting jobs: %d" %
                          maxpriowaiting)

            # Query infosys - either local or index
            if self.cluster:
                if self.cluster.find('://') != -1:
                    aris = arc.URL(self.cluster)
                else:
                    aris = arc.URL('gsiftp://%s' % self.cluster)
                if aris.Protocol() == 'https':
                    aris.ChangePath('/arex')
                    infoendpoints = [
                        arc.Endpoint(aris.str(), arc.Endpoint.COMPUTINGINFO,
                                     'org.ogf.glue.emies.resourceinfo')
                    ]
                elif aris.Protocol() == 'local':
                    infoendpoints = [
                        arc.Endpoint(aris.str(), arc.Endpoint.COMPUTINGINFO,
                                     'org.nordugrid.local')
                    ]
                else:
                    aris = 'ldap://' + aris.Host(
                    ) + '/mds-vo-name=local,o=grid'
                    infoendpoints = [
                        arc.Endpoint(aris, arc.Endpoint.COMPUTINGINFO,
                                     'org.nordugrid.ldapng')
                    ]
            else:
                giises = self.conf.getList(['atlasgiis', 'item'])
                infoendpoints = []
                for g in giises:
                    # Specify explicitly EGIIS
                    infoendpoints.append(
                        arc.Endpoint(str(g), arc.Endpoint.REGISTRY,
                                     "org.nordugrid.ldapegiis"))

            # Set UserConfig credential for querying infosys
            proxystring = str(self.db.getProxy(proxyid))
            self.uc.CredentialString(proxystring)
            global usercred
            usercred = self.uc
            # retriever contains a list of CE endpoints
            retriever = arc.ComputingServiceRetriever(self.uc, infoendpoints)
            retriever.wait()
            # targets is the list of queues
            # parse target.ComputingService.ID for the CE hostname
            # target.ComputingShare.Name is the queue name
            targets = retriever.GetExecutionTargets()

            # Filter only sites for this process
            queuelist = []
            for target in targets:
                if not target.ComputingService.ID:
                    self.log.info(
                        "Target %s does not have ComputingService ID defined, skipping"
                        % target.ComputingService.Name)
                    continue
                # If EMI-ES infoendpoint, force EMI-ES submission
                if infoendpoints[
                        0].InterfaceName == 'org.ogf.glue.emies.resourceinfo' and target.ComputingEndpoint.InterfaceName != 'org.ogf.glue.emies.activitycreation':
                    self.log.debug(
                        "Rejecting target interface %s because not EMI-ES" %
                        target.ComputingEndpoint.InterfaceName)
                    continue
                # Check for matching host and queue
                targethost = re.sub(
                    ':arex$', '',
                    re.sub('urn:ogf:ComputingService:', '',
                           target.ComputingService.ID))
                targetqueue = target.ComputingShare.Name
                if clusterhost and targethost != clusterhost:
                    self.log.debug(
                        'Rejecting target host %s as it does not match %s' %
                        (targethost, clusterhost))
                    continue
                if clusterqueue and targetqueue != clusterqueue:
                    self.log.debug(
                        'Rejecting target queue %s as it does not match %s' %
                        (targetqueue, clusterqueue))
                    continue
                if targetqueue in self.conf.getList(['queuesreject', 'item']):
                    self.log.debug(
                        'Rejecting target queue %s in queuesreject list' %
                        targetqueue)
                    continue
                elif targethost in self.conf.getList(
                    ['clustersreject', 'item']):
                    self.log.debug(
                        'Rejecting target host %s in clustersreject list' %
                        targethost)
                    continue
                else:
                    # tmp hack
                    target.ComputingShare.LocalWaitingJobs = 0
                    target.ComputingShare.PreLRMSWaitingJobs = 0
                    target.ExecutionEnvironment.CPUClockSpeed = 2000
                    qjobs = self.db.getArcJobsInfo(
                        "cluster='" + str(self.cluster) +
                        "' and  arcstate='submitted' and fairshare='%s'" %
                        fairshare, ['id', 'priority'])
                    rjobs = self.db.getArcJobsInfo(
                        "cluster='" + str(self.cluster) +
                        "' and  arcstate='running' and fairshare='%s'" %
                        fairshare, ['id'])

                    # max queued priority
                    try:
                        maxprioqueued = max(
                            qjobs, key=lambda x: x['priority'])['priority']
                    except:
                        maxprioqueued = 0
                    self.log.info("Max priority queued: %d" % maxprioqueued)

                    # Limit number of submitted jobs using configuration or default (0.15 + 100/num of shares)
                    # Note: assumes only a few shares are used
                    qfraction = float(self.conf.get([
                        'jobs', 'queuefraction'
                    ])) if self.conf.get(['jobs', 'queuefraction']) else 0.15
                    qoffset = int(self.conf.get([
                        'jobs', 'queueoffset'
                    ])) if self.conf.get(['jobs', 'queueoffset']) else 100
                    jlimit = len(rjobs) * qfraction + qoffset / len(fairshares)
                    self.log.debug("running %d, queued %d, queue limit %d" %
                                   (len(rjobs), len(qjobs), jlimit))
                    if str(self.cluster).find('arc-boinc-0') != -1:
                        jlimit = len(rjobs) * 0.15 + 400
                    if str(self.cluster).find('XXXpikolit') != -1:
                        jlimit = len(rjobs) * 0.15 + 100
                    if str(self.cluster).find('arc05.lcg') != -1:
                        jlimit = len(rjobs) * 0.15 + 400
                    target.ComputingShare.PreLRMSWaitingJobs = len(qjobs)
                    if len(qjobs) < jlimit or ((maxpriowaiting > maxprioqueued)
                                               and (maxpriowaiting > 10)):
                        if maxpriowaiting > maxprioqueued:
                            self.log.info(
                                "Overriding limit, maxpriowaiting: %d > maxprioqueued: %d"
                                % (maxpriowaiting, maxprioqueued))
                        queuelist.append(target)
                        self.log.debug("Adding target %s:%s" %
                                       (targethost, targetqueue))
                    else:
                        self.log.info(
                            "%s/%s already at limit of submitted jobs for fairshare %s"
                            % (targethost, targetqueue, fairshare))

            # check if any queues are available, if not leave and try again next time
            if not queuelist:
                self.log.info("No free queues available")
                self.db.Commit()
                continue

            self.log.info("start submitting")

            # Just run one thread for each job in sequence. Strange things happen
            # when trying to create a new UserConfig object for each thread.
            tasks = []
            for j in jobs:
                self.log.debug("%s: preparing submission" % j['appjobid'])
                jobdescstr = str(
                    self.db.getArcJobDescription(str(j['jobdesc'])))
                jobdescs = arc.JobDescriptionList()
                if not jobdescstr or not arc.JobDescription_Parse(
                        jobdescstr, jobdescs):
                    self.log.error("%s: Failed to prepare job description" %
                                   j['appjobid'])
                    continue
                tasks.append((j['id'], j['appjobid'], jobdescstr, proxystring,
                              int(self.conf.get(['atlasgiis', 'timeout']))))

            npools = 1
            if any(s in self.cluster
                   for s in self.conf.getList(['parallelsubmit', 'item'])):
                npools = int(self.conf.get(['parallelsubmit', 'npools']))
            self.log.debug("Starting submitters: %s" % npools)

            pool = multiprocessing.Pool(npools)
            #results = []
            #for task in tasks:
            #    result = pool.apply_async(Submit,(task))
            #    results.append(result)
            # Submit in workers
            results = [pool.apply_async(Submit, (t)) for t in tasks]

            # timeout per submission
            timeout = 60
            stopflag = False
            for result, task in zip(results, tasks):
                try:
                    jdb = result.get(timeout)
                    jconv = JobConv()
                    job = jconv.db2job(jdb)
                except multiprocessing.TimeoutError:
                    self.log.error(
                        "%s: submission timeout: exit and try again" % task[1])
                    # abort submission if Submit process is stuck
                    #pool.terminate()
                    KillPool(pool)
                    pool.join()
                    stopflag = True
                    # reduce timeout to finish quickly
                    timeout = 0.1
                    continue
                if job is None:
                    self.log.error("%s: no job defined for %d" %
                                   (task[1], task[0]))
                    continue
                jd = {}
                jd['arcstate'] = 'submitted'
                # initial offset to 1 minute to force first status check
                jd['tarcstate'] = self.db.getTimeStamp(
                    time.time() -
                    int(self.conf.get(['jobs', 'checkinterval'])) + 120)
                jd['tstate'] = self.db.getTimeStamp()
                # extract hostname of cluster (depends on JobID being a URL)
                self.log.info("%s: job id %s" % (task[1], job.JobID))
                jd['cluster'] = self.cluster
                self.db.updateArcJobLazy(task[0], jd, job)
            if not stopflag:
                pool.terminate()
                pool.join()
            else:
                # stop submitting, gsiftp connection problem likely
                raise ExceptInterrupt(15)

            self.log.info("threads finished")
            # commit transaction to release row locks
            self.db.Commit()

            # still proxy bug - exit if there are multiple proxies
            if len(self.db.getProxiesInfo('TRUE', ['id'])) > 1:
                raise ExceptInterrupt(15)

        self.log.info("end submitting")

        return
Ejemplo n.º 10
0
    def submit_job(self, executable, args=[], input_files=[]):
        """
        Submit a job and return the job ID

        :param executable:  The command to run on the LOTUS cluster
        :param args:        List of arguments to pass to the executable
        :param input_files: A list of paths to local files to copy to the remote session directory
                            (the directory the job will run from on JASMIN)

        :raises InputFileError:          if any of the specified input files do not exist or are
                                         directories
        :raises NoTargetsAvailableError: if no execution targets can be found on the ARC server
        :raises JobSubmissionError:      if the job cannot be submitted to any targets

        :return: Job ID
        """
        endpoint = arc.Endpoint(self.config.ARC_SERVER,
                                arc.Endpoint.COMPUTINGINFO)

        user_config = self.get_user_config()

        # Get the ExecutionTargets of this ComputingElement
        retriever = arc.ComputingServiceRetriever(user_config, [endpoint])
        retriever.wait()
        targets = retriever.GetExecutionTargets()

        if len(targets) == 0:
            raise NoTargetsAvailableError("No targets available")

        input_files_map = {}  # Map local paths to destination file names
        for filename in input_files:
            if not os.path.isfile(filename):
                raise InputFileError("{} is not a file".format(filename))

            # Use absolute local path
            input_files_map[os.path.abspath(filename)] = os.path.basename(
                filename)

        template = self.env.get_template("job_template.xml")
        jsdl = template.render({
            "name": "ARC job",  # TODO: Use sensible name or omit
            "executable": executable,
            "arguments": args,
            "input_files_map": input_files_map,
            "output_file": self.config.OUTPUT_FILE
        })
        job_descriptions = self.get_job_descriptions(jsdl)

        # Create an empty job object which will contain our submitted job
        job = arc.Job()

        # Submit job directly to the execution targets, without a broker
        # Try each target until successfully submitted
        for target in targets:
            msg = "Attempting to submit job to {} ({})".format(
                target.ComputingEndpoint.URLString,
                target.ComputingEndpoint.InterfaceName)
            self.logger.msg(arc.DEBUG, msg)

            if target.Submit(user_config, job_descriptions[0], job):
                break
            else:
                self.logger.msg(arc.DEBUG, "Failed to submit job")
        else:
            raise JobSubmissionError(
                "Could not submit job to any of the {} available target(s)".
                format(len(targets)))

        self.logger.msg(arc.INFO, "Started job with ID: {}".format(job.JobID))

        # Write information on submitted job to local job list so standard arc tools (arcstat,
        # arcget etc) can be used with this job
        job_list = arc.JobInformationStorageBDB(self.config.JOBS_INFO_FILE)
        if not job_list.Write([job]):
            self.logger.msg(
                arc.WARNING, "Failed to write to local job list {}".format(
                    self.config.JOBS_INFO_FILE))

        return job.JobID
Ejemplo n.º 11
0
    def submitJob(self, executableFile, proxy, numberOfJobs=1):
        """Method to submit job"""

        # Assume that the ARC queues are always of the format nordugrid-<batchSystem>-<queue>
        # And none of our supported batch systems have a "-" in their name
        self.arcQueue = self.queue.split("-", 2)[2]
        result = self._prepareProxy()
        if not result["OK"]:
            self.log.error("ARCComputingElement: failed to set up proxy", result["Message"])
            return result
        self.usercfg.ProxyPath(os.environ["X509_USER_PROXY"])

        self.log.verbose("Executable file path: %s" % executableFile)
        if not os.access(executableFile, 5):
            os.chmod(executableFile, stat.S_IRWXU | stat.S_IRGRP | stat.S_IXGRP | stat.S_IROTH + stat.S_IXOTH)

        batchIDList = []
        stampDict = {}

        # Creating an endpoint
        endpoint = arc.Endpoint(self.ceHost, arc.Endpoint.COMPUTINGINFO, "org.nordugrid.ldapglue2")

        # Get the ExecutionTargets of the ComputingElement (Can be REST, EMI-ES or GRIDFTP)
        retriever = arc.ComputingServiceRetriever(self.usercfg, [endpoint])
        retriever.wait()
        targetsWithQueues = list(retriever.GetExecutionTargets())

        # Targets also include queues
        # To avoid losing time trying to submit to queues we cannot interact with, we only keep the interesting ones
        targets = []
        for target in targetsWithQueues:
            if target.ComputingShare.Name == self.arcQueue:
                self.log.debug(
                    "Adding target:",
                    "%s (%s)" % (target.ComputingEndpoint.URLString, target.ComputingEndpoint.InterfaceName),
                )
                targets.append(target)

        # At this point, we should have GRIDFTP and AREX (EMI-ES and REST) targets related to arcQueue
        # We intend to submit to AREX first, if it does not work, GRIDFTP is used
        submissionWorked = False
        for target in targets:
            # If the submission is already done, we stop
            if submissionWorked:
                break

            for __i in range(numberOfJobs):

                # The basic job description
                jobdescs = arc.JobDescriptionList()

                # Get the job into the ARC way
                xrslString, diracStamp = self._writeXRSL(executableFile)
                self.log.debug("XRSL string submitted : %s" % xrslString)
                self.log.debug("DIRAC stamp for job : %s" % diracStamp)

                # The arc bindings don't accept unicode objects in Python 2 so xrslString must be explicitly cast
                result = arc.JobDescription_Parse(str(xrslString), jobdescs)
                if not result:
                    self.log.error("Invalid job description", "%r, message=%s" % (xrslString, result.str()))
                    break

                # Submit the job
                job = arc.Job()
                result = target.Submit(self.usercfg, jobdescs[0], job)

                # Save info or else ..else.
                if result == arc.SubmissionStatus.NONE:
                    # Job successfully submitted
                    pilotJobReference = job.JobID
                    batchIDList.append(pilotJobReference)
                    stampDict[pilotJobReference] = diracStamp
                    submissionWorked = True
                    self.log.debug("Successfully submitted job %s to CE %s" % (pilotJobReference, self.ceHost))
                else:
                    self._analyzeSubmissionError(result)
                    break  # Boo hoo *sniff*

        if batchIDList:
            result = S_OK(batchIDList)
            result["PilotStampDict"] = stampDict
        else:
            result = S_ERROR("No pilot references obtained from the ARC job submission")
        return result
Ejemplo n.º 12
0
 def test_the_constructor(self):
     retriever = arc.ComputingServiceRetriever(self.usercfg)
     self.expect(retriever).to_be_an_instance_of(
         arc.ComputingServiceRetriever)