def test_getting_a_target(self): retriever = arc.ComputingServiceRetriever(self.usercfg) self.expect(retriever).to_be_empty() retriever.addEndpoint(self.ce) retriever.wait() self.expect(retriever).to_have(1).target() etlist = retriever.GetExecutionTargets() self.expect(etlist).to_have(1).target()
def getCEStatus(self): """ Method to return information on running and pending jobs. We hope to satisfy both instances that use robot proxies and those which use proper configurations. """ result = self._prepareProxy() if not result['OK']: gLogger.error('ARCComputingElement: failed to set up proxy', result['Message']) return result self.usercfg.ProxyPath(os.environ['X509_USER_PROXY']) # Try to find out which VO we are running for. vo = '' res = getVOfromProxyGroup() if res['OK']: vo = res['Value'] result = S_OK() result['SubmittedJobs'] = 0 if not vo: # Presumably the really proper way forward once the infosys-discuss WG comes up with a solution # and it is implemented. Needed for DIRAC instances which use robot certificates for pilots. endpoints = [arc.Endpoint("ldap://" + self.ceHost + "/MDS-Vo-name=local,o=grid", arc.Endpoint.COMPUTINGINFO, 'org.nordugrid.ldapng')] retriever = arc.ComputingServiceRetriever(self.usercfg, endpoints) retriever.wait() # Takes a bit of time to get and parse the ldap information targets = retriever.GetExecutionTargets() ceStats = targets[0].ComputingShare gLogger.debug("Running jobs for CE %s : %s" % (self.ceHost, ceStats.RunningJobs)) gLogger.debug("Waiting jobs for CE %s : %s" % (self.ceHost, ceStats.WaitingJobs)) result['RunningJobs'] = ceStats.RunningJobs result['WaitingJobs'] = ceStats.WaitingJobs else: # The system which works properly at present for ARC CEs that are configured correctly. # But for this we need the VO to be known - ask me (Raja) for the whole story if interested. cmd = 'ldapsearch -x -LLL -H ldap://%s:2135 -b mds-vo-name=resource,o=grid "(GlueVOViewLocalID=%s)"' % ( self.ceHost, vo.lower()) res = shellCall(0, cmd) if not res['OK']: gLogger.debug("Could not query CE %s - is it down?" % self.ceHost) return res try: ldapValues = res['Value'][1].split("\n") running = [lValue for lValue in ldapValues if 'GlueCEStateRunningJobs' in lValue] waiting = [lValue for lValue in ldapValues if 'GlueCEStateWaitingJobs' in lValue] result['RunningJobs'] = int(running[0].split(":")[1]) result['WaitingJobs'] = int(waiting[0].split(":")[1]) except IndexError: res = S_ERROR('Unknown ldap failure for site %s' % self.ceHost) return res return result
def retrieve(uc, endpoints): # The ComputingServiceRetriever needs the UserConfig to know which credentials # to use in case of HTTPS connections retriever = arc.ComputingServiceRetriever(uc, endpoints) # the constructor of the ComputingServiceRetriever returns immediately sys.stdout.write('\n') sys.stdout.write( "ComputingServiceRetriever created with the following endpoints:\n") for endpoint in endpoints: sys.stdout.write("- %s\n" % endpoint.str()) # here we want to wait until all the results arrive sys.stdout.write("Waiting for the results...\n") retriever.wait() return retriever
def example(): # Creating a UserConfig object with the user's proxy # and the path of the trusted CA certificates uc = arc.UserConfig() uc.ProxyPath("/tmp/x509up_u%s" % os.getuid()) uc.CACertificatesDirectory("/etc/grid-security/certificates") # Creating an endpoint for a Computing Element endpoint = arc.Endpoint("piff.hep.lu.se", arc.Endpoint.COMPUTINGINFO, "org.nordugrid.ldapglue2") # Get the ExecutionTargets of this ComputingElement retriever = arc.ComputingServiceRetriever(uc, [endpoint]) retriever.wait() targets = retriever.GetExecutionTargets() # Shuffle the targets to simulate a random broker targets = list(targets) random.shuffle(targets) # Create a JobDescription jobdesc = arc.JobDescription() jobdesc.Application.Executable.Path = "/bin/hostname" jobdesc.Application.Output = "stdout.txt" # create an empty job object which will contain our submitted job job = arc.Job() success = False # Submit job directly to the execution targets, without a broker for target in targets: sys.stdout.write("Trying to submit to %s (%s) ... " % (target.ComputingEndpoint.URLString, target.ComputingEndpoint.InterfaceName)) sys.stdout.flush() success = target.Submit(uc, jobdesc, job) if success: sys.stdout.write("succeeded!\n") break else: sys.stdout.write("failed!\n") if success: sys.stdout.write("Job was submitted:\n") job.SaveToStream(arc.CPyOstream(sys.stdout), False) else: sys.stdout.write("Job submission failed\n")
def getCEStatus(self): """Method to return information on running and pending jobs. We hope to satisfy both instances that use robot proxies and those which use proper configurations. """ result = self._prepareProxy() if not result["OK"]: self.log.error("ARCComputingElement: failed to set up proxy", result["Message"]) return result self.usercfg.ProxyPath(os.environ["X509_USER_PROXY"]) # Creating an endpoint endpoint = arc.Endpoint(self.ceHost, arc.Endpoint.COMPUTINGINFO, "org.nordugrid.ldapglue2") # Get the ExecutionTargets of the ComputingElement (Can be REST, EMI-ES or GRIDFTP) retriever = arc.ComputingServiceRetriever(self.usercfg, [endpoint]) retriever.wait() # Takes a bit of time to get and parse the ldap information targetsWithQueues = retriever.GetExecutionTargets() # Targets also include queues # Some of them might be used by different VOs targets = [] for target in targetsWithQueues: if target.ComputingShare.Name == self.arcQueue: self.log.debug( "Adding target:", "%s (%s)" % (target.ComputingEndpoint.URLString, target.ComputingEndpoint.InterfaceName), ) targets.append(target) # We extract stat from the AREX service (targets[0]) ceStats = targets[0].ComputingShare self.log.debug("Running jobs for CE %s : %s" % (self.ceHost, ceStats.RunningJobs)) self.log.debug("Waiting jobs for CE %s : %s" % (self.ceHost, ceStats.WaitingJobs)) result = S_OK() result["SubmittedJobs"] = 0 result["RunningJobs"] = ceStats.RunningJobs result["WaitingJobs"] = ceStats.WaitingJobs return result
def _arc_submit(self, xrsl, arcces, userconfig, log): '''Check the available CEs and submit''' queuelist = [] for arcce in arcces: (ce_endpoint, ce_queue) = arcce aris = arc.URL(str(ce_endpoint)) ce_host = aris.Host() if aris.Protocol() == 'https': aris.ChangePath('/arex') infoendpoints = [ arc.Endpoint(aris.str(), arc.Endpoint.COMPUTINGINFO, 'org.ogf.glue.emies.resourceinfo') ] else: aris = 'ldap://' + aris.Host() + '/mds-vo-name=local,o=grid' infoendpoints = [ arc.Endpoint(aris, arc.Endpoint.COMPUTINGINFO, 'org.nordugrid.ldapng') ] # retriever contains a list of CE endpoints retriever = arc.ComputingServiceRetriever(userconfig, infoendpoints) retriever.wait() # targets is the list of queues # parse target.ComputingService.ID for the CE hostname # target.ComputingShare.Name is the queue name targets = retriever.GetExecutionTargets() # Filter only sites for this process for target in targets: if not target.ComputingService.ID: log.info( "Target {0} does not have ComputingService ID defined, skipping" .format(target.ComputingService.Name)) continue # If EMI-ES infoendpoint, force EMI-ES submission if infoendpoints[0].InterfaceName == 'org.ogf.glue.emies.resourceinfo' \ and target.ComputingEndpoint.InterfaceName != 'org.ogf.glue.emies.activitycreation': log.debug( "Rejecting target interface {0} because not EMI-ES". format(target.ComputingEndpoint.InterfaceName)) continue # Check for matching host and queue targethost = re.sub( ':arex$', '', re.sub('urn:ogf:ComputingService:', '', target.ComputingService.ID)) targetqueue = target.ComputingShare.Name if targethost != ce_host: log.debug( 'Rejecting target host {0} as it does not match {1}'. format(targethost, ce_host)) continue if targetqueue != ce_queue: log.debug( 'Rejecting target queue {0} as it does not match {1}'. format(targetqueue, ce_queue)) continue queuelist.append(target) log.debug("Adding target {0}:{1}".format( targethost, targetqueue)) # check if any queues are available, if not leave and try again next time if not queuelist: raise Exception("No free queues available") log.debug("preparing submission") jobdescs = arc.JobDescriptionList() if not arc.JobDescription_Parse(str(xrsl), jobdescs): raise Exception("Failed to prepare job description") # Run the submission in a separate thread thr = SubmitThr(queuelist, jobdescs, userconfig) return self._run_submit(thr)
def getCEStatus(self): """ Method to return information on running and pending jobs. We hope to satisfy both instances that use robot proxies and those which use proper configurations. """ result = self._prepareProxy() if not result['OK']: self.log.error('ARCComputingElement: failed to set up proxy', result['Message']) return result self.usercfg.ProxyPath(os.environ['X509_USER_PROXY']) # Try to find out which VO we are running for. vo = '' res = getVOfromProxyGroup() if res['OK']: vo = res['Value'] result = S_OK() result['SubmittedJobs'] = 0 if not vo: # Presumably the really proper way forward once the infosys-discuss WG comes up with a solution # and it is implemented. Needed for DIRAC instances which use robot certificates for pilots. endpoints = [arc.Endpoint(str("ldap://" + self.ceHost + "/MDS-Vo-name=local,o=grid"), arc.Endpoint.COMPUTINGINFO, 'org.nordugrid.ldapng')] retriever = arc.ComputingServiceRetriever(self.usercfg, endpoints) retriever.wait() # Takes a bit of time to get and parse the ldap information targets = retriever.GetExecutionTargets() ceStats = targets[0].ComputingShare self.log.debug("Running jobs for CE %s : %s" % (self.ceHost, ceStats.RunningJobs)) self.log.debug("Waiting jobs for CE %s : %s" % (self.ceHost, ceStats.WaitingJobs)) result['RunningJobs'] = ceStats.RunningJobs result['WaitingJobs'] = ceStats.WaitingJobs else: # The system which works properly at present for ARC CEs that are configured correctly. # But for this we need the VO to be known - ask me (Raja) for the whole story if interested. # cmd = 'ldapsearch -x -LLL -H ldap://%s:2135 -b mds-vo-name=resource,o=grid "(GlueVOViewLocalID=%s)"' % ( # self.ceHost, vo.lower()) if not self.queue: self.log.error('ARCComputingElement: No queue ...') res = S_ERROR('Unknown queue (%s) failure for site %s' % (self.queue, self.ceHost)) return res cmd1 = "ldapsearch -x -o ldif-wrap=no -LLL -h %s:2135 -b \'o=glue\' " % self.ceHost cmd2 = '"(&(objectClass=GLUE2MappingPolicy)(GLUE2PolicyRule=vo:%s))"' % vo.lower() cmd3 = ' | grep GLUE2MappingPolicyShareForeignKey | grep %s' % (self.queue.split("-")[-1]) cmd4 = ' | sed \'s/GLUE2MappingPolicyShareForeignKey: /GLUE2ShareID=/\' ' cmd5 = ' | xargs -L1 ldapsearch -x -o ldif-wrap=no -LLL -h %s:2135 -b \'o=glue\' ' % self.ceHost cmd6 = ' | egrep \'(ShareWaiting|ShareRunning)\'' res = shellCall(0, cmd1 + cmd2 + cmd3 + cmd4 + cmd5 + cmd6) if not res['OK']: self.log.debug("Could not query CE %s - is it down?" % self.ceHost) return res try: ldapValues = res['Value'][1].split("\n") running = [lValue for lValue in ldapValues if 'GLUE2ComputingShareRunningJobs' in lValue] waiting = [lValue for lValue in ldapValues if 'GLUE2ComputingShareWaitingJobs' in lValue] result['RunningJobs'] = int(running[0].split(":")[1]) result['WaitingJobs'] = int(waiting[0].split(":")[1]) except IndexError: res = S_ERROR('Unknown ldap failure for site %s' % self.ceHost) return res return result
def submit(self): """ Main function to submit jobs. """ global queuelist # check for stopsubmission flag if self.conf.get(['downtime', 'stopsubmission']) == "true": self.log.info('Submission suspended due to downtime') return 0 # Get cluster host and queue: cluster/queue clusterhost = clusterqueue = None if self.cluster: cluster = self.cluster if cluster.find('://') == -1: cluster = 'gsiftp://' + cluster clusterurl = arc.URL(cluster) clusterhost = clusterurl.Host() clusterqueue = clusterurl.Path()[1:] # strip off leading slash # Apply fair-share if self.cluster: fairshares = self.db.getArcJobsInfo( "arcstate='tosubmit' and clusterlist like '%" + self.cluster + "%'", ['fairshare']) else: fairshares = self.db.getArcJobsInfo( "arcstate='tosubmit' and clusterlist=''", ['fairshare']) if not fairshares: self.log.info('Nothing to submit') return 0 fairshares = list(set([p['fairshare'] for p in fairshares])) # For EMI-ES proxy bug - see below shuffle(fairshares) count = 0 for fairshare in fairshares: try: # catch any exceptions here to avoid leaving lock if self.cluster: # Lock row for update in case multiple clusters are specified #jobs=self.db.getArcJobsInfo("arcstate='tosubmit' and ( clusterlist like '%{0}' or clusterlist like '%{0},%' ) and fairshare='{1}' order by priority desc limit 10".format(self.cluster, fairshare), jobs = self.db.getArcJobsInfo( "arcstate='tosubmit' and ( clusterlist like '%{0}' or clusterlist like '%{0},%' ) and fairshare='{1}' limit 10" .format(self.cluster, fairshare), columns=[ "id", "jobdesc", "appjobid", "priority", "proxyid" ], lock=True) if jobs: self.log.debug("started lock for writing %d jobs" % len(jobs)) else: jobs = self.db.getArcJobsInfo( "arcstate='tosubmit' and clusterlist='' and fairshare='{0}' limit 10" .format(fairshare), columns=["id", "jobdesc", "appjobid", "priority"]) # mark submitting in db jobs_taken = [] for j in jobs: jd = { 'cluster': self.cluster, 'arcstate': 'submitting', 'tarcstate': self.db.getTimeStamp() } self.db.updateArcJobLazy(j['id'], jd) jobs_taken.append(j) jobs = jobs_taken finally: if self.cluster: try: self.db.Commit(lock=True) self.log.debug("ended lock") except: self.log.warning("Failed to release DB lock") else: self.db.Commit() if len(jobs) == 0: #self.log.debug("No jobs to submit") continue self.log.info("Submitting %d jobs for fairshare %s" % (len(jobs), fairshare)) # max waiting priority try: maxpriowaiting = max(jobs, key=lambda x: x['priority'])['priority'] except: maxpriowaiting = 0 self.log.info("Maximum priority of waiting jobs: %d" % maxpriowaiting) # Query infosys - either local or index if self.cluster: if self.cluster.find('://') != -1: aris = arc.URL(self.cluster) else: aris = arc.URL('gsiftp://%s' % self.cluster) if aris.Protocol() == 'https': aris.ChangePath('/arex') infoendpoints = [ arc.Endpoint(aris.str(), arc.Endpoint.COMPUTINGINFO, 'org.ogf.glue.emies.resourceinfo') ] elif aris.Protocol() == 'local': infoendpoints = [ arc.Endpoint(aris.str(), arc.Endpoint.COMPUTINGINFO, 'org.nordugrid.local') ] else: aris = 'ldap://' + aris.Host( ) + '/mds-vo-name=local,o=grid' infoendpoints = [ arc.Endpoint(aris, arc.Endpoint.COMPUTINGINFO, 'org.nordugrid.ldapng') ] else: giises = self.conf.getList(['atlasgiis', 'item']) infoendpoints = [] for g in giises: # Specify explicitly EGIIS infoendpoints.append( arc.Endpoint(str(g), arc.Endpoint.REGISTRY, "org.nordugrid.ldapegiis")) # Set UserConfig credential for each proxy. Assumes that any proxy # in the fairshare can query the CE infosys self.uc.CredentialString(self.db.getProxy(jobs[0]['proxyid'])) # retriever contains a list of CE endpoints retriever = arc.ComputingServiceRetriever(self.uc, infoendpoints) retriever.wait() # targets is the list of queues # parse target.ComputingService.ID for the CE hostname # target.ComputingShare.Name is the queue name targets = retriever.GetExecutionTargets() # Filter only sites for this process queuelist = [] for target in targets: if not target.ComputingService.ID: self.log.info( "Target %s does not have ComputingService ID defined, skipping" % target.ComputingService.Name) continue # If EMI-ES infoendpoint, force EMI-ES submission if infoendpoints[ 0].InterfaceName == 'org.ogf.glue.emies.resourceinfo' and target.ComputingEndpoint.InterfaceName != 'org.ogf.glue.emies.activitycreation': self.log.debug( "Rejecting target interface %s because not EMI-ES" % target.ComputingEndpoint.InterfaceName) continue # Check for matching host and queue targethost = re.sub( ':arex$', '', re.sub('urn:ogf:ComputingService:', '', target.ComputingService.ID)) targetqueue = target.ComputingShare.Name if clusterhost and targethost != clusterhost: self.log.debug( 'Rejecting target host %s as it does not match %s' % (targethost, clusterhost)) continue if clusterqueue and targetqueue != clusterqueue: self.log.debug( 'Rejecting target queue %s as it does not match %s' % (targetqueue, clusterqueue)) continue if targetqueue in self.conf.getList(['queuesreject', 'item']): self.log.debug( 'Rejecting target queue %s in queuesreject list' % targetqueue) continue elif targethost in self.conf.getList( ['clustersreject', 'item']): self.log.debug( 'Rejecting target host %s in clustersreject list' % targethost) continue else: # tmp hack target.ComputingShare.LocalWaitingJobs = 0 target.ComputingShare.PreLRMSWaitingJobs = 0 target.ExecutionEnvironment.CPUClockSpeed = 2000 qjobs = self.db.getArcJobsInfo( "cluster='" + str(self.cluster) + "' and arcstate='submitted' and fairshare='%s'" % fairshare, ['id', 'priority']) rjobs = self.db.getArcJobsInfo( "cluster='" + str(self.cluster) + "' and arcstate='running' and fairshare='%s'" % fairshare, ['id']) # max queued priority try: maxprioqueued = max( qjobs, key=lambda x: x['priority'])['priority'] except: maxprioqueued = 0 self.log.info("Max priority queued: %d" % maxprioqueued) # Set number of submitted jobs to running * 0.15 + 400/num of shares # Note: assumes only a few shares are used jlimit = len(rjobs) * 0.15 + 100 / len(fairshares) if str(self.cluster).find('arc-boinc-0') != -1: jlimit = len(rjobs) * 0.15 + 400 if str(self.cluster).find('XXXpikolit') != -1: jlimit = len(rjobs) * 0.15 + 100 if str(self.cluster).find('arc05.lcg') != -1: jlimit = len(rjobs) * 0.15 + 400 target.ComputingShare.PreLRMSWaitingJobs = len(qjobs) if len(qjobs) < jlimit or ((maxpriowaiting > maxprioqueued) and (maxpriowaiting > 10)): if maxpriowaiting > maxprioqueued: self.log.info( "Overriding limit, maxpriowaiting: %d > maxprioqueued: %d" % (maxpriowaiting, maxprioqueued)) queuelist.append(target) self.log.debug("Adding target %s:%s" % (targethost, targetqueue)) else: self.log.info( "%s/%s already at limit of submitted jobs for fairshare %s" % (targethost, targetqueue, fairshare)) # check if any queues are available, if not leave and try again next time if not queuelist: self.log.info("No free queues available") self.db.Commit() # EMI-ES proxy problem - see bug 3685 if self.cluster and self.cluster.startswith('https://'): raise ExceptInterrupt(15) continue self.log.info("start submitting") # Just run one thread for each job in sequence. Strange things happen # when trying to create a new UserConfig object for each thread. for j in jobs: self.log.debug("%s: preparing submission" % j['appjobid']) jobdescstr = str( self.db.getArcJobDescription(str(j['jobdesc']))) jobdescs = arc.JobDescriptionList() if not jobdescstr or not arc.JobDescription_Parse( jobdescstr, jobdescs): self.log.error("%s: Failed to prepare job description" % j['appjobid']) continue # TODO: might not work if proxies are different within a share # since same uc object is shared among threads self.uc.CredentialString(self.db.getProxy(j['proxyid'])) t = SubmitThr(Submit, j['id'], j['appjobid'], jobdescs, self.uc, self.log) self.RunThreadsSplit([t], 1) count = count + 1 self.log.info("threads finished") # commit transaction to release row locks self.db.Commit() # EMI-ES proxy problem - see bug 3685 if self.cluster and self.cluster.startswith('https://'): raise ExceptInterrupt(15) self.log.info("end submitting") return count
def submit(self): """ Main function to submit jobs. """ global queuelist # check for stopsubmission flag if self.conf.get(['downtime', 'stopsubmission']) == "true": self.log.info('Submission suspended due to downtime') return # check for any site-specific limits or status clusterstatus = self.conf.getCond(["sites", "site"], f"endpoint={self.cluster}", ["status"]) or 'online' if clusterstatus == 'offline': self.log.info('Site status is offline') return clustermaxjobs = int( self.conf.getCond(["sites", "site"], f"endpoint={self.cluster}", ["maxjobs"]) or 999999) nsubmitted = self.db.getNArcJobs(f"cluster='{self.cluster}'") if nsubmitted >= clustermaxjobs: self.log.info( f'{nsubmitted} submitted jobs is greater than or equal to max jobs {clustermaxjobs}' ) return # Get cluster host and queue: cluster/queue clusterhost = clusterqueue = None if self.cluster: cluster = self.cluster if cluster.find('://') == -1: cluster = 'gsiftp://' + cluster clusterurl = arc.URL(cluster) clusterhost = clusterurl.Host() clusterqueue = clusterurl.Path()[1:] # strip off leading slash # Apply fair-share if self.cluster: fairshares = self.db.getArcJobsInfo( "arcstate='tosubmit' and clusterlist like '%" + self.cluster + "%'", ['fairshare', 'proxyid']) else: fairshares = self.db.getArcJobsInfo( "arcstate='tosubmit' and clusterlist=''", ['fairshare', 'proxyid']) if not fairshares: self.log.info('Nothing to submit') return # split by proxy for GU queues fairshares = list( set([(p['fairshare'], p['proxyid']) for p in fairshares])) # For proxy bug - see below shuffle(fairshares) for fairshare, proxyid in fairshares: # apply maxjobs limit (check above should make sure greater than zero) # Note: relies on exit after first loop limit = min(clustermaxjobs - nsubmitted, 10) try: # catch any exceptions here to avoid leaving lock if self.cluster: # Lock row for update in case multiple clusters are specified #jobs=self.db.getArcJobsInfo("arcstate='tosubmit' and ( clusterlist like '%{0}' or clusterlist like '%{0},%' ) and fairshare='{1}' order by priority desc limit 10".format(self.cluster, fairshare), jobs = self.db.getArcJobsInfo( "arcstate='tosubmit' and ( clusterlist like '%{0}' or clusterlist like '%{0},%' ) and fairshare='{1}' and proxyid='{2}' limit {3}" .format(self.cluster, fairshare, proxyid, limit), columns=[ "id", "jobdesc", "appjobid", "priority", "proxyid", "clusterlist" ], lock=True) if jobs: self.log.debug("started lock for writing %d jobs" % len(jobs)) else: jobs = self.db.getArcJobsInfo( "arcstate='tosubmit' and clusterlist='' and fairshare='{0} and proxyid={1}' limit {2}" .format(fairshare, proxyid, limit), columns=[ "id", "jobdesc", "appjobid", "priority", "proxyid", "clusterlist" ]) # mark submitting in db jobs_taken = [] for j in jobs: jd = { 'cluster': self.cluster, 'arcstate': 'submitting', 'tarcstate': self.db.getTimeStamp() } self.db.updateArcJobLazy(j['id'], jd) jobs_taken.append(j) jobs = jobs_taken finally: if self.cluster: try: self.db.Commit(lock=True) self.log.debug("ended lock") except: self.log.warning("Failed to release DB lock") else: self.db.Commit() if len(jobs) == 0: #self.log.debug("No jobs to submit") continue self.log.info( "Submitting %d jobs for fairshare %s and proxyid %d" % (len(jobs), fairshare, proxyid)) # max waiting priority try: maxpriowaiting = max(jobs, key=lambda x: x['priority'])['priority'] except: maxpriowaiting = 0 self.log.info("Maximum priority of waiting jobs: %d" % maxpriowaiting) # Query infosys - either local or index if self.cluster: if self.cluster.find('://') != -1: aris = arc.URL(self.cluster) else: aris = arc.URL('gsiftp://%s' % self.cluster) if aris.Protocol() == 'https': aris.ChangePath('/arex') infoendpoints = [ arc.Endpoint(aris.str(), arc.Endpoint.COMPUTINGINFO, 'org.ogf.glue.emies.resourceinfo') ] elif aris.Protocol() == 'local': infoendpoints = [ arc.Endpoint(aris.str(), arc.Endpoint.COMPUTINGINFO, 'org.nordugrid.local') ] else: aris = 'ldap://' + aris.Host( ) + '/mds-vo-name=local,o=grid' infoendpoints = [ arc.Endpoint(aris, arc.Endpoint.COMPUTINGINFO, 'org.nordugrid.ldapng') ] else: giises = self.conf.getList(['atlasgiis', 'item']) infoendpoints = [] for g in giises: # Specify explicitly EGIIS infoendpoints.append( arc.Endpoint(str(g), arc.Endpoint.REGISTRY, "org.nordugrid.ldapegiis")) # Set UserConfig credential for querying infosys proxystring = str(self.db.getProxy(proxyid)) self.uc.CredentialString(proxystring) global usercred usercred = self.uc # retriever contains a list of CE endpoints retriever = arc.ComputingServiceRetriever(self.uc, infoendpoints) retriever.wait() # targets is the list of queues # parse target.ComputingService.ID for the CE hostname # target.ComputingShare.Name is the queue name targets = retriever.GetExecutionTargets() # Filter only sites for this process queuelist = [] for target in targets: if not target.ComputingService.ID: self.log.info( "Target %s does not have ComputingService ID defined, skipping" % target.ComputingService.Name) continue # If EMI-ES infoendpoint, force EMI-ES submission if infoendpoints[ 0].InterfaceName == 'org.ogf.glue.emies.resourceinfo' and target.ComputingEndpoint.InterfaceName != 'org.ogf.glue.emies.activitycreation': self.log.debug( "Rejecting target interface %s because not EMI-ES" % target.ComputingEndpoint.InterfaceName) continue # Check for matching host and queue targethost = re.sub( ':arex$', '', re.sub('urn:ogf:ComputingService:', '', target.ComputingService.ID)) targetqueue = target.ComputingShare.Name if clusterhost and targethost != clusterhost: self.log.debug( 'Rejecting target host %s as it does not match %s' % (targethost, clusterhost)) continue if clusterqueue and targetqueue != clusterqueue: self.log.debug( 'Rejecting target queue %s as it does not match %s' % (targetqueue, clusterqueue)) continue if targetqueue in self.conf.getList(['queuesreject', 'item']): self.log.debug( 'Rejecting target queue %s in queuesreject list' % targetqueue) continue elif targethost in self.conf.getList( ['clustersreject', 'item']): self.log.debug( 'Rejecting target host %s in clustersreject list' % targethost) continue else: # tmp hack target.ComputingShare.LocalWaitingJobs = 0 target.ComputingShare.PreLRMSWaitingJobs = 0 target.ExecutionEnvironment.CPUClockSpeed = 2000 qjobs = self.db.getArcJobsInfo( "cluster='" + str(self.cluster) + "' and arcstate='submitted' and fairshare='%s'" % fairshare, ['id', 'priority']) rjobs = self.db.getArcJobsInfo( "cluster='" + str(self.cluster) + "' and arcstate='running' and fairshare='%s'" % fairshare, ['id']) # max queued priority try: maxprioqueued = max( qjobs, key=lambda x: x['priority'])['priority'] except: maxprioqueued = 0 self.log.info("Max priority queued: %d" % maxprioqueued) # Limit number of submitted jobs using configuration or default (0.15 + 100/num of shares) # Note: assumes only a few shares are used qfraction = float(self.conf.get([ 'jobs', 'queuefraction' ])) if self.conf.get(['jobs', 'queuefraction']) else 0.15 qoffset = int(self.conf.get([ 'jobs', 'queueoffset' ])) if self.conf.get(['jobs', 'queueoffset']) else 100 jlimit = len(rjobs) * qfraction + qoffset / len(fairshares) self.log.debug("running %d, queued %d, queue limit %d" % (len(rjobs), len(qjobs), jlimit)) if str(self.cluster).find('arc-boinc-0') != -1: jlimit = len(rjobs) * 0.15 + 400 if str(self.cluster).find('XXXpikolit') != -1: jlimit = len(rjobs) * 0.15 + 100 if str(self.cluster).find('arc05.lcg') != -1: jlimit = len(rjobs) * 0.15 + 400 target.ComputingShare.PreLRMSWaitingJobs = len(qjobs) if len(qjobs) < jlimit or ((maxpriowaiting > maxprioqueued) and (maxpriowaiting > 10)): if maxpriowaiting > maxprioqueued: self.log.info( "Overriding limit, maxpriowaiting: %d > maxprioqueued: %d" % (maxpriowaiting, maxprioqueued)) queuelist.append(target) self.log.debug("Adding target %s:%s" % (targethost, targetqueue)) else: self.log.info( "%s/%s already at limit of submitted jobs for fairshare %s" % (targethost, targetqueue, fairshare)) # check if any queues are available, if not leave and try again next time if not queuelist: self.log.info("No free queues available") self.db.Commit() continue self.log.info("start submitting") # Just run one thread for each job in sequence. Strange things happen # when trying to create a new UserConfig object for each thread. tasks = [] for j in jobs: self.log.debug("%s: preparing submission" % j['appjobid']) jobdescstr = str( self.db.getArcJobDescription(str(j['jobdesc']))) jobdescs = arc.JobDescriptionList() if not jobdescstr or not arc.JobDescription_Parse( jobdescstr, jobdescs): self.log.error("%s: Failed to prepare job description" % j['appjobid']) continue tasks.append((j['id'], j['appjobid'], jobdescstr, proxystring, int(self.conf.get(['atlasgiis', 'timeout'])))) npools = 1 if any(s in self.cluster for s in self.conf.getList(['parallelsubmit', 'item'])): npools = int(self.conf.get(['parallelsubmit', 'npools'])) self.log.debug("Starting submitters: %s" % npools) pool = multiprocessing.Pool(npools) #results = [] #for task in tasks: # result = pool.apply_async(Submit,(task)) # results.append(result) # Submit in workers results = [pool.apply_async(Submit, (t)) for t in tasks] # timeout per submission timeout = 60 stopflag = False for result, task in zip(results, tasks): try: jdb = result.get(timeout) jconv = JobConv() job = jconv.db2job(jdb) except multiprocessing.TimeoutError: self.log.error( "%s: submission timeout: exit and try again" % task[1]) # abort submission if Submit process is stuck #pool.terminate() KillPool(pool) pool.join() stopflag = True # reduce timeout to finish quickly timeout = 0.1 continue if job is None: self.log.error("%s: no job defined for %d" % (task[1], task[0])) continue jd = {} jd['arcstate'] = 'submitted' # initial offset to 1 minute to force first status check jd['tarcstate'] = self.db.getTimeStamp( time.time() - int(self.conf.get(['jobs', 'checkinterval'])) + 120) jd['tstate'] = self.db.getTimeStamp() # extract hostname of cluster (depends on JobID being a URL) self.log.info("%s: job id %s" % (task[1], job.JobID)) jd['cluster'] = self.cluster self.db.updateArcJobLazy(task[0], jd, job) if not stopflag: pool.terminate() pool.join() else: # stop submitting, gsiftp connection problem likely raise ExceptInterrupt(15) self.log.info("threads finished") # commit transaction to release row locks self.db.Commit() # still proxy bug - exit if there are multiple proxies if len(self.db.getProxiesInfo('TRUE', ['id'])) > 1: raise ExceptInterrupt(15) self.log.info("end submitting") return
def submit_job(self, executable, args=[], input_files=[]): """ Submit a job and return the job ID :param executable: The command to run on the LOTUS cluster :param args: List of arguments to pass to the executable :param input_files: A list of paths to local files to copy to the remote session directory (the directory the job will run from on JASMIN) :raises InputFileError: if any of the specified input files do not exist or are directories :raises NoTargetsAvailableError: if no execution targets can be found on the ARC server :raises JobSubmissionError: if the job cannot be submitted to any targets :return: Job ID """ endpoint = arc.Endpoint(self.config.ARC_SERVER, arc.Endpoint.COMPUTINGINFO) user_config = self.get_user_config() # Get the ExecutionTargets of this ComputingElement retriever = arc.ComputingServiceRetriever(user_config, [endpoint]) retriever.wait() targets = retriever.GetExecutionTargets() if len(targets) == 0: raise NoTargetsAvailableError("No targets available") input_files_map = {} # Map local paths to destination file names for filename in input_files: if not os.path.isfile(filename): raise InputFileError("{} is not a file".format(filename)) # Use absolute local path input_files_map[os.path.abspath(filename)] = os.path.basename( filename) template = self.env.get_template("job_template.xml") jsdl = template.render({ "name": "ARC job", # TODO: Use sensible name or omit "executable": executable, "arguments": args, "input_files_map": input_files_map, "output_file": self.config.OUTPUT_FILE }) job_descriptions = self.get_job_descriptions(jsdl) # Create an empty job object which will contain our submitted job job = arc.Job() # Submit job directly to the execution targets, without a broker # Try each target until successfully submitted for target in targets: msg = "Attempting to submit job to {} ({})".format( target.ComputingEndpoint.URLString, target.ComputingEndpoint.InterfaceName) self.logger.msg(arc.DEBUG, msg) if target.Submit(user_config, job_descriptions[0], job): break else: self.logger.msg(arc.DEBUG, "Failed to submit job") else: raise JobSubmissionError( "Could not submit job to any of the {} available target(s)". format(len(targets))) self.logger.msg(arc.INFO, "Started job with ID: {}".format(job.JobID)) # Write information on submitted job to local job list so standard arc tools (arcstat, # arcget etc) can be used with this job job_list = arc.JobInformationStorageBDB(self.config.JOBS_INFO_FILE) if not job_list.Write([job]): self.logger.msg( arc.WARNING, "Failed to write to local job list {}".format( self.config.JOBS_INFO_FILE)) return job.JobID
def submitJob(self, executableFile, proxy, numberOfJobs=1): """Method to submit job""" # Assume that the ARC queues are always of the format nordugrid-<batchSystem>-<queue> # And none of our supported batch systems have a "-" in their name self.arcQueue = self.queue.split("-", 2)[2] result = self._prepareProxy() if not result["OK"]: self.log.error("ARCComputingElement: failed to set up proxy", result["Message"]) return result self.usercfg.ProxyPath(os.environ["X509_USER_PROXY"]) self.log.verbose("Executable file path: %s" % executableFile) if not os.access(executableFile, 5): os.chmod(executableFile, stat.S_IRWXU | stat.S_IRGRP | stat.S_IXGRP | stat.S_IROTH + stat.S_IXOTH) batchIDList = [] stampDict = {} # Creating an endpoint endpoint = arc.Endpoint(self.ceHost, arc.Endpoint.COMPUTINGINFO, "org.nordugrid.ldapglue2") # Get the ExecutionTargets of the ComputingElement (Can be REST, EMI-ES or GRIDFTP) retriever = arc.ComputingServiceRetriever(self.usercfg, [endpoint]) retriever.wait() targetsWithQueues = list(retriever.GetExecutionTargets()) # Targets also include queues # To avoid losing time trying to submit to queues we cannot interact with, we only keep the interesting ones targets = [] for target in targetsWithQueues: if target.ComputingShare.Name == self.arcQueue: self.log.debug( "Adding target:", "%s (%s)" % (target.ComputingEndpoint.URLString, target.ComputingEndpoint.InterfaceName), ) targets.append(target) # At this point, we should have GRIDFTP and AREX (EMI-ES and REST) targets related to arcQueue # We intend to submit to AREX first, if it does not work, GRIDFTP is used submissionWorked = False for target in targets: # If the submission is already done, we stop if submissionWorked: break for __i in range(numberOfJobs): # The basic job description jobdescs = arc.JobDescriptionList() # Get the job into the ARC way xrslString, diracStamp = self._writeXRSL(executableFile) self.log.debug("XRSL string submitted : %s" % xrslString) self.log.debug("DIRAC stamp for job : %s" % diracStamp) # The arc bindings don't accept unicode objects in Python 2 so xrslString must be explicitly cast result = arc.JobDescription_Parse(str(xrslString), jobdescs) if not result: self.log.error("Invalid job description", "%r, message=%s" % (xrslString, result.str())) break # Submit the job job = arc.Job() result = target.Submit(self.usercfg, jobdescs[0], job) # Save info or else ..else. if result == arc.SubmissionStatus.NONE: # Job successfully submitted pilotJobReference = job.JobID batchIDList.append(pilotJobReference) stampDict[pilotJobReference] = diracStamp submissionWorked = True self.log.debug("Successfully submitted job %s to CE %s" % (pilotJobReference, self.ceHost)) else: self._analyzeSubmissionError(result) break # Boo hoo *sniff* if batchIDList: result = S_OK(batchIDList) result["PilotStampDict"] = stampDict else: result = S_ERROR("No pilot references obtained from the ARC job submission") return result
def test_the_constructor(self): retriever = arc.ComputingServiceRetriever(self.usercfg) self.expect(retriever).to_be_an_instance_of( arc.ComputingServiceRetriever)