def insertJobAndDescription(self, jobdesc, proxyid, siteName, lazy=False): """ Insert job into clientjobs and job description into jobdescriptions. This function also inserts job description. It is meant for clients that can insert everything at the same time. Args: jobdesc: A string with xRSL job description. proxyid: ID from proxies table of a proxy that job will be submitted with. siteName: A string with name of a site in configuration that job will be submitted to. lazy: A boolean that determines whether transaction should be commited after operation. Returns: ID of inserted job. """ c = self.db.getCursor() # first, insert job description and retreive the job ID try: query = 'INSERT INTO jobdescriptions (jobdescription) VALUES (%s)' c.execute(query, [jobdesc]) c.execute('SELECT LAST_INSERT_ID()') jobdescid = c.fetchone()['LAST_INSERT_ID()'] except: self.log.exception('Error inserting job description') raise # get job name from xRSL jobdescs = arc.JobDescriptionList() arc.JobDescription_Parse(str(jobdesc), jobdescs) jobname = jobdescs[0].Identification.JobName # insert job query = """ INSERT INTO clientjobs (created, jobname, jobdesc, siteName, proxyid) VALUES (%s, %s, %s, %s, %s) """ c = self.db.getCursor() try: c.execute( query, [self.getTimeStamp(), jobname, jobdescid, siteName, proxyid]) c.execute('SELECT LAST_INSERT_ID()') jobid = c.fetchone()['LAST_INSERT_ID()'] except: self.log.exception('Error while inserting new job') raise else: if not lazy: self.Commit() return jobid
def Submit(id, appjobid, jobdescstr, ucproxy, timeout): global queuelist global usercred # get the submission logger #log = logger() log = logging.getLogger() if len(queuelist) == 0: log.error("%s: no cluster free for submission" % appjobid) return None #cred_type=arc.initializeCredentialsType(arc.initializeCredentialsType.SkipCredentials) #uc=arc.UserConfig(cred_type) uc = usercred uc.CredentialString(ucproxy) jobdescs = arc.JobDescriptionList() if not jobdescstr or not arc.JobDescription_Parse(jobdescstr, jobdescs): log.error("%s: Failed to prepare job description" % appjobid) return None # Do brokering among the available queues jobdesc = jobdescs[0] broker = arc.Broker(uc, jobdesc, "Random") targetsorter = arc.ExecutionTargetSorter(broker) for target in queuelist: log.debug("%s: considering target %s:%s" % (appjobid, target.ComputingService.Name, target.ComputingShare.Name)) # Adding an entity performs matchmaking and brokering targetsorter.addEntity(target) if len(targetsorter.getMatchingTargets()) == 0: log.error("%s: no clusters satisfied job description requirements" % appjobid) return None targetsorter.reset( ) # required to reset iterator, otherwise we get a seg fault selectedtarget = targetsorter.getCurrentTarget() # Job object will contain the submitted job job = arc.Job() submitter = arc.Submitter(uc) if submitter.Submit(selectedtarget, jobdesc, job) != arc.SubmissionStatus.NONE: log.error("%s: Submission failed" % appjobid) return None jconv = JobConv() return jconv.job2db(job)
def insertArcJob(self, jobdesc, jobdescid, proxyid='', maxattempts=0, clusterlist='', appjobid='', downloadfiles='', fairshare=''): ''' Insert job into arcjobs table. This function is a modified version of insertArcJobDescription from aCTDBArc module. Because client engine uses jobdescriptions table to store job descriptions, it cannot use job insertion functions from aCTDBArc for passing jobs to ARC engine, because those insert job description themselves (duplicating it). Function is kept to be similar and violate some conventions (exceptions) of other functions in this module (clientdb) on purpose for now. ''' # extract priority from job desc (also checks if desc is valid) jobdescs = arc.JobDescriptionList() if not arc.JobDescription_Parse(str(jobdesc), jobdescs): self.log.error("%s: Failed to prepare job description" % appjobid) return None priority = jobdescs[0].Application.Priority if priority == -1: # use nicer default priority priority = 50 c = self.db.getCursor() desc = {} desc['created'] = self.getTimeStamp() desc['arcstate'] = "tosubmit" desc['tarcstate'] = desc['created'] desc['tstate'] = desc['created'] desc['cluster'] = '' desc['clusterlist'] = clusterlist desc['jobdesc'] = jobdescid desc['attemptsleft'] = maxattempts desc['proxyid'] = proxyid desc['appjobid'] = appjobid desc['downloadfiles'] = downloadfiles desc['priority'] = priority desc['fairshare'] = fairshare s="insert into arcjobs" + " ( " + \ ",".join(["%s" % (k) for k in desc.keys()]) + \ " ) " + " values " + " ( " + \ ",".join(['%s' % (k) for k in ["%s"] * len(desc)]) + " ) " c.execute(s, list(desc.values())) c.execute("SELECT LAST_INSERT_ID()") row = c.fetchone() self.Commit() return row
def insertArcJobDescription(self, jobdesc, proxyid='', maxattempts=0, clusterlist='', appjobid='', downloadfiles='', fairshare=''): ''' Add a new job description for the ARC engine to process. If specified the job will be sent to a cluster in the given list. ''' # extract priority from job desc (also checks if desc is valid) jobdescs = arc.JobDescriptionList() if not arc.JobDescription_Parse(str(jobdesc), jobdescs): self.log.error("%s: Failed to prepare job description" % appjobid) return None priority = jobdescs[0].Application.Priority if priority == -1: # use nicer default priority priority = 50 # todo: find some useful default for proxyid c = self.db.getCursor() s = "insert into jobdescriptions (jobdescription) values (%s)" c.execute(s, [jobdesc]) c.execute("SELECT LAST_INSERT_ID()") jobdescid = c.fetchone()['LAST_INSERT_ID()'] desc = {} desc['created'] = self.getTimeStamp() desc['arcstate'] = "tosubmit" desc['tarcstate'] = desc['created'] desc['tstate'] = desc['created'] desc['cluster'] = '' desc['clusterlist'] = clusterlist desc['jobdesc'] = jobdescid desc['attemptsleft'] = maxattempts desc['proxyid'] = proxyid desc['appjobid'] = appjobid desc['downloadfiles'] = downloadfiles desc['priority'] = priority desc['fairshare'] = fairshare s="insert into arcjobs" + " ( " + ",".join(['%s' % (k) for k in desc.keys()]) + " ) " + " values " + \ " ( " + ",".join(['%s' % (k) for k in ["%s"] * len(desc.keys()) ]) + " ) " c.execute(s, list(desc.values())) c.execute("SELECT LAST_INSERT_ID()") row = c.fetchone() self.Commit() return row
def checkJobDesc(jobdesc): """ Check if job description is valid. This part is taken from aCTDBArc.py and should be kept updated. Args: jobdesc: A string with job xRSL job description. Raises: InvalidJobDescriptionError: Job description is invalid. """ jobdescs = arc.JobDescriptionList() if not arc.JobDescription_Parse(str(jobdesc), jobdescs): logger.error('Job description is not valid') raise errors.InvalidJobDescriptionError()
def insertJob(self, jobdesc, proxyid, siteName, lazy=False): """ Insert job into clientjobs table. This function does not insert job decription in to the database. It has to be inserted separately. However, job description is still needed to determine the name of the job. This function is meant for clients that need to perform additional work on job descriptions. Args: jobdesc: A string with xRSL job description. proxyid: ID from proxies table of a proxy that job will be submitted with. siteName: A string with name of a site in configuration that job will be submitted to. lazy: A boolean that determines whether transaction should be commited after operation. Returns: ID of inserted job. """ # get job name from xRSL jobdescs = arc.JobDescriptionList() # Error is not checked because caller (actsub.py) already checked # validity of xrsl. arc.JobDescription_Parse(str(jobdesc), jobdescs) jobname = jobdescs[0].Identification.JobName # insert job query = """ INSERT INTO clientjobs (created, jobname, jobdesc, siteName, proxyid) VALUES (%s, %s, %s, %s, %s) """ c = self.db.getCursor() try: c.execute(query, [self.getTimeStamp(), jobname, None, siteName, proxyid]) c.execute('SELECT LAST_INSERT_ID()') jobid = c.fetchone()['LAST_INSERT_ID()'] except: self.log.exception('Error while inserting new job') raise else: if not lazy: self.Commit() return jobid
def main(lrms, grami, conf="/etc/arc.conf"): lrms = get_lrms_module(lrms) gridid = grami.split('.')[-2] is_parsed = False try: jds = arc.JobDescriptionList() with open(grami, 'r+') as jobdesc: content = jobdesc.read() is_parsed = JobDescriptionParserGRAMi.Parse(content, jds) jd = jds[0] localid = lrms.Submit(conf, jd) assert (type(localid) == str) jobdesc.write('joboption_jobid=%s\n' % localid) return 0 except (ArcError, AssertionError): pass except IOError: error('%s: Failed to access GRAMi file' % gridid, 'pySubmit') except Exception: error('Unexpected exception:\n%s' % traceback.format_exc(), 'pySubmit') return 1
def get_job_descriptions(self, jsdl): """ Return an instance of ``arc.JobDescriptionList`` containing the job described by the given JSDL :param jsdl: String containing the job description in JSDL format """ job_descriptions = arc.JobDescriptionList() temp_filename = None with tempfile.NamedTemporaryFile(delete=False) as temp_file: temp_filename = temp_file.name temp_file.write(jsdl) try: if not arc.JobDescription_ParseFromFile(temp_filename, job_descriptions): raise InvalidJobDescription( "Could not parse job description XML") finally: # Delete the temp file - finally clause is run even if exception is raised os.unlink(temp_filename) return job_descriptions
def _arc_submit(self, xrsl, arcces, userconfig, log): '''Check the available CEs and submit''' queuelist = [] for arcce in arcces: (ce_endpoint, ce_queue) = arcce aris = arc.URL(str(ce_endpoint)) ce_host = aris.Host() if aris.Protocol() == 'https': aris.ChangePath('/arex') infoendpoints = [ arc.Endpoint(aris.str(), arc.Endpoint.COMPUTINGINFO, 'org.ogf.glue.emies.resourceinfo') ] else: aris = 'ldap://' + aris.Host() + '/mds-vo-name=local,o=grid' infoendpoints = [ arc.Endpoint(aris, arc.Endpoint.COMPUTINGINFO, 'org.nordugrid.ldapng') ] # retriever contains a list of CE endpoints retriever = arc.ComputingServiceRetriever(userconfig, infoendpoints) retriever.wait() # targets is the list of queues # parse target.ComputingService.ID for the CE hostname # target.ComputingShare.Name is the queue name targets = retriever.GetExecutionTargets() # Filter only sites for this process for target in targets: if not target.ComputingService.ID: log.info( "Target {0} does not have ComputingService ID defined, skipping" .format(target.ComputingService.Name)) continue # If EMI-ES infoendpoint, force EMI-ES submission if infoendpoints[0].InterfaceName == 'org.ogf.glue.emies.resourceinfo' \ and target.ComputingEndpoint.InterfaceName != 'org.ogf.glue.emies.activitycreation': log.debug( "Rejecting target interface {0} because not EMI-ES". format(target.ComputingEndpoint.InterfaceName)) continue # Check for matching host and queue targethost = re.sub( ':arex$', '', re.sub('urn:ogf:ComputingService:', '', target.ComputingService.ID)) targetqueue = target.ComputingShare.Name if targethost != ce_host: log.debug( 'Rejecting target host {0} as it does not match {1}'. format(targethost, ce_host)) continue if targetqueue != ce_queue: log.debug( 'Rejecting target queue {0} as it does not match {1}'. format(targetqueue, ce_queue)) continue queuelist.append(target) log.debug("Adding target {0}:{1}".format( targethost, targetqueue)) # check if any queues are available, if not leave and try again next time if not queuelist: raise Exception("No free queues available") log.debug("preparing submission") jobdescs = arc.JobDescriptionList() if not arc.JobDescription_Parse(str(xrsl), jobdescs): raise Exception("Failed to prepare job description") # Run the submission in a separate thread thr = SubmitThr(queuelist, jobdescs, userconfig) return self._run_submit(thr)
def submitJob( self, executableFile, proxy, numberOfJobs = 1 ): """ Method to submit job """ # Assume that the ARC queues are always of the format nordugrid-<batchSystem>-<queue> # And none of our supported batch systems have a "-" in their name self.arcQueue = self.queue.split("-",2)[2] result = self._prepareProxy() self.usercfg.ProxyPath(os.environ['X509_USER_PROXY']) if not result['OK']: gLogger.error( 'ARCComputingElement: failed to set up proxy', result['Message'] ) return result gLogger.verbose( "Executable file path: %s" % executableFile ) if not os.access( executableFile, 5 ): os.chmod( executableFile, stat.S_IRWXU | stat.S_IRGRP | stat.S_IXGRP | stat.S_IROTH + stat.S_IXOTH ) batchIDList = [] stampDict = {} endpoint = arc.Endpoint( self.ceHost + ":2811/jobs", arc.Endpoint.JOBSUBMIT, "org.nordugrid.gridftpjob") # Submit jobs iteratively for now. Tentatively easier than mucking around with the JobSupervisor class for __i in range(numberOfJobs): # The basic job description jobdescs = arc.JobDescriptionList() # Get the job into the ARC way xrslString, diracStamp = self.__writeXRSL( executableFile ) gLogger.debug("XRSL string submitted : %s" %xrslString) gLogger.debug("DIRAC stamp for job : %s" %diracStamp) if not arc.JobDescription_Parse(xrslString, jobdescs): gLogger.error("Invalid job description") break # Submit the job jobs = arc.JobList() # filled by the submit process submitter = arc.Submitter(self.usercfg) result = submitter.Submit(endpoint, jobdescs, jobs) # Save info or else ..else. if ( result == arc.SubmissionStatus.NONE ): # Job successfully submitted pilotJobReference = jobs[0].JobID batchIDList.append( pilotJobReference ) stampDict[pilotJobReference] = diracStamp gLogger.debug("Successfully submitted job %s to CE %s" % (pilotJobReference, self.ceHost)) else: message = "Failed to submit job because " if (result.isSet(arc.SubmissionStatus.NOT_IMPLEMENTED) ): gLogger.warn( "%s feature not implemented on CE? (weird I know - complain to site admins" % message ) if ( result.isSet(arc.SubmissionStatus.NO_SERVICES) ): gLogger.warn( "%s no services are running on CE? (open GGUS ticket to site admins" % message ) if ( result.isSet(arc.SubmissionStatus.ENDPOINT_NOT_QUERIED) ): gLogger.warn( "%s endpoint was not even queried. (network ..?)" % message ) if ( result.isSet(arc.SubmissionStatus.BROKER_PLUGIN_NOT_LOADED) ): gLogger.warn( "%s BROKER_PLUGIN_NOT_LOADED : ARC library installation problem?" % message ) if ( result.isSet(arc.SubmissionStatus.DESCRIPTION_NOT_SUBMITTED) ): gLogger.warn( "%s Job not submitted - incorrect job description? (missing field in XRSL string?)" % message ) if ( result.isSet(arc.SubmissionStatus.SUBMITTER_PLUGIN_NOT_LOADED) ): gLogger.warn( "%s SUBMITTER_PLUGIN_NOT_LOADED : ARC library installation problem?" % message ) if ( result.isSet(arc.SubmissionStatus.AUTHENTICATION_ERROR) ): gLogger.warn( "%s authentication error - screwed up / expired proxy? Renew / upload pilot proxy on machine?" % message ) if ( result.isSet(arc.SubmissionStatus.ERROR_FROM_ENDPOINT) ): gLogger.warn( "%s some error from the CE - possibly CE problems?" % message ) gLogger.warn( "%s ... maybe above messages will give a hint." % message ) break # Boo hoo *sniff* if batchIDList: result = S_OK( batchIDList ) result['PilotStampDict'] = stampDict else: result = S_ERROR('No pilot references obtained from the ARC job submission') return result
# Set up logging to stderr with level VERBOSE (a lot of output will be shown) logstdout = arc.LogStream(sys.stdout) logstdout.setFormat(arc.ShortFormat) arc.Logger_getRootLogger().addDestination(logstdout) arc.Logger_getRootLogger().setThreshold(arc.VERBOSE) logger = arc.Logger(arc.Logger_getRootLogger(), "jobsubmit") # UserConfig contains information on credentials and default services to use. # This form of the constructor is necessary to initialise the local job list. usercfg = arc.UserConfig("", "") # Simple job description which outputs hostname to stdout jobdescstring = "&(executable=/bin/hostname)(stdout=stdout)" # Parse job description jobdescs = arc.JobDescriptionList() if not arc.JobDescription_Parse(jobdescstring, jobdescs): logger.msg(arc.ERROR, "Invalid job description") sys.exit(1) # Use 'arc.JobDescription_ParseFromFile("helloworld.xrsl", jobdescs)' # to parse job description from file. # Use top-level NorduGrid information index to find resources index = arc.Endpoint( "ldap://index1.nordugrid.org:2135/Mds-Vo-name=NorduGrid,o=grid", arc.Endpoint.REGISTRY, "org.nordugrid.ldapegiis") services = arc.EndpointList(1, index) # Do the submission jobs = arc.JobList()
def submit(self): """ Main function to submit jobs. """ global queuelist # check for stopsubmission flag if self.conf.get(['downtime', 'stopsubmission']) == "true": self.log.info('Submission suspended due to downtime') return 0 # Get cluster host and queue: cluster/queue clusterhost = clusterqueue = None if self.cluster: cluster = self.cluster if cluster.find('://') == -1: cluster = 'gsiftp://' + cluster clusterurl = arc.URL(cluster) clusterhost = clusterurl.Host() clusterqueue = clusterurl.Path()[1:] # strip off leading slash # Apply fair-share if self.cluster: fairshares = self.db.getArcJobsInfo( "arcstate='tosubmit' and clusterlist like '%" + self.cluster + "%'", ['fairshare']) else: fairshares = self.db.getArcJobsInfo( "arcstate='tosubmit' and clusterlist=''", ['fairshare']) if not fairshares: self.log.info('Nothing to submit') return 0 fairshares = list(set([p['fairshare'] for p in fairshares])) # For EMI-ES proxy bug - see below shuffle(fairshares) count = 0 for fairshare in fairshares: try: # catch any exceptions here to avoid leaving lock if self.cluster: # Lock row for update in case multiple clusters are specified #jobs=self.db.getArcJobsInfo("arcstate='tosubmit' and ( clusterlist like '%{0}' or clusterlist like '%{0},%' ) and fairshare='{1}' order by priority desc limit 10".format(self.cluster, fairshare), jobs = self.db.getArcJobsInfo( "arcstate='tosubmit' and ( clusterlist like '%{0}' or clusterlist like '%{0},%' ) and fairshare='{1}' limit 10" .format(self.cluster, fairshare), columns=[ "id", "jobdesc", "appjobid", "priority", "proxyid" ], lock=True) if jobs: self.log.debug("started lock for writing %d jobs" % len(jobs)) else: jobs = self.db.getArcJobsInfo( "arcstate='tosubmit' and clusterlist='' and fairshare='{0}' limit 10" .format(fairshare), columns=["id", "jobdesc", "appjobid", "priority"]) # mark submitting in db jobs_taken = [] for j in jobs: jd = { 'cluster': self.cluster, 'arcstate': 'submitting', 'tarcstate': self.db.getTimeStamp() } self.db.updateArcJobLazy(j['id'], jd) jobs_taken.append(j) jobs = jobs_taken finally: if self.cluster: try: self.db.Commit(lock=True) self.log.debug("ended lock") except: self.log.warning("Failed to release DB lock") else: self.db.Commit() if len(jobs) == 0: #self.log.debug("No jobs to submit") continue self.log.info("Submitting %d jobs for fairshare %s" % (len(jobs), fairshare)) # max waiting priority try: maxpriowaiting = max(jobs, key=lambda x: x['priority'])['priority'] except: maxpriowaiting = 0 self.log.info("Maximum priority of waiting jobs: %d" % maxpriowaiting) # Query infosys - either local or index if self.cluster: if self.cluster.find('://') != -1: aris = arc.URL(self.cluster) else: aris = arc.URL('gsiftp://%s' % self.cluster) if aris.Protocol() == 'https': aris.ChangePath('/arex') infoendpoints = [ arc.Endpoint(aris.str(), arc.Endpoint.COMPUTINGINFO, 'org.ogf.glue.emies.resourceinfo') ] elif aris.Protocol() == 'local': infoendpoints = [ arc.Endpoint(aris.str(), arc.Endpoint.COMPUTINGINFO, 'org.nordugrid.local') ] else: aris = 'ldap://' + aris.Host( ) + '/mds-vo-name=local,o=grid' infoendpoints = [ arc.Endpoint(aris, arc.Endpoint.COMPUTINGINFO, 'org.nordugrid.ldapng') ] else: giises = self.conf.getList(['atlasgiis', 'item']) infoendpoints = [] for g in giises: # Specify explicitly EGIIS infoendpoints.append( arc.Endpoint(str(g), arc.Endpoint.REGISTRY, "org.nordugrid.ldapegiis")) # Set UserConfig credential for each proxy. Assumes that any proxy # in the fairshare can query the CE infosys self.uc.CredentialString(self.db.getProxy(jobs[0]['proxyid'])) # retriever contains a list of CE endpoints retriever = arc.ComputingServiceRetriever(self.uc, infoendpoints) retriever.wait() # targets is the list of queues # parse target.ComputingService.ID for the CE hostname # target.ComputingShare.Name is the queue name targets = retriever.GetExecutionTargets() # Filter only sites for this process queuelist = [] for target in targets: if not target.ComputingService.ID: self.log.info( "Target %s does not have ComputingService ID defined, skipping" % target.ComputingService.Name) continue # If EMI-ES infoendpoint, force EMI-ES submission if infoendpoints[ 0].InterfaceName == 'org.ogf.glue.emies.resourceinfo' and target.ComputingEndpoint.InterfaceName != 'org.ogf.glue.emies.activitycreation': self.log.debug( "Rejecting target interface %s because not EMI-ES" % target.ComputingEndpoint.InterfaceName) continue # Check for matching host and queue targethost = re.sub( ':arex$', '', re.sub('urn:ogf:ComputingService:', '', target.ComputingService.ID)) targetqueue = target.ComputingShare.Name if clusterhost and targethost != clusterhost: self.log.debug( 'Rejecting target host %s as it does not match %s' % (targethost, clusterhost)) continue if clusterqueue and targetqueue != clusterqueue: self.log.debug( 'Rejecting target queue %s as it does not match %s' % (targetqueue, clusterqueue)) continue if targetqueue in self.conf.getList(['queuesreject', 'item']): self.log.debug( 'Rejecting target queue %s in queuesreject list' % targetqueue) continue elif targethost in self.conf.getList( ['clustersreject', 'item']): self.log.debug( 'Rejecting target host %s in clustersreject list' % targethost) continue else: # tmp hack target.ComputingShare.LocalWaitingJobs = 0 target.ComputingShare.PreLRMSWaitingJobs = 0 target.ExecutionEnvironment.CPUClockSpeed = 2000 qjobs = self.db.getArcJobsInfo( "cluster='" + str(self.cluster) + "' and arcstate='submitted' and fairshare='%s'" % fairshare, ['id', 'priority']) rjobs = self.db.getArcJobsInfo( "cluster='" + str(self.cluster) + "' and arcstate='running' and fairshare='%s'" % fairshare, ['id']) # max queued priority try: maxprioqueued = max( qjobs, key=lambda x: x['priority'])['priority'] except: maxprioqueued = 0 self.log.info("Max priority queued: %d" % maxprioqueued) # Set number of submitted jobs to running * 0.15 + 400/num of shares # Note: assumes only a few shares are used jlimit = len(rjobs) * 0.15 + 100 / len(fairshares) if str(self.cluster).find('arc-boinc-0') != -1: jlimit = len(rjobs) * 0.15 + 400 if str(self.cluster).find('XXXpikolit') != -1: jlimit = len(rjobs) * 0.15 + 100 if str(self.cluster).find('arc05.lcg') != -1: jlimit = len(rjobs) * 0.15 + 400 target.ComputingShare.PreLRMSWaitingJobs = len(qjobs) if len(qjobs) < jlimit or ((maxpriowaiting > maxprioqueued) and (maxpriowaiting > 10)): if maxpriowaiting > maxprioqueued: self.log.info( "Overriding limit, maxpriowaiting: %d > maxprioqueued: %d" % (maxpriowaiting, maxprioqueued)) queuelist.append(target) self.log.debug("Adding target %s:%s" % (targethost, targetqueue)) else: self.log.info( "%s/%s already at limit of submitted jobs for fairshare %s" % (targethost, targetqueue, fairshare)) # check if any queues are available, if not leave and try again next time if not queuelist: self.log.info("No free queues available") self.db.Commit() # EMI-ES proxy problem - see bug 3685 if self.cluster and self.cluster.startswith('https://'): raise ExceptInterrupt(15) continue self.log.info("start submitting") # Just run one thread for each job in sequence. Strange things happen # when trying to create a new UserConfig object for each thread. for j in jobs: self.log.debug("%s: preparing submission" % j['appjobid']) jobdescstr = str( self.db.getArcJobDescription(str(j['jobdesc']))) jobdescs = arc.JobDescriptionList() if not jobdescstr or not arc.JobDescription_Parse( jobdescstr, jobdescs): self.log.error("%s: Failed to prepare job description" % j['appjobid']) continue # TODO: might not work if proxies are different within a share # since same uc object is shared among threads self.uc.CredentialString(self.db.getProxy(j['proxyid'])) t = SubmitThr(Submit, j['id'], j['appjobid'], jobdescs, self.uc, self.log) self.RunThreadsSplit([t], 1) count = count + 1 self.log.info("threads finished") # commit transaction to release row locks self.db.Commit() # EMI-ES proxy problem - see bug 3685 if self.cluster and self.cluster.startswith('https://'): raise ExceptInterrupt(15) self.log.info("end submitting") return count
def submit(self): """ Main function to submit jobs. """ global queuelist # check for stopsubmission flag if self.conf.get(['downtime', 'stopsubmission']) == "true": self.log.info('Submission suspended due to downtime') return # check for any site-specific limits or status clusterstatus = self.conf.getCond(["sites", "site"], f"endpoint={self.cluster}", ["status"]) or 'online' if clusterstatus == 'offline': self.log.info('Site status is offline') return clustermaxjobs = int( self.conf.getCond(["sites", "site"], f"endpoint={self.cluster}", ["maxjobs"]) or 999999) nsubmitted = self.db.getNArcJobs(f"cluster='{self.cluster}'") if nsubmitted >= clustermaxjobs: self.log.info( f'{nsubmitted} submitted jobs is greater than or equal to max jobs {clustermaxjobs}' ) return # Get cluster host and queue: cluster/queue clusterhost = clusterqueue = None if self.cluster: cluster = self.cluster if cluster.find('://') == -1: cluster = 'gsiftp://' + cluster clusterurl = arc.URL(cluster) clusterhost = clusterurl.Host() clusterqueue = clusterurl.Path()[1:] # strip off leading slash # Apply fair-share if self.cluster: fairshares = self.db.getArcJobsInfo( "arcstate='tosubmit' and clusterlist like '%" + self.cluster + "%'", ['fairshare', 'proxyid']) else: fairshares = self.db.getArcJobsInfo( "arcstate='tosubmit' and clusterlist=''", ['fairshare', 'proxyid']) if not fairshares: self.log.info('Nothing to submit') return # split by proxy for GU queues fairshares = list( set([(p['fairshare'], p['proxyid']) for p in fairshares])) # For proxy bug - see below shuffle(fairshares) for fairshare, proxyid in fairshares: # apply maxjobs limit (check above should make sure greater than zero) # Note: relies on exit after first loop limit = min(clustermaxjobs - nsubmitted, 10) try: # catch any exceptions here to avoid leaving lock if self.cluster: # Lock row for update in case multiple clusters are specified #jobs=self.db.getArcJobsInfo("arcstate='tosubmit' and ( clusterlist like '%{0}' or clusterlist like '%{0},%' ) and fairshare='{1}' order by priority desc limit 10".format(self.cluster, fairshare), jobs = self.db.getArcJobsInfo( "arcstate='tosubmit' and ( clusterlist like '%{0}' or clusterlist like '%{0},%' ) and fairshare='{1}' and proxyid='{2}' limit {3}" .format(self.cluster, fairshare, proxyid, limit), columns=[ "id", "jobdesc", "appjobid", "priority", "proxyid", "clusterlist" ], lock=True) if jobs: self.log.debug("started lock for writing %d jobs" % len(jobs)) else: jobs = self.db.getArcJobsInfo( "arcstate='tosubmit' and clusterlist='' and fairshare='{0} and proxyid={1}' limit {2}" .format(fairshare, proxyid, limit), columns=[ "id", "jobdesc", "appjobid", "priority", "proxyid", "clusterlist" ]) # mark submitting in db jobs_taken = [] for j in jobs: jd = { 'cluster': self.cluster, 'arcstate': 'submitting', 'tarcstate': self.db.getTimeStamp() } self.db.updateArcJobLazy(j['id'], jd) jobs_taken.append(j) jobs = jobs_taken finally: if self.cluster: try: self.db.Commit(lock=True) self.log.debug("ended lock") except: self.log.warning("Failed to release DB lock") else: self.db.Commit() if len(jobs) == 0: #self.log.debug("No jobs to submit") continue self.log.info( "Submitting %d jobs for fairshare %s and proxyid %d" % (len(jobs), fairshare, proxyid)) # max waiting priority try: maxpriowaiting = max(jobs, key=lambda x: x['priority'])['priority'] except: maxpriowaiting = 0 self.log.info("Maximum priority of waiting jobs: %d" % maxpriowaiting) # Query infosys - either local or index if self.cluster: if self.cluster.find('://') != -1: aris = arc.URL(self.cluster) else: aris = arc.URL('gsiftp://%s' % self.cluster) if aris.Protocol() == 'https': aris.ChangePath('/arex') infoendpoints = [ arc.Endpoint(aris.str(), arc.Endpoint.COMPUTINGINFO, 'org.ogf.glue.emies.resourceinfo') ] elif aris.Protocol() == 'local': infoendpoints = [ arc.Endpoint(aris.str(), arc.Endpoint.COMPUTINGINFO, 'org.nordugrid.local') ] else: aris = 'ldap://' + aris.Host( ) + '/mds-vo-name=local,o=grid' infoendpoints = [ arc.Endpoint(aris, arc.Endpoint.COMPUTINGINFO, 'org.nordugrid.ldapng') ] else: giises = self.conf.getList(['atlasgiis', 'item']) infoendpoints = [] for g in giises: # Specify explicitly EGIIS infoendpoints.append( arc.Endpoint(str(g), arc.Endpoint.REGISTRY, "org.nordugrid.ldapegiis")) # Set UserConfig credential for querying infosys proxystring = str(self.db.getProxy(proxyid)) self.uc.CredentialString(proxystring) global usercred usercred = self.uc # retriever contains a list of CE endpoints retriever = arc.ComputingServiceRetriever(self.uc, infoendpoints) retriever.wait() # targets is the list of queues # parse target.ComputingService.ID for the CE hostname # target.ComputingShare.Name is the queue name targets = retriever.GetExecutionTargets() # Filter only sites for this process queuelist = [] for target in targets: if not target.ComputingService.ID: self.log.info( "Target %s does not have ComputingService ID defined, skipping" % target.ComputingService.Name) continue # If EMI-ES infoendpoint, force EMI-ES submission if infoendpoints[ 0].InterfaceName == 'org.ogf.glue.emies.resourceinfo' and target.ComputingEndpoint.InterfaceName != 'org.ogf.glue.emies.activitycreation': self.log.debug( "Rejecting target interface %s because not EMI-ES" % target.ComputingEndpoint.InterfaceName) continue # Check for matching host and queue targethost = re.sub( ':arex$', '', re.sub('urn:ogf:ComputingService:', '', target.ComputingService.ID)) targetqueue = target.ComputingShare.Name if clusterhost and targethost != clusterhost: self.log.debug( 'Rejecting target host %s as it does not match %s' % (targethost, clusterhost)) continue if clusterqueue and targetqueue != clusterqueue: self.log.debug( 'Rejecting target queue %s as it does not match %s' % (targetqueue, clusterqueue)) continue if targetqueue in self.conf.getList(['queuesreject', 'item']): self.log.debug( 'Rejecting target queue %s in queuesreject list' % targetqueue) continue elif targethost in self.conf.getList( ['clustersreject', 'item']): self.log.debug( 'Rejecting target host %s in clustersreject list' % targethost) continue else: # tmp hack target.ComputingShare.LocalWaitingJobs = 0 target.ComputingShare.PreLRMSWaitingJobs = 0 target.ExecutionEnvironment.CPUClockSpeed = 2000 qjobs = self.db.getArcJobsInfo( "cluster='" + str(self.cluster) + "' and arcstate='submitted' and fairshare='%s'" % fairshare, ['id', 'priority']) rjobs = self.db.getArcJobsInfo( "cluster='" + str(self.cluster) + "' and arcstate='running' and fairshare='%s'" % fairshare, ['id']) # max queued priority try: maxprioqueued = max( qjobs, key=lambda x: x['priority'])['priority'] except: maxprioqueued = 0 self.log.info("Max priority queued: %d" % maxprioqueued) # Limit number of submitted jobs using configuration or default (0.15 + 100/num of shares) # Note: assumes only a few shares are used qfraction = float(self.conf.get([ 'jobs', 'queuefraction' ])) if self.conf.get(['jobs', 'queuefraction']) else 0.15 qoffset = int(self.conf.get([ 'jobs', 'queueoffset' ])) if self.conf.get(['jobs', 'queueoffset']) else 100 jlimit = len(rjobs) * qfraction + qoffset / len(fairshares) self.log.debug("running %d, queued %d, queue limit %d" % (len(rjobs), len(qjobs), jlimit)) if str(self.cluster).find('arc-boinc-0') != -1: jlimit = len(rjobs) * 0.15 + 400 if str(self.cluster).find('XXXpikolit') != -1: jlimit = len(rjobs) * 0.15 + 100 if str(self.cluster).find('arc05.lcg') != -1: jlimit = len(rjobs) * 0.15 + 400 target.ComputingShare.PreLRMSWaitingJobs = len(qjobs) if len(qjobs) < jlimit or ((maxpriowaiting > maxprioqueued) and (maxpriowaiting > 10)): if maxpriowaiting > maxprioqueued: self.log.info( "Overriding limit, maxpriowaiting: %d > maxprioqueued: %d" % (maxpriowaiting, maxprioqueued)) queuelist.append(target) self.log.debug("Adding target %s:%s" % (targethost, targetqueue)) else: self.log.info( "%s/%s already at limit of submitted jobs for fairshare %s" % (targethost, targetqueue, fairshare)) # check if any queues are available, if not leave and try again next time if not queuelist: self.log.info("No free queues available") self.db.Commit() continue self.log.info("start submitting") # Just run one thread for each job in sequence. Strange things happen # when trying to create a new UserConfig object for each thread. tasks = [] for j in jobs: self.log.debug("%s: preparing submission" % j['appjobid']) jobdescstr = str( self.db.getArcJobDescription(str(j['jobdesc']))) jobdescs = arc.JobDescriptionList() if not jobdescstr or not arc.JobDescription_Parse( jobdescstr, jobdescs): self.log.error("%s: Failed to prepare job description" % j['appjobid']) continue tasks.append((j['id'], j['appjobid'], jobdescstr, proxystring, int(self.conf.get(['atlasgiis', 'timeout'])))) npools = 1 if any(s in self.cluster for s in self.conf.getList(['parallelsubmit', 'item'])): npools = int(self.conf.get(['parallelsubmit', 'npools'])) self.log.debug("Starting submitters: %s" % npools) pool = multiprocessing.Pool(npools) #results = [] #for task in tasks: # result = pool.apply_async(Submit,(task)) # results.append(result) # Submit in workers results = [pool.apply_async(Submit, (t)) for t in tasks] # timeout per submission timeout = 60 stopflag = False for result, task in zip(results, tasks): try: jdb = result.get(timeout) jconv = JobConv() job = jconv.db2job(jdb) except multiprocessing.TimeoutError: self.log.error( "%s: submission timeout: exit and try again" % task[1]) # abort submission if Submit process is stuck #pool.terminate() KillPool(pool) pool.join() stopflag = True # reduce timeout to finish quickly timeout = 0.1 continue if job is None: self.log.error("%s: no job defined for %d" % (task[1], task[0])) continue jd = {} jd['arcstate'] = 'submitted' # initial offset to 1 minute to force first status check jd['tarcstate'] = self.db.getTimeStamp( time.time() - int(self.conf.get(['jobs', 'checkinterval'])) + 120) jd['tstate'] = self.db.getTimeStamp() # extract hostname of cluster (depends on JobID being a URL) self.log.info("%s: job id %s" % (task[1], job.JobID)) jd['cluster'] = self.cluster self.db.updateArcJobLazy(task[0], jd, job) if not stopflag: pool.terminate() pool.join() else: # stop submitting, gsiftp connection problem likely raise ExceptInterrupt(15) self.log.info("threads finished") # commit transaction to release row locks self.db.Commit() # still proxy bug - exit if there are multiple proxies if len(self.db.getProxiesInfo('TRUE', ['id'])) > 1: raise ExceptInterrupt(15) self.log.info("end submitting") return
def submitJob(self, executableFile, proxy, numberOfJobs=1, inputs=None, outputs=None): """Method to submit job""" # Assume that the ARC queues are always of the format nordugrid-<batchSystem>-<queue> # And none of our supported batch systems have a "-" in their name self.arcQueue = self.queue.split("-", 2)[2] result = self._prepareProxy() if not result["OK"]: self.log.error("ARCComputingElement: failed to set up proxy", result["Message"]) return result self.usercfg.ProxyPath(os.environ["X509_USER_PROXY"]) self.log.verbose("Executable file path: %s" % executableFile) if not os.access(executableFile, 5): os.chmod( executableFile, stat.S_IRWXU | stat.S_IRGRP | stat.S_IXGRP | stat.S_IROTH + stat.S_IXOTH) executables = None if self.preamble: executables = [executableFile] executableFile = self._bundlePreamble(executableFile) batchIDList = [] stampDict = {} if self.endpointType == "Gridftp": endpoint = arc.Endpoint(str(self.ceHost + ":2811/jobs"), arc.Endpoint.JOBSUBMIT, "org.nordugrid.gridftpjob") else: endpoint = arc.Endpoint( str("https://" + self.ceHost + ":8443/arex"), arc.Endpoint.JOBSUBMIT, "org.ogf.glue.emies.activitycreation", ) # Submit jobs iteratively for now. Tentatively easier than mucking around with the JobSupervisor class for __i in range(numberOfJobs): # The basic job description jobdescs = arc.JobDescriptionList() # Get the job into the ARC way xrslString, diracStamp = self._writeXRSL(executableFile, inputs, outputs, executables) self.log.debug("XRSL string submitted : %s" % xrslString) self.log.debug("DIRAC stamp for job : %s" % diracStamp) # The arc bindings don't accept unicode objects in Python 2 so xrslString must be explicitly cast result = arc.JobDescription_Parse(str(xrslString), jobdescs) if not result: self.log.error("Invalid job description", "%r, message=%s" % (xrslString, result.str())) break # Submit the job jobs = arc.JobList() # filled by the submit process submitter = arc.Submitter(self.usercfg) result = submitter.Submit(endpoint, jobdescs, jobs) # Save info or else ..else. if result == arc.SubmissionStatus.NONE: # Job successfully submitted pilotJobReference = jobs[0].JobID batchIDList.append(pilotJobReference) stampDict[pilotJobReference] = diracStamp self.log.debug("Successfully submitted job %s to CE %s" % (pilotJobReference, self.ceHost)) else: self._analyzeSubmissionError(result) break # Boo hoo *sniff* if self.preamble: os.unlink(executableFile) if batchIDList: result = S_OK(batchIDList) result["PilotStampDict"] = stampDict else: result = S_ERROR( "No pilot references obtained from the ARC job submission") return result
def submitJob(self, executableFile, proxy, numberOfJobs=1, inputs=None, outputs=None): """Method to submit job""" # Assume that the ARC queues are always of the format nordugrid-<batchSystem>-<queue> # And none of our supported batch systems have a "-" in their name self.arcQueue = self.queue.split("-", 2)[2] result = self._prepareProxy() if not result["OK"]: self.log.error("ARCComputingElement: failed to set up proxy", result["Message"]) return result self.usercfg.ProxyPath(os.environ["X509_USER_PROXY"]) self.log.verbose("Executable file path: %s" % executableFile) if not os.access(executableFile, 5): os.chmod( executableFile, stat.S_IRWXU | stat.S_IRGRP | stat.S_IXGRP | stat.S_IROTH + stat.S_IXOTH) executables = None if self.preamble: executables = [executableFile] executableFile = self._bundlePreamble(executableFile) batchIDList = [] stampDict = {} if self.endpointType == "Gridftp": endpoint = arc.Endpoint(str(self.ceHost + ":2811/jobs"), arc.Endpoint.JOBSUBMIT, "org.nordugrid.gridftpjob") else: endpoint = arc.Endpoint( str("https://" + self.ceHost + ":8443/arex"), arc.Endpoint.JOBSUBMIT, "org.ogf.glue.emies.activitycreation", ) # Submit jobs iteratively for now. Tentatively easier than mucking around with the JobSupervisor class for __i in range(numberOfJobs): # The basic job description jobdescs = arc.JobDescriptionList() # Get the job into the ARC way xrslString, diracStamp = self.__writeXRSL(executableFile, inputs, outputs, executables) self.log.debug("XRSL string submitted : %s" % xrslString) self.log.debug("DIRAC stamp for job : %s" % diracStamp) # The arc bindings don't accept unicode objects in Python 2 so xrslString must be explicitly cast result = arc.JobDescription_Parse(str(xrslString), jobdescs) if not result: self.log.error("Invalid job description", "%r, message=%s" % (xrslString, result.str())) break # Submit the job jobs = arc.JobList() # filled by the submit process submitter = arc.Submitter(self.usercfg) result = submitter.Submit(endpoint, jobdescs, jobs) # Save info or else ..else. if result == arc.SubmissionStatus.NONE: # Job successfully submitted pilotJobReference = jobs[0].JobID batchIDList.append(pilotJobReference) stampDict[pilotJobReference] = diracStamp self.log.debug("Successfully submitted job %s to CE %s" % (pilotJobReference, self.ceHost)) else: message = "Failed to submit job because " if result.isSet(arc.SubmissionStatus.NOT_IMPLEMENTED): # pylint: disable=no-member self.log.warn( "%s feature not implemented on CE? (weird I know - complain to site admins" % message) if result.isSet(arc.SubmissionStatus.NO_SERVICES): # pylint: disable=no-member self.log.warn( "%s no services are running on CE? (open GGUS ticket to site admins" % message) if result.isSet(arc.SubmissionStatus.ENDPOINT_NOT_QUERIED): # pylint: disable=no-member self.log.warn( "%s endpoint was not even queried. (network ..?)" % message) if result.isSet(arc.SubmissionStatus.BROKER_PLUGIN_NOT_LOADED): # pylint: disable=no-member self.log.warn( "%s BROKER_PLUGIN_NOT_LOADED : ARC library installation problem?" % message) if result.isSet( arc.SubmissionStatus.DESCRIPTION_NOT_SUBMITTED): # pylint: disable=no-member self.log.warn( "%s Job not submitted - incorrect job description? (missing field in XRSL string?)" % message) if result.isSet( arc.SubmissionStatus.SUBMITTER_PLUGIN_NOT_LOADED): # pylint: disable=no-member self.log.warn( "%s SUBMITTER_PLUGIN_NOT_LOADED : ARC library installation problem?" % message) if result.isSet(arc.SubmissionStatus.AUTHENTICATION_ERROR): # pylint: disable=no-member self.log.warn( "%s authentication error - screwed up / expired proxy? Renew / upload pilot proxy on machine?" % message) if result.isSet(arc.SubmissionStatus.ERROR_FROM_ENDPOINT): # pylint: disable=no-member self.log.warn( "%s some error from the CE - possibly CE problems?" % message) self.log.warn("%s ... maybe above messages will give a hint." % message) break # Boo hoo *sniff* if self.preamble: os.unlink(executableFile) if batchIDList: result = S_OK(batchIDList) result["PilotStampDict"] = stampDict else: result = S_ERROR( "No pilot references obtained from the ARC job submission") return result
def submitJob(self, executableFile, proxy, numberOfJobs=1): """Method to submit job""" # Assume that the ARC queues are always of the format nordugrid-<batchSystem>-<queue> # And none of our supported batch systems have a "-" in their name self.arcQueue = self.queue.split("-", 2)[2] result = self._prepareProxy() if not result["OK"]: self.log.error("ARCComputingElement: failed to set up proxy", result["Message"]) return result self.usercfg.ProxyPath(os.environ["X509_USER_PROXY"]) self.log.verbose("Executable file path: %s" % executableFile) if not os.access(executableFile, 5): os.chmod(executableFile, stat.S_IRWXU | stat.S_IRGRP | stat.S_IXGRP | stat.S_IROTH + stat.S_IXOTH) batchIDList = [] stampDict = {} # Creating an endpoint endpoint = arc.Endpoint(self.ceHost, arc.Endpoint.COMPUTINGINFO, "org.nordugrid.ldapglue2") # Get the ExecutionTargets of the ComputingElement (Can be REST, EMI-ES or GRIDFTP) retriever = arc.ComputingServiceRetriever(self.usercfg, [endpoint]) retriever.wait() targetsWithQueues = list(retriever.GetExecutionTargets()) # Targets also include queues # To avoid losing time trying to submit to queues we cannot interact with, we only keep the interesting ones targets = [] for target in targetsWithQueues: if target.ComputingShare.Name == self.arcQueue: self.log.debug( "Adding target:", "%s (%s)" % (target.ComputingEndpoint.URLString, target.ComputingEndpoint.InterfaceName), ) targets.append(target) # At this point, we should have GRIDFTP and AREX (EMI-ES and REST) targets related to arcQueue # We intend to submit to AREX first, if it does not work, GRIDFTP is used submissionWorked = False for target in targets: # If the submission is already done, we stop if submissionWorked: break for __i in range(numberOfJobs): # The basic job description jobdescs = arc.JobDescriptionList() # Get the job into the ARC way xrslString, diracStamp = self._writeXRSL(executableFile) self.log.debug("XRSL string submitted : %s" % xrslString) self.log.debug("DIRAC stamp for job : %s" % diracStamp) # The arc bindings don't accept unicode objects in Python 2 so xrslString must be explicitly cast result = arc.JobDescription_Parse(str(xrslString), jobdescs) if not result: self.log.error("Invalid job description", "%r, message=%s" % (xrslString, result.str())) break # Submit the job job = arc.Job() result = target.Submit(self.usercfg, jobdescs[0], job) # Save info or else ..else. if result == arc.SubmissionStatus.NONE: # Job successfully submitted pilotJobReference = job.JobID batchIDList.append(pilotJobReference) stampDict[pilotJobReference] = diracStamp submissionWorked = True self.log.debug("Successfully submitted job %s to CE %s" % (pilotJobReference, self.ceHost)) else: self._analyzeSubmissionError(result) break # Boo hoo *sniff* if batchIDList: result = S_OK(batchIDList) result["PilotStampDict"] = stampDict else: result = S_ERROR("No pilot references obtained from the ARC job submission") return result