def setUp(self): if Config.USE_REDIS: __db = redis.StrictRedis(Config.REDIS_HOSTNAME, Config.REDIS_PORT, db=0) __db.flushall() self.job1 = TangoJob( name="sample_job_1", vm="ilter.img", outputFile="sample_job_1_output", input=[], timeout=30, notifyURL="notifyMeUrl", maxOutputFileSize=4096, ) self.job2 = TangoJob( name="sample_job_2", vm="ilter.img", outputFile="sample_job_2_output", input=[], timeout=30, notifyURL="notifyMeUrl", maxOutputFileSize=4096, ) self.jobQueue = JobQueue(None) self.jobQueue.reset() self.jobId1 = self.jobQueue.add(self.job1) self.jobId2 = self.jobQueue.add(self.job2)
def __init__(self): self.daemon = True vmms = None if Config.VMMS_NAME == "tashiSSH": from vmms.tashiSSH import TashiSSH vmms = TashiSSH() elif Config.VMMS_NAME == "ec2SSH": from vmms.ec2SSH import Ec2SSH vmms = Ec2SSH() elif Config.VMMS_NAME == "localDocker": from vmms.localDocker import LocalDocker vmms = LocalDocker() elif Config.VMMS_NAME == "distDocker": from vmms.distDocker import DistDocker vmms = DistDocker() self.preallocator = Preallocator({Config.VMMS_NAME: vmms}) self.jobQueue = JobQueue(self.preallocator) if not Config.USE_REDIS: # creates a local Job Manager if there is no persistent # memory between processes. Otherwise, JobManager will # be initiated separately JobManager(self.jobQueue).start() logging.basicConfig( filename=Config.LOGFILE, format="%(levelname)s|%(asctime)s|%(name)s|%(message)s", level=Config.LOGLEVEL, ) self.start_time = time.time() self.log = logging.getLogger("TangoServer") self.log.info("Starting Tango server")
def setUp(self): self.app = www.app.test_client() try: os.remove(quePath) except: pass self.jobQueue = JobQueue(quePath)
def setUp(self): if Config.USE_REDIS: __db = redis.StrictRedis( Config.REDIS_HOSTNAME, Config.REDIS_PORT, db=0) __db.flushall() self.job1 = TangoJob( name="sample_job_1", vm="ilter.img", outputFile="sample_job_1_output", input=[], timeout=30, notifyURL="notifyMeUrl", maxOutputFileSize=4096) self.job2 = TangoJob( name="sample_job_2", vm="ilter.img", outputFile="sample_job_2_output", input=[], timeout=30, notifyURL="notifyMeUrl", maxOutputFileSize=4096) self.jobQueue = JobQueue(None) self.jobQueue.reset() self.jobId1 = self.jobQueue.add(self.job1) self.jobId2 = self.jobQueue.add(self.job2)
def setUp(self): try: os.remove(quePath) except: pass self.que = JobQueue(quePath) self.jobProcess = JobProcess(quePath)
def setUp(s): try: #os.remove(quePath) os.remove(serverQueuePath) except: pass www.app.config['UNIT_TEST'] = True s.app = www.app.test_client() s.que = JobQueue(quePath)
def getStatus(id, queuePath): # Retrieve the status and result of the given job ID. # @param id: the job ID # @param queuePath: the job queue path # @returns: a dict of the form: {'status': <status>, 'result': <dict>} # where result is None if the job is not found; # only Success and Error may have an optional result; if # there is no result, no result property is returned statusResult = JobQueue(queuePath).getStatus(id) if statusResult == None: raise ErrorResp('unknown job ID of: ' + str(id)) return statusResult
def __init__(self): logging.basicConfig( filename = self.LOGFILE, format = "%(levelname)s|%(asctime)s|%(name)s|%(message)s", level = Config.LOGLEVEL ) vmms = None if Config.VMMS_NAME == "localSSH": from vmms.localSSH import LocalSSH vmms = LocalSSH() elif Config.VMMS_NAME == "tashiSSH": from vmms.tashiSSH import TashiSSH vmms = TashiSSH() elif Config.VMMS_NAME == "ec2SSH": from vmms.ec2SSH import Ec2SSH vmms = Ec2SSH() elif Config.VMMS_NAME == "localDocker": from vmms.localDocker import LocalDocker vmms = LocalDocker() self.vmms = {Config.VMMS_NAME: vmms} self.preallocator = Preallocator(self.vmms) self.queue = JobQueue(self.preallocator) if not Config.USE_REDIS: # creates a local Job Manager if there is no persistent # memory between processes. Otherwise, JobManager will # be initiated separately JobManager(self.queue, self.vmms, self.preallocator) self.tango = TangoServer(self.queue, self.preallocator, self.vmms) logging.basicConfig( filename=self.LOGFILE, format="%(levelname)s|%(asctime)s|%(name)s|%(message)s", level=Config.LOGLEVEL ) logging.getLogger('boto').setLevel(logging.INFO) self.log = logging.getLogger("TangoREST") self.log.info("Starting RESTful Tango server") self.status = Status()
def add(email, operation, parms, ctx): # Add a job to the tail end of the job queue. # @param email: email/username requesting the job # @param operation: job operation to run; the python module that # contains the calcMain() function should be in the # file, <operation>_www.py # @param parms: parameters as a python dict to be passed to # <operation>_www.py.calcMain() # @params ctx: the job context holding information for the postCalc # @returns: (jobId, status) # Extract any doNotEmail flag. doNotEmail = parms.pop('doNotEmail', None) if 'map' in parms: ctx.map = parms['map'] elif 'mapId' in parms: ctx.map = parms['mapId'] if email != None: ctx.email = email packedTask = _packTask(operation, parms, ctx) queuePath = ctx.app.jobQueuePath jobId = JobQueue(queuePath).add(id, packedTask, email, doNotEmail) # Get the status of the job just added to the queue. result = getStatus(jobId, queuePath) # Run the job now. if not ctx.app.unitTest: _runNow(jobId, ctx.app.jobProcessPath, queuePath) # Return the id and status. return { 'status': 'InJobQueue', 'jobId': jobId, 'jobStatusUrl': ctx.app.jobStatusUrl + str(jobId), }
def main(args): queuePath = args[0] id = int(args[1]) # TODO these should be wrapped with a try-except because any errors here # will not be reported in the server log. jobProcess = JobProcess(queuePath) operation, parms, ctx = jobProcess.unpackTask(jobProcess.queue.getTask(id)) try: status, result = jobProcess.run(id, operation, parms, ctx) except Exception as e: status = 'Error' result = _formatError(str(e), traceback.format_exc(100), operation, parms) except: status = 'Error' result = _formatError(None, traceback.format_exc(100), operation, parms) # Set the completion status. JobQueue(queuePath).setResult(id, status, result, ctx, operation)
class TangoServer: """ TangoServer - Implements the API functions that the server accepts """ def __init__(self): self.daemon = True vmms = None if Config.VMMS_NAME == "tashiSSH": from vmms.tashiSSH import TashiSSH vmms = TashiSSH() elif Config.VMMS_NAME == "ec2SSH": from vmms.ec2SSH import Ec2SSH vmms = Ec2SSH() elif Config.VMMS_NAME == "localDocker": from vmms.localDocker import LocalDocker vmms = LocalDocker() elif Config.VMMS_NAME == "distDocker": from vmms.distDocker import DistDocker vmms = DistDocker() self.preallocator = Preallocator({Config.VMMS_NAME: vmms}) self.jobQueue = JobQueue(self.preallocator) if not Config.USE_REDIS: # creates a local Job Manager if there is no persistent # memory between processes. Otherwise, JobManager will # be initiated separately JobManager(self.jobQueue).start() logging.basicConfig( filename=Config.LOGFILE, format="%(levelname)s|%(asctime)s|%(name)s|%(message)s", level=Config.LOGLEVEL, ) self.start_time = time.time() self.log = logging.getLogger("TangoServer") self.log.info("Starting Tango server") def addJob(self, job): """ addJob - Add a job to the job queue """ Config.job_requests += 1 self.log.debug("Received addJob request") ret = self.__validateJob(job, self.preallocator.vmms) self.log.info("Done validating job %s" % (job.name)) if ret == 0: return self.jobQueue.add(job) else: self.jobQueue.addDead(job) return -1 def delJob(self, id, deadjob): """ delJob - Delete a job @param id: Id of job to delete @param deadjob - If 0, move the job from the live queue to the dead queue. If non-zero, remove the job from the dead queue and discard. Use with caution! """ self.log.debug("Received delJob(%d, %d) request" % (id, deadjob)) return self.jobQueue.delJob(id, deadjob) def getJobs(self, item): """ getJobs - Return the list of live jobs (item == 0) or the list of dead jobs (item == -1). """ try: self.log.debug("Received getJobs(%s) request" % (item)) if item == -1: # return the list of dead jobs return self.jobQueue.deadJobs.values() elif item == 0: # return the list of live jobs return self.jobQueue.liveJobs.values() else: # invalid parameter return [] except Exception as e: self.log.debug("getJobs: %s" % str(e)) def preallocVM(self, vm, num): """ preallocVM - Set the pool size for VMs of type vm to num """ self.log.debug("Received preallocVM(%s,%d)request" % (vm.name, num)) try: vmms = self.preallocator.vmms[vm.vmms] if not vm or num < 0: return -2 if vm.image not in vmms.getImages(): self.log.error("Invalid image name") return -3 (name, ext) = os.path.splitext(vm.image) vm.name = name self.preallocator.update(vm, num) return 0 except Exception as err: self.log.error("preallocVM failed: %s" % err) return -1 def getVMs(self, vmms_name): """ getVMs - return the list of VMs managed by the service vmms_name """ self.log.debug("Received getVMs request(%s)" % vmms_name) try: if vmms_name in self.preallocator.vmms: vmms_inst = self.preallocator.vmms[vmms_name] return vmms_inst.getVMs() else: return [] except Exception as err: self.log.error("getVMs request failed: %s" % err) return [] def delVM(self, vmName, id): """ delVM - delete a specific VM instance from a pool """ self.log.debug("Received delVM request(%s, %d)" % (vmName, id)) try: if not vmName or vmName == "" or not id: return -1 return self.preallocator.destroyVM(vmName, id) except Exception as err: self.log.error("delVM request failed: %s" % err) return -1 def getPool(self, vmName): """ getPool - Return the current members of a pool and its free list """ self.log.debug("Received getPool request(%s)" % (vmName)) try: if not vmName or vmName == "": return [] result = self.preallocator.getPool(vmName) return [ "pool_size=%d" % len(result["pool"]), "free_size=%d" % len(result["free"]), "pool=%s" % result["pool"], "free=%s" % result["free"] ] except Exception as err: self.log.error("getPool request failed: %s" % err) return [] def getInfo(self): """ getInfo - return various statistics about the Tango daemon """ stats = {} stats['elapsed_secs'] = time.time() - self.start_time stats['job_requests'] = Config.job_requests stats['job_retries'] = Config.job_retries stats['waitvm_timeouts'] = Config.waitvm_timeouts stats['runjob_timeouts'] = Config.runjob_timeouts stats['copyin_errors'] = Config.copyin_errors stats['runjob_errors'] = Config.runjob_errors stats['copyout_errors'] = Config.copyout_errors stats['num_threads'] = threading.activeCount() return stats # # Helper functions # def resetTango(self, vmms): """ resetTango - resets Tango to a clean predictable state and ensures that it has a working virtualization environment. A side effect is that also checks that each supported VMMS is actually running. """ self.log.debug("Received resetTango request.") try: # For each supported VMM system, get the instances it knows about, # and kill those in the current Tango name space. for vmms_name in vmms: vobj = vmms[vmms_name] vms = vobj.getVMs() self.log.debug("Pre-existing VMs: %s" % [vm.name for vm in vms]) namelist = [] for vm in vms: if re.match("%s-" % Config.PREFIX, vm.name): vobj.destroyVM(vm) # Need a consistent abstraction for a vm between # interfaces namelist.append(vm.name) if namelist: self.log.warning("Killed these %s VMs on restart: %s" % (vmms_name, namelist)) for _, job in self.jobQueue.liveJobs.iteritems(): if not job.isNotAssigned(): job.makeUnassigned() self.log.debug("job: %s, assigned: %s" % (str(job.name), str(job.assigned))) except Exception as err: self.log.error("resetTango: Call to VMMS %s failed: %s" % (vmms_name, err)) os._exit(1) def __validateJob(self, job, vmms): """ validateJob - validate the input arguments in an addJob request. """ errors = 0 # If this isn't a Tango job then bail with an error if (not isinstance(job, TangoJob)): return -1 # Every job must have a name if not job.name: self.log.error("validateJob: Missing job.name") job.appendTrace("%s|validateJob: Missing job.name" % (datetime.utcnow().ctime())) errors += 1 # Check the virtual machine field if not job.vm: self.log.error("validateJob: Missing job.vm") job.appendTrace("%s|validateJob: Missing job.vm" % (datetime.utcnow().ctime())) errors += 1 else: if not job.vm.image: self.log.error("validateJob: Missing job.vm.image") job.appendTrace("%s|validateJob: Missing job.vm.image" % (datetime.utcnow().ctime())) errors += 1 else: vobj = vmms[Config.VMMS_NAME] imgList = vobj.getImages() if job.vm.image not in imgList: self.log.error("validateJob: Image not found: %s" % job.vm.image) job.appendTrace("%s|validateJob: Image not found: %s" % (datetime.utcnow().ctime(), job.vm.image)) job.appendTrace("%s|validateJob: Images available: %s" % (datetime.utcnow().ctime(), imgList)) errors += 1 else: (name, ext) = os.path.splitext(job.vm.image) job.vm.name = name if not job.vm.vmms: self.log.error("validateJob: Missing job.vm.vmms") job.appendTrace("%s|validateJob: Missing job.vm.vmms" % (datetime.utcnow().ctime())) errors += 1 else: if job.vm.vmms not in vmms: self.log.error("validateJob: Invalid vmms name: %s" % job.vm.vmms) job.appendTrace("%s|validateJob: Invalid vmms name: %s" % (datetime.utcnow().ctime(), job.vm.vmms)) errors += 1 # Check the output file if not job.outputFile: self.log.error("validateJob: Missing job.outputFile") job.appendTrace("%s|validateJob: Missing job.outputFile" % (datetime.utcnow().ctime())) errors += 1 else: if not os.path.exists(os.path.dirname(job.outputFile)): self.log.error("validateJob: Bad output path: %s", job.outputFile) job.appendTrace("%s|validateJob: Bad output path: %s" % (datetime.utcnow().ctime(), job.outputFile)) errors += 1 # Check for max output file size parameter if not job.maxOutputFileSize: self.log.debug( "validateJob: Setting job.maxOutputFileSize " "to default value: %d bytes", Config.MAX_OUTPUT_FILE_SIZE) job.maxOutputFileSize = Config.MAX_OUTPUT_FILE_SIZE # Check the list of input files hasMakefile = False for inputFile in job.input: if not inputFile.localFile: self.log.error("validateJob: Missing inputFile.localFile") job.appendTrace("%s|validateJob: Missing inputFile.localFile" % (datetime.utcnow().ctime())) errors += 1 else: if not os.path.exists(os.path.dirname(job.outputFile)): self.log.error("validateJob: Bad output path: %s", job.outputFile) job.appendTrace( "%s|validateJob: Bad output path: %s" % (datetime.utcnow().ctime(), job.outputFile)) errors += 1 if inputFile.destFile == 'Makefile': hasMakefile = True # Check if input files include a Makefile if not hasMakefile: self.log.error("validateJob: Missing Makefile in input files.") job.appendTrace( "%s|validateJob: Missing Makefile in input files." % (datetime.utcnow().ctime())) errors += 1 # Check if job timeout has been set; If not set timeout to default if not job.timeout or job.timeout <= 0: self.log.debug( "validateJob: Setting job.timeout to" " default config value: %d secs", Config.RUNJOB_TIMEOUT) job.timeout = Config.RUNJOB_TIMEOUT # Any problems, return an error status if errors > 0: self.log.error("validateJob: Job rejected: %d errors" % errors) job.appendTrace("%s|validateJob: Job rejected: %d errors" % (datetime.utcnow().ctime(), errors)) return -1 else: return 0
class TangoServer: """ TangoServer - Implements the API functions that the server accepts """ def __init__(self): self.daemon = True vmms = None if Config.VMMS_NAME == "tashiSSH": from vmms.tashiSSH import TashiSSH vmms = TashiSSH() elif Config.VMMS_NAME == "ec2SSH": from vmms.ec2SSH import Ec2SSH vmms = Ec2SSH() elif Config.VMMS_NAME == "localDocker": from vmms.localDocker import LocalDocker vmms = LocalDocker() elif Config.VMMS_NAME == "distDocker": from vmms.distDocker import DistDocker vmms = DistDocker() self.preallocator = Preallocator({Config.VMMS_NAME: vmms}) self.jobQueue = JobQueue(self.preallocator) if not Config.USE_REDIS: # creates a local Job Manager if there is no persistent # memory between processes. Otherwise, JobManager will # be initiated separately JobManager(self.jobQueue).start() logging.basicConfig( filename=Config.LOGFILE, format="%(levelname)s|%(asctime)s|%(name)s|%(message)s", level=Config.LOGLEVEL, ) self.start_time = time.time() self.log = logging.getLogger("TangoServer") self.log.info("Starting Tango server") def addJob(self, job): """ addJob - Add a job to the job queue """ Config.job_requests += 1 self.log.debug("Received addJob request") ret = self.__validateJob(job, self.preallocator.vmms) self.log.info("Done validating job %s" % (job.name)) if ret == 0: return self.jobQueue.add(job) else: self.jobQueue.addDead(job) return -1 def delJob(self, id, deadjob): """ delJob - Delete a job @param id: Id of job to delete @param deadjob - If 0, move the job from the live queue to the dead queue. If non-zero, remove the job from the dead queue and discard. Use with caution! """ self.log.debug("Received delJob(%d, %d) request" % (id, deadjob)) return self.jobQueue.delJob(id, deadjob) def getJobs(self, item): """ getJobs - Return the list of live jobs (item == 0) or the list of dead jobs (item == -1). """ try: self.log.debug("Received getJobs(%s) request" % (item)) if item == -1: # return the list of dead jobs return self.jobQueue.deadJobs.values() elif item == 0: # return the list of live jobs return self.jobQueue.liveJobs.values() else: # invalid parameter return [] except Exception as e: self.log.debug("getJobs: %s" % str(e)) def preallocVM(self, vm, num): """ preallocVM - Set the pool size for VMs of type vm to num """ self.log.debug("Received preallocVM(%s,%d)request" % (vm.name, num)) try: vmms = self.preallocator.vmms[vm.vmms] if not vm or num < 0: return -2 if vm.image not in vmms.getImages(): self.log.error("Invalid image name") return -3 (name, ext) = os.path.splitext(vm.image) vm.name = name self.preallocator.update(vm, num) return 0 except Exception as err: self.log.error("preallocVM failed: %s" % err) return -1 def getVMs(self, vmms_name): """ getVMs - return the list of VMs managed by the service vmms_name """ self.log.debug("Received getVMs request(%s)" % vmms_name) try: if vmms_name in self.preallocator.vmms: vmms_inst = self.preallocator.vmms[vmms_name] return vmms_inst.getVMs() else: return [] except Exception as err: self.log.error("getVMs request failed: %s" % err) return [] def delVM(self, vmName, id): """ delVM - delete a specific VM instance from a pool """ self.log.debug("Received delVM request(%s, %d)" % (vmName, id)) try: if not vmName or vmName == "" or not id: return -1 return self.preallocator.destroyVM(vmName, id) except Exception as err: self.log.error("delVM request failed: %s" % err) return -1 def getPool(self, vmName): """ getPool - Return the current members of a pool and its free list """ self.log.debug("Received getPool request(%s)" % (vmName)) try: if not vmName or vmName == "": return [] result = self.preallocator.getPool(vmName) return ["pool_size=%d" % len(result["pool"]), "free_size=%d" % len(result["free"]), "pool=%s" % result["pool"], "free=%s" % result["free"]] except Exception as err: self.log.error("getPool request failed: %s" % err) return [] def getInfo(self): """ getInfo - return various statistics about the Tango daemon """ stats = {} stats['elapsed_secs'] = time.time() - self.start_time; stats['job_requests'] = Config.job_requests stats['job_retries'] = Config.job_retries stats['waitvm_timeouts'] = Config.waitvm_timeouts stats['runjob_timeouts'] = Config.runjob_timeouts stats['copyin_errors'] = Config.copyin_errors stats['runjob_errors'] = Config.runjob_errors stats['copyout_errors'] = Config.copyout_errors stats['num_threads'] = threading.activeCount() return stats # # Helper functions # def resetTango(self, vmms): """ resetTango - resets Tango to a clean predictable state and ensures that it has a working virtualization environment. A side effect is that also checks that each supported VMMS is actually running. """ self.log.debug("Received resetTango request.") try: # For each supported VMM system, get the instances it knows about, # and kill those in the current Tango name space. for vmms_name in vmms: vobj = vmms[vmms_name] vms = vobj.getVMs() self.log.debug("Pre-existing VMs: %s" % [vm.name for vm in vms]) namelist = [] for vm in vms: if re.match("%s-" % Config.PREFIX, vm.name): vobj.destroyVM(vm) # Need a consistent abstraction for a vm between # interfaces namelist.append(vm.name) if namelist: self.log.warning("Killed these %s VMs on restart: %s" % (vmms_name, namelist)) for _, job in self.jobQueue.liveJobs.iteritems(): if not job.isNotAssigned(): job.makeUnassigned() self.log.debug("job: %s, assigned: %s" % (str(job.name), str(job.assigned))) except Exception as err: self.log.error("resetTango: Call to VMMS %s failed: %s" % (vmms_name, err)) os._exit(1) def __validateJob(self, job, vmms): """ validateJob - validate the input arguments in an addJob request. """ errors = 0 # If this isn't a Tango job then bail with an error if (not isinstance(job, TangoJob)): return -1 # Every job must have a name if not job.name: self.log.error("validateJob: Missing job.name") job.appendTrace("%s|validateJob: Missing job.name" % (datetime.utcnow().ctime())) errors += 1 # Check the virtual machine field if not job.vm: self.log.error("validateJob: Missing job.vm") job.appendTrace("%s|validateJob: Missing job.vm" % (datetime.utcnow().ctime())) errors += 1 else: if not job.vm.image: self.log.error("validateJob: Missing job.vm.image") job.appendTrace("%s|validateJob: Missing job.vm.image" % (datetime.utcnow().ctime())) errors += 1 else: vobj = vmms[Config.VMMS_NAME] imgList = vobj.getImages() if job.vm.image not in imgList: self.log.error("validateJob: Image not found: %s" % job.vm.image) job.appendTrace("%s|validateJob: Image not found: %s" % (datetime.utcnow().ctime(), job.vm.image)) errors += 1 else: (name, ext) = os.path.splitext(job.vm.image) job.vm.name = name if not job.vm.vmms: self.log.error("validateJob: Missing job.vm.vmms") job.appendTrace("%s|validateJob: Missing job.vm.vmms" % (datetime.utcnow().ctime())) errors += 1 else: if job.vm.vmms not in vmms: self.log.error("validateJob: Invalid vmms name: %s" % job.vm.vmms) job.appendTrace("%s|validateJob: Invalid vmms name: %s" % (datetime.utcnow().ctime(), job.vm.vmms)) errors += 1 # Check the output file if not job.outputFile: self.log.error("validateJob: Missing job.outputFile") job.appendTrace("%s|validateJob: Missing job.outputFile" % (datetime.utcnow().ctime())) errors += 1 else: if not os.path.exists(os.path.dirname(job.outputFile)): self.log.error("validateJob: Bad output path: %s", job.outputFile) job.appendTrace("%s|validateJob: Bad output path: %s" % (datetime.utcnow().ctime(), job.outputFile)) errors += 1 # Check for max output file size parameter if not job.maxOutputFileSize: self.log.debug("validateJob: Setting job.maxOutputFileSize " "to default value: %d bytes", Config.MAX_OUTPUT_FILE_SIZE) job.maxOutputFileSize = Config.MAX_OUTPUT_FILE_SIZE # Check the list of input files hasMakefile = False for inputFile in job.input: if not inputFile.localFile: self.log.error("validateJob: Missing inputFile.localFile") job.appendTrace("%s|validateJob: Missing inputFile.localFile" % (datetime.utcnow().ctime())) errors += 1 else: if not os.path.exists(os.path.dirname(job.outputFile)): self.log.error("validateJob: Bad output path: %s", job.outputFile) job.appendTrace("%s|validateJob: Bad output path: %s" % (datetime.utcnow().ctime(), job.outputFile)) errors += 1 if inputFile.destFile == 'Makefile': hasMakefile = True # Check if input files include a Makefile if not hasMakefile: self.log.error("validateJob: Missing Makefile in input files.") job.appendTrace("%s|validateJob: Missing Makefile in input files." % (datetime.utcnow().ctime())) errors+=1 # Check if job timeout has been set; If not set timeout to default if not job.timeout or job.timeout <= 0: self.log.debug("validateJob: Setting job.timeout to" " default config value: %d secs", Config.RUNJOB_TIMEOUT) job.timeout = Config.RUNJOB_TIMEOUT # Any problems, return an error status if errors > 0: self.log.error("validateJob: Job rejected: %d errors" % errors) job.appendTrace("%s|validateJob: Job rejected: %d errors" % (datetime.utcnow().ctime(), errors)) return -1 else: return 0
class TestJobQueue(unittest.TestCase): def setUp(self): if Config.USE_REDIS: __db = redis.StrictRedis(Config.REDIS_HOSTNAME, Config.REDIS_PORT, db=0) __db.flushall() self.job1 = TangoJob( name="sample_job_1", vm="ilter.img", outputFile="sample_job_1_output", input=[], timeout=30, notifyURL="notifyMeUrl", maxOutputFileSize=4096, ) self.job2 = TangoJob( name="sample_job_2", vm="ilter.img", outputFile="sample_job_2_output", input=[], timeout=30, notifyURL="notifyMeUrl", maxOutputFileSize=4096, ) self.jobQueue = JobQueue(None) self.jobQueue.reset() self.jobId1 = self.jobQueue.add(self.job1) self.jobId2 = self.jobQueue.add(self.job2) def test_sharedInt(self): if Config.USE_REDIS: num1 = TangoIntValue("nextID", 1000) num2 = TangoIntValue("nextID", 3000) self.assertEqual(num1.get(), 1000) self.assertEqual(num1.get(), num2.get()) else: return def test_job(self): self.job1.makeUnassigned() self.assertTrue(self.job1.isNotAssigned()) job = self.jobQueue.get(self.jobId1) self.assertTrue(job.isNotAssigned()) self.job1.makeAssigned() print("Checkout:") self.assertFalse(self.job1.isNotAssigned()) self.assertFalse(job.isNotAssigned()) def test_add(self): info = self.jobQueue.getInfo() self.assertEqual(info["size"], 2) def test_addToUnassigned(self): info = self.jobQueue.getInfo() self.assertEqual(info["size_unassignedjobs"], 2) def test_addDead(self): return self.assertEqual(1, 1) def test_delJob(self): self.jobQueue.delJob(self.jobId1, 0) info = self.jobQueue.getInfo() self.assertEqual(info["size"], 1) self.assertEqual(info["size_deadjobs"], 1) self.assertEqual(info["size_unassignedjobs"], 1) self.jobQueue.delJob(self.jobId1, 1) info = self.jobQueue.getInfo() self.assertEqual(info["size_deadjobs"], 0) self.assertEqual(info["size"], 1) self.assertEqual(info["size_unassignedjobs"], 1) return False def test_get(self): ret_job_1 = self.jobQueue.get(self.jobId1) self.assertEqual(str(ret_job_1.id), self.jobId1) ret_job_2 = self.jobQueue.get(self.jobId2) self.assertEqual(str(ret_job_2.id), self.jobId2) def test_getNextPendingJob(self): self.jobQueue.assignJob(self.jobId2) # job 2 should have been removed from unassigned queue info = self.jobQueue.getInfo() self.assertEqual(info["size_unassignedjobs"], 1) self.jobQueue.assignJob(self.jobId1) info = self.jobQueue.getInfo() self.assertEqual(info["size_unassignedjobs"], 0) self.jobQueue.unassignJob(self.jobId1) info = self.jobQueue.getInfo() self.assertEqual(info["size_unassignedjobs"], 1) job = self.jobQueue.getNextPendingJob() self.assertMultiLineEqual(str(job.id), self.jobId1) def test_getNextPendingJob2(self): job = self.jobQueue.getNextPendingJob() self.assertMultiLineEqual(str(job.id), self.jobId1) job = self.jobQueue.getNextPendingJob() self.assertMultiLineEqual(str(job.id), self.jobId2) def test_assignJob(self): self.jobQueue.assignJob(self.jobId1) job = self.jobQueue.get(self.jobId1) self.assertFalse(job.isNotAssigned()) def test_unassignJob(self): self.jobQueue.assignJob(self.jobId1) job = self.jobQueue.get(self.jobId1) self.assertTrue(job.assigned) self.jobQueue.unassignJob(self.jobId1) job = self.jobQueue.get(self.jobId1) return self.assertEqual(job.assigned, False) def test_makeDead(self): info = self.jobQueue.getInfo() self.assertEqual(info["size_deadjobs"], 0) self.assertEqual(info["size_unassignedjobs"], 2) self.jobQueue.makeDead(self.jobId1, "test") info = self.jobQueue.getInfo() self.assertEqual(info["size_deadjobs"], 1) self.assertEqual(info["size_unassignedjobs"], 1) def test__getNextID(self): init_id = self.jobQueue.nextID for i in range(1, Config.MAX_JOBID + 100): id = self.jobQueue._getNextID() self.assertNotEqual(str(id), self.jobId1) self.jobQueue.nextID = init_id
class TangoServer: """ TangoServer - Implements the API functions that the server accepts """ def __init__(self): self.daemon = True # init logging early, or some logging will be lost logging.basicConfig( filename=Config.LOGFILE, format="%(levelname)s|%(asctime)s|%(name)s|%(message)s", level=Config.LOGLEVEL, ) vmms = None if Config.VMMS_NAME == "tashiSSH": from vmms.tashiSSH import TashiSSH vmms = TashiSSH() elif Config.VMMS_NAME == "ec2SSH": from vmms.ec2SSH import Ec2SSH vmms = Ec2SSH() elif Config.VMMS_NAME == "localDocker": from vmms.localDocker import LocalDocker vmms = LocalDocker() elif Config.VMMS_NAME == "distDocker": from vmms.distDocker import DistDocker vmms = DistDocker() self.preallocator = Preallocator({Config.VMMS_NAME: vmms}) self.jobQueue = JobQueue(self.preallocator) if not Config.USE_REDIS: # creates a local Job Manager if there is no persistent # memory between processes. Otherwise, JobManager will # be initiated separately JobManager(self.jobQueue).start() self.start_time = time.time() self.log = logging.getLogger("TangoServer") self.log.info("Starting Tango server") def addJob(self, job): """ addJob - Add a job to the job queue """ Config.job_requests += 1 self.log.debug("Received addJob request") ret = self.__validateJob(job, self.preallocator.vmms) self.log.info("Done validating job %s" % (job.name)) if ret == 0: return self.jobQueue.add(job) else: self.jobQueue.addDead(job) return -1 def delJob(self, id, deadjob): """ delJob - Delete a job @param id: Id of job to delete @param deadjob - If 0, move the job from the live queue to the dead queue. If non-zero, remove the job from the dead queue and discard. Use with caution! """ self.log.debug("Received delJob(%d, %d) request" % (id, deadjob)) return self.jobQueue.delJob(id, deadjob) def cancelJobWithPath(self, outFilePath): """ cancelJobWithPath - when this function returns, one of the following is true: 1. The job with the specified output file does not exist 2. the job with the specified output file has finished running normally 3. The job with the specified output file has been cancelled 4. The job was found, and it's running, but cancellation failed. In case 1, NOT_FOUND is returned. 2, ALREADY_COMPLETED is returned. 3, SUCCEEDED is returned. 4, FAILED is returned. """ self.log.debug("Received cancelJobWithPath(%s) request" % (outFilePath)) id, job, job_status = self.jobQueue.findRemovingWaiting(outFilePath) self.log.debug("cancelJobWithPath: Found a job %s with status %s" % (job, job_status)) if job_status == JobQueue.JobStatus.NOT_FOUND: return CancellationStatus.NOT_FOUND elif job_status == JobQueue.JobStatus.DEAD: return CancellationStatus.ALREADY_COMPLETED elif job_status == JobQueue.JobStatus.RUNNING: return self.killUntilJobComplete(id, job) else: assert job_status == JobQueue.JobStatus.WAITING # In this case, findRemovingLive has moved the live job to the dead # queue, and we have nothing to worry about. # Let's notify autolab that the job is done. if job.notifyURL: outputFileName = job.outputFile.split("/")[ -1] # get filename from path files = { 'file': unicode('Job was cancelled before it started.') } hdrs = {'Filename': outputFileName} self.log.debug("Sending request to %s" % job.notifyURL) def worker(): requests.post(job.notifyURL, files=files, headers=hdrs, data={'runningTimeSeconds': 0}, verify=False) threading.Thread(target=worker).start() return CancellationStatus.SUCCEEDED def killUntilJobComplete(self, id, job): """ Here's the contract: If the job is currently running (i.e. it could complete at some point in the future), then this method will return only when the job is complete. It tries to help by repeatedly `pkill`ing the process. But a compliant implementation could just block until the job completes on its own. On success, returns SUCCEEDED; on failure, return FAILED (compliant w above method) """ self.log.debug("Received killUntilJobComplete request") vm = job.vm for _ in xrange(0, Config.CANCEL_RETRIES): # Returns 0 on success. if self.preallocator.vmms[vm.vmms].kill(vm) == 0: return CancellationStatus.SUCCEEDED return CancellationStatus.FAILED def getJobs(self, item): """ getJobs - Return the list of live jobs (item == 0) or the list of dead jobs (item == -1). ^ You gotta be kidding me. Is this an API for number lovers. """ try: self.log.debug("Received getJobs(%s) request" % (item)) if item == -1: # return the list of dead jobs return self.jobQueue.deadJobs.values() elif item == 0: # return the list of live jobs return self.jobQueue.liveJobs.values() else: # invalid parameter return [] except Exception as e: self.log.debug("getJobs: %s" % str(e)) def preallocVM(self, vm, num): """ preallocVM - Set the pool size for VMs of type vm to num """ self.log.debug("Received preallocVM(%s,%d)request" % (vm.name, num)) try: vmms = self.preallocator.vmms[vm.vmms] if not vm or num < 0: return -2 if not vmms.isValidImage(vm.image): self.log.error("Invalid image name") return -3 (name, ext) = os.path.splitext(vm.image) vm.name = name self.preallocator.update(vm, num) return 0 except Exception as err: self.log.error("preallocVM failed: %s" % err) return -1 def getVMs(self, vmms_name): """ getVMs - return the list of VMs managed by the service vmms_name """ self.log.debug("Received getVMs request(%s)" % vmms_name) try: if vmms_name in self.preallocator.vmms: vmms_inst = self.preallocator.vmms[vmms_name] return vmms_inst.getVMs() else: return [] except Exception as err: self.log.error("getVMs request failed: %s" % err) return [] def delVM(self, vmName, id): """ delVM - delete a specific VM instance from a pool """ self.log.debug("Received delVM request(%s, %d)" % (vmName, id)) try: if not vmName or vmName == "" or not id: return -1 return self.preallocator.destroyVM(vmName, id) except Exception as err: self.log.error("delVM request failed: %s" % err) return -1 def getPool(self, vmName): """ getPool - Return the current members of a pool and its free list """ self.log.debug("Received getPool request(%s)" % (vmName)) try: if not vmName or vmName == "": return [] result = self.preallocator.getPool(vmName) return [ "pool_size=%d" % len(result["pool"]), "free_size=%d" % len(result["free"]), "pool=%s" % result["pool"], "free=%s" % result["free"] ] except Exception as err: self.log.error("getPool request failed: %s" % err) return [] def getInfo(self): """ getInfo - return various statistics about the Tango daemon """ stats = {} stats['elapsed_secs'] = time.time() - self.start_time stats['job_requests'] = Config.job_requests stats['job_retries'] = Config.job_retries stats['waitvm_timeouts'] = Config.waitvm_timeouts stats['runjob_timeouts'] = Config.runjob_timeouts stats['copyin_errors'] = Config.copyin_errors stats['runjob_errors'] = Config.runjob_errors stats['copyout_errors'] = Config.copyout_errors stats['num_threads'] = threading.activeCount() return stats def setScaleParams(self, low_water_mark, max_pool_size): self.preallocator.low_water_mark.set(low_water_mark) self.jobQueue.max_pool_size.set(max_pool_size) return 0 def runningTimeForOutputFile(self, outputFile): self.log.debug("Received runningTimeForOutputFile(%s)" % outputFile) liveJobTuple = self.jobQueue.liveJobs.getWrapped(outputFile) if liveJobTuple: (_, liveJob) = liveJobTuple self.log.debug(str(liveJob.startTime)) return liveJob.runningTime() return None # # Helper functions # # NOTE: This function should be called by ONLY jobManager. The rest servers # shouldn't call this function. def resetTango(self, vmms): """ resetTango - resets Tango to a clean predictable state and ensures that it has a working virtualization environment. A side effect is that also checks that each supported VMMS is actually running. """ # There are two cases this function is called: 1. Tango has a fresh start. # Then we want to destroy all instances in Tango's name space. 2. Job # Manager is restarted after a previous crash. Then we want to destroy # the "busy" instances prior to the crash and leave the "free" onces intact. self.log.debug("Received resetTango request.") try: # For each supported VMM system, get the instances it knows about # in the current Tango name space and kill those not in free pools. for vmms_name in vmms: vobj = vmms[vmms_name] # Round up all instances in the free pools. allFreeVMs = [] for key in self.preallocator.machines.keys(): freePool = self.preallocator.getPool(key)["free"] for vmId in freePool: vmName = vobj.instanceName(vmId, key) allFreeVMs.append(vmName) self.log.info("vms in all free pools: %s" % allFreeVMs) # For each in Tango's name space, destroy the onces in free pool. # AND remove it from Tango's internal bookkeeping. vms = vobj.getVMs() self.log.debug("Pre-existing VMs: %s" % [vm.name for vm in vms]) destroyedList = [] removedList = [] for vm in vms: if re.match("%s-" % Config.PREFIX, vm.name): # Todo: should have an one-call interface to destroy the # machine AND to keep the interval data consistent. if vm.name not in allFreeVMs: destroyedList.append(vm.name) vobj.destroyVM(vm) # also remove it from "total" set of the pool (prefix, vmId, poolName) = vm.name.split("-") machine = self.preallocator.machines.get(poolName) if not machine: # the pool may not exist continue if int(vmId) in machine[0]: removedList.append(vm.name) machine[0].remove(int(vmId)) self.preallocator.machines.set(poolName, machine) if destroyedList: self.log.warning("Killed these %s VMs on restart: %s" % (vmms_name, destroyedList)) if removedList: self.log.warning("Removed these %s VMs from their pools" % (removedList)) for _, job in self.jobQueue.liveJobs.iteritems(): if not job.isNotAssigned(): job.makeUnassigned() self.log.debug("job: %s, assigned: %s" % (str(job.name), str(job.assigned))) except Exception as err: self.log.error("resetTango: Call to VMMS %s failed: %s" % (vmms_name, err)) os._exit(1) def __validateJob(self, job, vmms): """ validateJob - validate the input arguments in an addJob request. """ errors = 0 # If this isn't a Tango job then bail with an error if (not isinstance(job, TangoJob)): return -1 # Every job must have a name if not job.name: self.log.error("validateJob: Missing job.name") job.appendTrace("validateJob: Missing job.name") errors += 1 # Check the virtual machine field if not job.vm: self.log.error("validateJob: Missing job.vm") job.appendTrace("validateJob: Missing job.vm") errors += 1 else: if not job.vm.image: self.log.error("validateJob: Missing job.vm.image") job.appendTrace("validateJob: Missing job.vm.image") errors += 1 else: vobj = vmms[Config.VMMS_NAME] if not vobj.isValidImage(job.vm.image): self.log.error("validateJob: Image not found: %s" % job.vm.image) job.appendTrace("validateJob: Image not found: %s" % job.vm.image) errors += 1 else: (name, ext) = os.path.splitext(job.vm.image) job.vm.name = name if not job.vm.vmms: self.log.error("validateJob: Missing job.vm.vmms") job.appendTrace("validateJob: Missing job.vm.vmms") errors += 1 else: if job.vm.vmms not in vmms: self.log.error("validateJob: Invalid vmms name: %s" % job.vm.vmms) job.appendTrace("validateJob: Invalid vmms name: %s" % job.vm.vmms) errors += 1 # Check the output file if not job.outputFile: self.log.error("validateJob: Missing job.outputFile") job.appendTrace("validateJob: Missing job.outputFile") errors += 1 else: if not os.path.exists(os.path.dirname(job.outputFile)): self.log.error("validateJob: Bad output path: %s" % job.outputFile) job.appendTrace("validateJob: Bad output path: %s" % job.outputFile) errors += 1 # Check for max output file size parameter if not job.maxOutputFileSize: self.log.debug( "validateJob: Setting job.maxOutputFileSize " "to default value: %d bytes", Config.MAX_OUTPUT_FILE_SIZE) job.maxOutputFileSize = Config.MAX_OUTPUT_FILE_SIZE # Check the list of input files hasMakefile = False for inputFile in job.input: if not inputFile.localFile: self.log.error("validateJob: Missing inputFile.localFile") job.appendTrace("validateJob: Missing inputFile.localFile") errors += 1 else: if not os.path.exists(os.path.dirname(job.outputFile)): self.log.error("validateJob: Bad output path: %s" % job.outputFile) job.appendTrace("validateJob: Bad output path: %s" % job.outputFile) errors += 1 if inputFile.destFile == 'Makefile': hasMakefile = True # Check if input files include a Makefile if not hasMakefile: self.log.error("validateJob: Missing Makefile in input files.") job.appendTrace("validateJob: Missing Makefile in input files.") errors += 1 # Check if job timeout has been set; If not set timeout to default if not job.timeout or job.timeout <= 0: self.log.debug( "validateJob: Setting job.timeout to" " default config value: %d secs", Config.RUNJOB_TIMEOUT) job.timeout = Config.RUNJOB_TIMEOUT # Any problems, return an error status if errors > 0: self.log.error("validateJob: Job rejected: %d errors" % errors) job.appendTrace("validateJob: Job rejected: %d errors" % errors) return -1 else: return 0
def __init__(s, queuePath): s.queuePath = queuePath s._connection_cache = {} s.queue = JobQueue(queuePath)
class TestJobQueue(unittest.TestCase): def setUp(self): if Config.USE_REDIS: __db = redis.StrictRedis( Config.REDIS_HOSTNAME, Config.REDIS_PORT, db=0) __db.flushall() self.job1 = TangoJob( name="sample_job_1", vm="ilter.img", outputFile="sample_job_1_output", input=[], timeout=30, notifyURL="notifyMeUrl", maxOutputFileSize=4096) self.job2 = TangoJob( name="sample_job_2", vm="ilter.img", outputFile="sample_job_2_output", input=[], timeout=30, notifyURL="notifyMeUrl", maxOutputFileSize=4096) self.jobQueue = JobQueue(None) self.jobQueue.reset() self.jobId1 = self.jobQueue.add(self.job1) self.jobId2 = self.jobQueue.add(self.job2) def test_sharedInt(self): if Config.USE_REDIS: num1 = TangoIntValue("nextID", 1000) num2 = TangoIntValue("nextID", 3000) self.assertEqual(num1.get(), 1000) self.assertEqual(num1.get(), num2.get()) else: return def test_job(self): self.job1.makeUnassigned() self.assertTrue(self.job1.isNotAssigned()) job = self.jobQueue.get(self.jobId1) self.assertTrue(job.isNotAssigned()) self.job1.makeAssigned() print "Checkout:" self.assertFalse(self.job1.isNotAssigned()) self.assertFalse(job.isNotAssigned()) def test_add(self): info = self.jobQueue.getInfo() self.assertEqual(info['size'], 2) def test_addDead(self): return self.assertEqual(1, 1) def test_remove(self): self.jobQueue.remove(self.jobId1) info = self.jobQueue.getInfo() self.assertEqual(info['size'], 1) self.jobQueue.remove(self.jobId2) info = self.jobQueue.getInfo() self.assertEqual(info['size'], 0) def test_delJob(self): self.jobQueue.delJob(self.jobId1, 0) info = self.jobQueue.getInfo() self.assertEqual(info['size'], 1) self.assertEqual(info['size_deadjobs'], 1) self.jobQueue.delJob(self.jobId1, 1) info = self.jobQueue.getInfo() self.assertEqual(info['size_deadjobs'], 0) return False def test_get(self): ret_job_1 = self.jobQueue.get(self.jobId1) self.assertEqual(str(ret_job_1.id), self.jobId1) ret_job_2 = self.jobQueue.get(self.jobId2) self.assertEqual(str(ret_job_2.id), self.jobId2) def test_getNextPendingJob(self): self.jobQueue.assignJob(self.jobId2) self.jobQueue.unassignJob(self.jobId1) exp_id = self.jobQueue.getNextPendingJob() self.assertMultiLineEqual(exp_id, self.jobId1) def test_getNextPendingJobReuse(self): return False def test_assignJob(self): self.jobQueue.assignJob(self.jobId1) job = self.jobQueue.get(self.jobId1) self.assertFalse(job.isNotAssigned()) def test_unassignJob(self): self.jobQueue.assignJob(self.jobId1) job = self.jobQueue.get(self.jobId1) self.assertTrue(job.assigned) self.jobQueue.unassignJob(self.jobId1) job = self.jobQueue.get(self.jobId1) return self.assertEqual(job.assigned, False) def test_makeDead(self): info = self.jobQueue.getInfo() self.assertEqual(info['size_deadjobs'], 0) self.jobQueue.makeDead(self.jobId1, "test") info = self.jobQueue.getInfo() self.assertEqual(info['size_deadjobs'], 1) def test__getNextID(self): init_id = self.jobQueue.nextID for i in xrange(1, Config.MAX_JOBID + 100): id = self.jobQueue._getNextID() self.assertNotEqual(str(id), self.jobId1) self.jobQueue.nextID = init_id
if __name__ == "__main__": if not Config.USE_REDIS: print( "You need to have Redis running to be able to initiate stand-alone\ JobManager") else: vmms = None if Config.VMMS_NAME == "localSSH": from vmms.localSSH import LocalSSH vmms = LocalSSH() elif Config.VMMS_NAME == "tashiSSH": from vmms.tashiSSH import TashiSSH vmms = TashiSSH() elif Config.VMMS_NAME == "ec2SSH": from vmms.ec2SSH import Ec2SSH vmms = Ec2SSH() elif Config.VMMS_NAME == "localDocker": from vmms.localDocker import LocalDocker vmms = LocalDocker() vmms = {Config.VMMS_NAME: vmms} preallocator = Preallocator(vmms) queue = JobQueue(preallocator) JobManager(queue, vmms, preallocator) print("Starting the stand-alone Tango JobManager")
def getAll(queuePath): # Dump all jobs in the queue. # @param queuePath: the job queue path # @returns: an array of jobs in an object return {'jobs': JobQueue(queuePath).getAll()}