class TangoServer: """ TangoServer - Implements the API functions that the server accepts """ def __init__(self): self.daemon = True vmms = None if Config.VMMS_NAME == "tashiSSH": from vmms.tashiSSH import TashiSSH vmms = TashiSSH() elif Config.VMMS_NAME == "ec2SSH": from vmms.ec2SSH import Ec2SSH vmms = Ec2SSH() elif Config.VMMS_NAME == "localDocker": from vmms.localDocker import LocalDocker vmms = LocalDocker() elif Config.VMMS_NAME == "distDocker": from vmms.distDocker import DistDocker vmms = DistDocker() self.preallocator = Preallocator({Config.VMMS_NAME: vmms}) self.jobQueue = JobQueue(self.preallocator) if not Config.USE_REDIS: # creates a local Job Manager if there is no persistent # memory between processes. Otherwise, JobManager will # be initiated separately JobManager(self.jobQueue).start() logging.basicConfig( filename=Config.LOGFILE, format="%(levelname)s|%(asctime)s|%(name)s|%(message)s", level=Config.LOGLEVEL, ) self.start_time = time.time() self.log = logging.getLogger("TangoServer") self.log.info("Starting Tango server") def addJob(self, job): """ addJob - Add a job to the job queue """ Config.job_requests += 1 self.log.debug("Received addJob request") ret = self.__validateJob(job, self.preallocator.vmms) self.log.info("Done validating job %s" % (job.name)) if ret == 0: return self.jobQueue.add(job) else: self.jobQueue.addDead(job) return -1 def delJob(self, id, deadjob): """ delJob - Delete a job @param id: Id of job to delete @param deadjob - If 0, move the job from the live queue to the dead queue. If non-zero, remove the job from the dead queue and discard. Use with caution! """ self.log.debug("Received delJob(%d, %d) request" % (id, deadjob)) return self.jobQueue.delJob(id, deadjob) def getJobs(self, item): """ getJobs - Return the list of live jobs (item == 0) or the list of dead jobs (item == -1). """ try: self.log.debug("Received getJobs(%s) request" % (item)) if item == -1: # return the list of dead jobs return self.jobQueue.deadJobs.values() elif item == 0: # return the list of live jobs return self.jobQueue.liveJobs.values() else: # invalid parameter return [] except Exception as e: self.log.debug("getJobs: %s" % str(e)) def preallocVM(self, vm, num): """ preallocVM - Set the pool size for VMs of type vm to num """ self.log.debug("Received preallocVM(%s,%d)request" % (vm.name, num)) try: vmms = self.preallocator.vmms[vm.vmms] if not vm or num < 0: return -2 if vm.image not in vmms.getImages(): self.log.error("Invalid image name") return -3 (name, ext) = os.path.splitext(vm.image) vm.name = name self.preallocator.update(vm, num) return 0 except Exception as err: self.log.error("preallocVM failed: %s" % err) return -1 def getVMs(self, vmms_name): """ getVMs - return the list of VMs managed by the service vmms_name """ self.log.debug("Received getVMs request(%s)" % vmms_name) try: if vmms_name in self.preallocator.vmms: vmms_inst = self.preallocator.vmms[vmms_name] return vmms_inst.getVMs() else: return [] except Exception as err: self.log.error("getVMs request failed: %s" % err) return [] def delVM(self, vmName, id): """ delVM - delete a specific VM instance from a pool """ self.log.debug("Received delVM request(%s, %d)" % (vmName, id)) try: if not vmName or vmName == "" or not id: return -1 return self.preallocator.destroyVM(vmName, id) except Exception as err: self.log.error("delVM request failed: %s" % err) return -1 def getPool(self, vmName): """ getPool - Return the current members of a pool and its free list """ self.log.debug("Received getPool request(%s)" % (vmName)) try: if not vmName or vmName == "": return [] result = self.preallocator.getPool(vmName) return [ "pool_size=%d" % len(result["pool"]), "free_size=%d" % len(result["free"]), "pool=%s" % result["pool"], "free=%s" % result["free"] ] except Exception as err: self.log.error("getPool request failed: %s" % err) return [] def getInfo(self): """ getInfo - return various statistics about the Tango daemon """ stats = {} stats['elapsed_secs'] = time.time() - self.start_time stats['job_requests'] = Config.job_requests stats['job_retries'] = Config.job_retries stats['waitvm_timeouts'] = Config.waitvm_timeouts stats['runjob_timeouts'] = Config.runjob_timeouts stats['copyin_errors'] = Config.copyin_errors stats['runjob_errors'] = Config.runjob_errors stats['copyout_errors'] = Config.copyout_errors stats['num_threads'] = threading.activeCount() return stats # # Helper functions # def resetTango(self, vmms): """ resetTango - resets Tango to a clean predictable state and ensures that it has a working virtualization environment. A side effect is that also checks that each supported VMMS is actually running. """ self.log.debug("Received resetTango request.") try: # For each supported VMM system, get the instances it knows about, # and kill those in the current Tango name space. for vmms_name in vmms: vobj = vmms[vmms_name] vms = vobj.getVMs() self.log.debug("Pre-existing VMs: %s" % [vm.name for vm in vms]) namelist = [] for vm in vms: if re.match("%s-" % Config.PREFIX, vm.name): vobj.destroyVM(vm) # Need a consistent abstraction for a vm between # interfaces namelist.append(vm.name) if namelist: self.log.warning("Killed these %s VMs on restart: %s" % (vmms_name, namelist)) for _, job in self.jobQueue.liveJobs.iteritems(): if not job.isNotAssigned(): job.makeUnassigned() self.log.debug("job: %s, assigned: %s" % (str(job.name), str(job.assigned))) except Exception as err: self.log.error("resetTango: Call to VMMS %s failed: %s" % (vmms_name, err)) os._exit(1) def __validateJob(self, job, vmms): """ validateJob - validate the input arguments in an addJob request. """ errors = 0 # If this isn't a Tango job then bail with an error if (not isinstance(job, TangoJob)): return -1 # Every job must have a name if not job.name: self.log.error("validateJob: Missing job.name") job.appendTrace("%s|validateJob: Missing job.name" % (datetime.utcnow().ctime())) errors += 1 # Check the virtual machine field if not job.vm: self.log.error("validateJob: Missing job.vm") job.appendTrace("%s|validateJob: Missing job.vm" % (datetime.utcnow().ctime())) errors += 1 else: if not job.vm.image: self.log.error("validateJob: Missing job.vm.image") job.appendTrace("%s|validateJob: Missing job.vm.image" % (datetime.utcnow().ctime())) errors += 1 else: vobj = vmms[Config.VMMS_NAME] imgList = vobj.getImages() if job.vm.image not in imgList: self.log.error("validateJob: Image not found: %s" % job.vm.image) job.appendTrace("%s|validateJob: Image not found: %s" % (datetime.utcnow().ctime(), job.vm.image)) job.appendTrace("%s|validateJob: Images available: %s" % (datetime.utcnow().ctime(), imgList)) errors += 1 else: (name, ext) = os.path.splitext(job.vm.image) job.vm.name = name if not job.vm.vmms: self.log.error("validateJob: Missing job.vm.vmms") job.appendTrace("%s|validateJob: Missing job.vm.vmms" % (datetime.utcnow().ctime())) errors += 1 else: if job.vm.vmms not in vmms: self.log.error("validateJob: Invalid vmms name: %s" % job.vm.vmms) job.appendTrace("%s|validateJob: Invalid vmms name: %s" % (datetime.utcnow().ctime(), job.vm.vmms)) errors += 1 # Check the output file if not job.outputFile: self.log.error("validateJob: Missing job.outputFile") job.appendTrace("%s|validateJob: Missing job.outputFile" % (datetime.utcnow().ctime())) errors += 1 else: if not os.path.exists(os.path.dirname(job.outputFile)): self.log.error("validateJob: Bad output path: %s", job.outputFile) job.appendTrace("%s|validateJob: Bad output path: %s" % (datetime.utcnow().ctime(), job.outputFile)) errors += 1 # Check for max output file size parameter if not job.maxOutputFileSize: self.log.debug( "validateJob: Setting job.maxOutputFileSize " "to default value: %d bytes", Config.MAX_OUTPUT_FILE_SIZE) job.maxOutputFileSize = Config.MAX_OUTPUT_FILE_SIZE # Check the list of input files hasMakefile = False for inputFile in job.input: if not inputFile.localFile: self.log.error("validateJob: Missing inputFile.localFile") job.appendTrace("%s|validateJob: Missing inputFile.localFile" % (datetime.utcnow().ctime())) errors += 1 else: if not os.path.exists(os.path.dirname(job.outputFile)): self.log.error("validateJob: Bad output path: %s", job.outputFile) job.appendTrace( "%s|validateJob: Bad output path: %s" % (datetime.utcnow().ctime(), job.outputFile)) errors += 1 if inputFile.destFile == 'Makefile': hasMakefile = True # Check if input files include a Makefile if not hasMakefile: self.log.error("validateJob: Missing Makefile in input files.") job.appendTrace( "%s|validateJob: Missing Makefile in input files." % (datetime.utcnow().ctime())) errors += 1 # Check if job timeout has been set; If not set timeout to default if not job.timeout or job.timeout <= 0: self.log.debug( "validateJob: Setting job.timeout to" " default config value: %d secs", Config.RUNJOB_TIMEOUT) job.timeout = Config.RUNJOB_TIMEOUT # Any problems, return an error status if errors > 0: self.log.error("validateJob: Job rejected: %d errors" % errors) job.appendTrace("%s|validateJob: Job rejected: %d errors" % (datetime.utcnow().ctime(), errors)) return -1 else: return 0
class TangoServer: """ TangoServer - Implements the API functions that the server accepts """ def __init__(self): self.daemon = True vmms = None if Config.VMMS_NAME == "tashiSSH": from vmms.tashiSSH import TashiSSH vmms = TashiSSH() elif Config.VMMS_NAME == "ec2SSH": from vmms.ec2SSH import Ec2SSH vmms = Ec2SSH() elif Config.VMMS_NAME == "localDocker": from vmms.localDocker import LocalDocker vmms = LocalDocker() elif Config.VMMS_NAME == "distDocker": from vmms.distDocker import DistDocker vmms = DistDocker() self.preallocator = Preallocator({Config.VMMS_NAME: vmms}) self.jobQueue = JobQueue(self.preallocator) if not Config.USE_REDIS: # creates a local Job Manager if there is no persistent # memory between processes. Otherwise, JobManager will # be initiated separately JobManager(self.jobQueue).start() logging.basicConfig( filename=Config.LOGFILE, format="%(levelname)s|%(asctime)s|%(name)s|%(message)s", level=Config.LOGLEVEL, ) self.start_time = time.time() self.log = logging.getLogger("TangoServer") self.log.info("Starting Tango server") def addJob(self, job): """ addJob - Add a job to the job queue """ Config.job_requests += 1 self.log.debug("Received addJob request") ret = self.__validateJob(job, self.preallocator.vmms) self.log.info("Done validating job %s" % (job.name)) if ret == 0: return self.jobQueue.add(job) else: self.jobQueue.addDead(job) return -1 def delJob(self, id, deadjob): """ delJob - Delete a job @param id: Id of job to delete @param deadjob - If 0, move the job from the live queue to the dead queue. If non-zero, remove the job from the dead queue and discard. Use with caution! """ self.log.debug("Received delJob(%d, %d) request" % (id, deadjob)) return self.jobQueue.delJob(id, deadjob) def getJobs(self, item): """ getJobs - Return the list of live jobs (item == 0) or the list of dead jobs (item == -1). """ try: self.log.debug("Received getJobs(%s) request" % (item)) if item == -1: # return the list of dead jobs return self.jobQueue.deadJobs.values() elif item == 0: # return the list of live jobs return self.jobQueue.liveJobs.values() else: # invalid parameter return [] except Exception as e: self.log.debug("getJobs: %s" % str(e)) def preallocVM(self, vm, num): """ preallocVM - Set the pool size for VMs of type vm to num """ self.log.debug("Received preallocVM(%s,%d)request" % (vm.name, num)) try: vmms = self.preallocator.vmms[vm.vmms] if not vm or num < 0: return -2 if vm.image not in vmms.getImages(): self.log.error("Invalid image name") return -3 (name, ext) = os.path.splitext(vm.image) vm.name = name self.preallocator.update(vm, num) return 0 except Exception as err: self.log.error("preallocVM failed: %s" % err) return -1 def getVMs(self, vmms_name): """ getVMs - return the list of VMs managed by the service vmms_name """ self.log.debug("Received getVMs request(%s)" % vmms_name) try: if vmms_name in self.preallocator.vmms: vmms_inst = self.preallocator.vmms[vmms_name] return vmms_inst.getVMs() else: return [] except Exception as err: self.log.error("getVMs request failed: %s" % err) return [] def delVM(self, vmName, id): """ delVM - delete a specific VM instance from a pool """ self.log.debug("Received delVM request(%s, %d)" % (vmName, id)) try: if not vmName or vmName == "" or not id: return -1 return self.preallocator.destroyVM(vmName, id) except Exception as err: self.log.error("delVM request failed: %s" % err) return -1 def getPool(self, vmName): """ getPool - Return the current members of a pool and its free list """ self.log.debug("Received getPool request(%s)" % (vmName)) try: if not vmName or vmName == "": return [] result = self.preallocator.getPool(vmName) return ["pool_size=%d" % len(result["pool"]), "free_size=%d" % len(result["free"]), "pool=%s" % result["pool"], "free=%s" % result["free"]] except Exception as err: self.log.error("getPool request failed: %s" % err) return [] def getInfo(self): """ getInfo - return various statistics about the Tango daemon """ stats = {} stats['elapsed_secs'] = time.time() - self.start_time; stats['job_requests'] = Config.job_requests stats['job_retries'] = Config.job_retries stats['waitvm_timeouts'] = Config.waitvm_timeouts stats['runjob_timeouts'] = Config.runjob_timeouts stats['copyin_errors'] = Config.copyin_errors stats['runjob_errors'] = Config.runjob_errors stats['copyout_errors'] = Config.copyout_errors stats['num_threads'] = threading.activeCount() return stats # # Helper functions # def resetTango(self, vmms): """ resetTango - resets Tango to a clean predictable state and ensures that it has a working virtualization environment. A side effect is that also checks that each supported VMMS is actually running. """ self.log.debug("Received resetTango request.") try: # For each supported VMM system, get the instances it knows about, # and kill those in the current Tango name space. for vmms_name in vmms: vobj = vmms[vmms_name] vms = vobj.getVMs() self.log.debug("Pre-existing VMs: %s" % [vm.name for vm in vms]) namelist = [] for vm in vms: if re.match("%s-" % Config.PREFIX, vm.name): vobj.destroyVM(vm) # Need a consistent abstraction for a vm between # interfaces namelist.append(vm.name) if namelist: self.log.warning("Killed these %s VMs on restart: %s" % (vmms_name, namelist)) for _, job in self.jobQueue.liveJobs.iteritems(): if not job.isNotAssigned(): job.makeUnassigned() self.log.debug("job: %s, assigned: %s" % (str(job.name), str(job.assigned))) except Exception as err: self.log.error("resetTango: Call to VMMS %s failed: %s" % (vmms_name, err)) os._exit(1) def __validateJob(self, job, vmms): """ validateJob - validate the input arguments in an addJob request. """ errors = 0 # If this isn't a Tango job then bail with an error if (not isinstance(job, TangoJob)): return -1 # Every job must have a name if not job.name: self.log.error("validateJob: Missing job.name") job.appendTrace("%s|validateJob: Missing job.name" % (datetime.utcnow().ctime())) errors += 1 # Check the virtual machine field if not job.vm: self.log.error("validateJob: Missing job.vm") job.appendTrace("%s|validateJob: Missing job.vm" % (datetime.utcnow().ctime())) errors += 1 else: if not job.vm.image: self.log.error("validateJob: Missing job.vm.image") job.appendTrace("%s|validateJob: Missing job.vm.image" % (datetime.utcnow().ctime())) errors += 1 else: vobj = vmms[Config.VMMS_NAME] imgList = vobj.getImages() if job.vm.image not in imgList: self.log.error("validateJob: Image not found: %s" % job.vm.image) job.appendTrace("%s|validateJob: Image not found: %s" % (datetime.utcnow().ctime(), job.vm.image)) errors += 1 else: (name, ext) = os.path.splitext(job.vm.image) job.vm.name = name if not job.vm.vmms: self.log.error("validateJob: Missing job.vm.vmms") job.appendTrace("%s|validateJob: Missing job.vm.vmms" % (datetime.utcnow().ctime())) errors += 1 else: if job.vm.vmms not in vmms: self.log.error("validateJob: Invalid vmms name: %s" % job.vm.vmms) job.appendTrace("%s|validateJob: Invalid vmms name: %s" % (datetime.utcnow().ctime(), job.vm.vmms)) errors += 1 # Check the output file if not job.outputFile: self.log.error("validateJob: Missing job.outputFile") job.appendTrace("%s|validateJob: Missing job.outputFile" % (datetime.utcnow().ctime())) errors += 1 else: if not os.path.exists(os.path.dirname(job.outputFile)): self.log.error("validateJob: Bad output path: %s", job.outputFile) job.appendTrace("%s|validateJob: Bad output path: %s" % (datetime.utcnow().ctime(), job.outputFile)) errors += 1 # Check for max output file size parameter if not job.maxOutputFileSize: self.log.debug("validateJob: Setting job.maxOutputFileSize " "to default value: %d bytes", Config.MAX_OUTPUT_FILE_SIZE) job.maxOutputFileSize = Config.MAX_OUTPUT_FILE_SIZE # Check the list of input files hasMakefile = False for inputFile in job.input: if not inputFile.localFile: self.log.error("validateJob: Missing inputFile.localFile") job.appendTrace("%s|validateJob: Missing inputFile.localFile" % (datetime.utcnow().ctime())) errors += 1 else: if not os.path.exists(os.path.dirname(job.outputFile)): self.log.error("validateJob: Bad output path: %s", job.outputFile) job.appendTrace("%s|validateJob: Bad output path: %s" % (datetime.utcnow().ctime(), job.outputFile)) errors += 1 if inputFile.destFile == 'Makefile': hasMakefile = True # Check if input files include a Makefile if not hasMakefile: self.log.error("validateJob: Missing Makefile in input files.") job.appendTrace("%s|validateJob: Missing Makefile in input files." % (datetime.utcnow().ctime())) errors+=1 # Check if job timeout has been set; If not set timeout to default if not job.timeout or job.timeout <= 0: self.log.debug("validateJob: Setting job.timeout to" " default config value: %d secs", Config.RUNJOB_TIMEOUT) job.timeout = Config.RUNJOB_TIMEOUT # Any problems, return an error status if errors > 0: self.log.error("validateJob: Job rejected: %d errors" % errors) job.appendTrace("%s|validateJob: Job rejected: %d errors" % (datetime.utcnow().ctime(), errors)) return -1 else: return 0
class TangoServer: """ TangoServer - Implements the API functions that the server accepts """ def __init__(self): self.daemon = True # init logging early, or some logging will be lost logging.basicConfig( filename=Config.LOGFILE, format="%(levelname)s|%(asctime)s|%(name)s|%(message)s", level=Config.LOGLEVEL, ) vmms = None if Config.VMMS_NAME == "tashiSSH": from vmms.tashiSSH import TashiSSH vmms = TashiSSH() elif Config.VMMS_NAME == "ec2SSH": from vmms.ec2SSH import Ec2SSH vmms = Ec2SSH() elif Config.VMMS_NAME == "localDocker": from vmms.localDocker import LocalDocker vmms = LocalDocker() elif Config.VMMS_NAME == "distDocker": from vmms.distDocker import DistDocker vmms = DistDocker() self.preallocator = Preallocator({Config.VMMS_NAME: vmms}) self.jobQueue = JobQueue(self.preallocator) if not Config.USE_REDIS: # creates a local Job Manager if there is no persistent # memory between processes. Otherwise, JobManager will # be initiated separately JobManager(self.jobQueue).start() self.start_time = time.time() self.log = logging.getLogger("TangoServer") self.log.info("Starting Tango server") def addJob(self, job): """ addJob - Add a job to the job queue """ Config.job_requests += 1 self.log.debug("Received addJob request") ret = self.__validateJob(job, self.preallocator.vmms) self.log.info("Done validating job %s" % (job.name)) if ret == 0: return self.jobQueue.add(job) else: self.jobQueue.addDead(job) return -1 def delJob(self, id, deadjob): """ delJob - Delete a job @param id: Id of job to delete @param deadjob - If 0, move the job from the live queue to the dead queue. If non-zero, remove the job from the dead queue and discard. Use with caution! """ self.log.debug("Received delJob(%d, %d) request" % (id, deadjob)) return self.jobQueue.delJob(id, deadjob) def cancelJobWithPath(self, outFilePath): """ cancelJobWithPath - when this function returns, one of the following is true: 1. The job with the specified output file does not exist 2. the job with the specified output file has finished running normally 3. The job with the specified output file has been cancelled 4. The job was found, and it's running, but cancellation failed. In case 1, NOT_FOUND is returned. 2, ALREADY_COMPLETED is returned. 3, SUCCEEDED is returned. 4, FAILED is returned. """ self.log.debug("Received cancelJobWithPath(%s) request" % (outFilePath)) id, job, job_status = self.jobQueue.findRemovingWaiting(outFilePath) self.log.debug("cancelJobWithPath: Found a job %s with status %s" % (job, job_status)) if job_status == JobQueue.JobStatus.NOT_FOUND: return CancellationStatus.NOT_FOUND elif job_status == JobQueue.JobStatus.DEAD: return CancellationStatus.ALREADY_COMPLETED elif job_status == JobQueue.JobStatus.RUNNING: return self.killUntilJobComplete(id, job) else: assert job_status == JobQueue.JobStatus.WAITING # In this case, findRemovingLive has moved the live job to the dead # queue, and we have nothing to worry about. # Let's notify autolab that the job is done. if job.notifyURL: outputFileName = job.outputFile.split("/")[ -1] # get filename from path files = { 'file': unicode('Job was cancelled before it started.') } hdrs = {'Filename': outputFileName} self.log.debug("Sending request to %s" % job.notifyURL) def worker(): requests.post(job.notifyURL, files=files, headers=hdrs, data={'runningTimeSeconds': 0}, verify=False) threading.Thread(target=worker).start() return CancellationStatus.SUCCEEDED def killUntilJobComplete(self, id, job): """ Here's the contract: If the job is currently running (i.e. it could complete at some point in the future), then this method will return only when the job is complete. It tries to help by repeatedly `pkill`ing the process. But a compliant implementation could just block until the job completes on its own. On success, returns SUCCEEDED; on failure, return FAILED (compliant w above method) """ self.log.debug("Received killUntilJobComplete request") vm = job.vm for _ in xrange(0, Config.CANCEL_RETRIES): # Returns 0 on success. if self.preallocator.vmms[vm.vmms].kill(vm) == 0: return CancellationStatus.SUCCEEDED return CancellationStatus.FAILED def getJobs(self, item): """ getJobs - Return the list of live jobs (item == 0) or the list of dead jobs (item == -1). ^ You gotta be kidding me. Is this an API for number lovers. """ try: self.log.debug("Received getJobs(%s) request" % (item)) if item == -1: # return the list of dead jobs return self.jobQueue.deadJobs.values() elif item == 0: # return the list of live jobs return self.jobQueue.liveJobs.values() else: # invalid parameter return [] except Exception as e: self.log.debug("getJobs: %s" % str(e)) def preallocVM(self, vm, num): """ preallocVM - Set the pool size for VMs of type vm to num """ self.log.debug("Received preallocVM(%s,%d)request" % (vm.name, num)) try: vmms = self.preallocator.vmms[vm.vmms] if not vm or num < 0: return -2 if not vmms.isValidImage(vm.image): self.log.error("Invalid image name") return -3 (name, ext) = os.path.splitext(vm.image) vm.name = name self.preallocator.update(vm, num) return 0 except Exception as err: self.log.error("preallocVM failed: %s" % err) return -1 def getVMs(self, vmms_name): """ getVMs - return the list of VMs managed by the service vmms_name """ self.log.debug("Received getVMs request(%s)" % vmms_name) try: if vmms_name in self.preallocator.vmms: vmms_inst = self.preallocator.vmms[vmms_name] return vmms_inst.getVMs() else: return [] except Exception as err: self.log.error("getVMs request failed: %s" % err) return [] def delVM(self, vmName, id): """ delVM - delete a specific VM instance from a pool """ self.log.debug("Received delVM request(%s, %d)" % (vmName, id)) try: if not vmName or vmName == "" or not id: return -1 return self.preallocator.destroyVM(vmName, id) except Exception as err: self.log.error("delVM request failed: %s" % err) return -1 def getPool(self, vmName): """ getPool - Return the current members of a pool and its free list """ self.log.debug("Received getPool request(%s)" % (vmName)) try: if not vmName or vmName == "": return [] result = self.preallocator.getPool(vmName) return [ "pool_size=%d" % len(result["pool"]), "free_size=%d" % len(result["free"]), "pool=%s" % result["pool"], "free=%s" % result["free"] ] except Exception as err: self.log.error("getPool request failed: %s" % err) return [] def getInfo(self): """ getInfo - return various statistics about the Tango daemon """ stats = {} stats['elapsed_secs'] = time.time() - self.start_time stats['job_requests'] = Config.job_requests stats['job_retries'] = Config.job_retries stats['waitvm_timeouts'] = Config.waitvm_timeouts stats['runjob_timeouts'] = Config.runjob_timeouts stats['copyin_errors'] = Config.copyin_errors stats['runjob_errors'] = Config.runjob_errors stats['copyout_errors'] = Config.copyout_errors stats['num_threads'] = threading.activeCount() return stats def setScaleParams(self, low_water_mark, max_pool_size): self.preallocator.low_water_mark.set(low_water_mark) self.jobQueue.max_pool_size.set(max_pool_size) return 0 def runningTimeForOutputFile(self, outputFile): self.log.debug("Received runningTimeForOutputFile(%s)" % outputFile) liveJobTuple = self.jobQueue.liveJobs.getWrapped(outputFile) if liveJobTuple: (_, liveJob) = liveJobTuple self.log.debug(str(liveJob.startTime)) return liveJob.runningTime() return None # # Helper functions # # NOTE: This function should be called by ONLY jobManager. The rest servers # shouldn't call this function. def resetTango(self, vmms): """ resetTango - resets Tango to a clean predictable state and ensures that it has a working virtualization environment. A side effect is that also checks that each supported VMMS is actually running. """ # There are two cases this function is called: 1. Tango has a fresh start. # Then we want to destroy all instances in Tango's name space. 2. Job # Manager is restarted after a previous crash. Then we want to destroy # the "busy" instances prior to the crash and leave the "free" onces intact. self.log.debug("Received resetTango request.") try: # For each supported VMM system, get the instances it knows about # in the current Tango name space and kill those not in free pools. for vmms_name in vmms: vobj = vmms[vmms_name] # Round up all instances in the free pools. allFreeVMs = [] for key in self.preallocator.machines.keys(): freePool = self.preallocator.getPool(key)["free"] for vmId in freePool: vmName = vobj.instanceName(vmId, key) allFreeVMs.append(vmName) self.log.info("vms in all free pools: %s" % allFreeVMs) # For each in Tango's name space, destroy the onces in free pool. # AND remove it from Tango's internal bookkeeping. vms = vobj.getVMs() self.log.debug("Pre-existing VMs: %s" % [vm.name for vm in vms]) destroyedList = [] removedList = [] for vm in vms: if re.match("%s-" % Config.PREFIX, vm.name): # Todo: should have an one-call interface to destroy the # machine AND to keep the interval data consistent. if vm.name not in allFreeVMs: destroyedList.append(vm.name) vobj.destroyVM(vm) # also remove it from "total" set of the pool (prefix, vmId, poolName) = vm.name.split("-") machine = self.preallocator.machines.get(poolName) if not machine: # the pool may not exist continue if int(vmId) in machine[0]: removedList.append(vm.name) machine[0].remove(int(vmId)) self.preallocator.machines.set(poolName, machine) if destroyedList: self.log.warning("Killed these %s VMs on restart: %s" % (vmms_name, destroyedList)) if removedList: self.log.warning("Removed these %s VMs from their pools" % (removedList)) for _, job in self.jobQueue.liveJobs.iteritems(): if not job.isNotAssigned(): job.makeUnassigned() self.log.debug("job: %s, assigned: %s" % (str(job.name), str(job.assigned))) except Exception as err: self.log.error("resetTango: Call to VMMS %s failed: %s" % (vmms_name, err)) os._exit(1) def __validateJob(self, job, vmms): """ validateJob - validate the input arguments in an addJob request. """ errors = 0 # If this isn't a Tango job then bail with an error if (not isinstance(job, TangoJob)): return -1 # Every job must have a name if not job.name: self.log.error("validateJob: Missing job.name") job.appendTrace("validateJob: Missing job.name") errors += 1 # Check the virtual machine field if not job.vm: self.log.error("validateJob: Missing job.vm") job.appendTrace("validateJob: Missing job.vm") errors += 1 else: if not job.vm.image: self.log.error("validateJob: Missing job.vm.image") job.appendTrace("validateJob: Missing job.vm.image") errors += 1 else: vobj = vmms[Config.VMMS_NAME] if not vobj.isValidImage(job.vm.image): self.log.error("validateJob: Image not found: %s" % job.vm.image) job.appendTrace("validateJob: Image not found: %s" % job.vm.image) errors += 1 else: (name, ext) = os.path.splitext(job.vm.image) job.vm.name = name if not job.vm.vmms: self.log.error("validateJob: Missing job.vm.vmms") job.appendTrace("validateJob: Missing job.vm.vmms") errors += 1 else: if job.vm.vmms not in vmms: self.log.error("validateJob: Invalid vmms name: %s" % job.vm.vmms) job.appendTrace("validateJob: Invalid vmms name: %s" % job.vm.vmms) errors += 1 # Check the output file if not job.outputFile: self.log.error("validateJob: Missing job.outputFile") job.appendTrace("validateJob: Missing job.outputFile") errors += 1 else: if not os.path.exists(os.path.dirname(job.outputFile)): self.log.error("validateJob: Bad output path: %s" % job.outputFile) job.appendTrace("validateJob: Bad output path: %s" % job.outputFile) errors += 1 # Check for max output file size parameter if not job.maxOutputFileSize: self.log.debug( "validateJob: Setting job.maxOutputFileSize " "to default value: %d bytes", Config.MAX_OUTPUT_FILE_SIZE) job.maxOutputFileSize = Config.MAX_OUTPUT_FILE_SIZE # Check the list of input files hasMakefile = False for inputFile in job.input: if not inputFile.localFile: self.log.error("validateJob: Missing inputFile.localFile") job.appendTrace("validateJob: Missing inputFile.localFile") errors += 1 else: if not os.path.exists(os.path.dirname(job.outputFile)): self.log.error("validateJob: Bad output path: %s" % job.outputFile) job.appendTrace("validateJob: Bad output path: %s" % job.outputFile) errors += 1 if inputFile.destFile == 'Makefile': hasMakefile = True # Check if input files include a Makefile if not hasMakefile: self.log.error("validateJob: Missing Makefile in input files.") job.appendTrace("validateJob: Missing Makefile in input files.") errors += 1 # Check if job timeout has been set; If not set timeout to default if not job.timeout or job.timeout <= 0: self.log.debug( "validateJob: Setting job.timeout to" " default config value: %d secs", Config.RUNJOB_TIMEOUT) job.timeout = Config.RUNJOB_TIMEOUT # Any problems, return an error status if errors > 0: self.log.error("validateJob: Job rejected: %d errors" % errors) job.appendTrace("validateJob: Job rejected: %d errors" % errors) return -1 else: return 0
class TangoREST: COURSELABS = Config.COURSELABS OUTPUT_FOLDER = "output" LOGFILE = Config.LOGFILE # Replace with choice of key store and override validateKey. # This key is just for testing. keys = Config.KEYS def __init__(self): logging.basicConfig( filename = self.LOGFILE, format = "%(levelname)s|%(asctime)s|%(name)s|%(message)s", level = Config.LOGLEVEL ) vmms = None if Config.VMMS_NAME == "localSSH": from vmms.localSSH import LocalSSH vmms = LocalSSH() elif Config.VMMS_NAME == "tashiSSH": from vmms.tashiSSH import TashiSSH vmms = TashiSSH() elif Config.VMMS_NAME == "ec2SSH": from vmms.ec2SSH import Ec2SSH vmms = Ec2SSH() elif Config.VMMS_NAME == "localDocker": from vmms.localDocker import LocalDocker vmms = LocalDocker() self.vmms = {Config.VMMS_NAME: vmms} self.preallocator = Preallocator(self.vmms) self.queue = JobQueue(self.preallocator) if not Config.USE_REDIS: # creates a local Job Manager if there is no persistent # memory between processes. Otherwise, JobManager will # be initiated separately JobManager(self.queue, self.vmms, self.preallocator) self.tango = TangoServer(self.queue, self.preallocator, self.vmms) logging.basicConfig( filename=self.LOGFILE, format="%(levelname)s|%(asctime)s|%(name)s|%(message)s", level=Config.LOGLEVEL ) logging.getLogger('boto').setLevel(logging.INFO) self.log = logging.getLogger("TangoREST") self.log.info("Starting RESTful Tango server") self.status = Status() def validateKey(self, key): """ validateKey - Validates key provided by client """ result = False for el in self.keys: if el == key: result = True return result def getDirName(self, key, courselab): """ getDirName - Computes directory name """ return "%s-%s" % (key, courselab) def getDirPath(self, key, courselab): """ getDirPath - Computes directory path """ labName = self.getDirName(key, courselab) return "%s/%s" % (self.COURSELABS, labName) def getOutPath(self, key, courselab): """ getOutPath - Computes output directory path """ labPath = self.getDirPath(key, courselab) return "%s/%s" % (labPath, self.OUTPUT_FOLDER) def computeMD5(self, directory): """ computeMD5 - Computes the MD5 hash of given files in the given directory """ result = [] for elem in os.listdir(directory): try: body = open("%s/%s" % (directory, elem)).read() md5hash = hashlib.md5(body).hexdigest() result.append({'md5': md5hash, 'localFile': elem}) except IOError: continue return result def createTangoMachine(self, image, vmms=Config.VMMS_NAME, vmObj={'cores': 1, 'memory': 512}): """ createTangoMachine - Creates a tango machine object from image """ return TangoMachine( name=image, vmms=vmms, image="%s" % (image), cores=vmObj["cores"], memory=vmObj["memory"], disk=None, network=None) def convertJobObj(self, dirName, jobObj): """ convertJobObj - Converts a dictionary into a TangoJob object """ name = jobObj['jobName'] outputFile = "%s/%s/%s/%s" % (self.COURSELABS, dirName, self.OUTPUT_FOLDER, jobObj['output_file']) timeout = jobObj['timeout'] notifyURL = None maxOutputFileSize = Config.MAX_OUTPUT_FILE_SIZE if 'callback_url' in jobObj: notifyURL = jobObj['callback_url'] # List of input files input = [] for file in jobObj['files']: inFile = file['localFile'] vmFile = file['destFile'] handinfile = InputFile( localFile="%s/%s/%s" % (self.COURSELABS, dirName, inFile), destFile=vmFile) input.append(handinfile) # VM object vm = self.createTangoMachine(jobObj["image"]) job = TangoJob( name=name, vm=vm, outputFile=outputFile, input=input, timeout=timeout, notifyURL=notifyURL, maxOutputFileSize=maxOutputFileSize) self.log.debug("inputFiles: %s" % [file.localFile for file in input]) self.log.debug("outputFile: %s" % outputFile) return job def convertTangoMachineObj(self, tangoMachine): """ convertVMObj - Converts a TangoMachine object into a dictionary """ # May need to convert instance_id vm = dict() vm['network'] = tangoMachine.network vm['resume'] = tangoMachine.resume vm['image'] = tangoMachine.image vm['memory'] = tangoMachine.memory vm['vmms'] = tangoMachine.vmms vm['cores'] = tangoMachine.cores vm['disk'] = tangoMachine.disk vm['id'] = tangoMachine.id vm['name'] = tangoMachine.name return vm def convertInputFileObj(self, inputFile): """ convertInputFileObj - Converts an InputFile object into a dictionary """ input = dict() input['destFile'] = inputFile.destFile input['localFile'] = inputFile.localFile return input def convertTangoJobObj(self, tangoJobObj): """ convertTangoJobObj - Converts a TangoJob object into a dictionary """ job = dict() # Convert scalar attribtues first job['retries'] = tangoJobObj.retries job['outputFile'] = tangoJobObj.outputFile job['name'] = tangoJobObj.name job['notifyURL'] = tangoJobObj.notifyURL job['maxOutputFileSize'] = tangoJobObj.maxOutputFileSize job['assigned'] = tangoJobObj.assigned job['timeout'] = tangoJobObj.timeout job['id'] = tangoJobObj.id job['trace'] = tangoJobObj.trace # Convert VM object job['vm'] = self.convertTangoMachineObj(tangoJobObj.vm) # Convert InputFile objects inputFiles = list() for inputFile in tangoJobObj.input: inputFiles.append(self.convertInputFileObj(inputFile)) job['input'] = inputFiles return job ## # Tango RESTful API ## def open(self, key, courselab): """ open - Return a list of md5 hashes for each input file in the key-courselab directory and make one if the directory doesn't exist """ self.log.debug("Received open request(%s, %s)" % (key, courselab)) if self.validateKey(key): labPath = self.getDirPath(key, courselab) try: if os.path.exists(labPath): self.log.info( "Found directory for (%s, %s)" % (key, courselab)) statusObj = self.status.found_dir statusObj['files'] = self.computeMD5(labPath) return statusObj else: outputPath = self.getOutPath(key, courselab) os.makedirs(outputPath) self.log.info( "Created directory for (%s, %s)" % (key, courselab)) statusObj = self.status.made_dir statusObj["files"] = [] return statusObj except Exception as e: self.log.error("open request failed: %s" % str(e)) return self.status.create(-1, str(e)) else: self.log.info("Key not recognized: %s" % key) return self.status.wrong_key def upload(self, key, courselab, file, body): """ upload - Upload file as an input file in key-courselab """ self.log.debug("Received upload request(%s, %s, %s)" % (key, courselab, file)) if (self.validateKey(key)): labPath = self.getDirPath(key, courselab) try: if os.path.exists(labPath): absPath = "%s/%s" % (labPath, file) if os.path.exists(absPath): fileMD5 = hashlib.md5(body).hexdigest() if fileMD5 in [obj["md5"] for obj in self.computeMD5(labPath)]: return self.status.file_exists fh = open(absPath, "wt") fh.write(body) fh.close() self.log.info( "Uploaded file to (%s, %s, %s)" % (key, courselab, file)) return self.status.file_uploaded else: self.log.info( "Courselab for (%s, %s) not found" % (key, courselab)) return self.status.wrong_courselab except Exception as e: self.log.error("upload request failed: %s" % str(e)) return self.status.create(-1, str(e)) else: self.log.info("Key not recognized: %s" % key) return self.status.wrong_key def addJob(self, key, courselab, jobStr): """ addJob - Add the job to be processed by Tango """ self.log.debug("Received addJob request(%s, %s, %s)" % (key, courselab, jobStr)) if (self.validateKey(key)): labName = self.getDirName(key, courselab) try: jobObj = json.loads(jobStr) job = self.convertJobObj(labName, jobObj) jobId = self.tango.addJob(job) self.log.debug("Done adding job") if (jobId == -1): self.log.info("Failed to add job to tango") return self.status.create(-1, job.trace) self.log.info("Successfully added job to tango") result = self.status.job_added result['jobId'] = jobId return result except Exception as e: exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] print(exc_type, fname, exc_tb.tb_lineno) self.log.error("addJob request failed: %s" % str(e)) return self.status.create(-1, str(e)) else: self.log.info("Key not recognized: %s" % key) return self.status.wrong_key def poll(self, key, courselab, outputFile): """ poll - Poll for the output file in key-courselab """ self.log.debug("Received poll request(%s, %s, %s)" % (key, courselab, outputFile)) if (self.validateKey(key)): outputPath = self.getOutPath(key, courselab) outfilePath = "%s/%s" % (outputPath, outputFile) if os.path.exists(outfilePath): self.log.info("Output file (%s, %s, %s) found" % (key, courselab, outputFile)) output = open(outfilePath) result = output.read() output.close() return result self.log.info("Output file (%s, %s, %s) not found" % (key, courselab, outputFile)) return self.status.out_not_found else: self.log.info("Key not recognized: %s" % key) return self.status.wrong_key def info(self, key): """ info - Returns basic status for the Tango service such as uptime, number of jobs etc """ self.log.debug("Received info request (%s)" % (key)) if (self.validateKey(key)): info = self.tango.getInfo() result = self.status.obtained_info result['info'] = info return result else: self.log.info("Key not recognized: %s" % key) return self.status.wrong_key def jobs(self, key, deadJobs): """ jobs - Returns the list of live jobs (deadJobs == 0) or the list of dead jobs (deadJobs == 1) """ self.log.debug("Received jobs request (%s, %s)" % (key, deadJobs)) if (self.validateKey(key)): jobs = list() result = self.status.obtained_jobs if (int(deadJobs) == 0): jobs = self.tango.getJobs(0) self.log.debug( "Retrieved live jobs (deadJobs = %s)" % deadJobs) elif (int(deadJobs) == 1): jobs = self.tango.getJobs(-1) self.log.debug( "Retrieved dead jobs (deadJobs = %s)" % deadJobs) result['jobs'] = list() for job in jobs: result['jobs'].append(self.convertTangoJobObj(job)) return result else: self.log.info("Key not recognized: %s" % key) return self.status.wrong_key def pool(self, key, image): """ pool - Get information about a pool of VMs spawned from image """ self.log.debug("Received pool request(%s, %s)" % (key, image)) if self.validateKey(key): if not image or image == "" or not image.endswith(".img"): self.log.info("Invalid image name") return self.status.invalid_image image = image[:-4] info = self.preallocator.getPool(image) if len(info["pool"]) == 0: self.log.info("Pool image not found: %s" % image) return self.status.pool_not_found self.log.info("Pool image found: %s" % image) result = self.status.obtained_pool result["total"] = info["pool"] result["free"] = info["free"] return result else: self.log.info("Key not recognized: %s" % key) return self.status.wrong_key def prealloc(self, key, image, num, vmStr): """ prealloc - Create a pool of num instances spawned from image """ self.log.debug("Received prealloc request(%s, %s, %s)" % (key, image, num)) if self.validateKey(key): if not image or image == "" or not image.endswith(".img"): self.log.info("Invalid image name") return self.status.invalid_image if vmStr != "": vmObj = json.loads(vmStr) vm = self.createTangoMachine(image, vmObj=vmObj) else: vm = self.createTangoMachine(image) success = self.tango.preallocVM(vm, int(num)) if (success == -1): self.log.info("Failed to preallocated VMs") return self.status.prealloc_failed self.log.info("Successfully preallocated VMs") return self.status.preallocated else: self.log.info("Key not recognized: %s" % key) return self.status.wrong_key def resetTango(self): """ Destroys VMs associated with this namespace. Used for admin purposes only. """ self.log.debug("Received resetTango request.") self.tango.resetTango(self.vmms)