def onStatusUpdate(self, cmd): cmd.updateTime = time.time() # append the command's status to the rendernode's history if isFinalStatus(cmd.status): # only if we don't already have a command for this task if hasattr(cmd.renderNode, 'tasksHistory' ) and cmd.task.id not in cmd.renderNode.tasksHistory: cmd.renderNode.tasksHistory.append(cmd.task.id) cmd.renderNode.history.append(cmd.status) if cmd.status is CMD_DONE: # TOFIX: handle CANCEL status and update end time when cancelling a # job so that it can be properly cleaned cmd.endTime = cmd.updateTime cmd.computeAvgTimeByFrame() cmd.attempt += 1 # autoretry elif cmd.status is CMD_ERROR: cmd.attempt += 1 if cmd.attempt < cmd.task.maxAttempt: LOGGER.debug( "Mark command %d for auto retry in %ds (%d/%d)" % (cmd.id, singletonconfig.get('CORE', 'DELAY_BEFORE_AUTORETRY'), cmd.attempt, cmd.task.maxAttempt)) t = Timer( singletonconfig.get('CORE', 'DELAY_BEFORE_AUTORETRY'), self.autoretry, [cmd]) t.start() elif cmd.status is CMD_ASSIGNED: cmd.startTime = cmd.updateTime elif cmd.status < CMD_ASSIGNED: cmd.startTime = None
def onStatusUpdate(self, cmd): cmd.updateTime = time.time() # append the command's status to the rendernode's history if isFinalStatus(cmd.status): # only if we don't already have a command for this task if hasattr(cmd.renderNode, 'tasksHistory') and cmd.task.id not in cmd.renderNode.tasksHistory: cmd.renderNode.tasksHistory.append(cmd.task.id) cmd.renderNode.history.append(cmd.status) if cmd.status is CMD_DONE: # TOFIX: handle CANCEL status and update end time when cancelling a # job so that it can be properly cleaned cmd.endTime = cmd.updateTime cmd.computeAvgTimeByFrame() cmd.attempt += 1 # autoretry elif cmd.status is CMD_ERROR: cmd.attempt += 1 LOGGER.debug("Mark command %d for auto retry in %ds (%d/%d)" % (cmd.id, singletonconfig.get('CORE', 'DELAY_BEFORE_AUTORETRY'), cmd.attempt, cmd.task.maxAttempt)) if cmd.attempt < cmd.task.maxAttempt: t = Timer(singletonconfig.get('CORE', 'DELAY_BEFORE_AUTORETRY'), self.autoretry, [cmd]) t.start() elif cmd.status is CMD_ASSIGNED: cmd.startTime = cmd.updateTime elif cmd.status < CMD_ASSIGNED: cmd.startTime = None
def setup_logging(options): if not os.path.exists(settings.LOGDIR): os.makedirs(settings.LOGDIR, 0755) mainLog = os.path.join(settings.LOGDIR, "dispatcher.log") assignLog = os.path.join(settings.LOGDIR, "assign.log") fileHandler = logging.handlers.RotatingFileHandler( mainLog, maxBytes=singletonconfig.get('CORE', 'LOG_SIZE'), backupCount=singletonconfig.get('CORE', 'LOG_BACKUPS'), encoding="UTF-8") assignHandler = logging.handlers.RotatingFileHandler( assignLog, maxBytes=singletonconfig.get('CORE', 'LOG_SIZE'), backupCount=singletonconfig.get('CORE', 'LOG_BACKUPS'), encoding="UTF-8") fileHandler.setFormatter( logging.Formatter("%(asctime)s %(name)10s %(levelname)s %(message)s")) assignHandler.setFormatter( logging.Formatter("%(asctime)s %(name)10s %(levelname)s %(message)s")) logLevel = logging.DEBUG if options.DEBUG else singletonconfig.get( 'CORE', 'LOG_LEVEL') # Must be set otherwise it will receive the statsLog data, but not higher than DEBUG otherwise we might loose some info if reconfig with higher lvl fileHandler.setLevel(logging.DEBUG) # Create main logger logging.getLogger().addHandler(fileHandler) logging.getLogger().setLevel(logLevel) # Create a specific logger for assignment information (force level to INFO) logging.getLogger('assign').addHandler(assignHandler) logging.getLogger('assign').setLevel(logging.DEBUG) logging.getLogger( 'assign' ).propagate = False # cut event to avoid getting this to the root log if options.CONSOLE and not options.DAEMONIZE: consoleHandler = logging.StreamHandler() consoleHandler.setFormatter( logging.Formatter( "%(asctime)s %(name)10s %(levelname)6s %(message)s", '%Y-%m-%d %H:%M:%S')) consoleHandler.setLevel(logLevel) logging.getLogger().addHandler(consoleHandler) logging.getLogger('main.dispatcher').setLevel(logLevel) logging.getLogger('main.webservice').setLevel(logging.ERROR)
def main(): options = process_args() setup_logging(options) logging.getLogger('daemon').info( "" ) logging.getLogger('daemon').info( "-----------------------------------------------" ) logging.getLogger('daemon').info( "Starting PULI server on port:%d.", settings.PORT) logging.getLogger('daemon').info( "-----------------------------------------------" ) logging.getLogger('daemon').info( " version = %s" % settings.VERSION ) logging.getLogger('daemon').info( " command = %s" % " ".join(sys.argv) ) logging.getLogger('daemon').info( " daemon = %r" % options.DAEMONIZE ) logging.getLogger('daemon').info( " console = %r" % options.CONSOLE ) logging.getLogger('daemon').info( " port = %s" % settings.PORT ) if options.DAEMONIZE: daemonize(settings.RUN_AS) dispatcherApplication = make_dispatcher() periodic = tornado.ioloop.PeriodicCallback( dispatcherApplication.loop, singletonconfig.get('CORE','MASTER_UPDATE_INTERVAL') ) periodic.start() try: tornado.ioloop.IOLoop.instance().start() except KeyboardInterrupt, SystemExit: logging.getLogger('dispatcher').info("Exit event caught: closing dispatcher...")
def main(): options = process_args() setup_logging(options) logging.getLogger('main').warning("") logging.getLogger('main').warning( "-----------------------------------------------") logging.getLogger('main').warning("Starting PULI server on port:%d.", settings.PORT) logging.getLogger('main').warning( "-----------------------------------------------") logging.getLogger('main').warning(" version = %s" % settings.VERSION) logging.getLogger('main').warning(" command = %s" % " ".join(sys.argv)) logging.getLogger('main').warning(" daemon = %r" % options.DAEMONIZE) logging.getLogger('main').warning(" console = %r" % options.CONSOLE) logging.getLogger('main').warning(" port = %s" % settings.PORT) logging.getLogger('main').warning("--") if options.DAEMONIZE: logging.getLogger('main').warning( "make current process a daemon and redirecting stdout/stderr to logfile" ) daemonize(settings.RUN_AS) try: # Redirect stdout and stderr to log file (using the first handler set in logging) sys.stdout = logging.getLogger().handlers[0].stream sys.stderr = logging.getLogger().handlers[0].stream except Exception: logging.getLogger('main').error( "Unexpected error occured when redirecting stdout/stderr to logfile" ) logging.getLogger('main').warning("creating dispatcher main application") server = make_dispatcher() # Define a periodic callback to process DB/COMPLETION/ASSIGNMENT updates periodic = tornado.ioloop.PeriodicCallback( server.loop, singletonconfig.get('CORE', 'MASTER_UPDATE_INTERVAL')) periodic.start() try: logging.getLogger('main').warning("starting tornado main loop") tornado.ioloop.IOLoop.instance().start() except (KeyboardInterrupt, SystemExit): server.application.shutdown() # If restart flag is set (via /restart webservice) if server.application.restartService: logging.getLogger('main').warning("Restarting service...") try: # Restart server using a specific command subprocess.check_call(settings.RESTART_COMMAND.split()) except subprocess.CalledProcessError, e: logging.getLogger('main').warning( "Impossible to restart systemd unit (error: %s)" % e) except AttributeError, e: logging.getLogger('main').warning( "Dispatcher settings do not define: RESTART_COMMAND")
def setup_logging(options): if not os.path.exists(settings.LOGDIR): os.makedirs(settings.LOGDIR, 0755) mainLog = os.path.join(settings.LOGDIR, "dispatcher.log") assignLog = os.path.join(settings.LOGDIR, "assign.log") fileHandler = logging.handlers.RotatingFileHandler( mainLog, maxBytes=singletonconfig.get('CORE', 'LOG_SIZE'), backupCount=singletonconfig.get('CORE', 'LOG_BACKUPS'), encoding="UTF-8") assignHandler = logging.handlers.RotatingFileHandler( assignLog, maxBytes=singletonconfig.get('CORE', 'LOG_SIZE'), backupCount=singletonconfig.get('CORE', 'LOG_BACKUPS'), encoding="UTF-8") fileHandler.setFormatter(logging.Formatter("%(asctime)s %(name)10s %(levelname)s %(message)s")) assignHandler.setFormatter(logging.Formatter("%(asctime)s %(name)10s %(levelname)s %(message)s")) logLevel = logging.DEBUG if options.DEBUG else singletonconfig.get('CORE', 'LOG_LEVEL') # Must be set otherwise it will receive the statsLog data, but not higher than DEBUG otherwise we might loose some info if reconfig with higher lvl fileHandler.setLevel(logging.DEBUG) # Create main logger logging.getLogger().addHandler(fileHandler) logging.getLogger().setLevel(logLevel) # Create a specific logger for assignment information (force level to INFO) logging.getLogger('assign').addHandler(assignHandler) logging.getLogger('assign').setLevel(logging.DEBUG) logging.getLogger('assign').propagate = False # cut event to avoid getting this to the root log if options.CONSOLE and not options.DAEMONIZE: consoleHandler = logging.StreamHandler() consoleHandler.setFormatter(logging.Formatter("%(asctime)s %(name)10s %(levelname)6s %(message)s", '%Y-%m-%d %H:%M:%S')) consoleHandler.setLevel(logLevel) logging.getLogger().addHandler(consoleHandler) logging.getLogger('main.dispatcher').setLevel(logLevel) logging.getLogger('main.webservice').setLevel(logging.ERROR)
def post(self): if singletonconfig.get('CORE','GET_STATS'): singletonstats.theStats.cycleCounts['add_graphs'] += 1 try: nodes = self.dispatcher.handleNewGraphRequestApply(self.getBodyAsJSON()) except Exception, e: logger.exception("Graph submission failed") raise Http500("Failed. %s" % str(e))
def post(self): if singletonconfig.get('CORE', 'GET_STATS'): singletonstats.theStats.cycleCounts['add_graphs'] += 1 try: nodes = self.dispatcher.handleNewGraphRequestApply(self.getBodyAsJSON()) except Exception, e: logger.exception("Graph submission failed") raise Http500("Failed. %s" % str(e))
def __init__(self, id, name, coresNumber, speed, ip, port, ramSize, caracteristics=None, performance=0.0, puliversion="undefined", createDate=None): '''Constructs a new Rendernode. :parameters: - `name`: the name of the rendernode - `coresNumber`: the number of processors - `speed`: the speed of the processor ''' self.id = int(id) if id else None self.name = str(name) self.coresNumber = int(coresNumber) self.ramSize = int(ramSize) self.licenseManager = None self.freeCoresNumber = int(coresNumber) self.usedCoresNumber = {} self.freeRam = int(ramSize) # ramSize-usedRam i.e. the amount of RAM used if several commands running concurrently self.systemFreeRam = int(ramSize) # the RAM available on the system (updated each ping) self.systemSwapPercentage = 0 self.usedRam = {} self.speed = speed self.commands = {} self.status = RN_UNKNOWN self.responseId = None self.host = str(ip) self.port = int(port) self.pools = [] self.idInformed = False self.isRegistered = False self.lastAliveTime = 0 self.httpConnection = None self.caracteristics = caracteristics if caracteristics else {} self.currentpoolshare = None self.performance = float(performance) self.history = deque(maxlen=singletonconfig.get('CORE', 'RN_NB_ERRORS_TOLERANCE')) self.tasksHistory = deque(maxlen=15) self.excluded = False # Init new data self.puliversion = puliversion if createDate is None: self.createDate = 0 else: self.createDate = createDate self.registerDate = time.time() # Flag linked to the worker flag "isPaused". Handles the case when a worker is set paused but a command is still running (finishing) # the RN on the dispatcher must be flag not to be assigned (i.e. in isAvailable property) # self.canBeAssigned = True if not "softs" in self.caracteristics: self.caracteristics["softs"] = []
def setup_logging(options): if not os.path.exists(settings.LOGDIR): os.makedirs(settings.LOGDIR, 0755) logFile = os.path.join(settings.LOGDIR, "dispatcher.log") fileHandler = logging.handlers.RotatingFileHandler(logFile, maxBytes=singletonconfig.get('CORE','LOG_SIZE'), backupCount=singletonconfig.get('CORE','LOG_BACKUPS'), encoding="UTF-8") fileHandler.setFormatter(logging.Formatter("%(asctime)s %(name)s %(levelname)s %(message)s")) logger = logging.getLogger() # logLevel = logging.DEBUG if options.DEBUG else logging.WARNING logLevel = logging.DEBUG if options.DEBUG else singletonconfig.get('CORE','LOG_LEVEL') logger.setLevel(logLevel) fileHandler.setLevel( logging.DEBUG ) # Must be set otherwise it will receive the statsLog data, but not higher than DEBUG otherwise we might loose some info if reconfig with higher lvl logger.addHandler(fileHandler) if options.CONSOLE and not options.DAEMONIZE: consoleHandler = logging.StreamHandler() consoleHandler.setFormatter(logging.Formatter("%(asctime)s %(name)10s %(levelname)6s %(message)s", '%Y-%m-%d %H:%M:%S')) consoleHandler.setLevel(logLevel) logger.addHandler(consoleHandler) # # Create a specific handler at DEBUG level for logging stats infos # # if singletonconfig.get('CORE','GET_STATS'): # statsFile = os.path.join(settings.LOGDIR, "stats.log") # statsLogger = logging.getLogger('stats') # statsLogger.setLevel( logging.DEBUG ) # statsLogger.addHandler( logging.handlers.RotatingFileHandler( statsFile, # maxBytes=singletonconfig.get('CORE','LOG_SIZE'), # backupCount=singletonconfig.get('CORE','LOG_BACKUPS')) ) logging.getLogger('dispatcher').setLevel(logLevel) logging.getLogger('webservice').setLevel(logging.ERROR)
def post(self): ''' Reload the main config file (using singletonconfig) and reload every main loggers. ''' try: singletonconfig.reload() logLevel = singletonconfig.get('CORE', 'LOG_LEVEL') logging.getLogger().setLevel(logLevel) logging.getLogger('main').setLevel(logLevel) logging.getLogger("worker").setLevel(logLevel) except Exception, e: raise Http500("Error during server reconfig: %r" % e)
def main(): options = process_args() setup_logging(options) logging.getLogger('main').warning("") logging.getLogger('main').warning("-----------------------------------------------") logging.getLogger('main').warning("Starting PULI server on port:%d.", settings.PORT) logging.getLogger('main').warning("-----------------------------------------------") logging.getLogger('main').warning(" version = %s" % settings.VERSION) logging.getLogger('main').warning(" command = %s" % " ".join(sys.argv)) logging.getLogger('main').warning(" daemon = %r" % options.DAEMONIZE) logging.getLogger('main').warning(" console = %r" % options.CONSOLE) logging.getLogger('main').warning(" port = %s" % settings.PORT) logging.getLogger('main').warning("--") if options.DAEMONIZE: logging.getLogger('main').warning("make current process a daemon and redirecting stdout/stderr to logfile") daemonize(settings.RUN_AS) try: # Redirect stdout and stderr to log file (using the first handler set in logging) sys.stdout = logging.getLogger().handlers[0].stream sys.stderr = logging.getLogger().handlers[0].stream except Exception: logging.getLogger('main').error("Unexpected error occured when redirecting stdout/stderr to logfile") logging.getLogger('main').warning("creating dispatcher main application") server = make_dispatcher() # Define a periodic callback to process DB/COMPLETION/ASSIGNMENT updates periodic = tornado.ioloop.PeriodicCallback(server.loop, singletonconfig.get('CORE', 'MASTER_UPDATE_INTERVAL')) periodic.start() try: logging.getLogger('main').warning("starting tornado main loop") tornado.ioloop.IOLoop.instance().start() except (KeyboardInterrupt, SystemExit): server.application.shutdown() # If restart flag is set (via /restart webservice) if server.application.restartService: logging.getLogger('main').warning("Restarting service...") try: # Restart server using a specific command subprocess.check_call(settings.RESTART_COMMAND.split()) except subprocess.CalledProcessError, e: logging.getLogger('main').warning("Impossible to restart systemd unit (error: %s)" % e) except AttributeError, e: logging.getLogger('main').warning("Dispatcher settings do not define: RESTART_COMMAND")
def prepare( self ): """ For each request, update stats if needed """ if singletonconfig.get('CORE','GET_STATS'): singletonstats.theStats.cycleCounts['incoming_requests'] += 1 if self.request.method == 'GET': singletonstats.theStats.cycleCounts['incoming_get'] += 1 elif self.request.method == 'POST': singletonstats.theStats.cycleCounts['incoming_post'] += 1 elif self.request.method == 'PUT': singletonstats.theStats.cycleCounts['incoming_put'] += 1 elif self.request.method == 'DELETE': singletonstats.theStats.cycleCounts['incoming_delete'] += 1
def onStatusUpdate(self, cmd): cmd.updateTime = time.time() # append the command's status to the rendernode's history if isFinalStatus(cmd.status): # only if we don't already have a command for this task if hasattr(cmd.renderNode, 'tasksHistory') and cmd.task.id not in cmd.renderNode.tasksHistory: cmd.renderNode.tasksHistory.append(cmd.task.id) cmd.renderNode.history.append(cmd.status) if cmd.status is CMD_DONE: # TOFIX: handle CANCEL status and update end time when cancelling a # job so that it can be properly cleaned cmd.endTime = cmd.updateTime cmd.computeAvgTimeByFrame() # autoretry elif cmd.status is CMD_ERROR: if cmd.retryCount == singletonconfig.get('CORE','MAX_RETRY_CMD_COUNT'): cmd.retryRnList.append(cmd.renderNode.name) elif cmd.retryCount < singletonconfig.get('CORE','MAX_RETRY_CMD_COUNT'): t = Timer(singletonconfig.get('CORE','DELAY_BEFORE_AUTORETRY'), self.autoretry, [cmd]) t.start() elif cmd.status is CMD_ASSIGNED: cmd.startTime = cmd.updateTime elif cmd.status < CMD_ASSIGNED: cmd.startTime = None
def prepare(self): """ For each request, update stats if needed """ self.startTime = time.time() if singletonconfig.get('CORE', 'GET_STATS'): singletonstats.theStats.cycleCounts['incoming_requests'] += 1 if self.request.method == 'GET': singletonstats.theStats.cycleCounts['incoming_get'] += 1 elif self.request.method == 'POST': singletonstats.theStats.cycleCounts['incoming_post'] += 1 elif self.request.method == 'PUT': singletonstats.theStats.cycleCounts['incoming_put'] += 1 elif self.request.method == 'DELETE': singletonstats.theStats.cycleCounts['incoming_delete'] += 1
def post(self): try: singletonconfig.reload() # FIXME on est oblige de changer le loglevel de tous les loggers du projet... # Il faudrait pouvoir affecter tous les log d'un seul coup logLevel = singletonconfig.get('CORE','LOG_LEVEL') logging.getLogger().setLevel( logLevel ) logging.getLogger("cmdwatcher").setLevel( logLevel ) logging.getLogger("command").setLevel( logLevel ) logging.getLogger("dispatcher").setLevel( logLevel ) logging.getLogger("framework").setLevel( logLevel ) logging.getLogger("model").setLevel( logLevel ) logging.getLogger("poolshares").setLevel( logLevel ) logging.getLogger("process").setLevel( logLevel ) logging.getLogger("worker").setLevel( logLevel ) logging.getLogger("workerws").setLevel( logLevel ) # Tous les logger de l'appli # # root # cmdwatcher # command # dispatcher # dispatcher.dispatchtree # dispatcher.webservice # dispatcher.webservice.editController # dispatcher.webservice.NodeController # dispatcher.webservice.PoolController # dispatcher.webservice.queryController # dispatcher.webservice.TaskController # framework # framework.application # framework.webservice # model # model.task # poolshares # process # userview # worker # worker.CmdThreader # workerws except Exception, e: raise Http500("Error during server reconfig: %r"%e)
def aggregate( self ): """ | Called each cycle to store data in a buffer array | Once every BUFFER_SIZE cycles, the data is dumped in stats.log for later use """ cycleData = [ self.cycleDate, copy( self.cycleTimers ), copy(self.cycleCounts), copy( self.assignmentTimers ) ] self.accumulationBuffer.append( cycleData ) # Clean data for next cycle (only counts need to be cleaned, timer are overwritten) self._resetCounts() # Dump to file if singletonconfig.get('CORE','STATS_BUFFER_SIZE') <= len(self.accumulationBuffer): self._flush() return True
def put(self, computerName, commandId): '''Update command `commandId` running on rendernode `renderNodeId`. Returns "200 OK" on success, or "404 Bad Request" if the provided json data is not valid. ''' if singletonconfig.get('CORE','GET_STATS'): singletonstats.theStats.cycleCounts['update_commands'] += 1 computerName = computerName.lower() # try: # updateDict = self.sanitizeUpdateDict(self.getBodyAsJSON()) # except TypeError, e: # return Http400(repr(e.args)) updateDict = self.getBodyAsJSON() updateDict['renderNodeName'] = computerName try: self.framework.application.updateCommandApply(updateDict) except KeyError, e: return Http404(str(e))
def request(self, method, url, body=None, headers={}): """ """ # from octopus.dispatcher import settings LOGGER.debug("Send request to RN: http://%s:%s%s %s (%s)"%(self.host, self.port , url, method, headers)) err=None conn = self.getHTTPConnection() # try to process the request at most RENDERNODE_REQUEST_MAX_RETRY_COUNT times. for i in xrange( singletonconfig.get('COMMUNICATION','RENDERNODE_REQUEST_MAX_RETRY_COUNT') ): try: conn.request(method, url, body, headers) response = conn.getresponse() if response.length: data = response.read(response.length) else: data = None # request succeeded conn.close() return (response, data) except http.socket.error, e: err = e LOGGER.debug("socket error %r" % e) try: conn.close() except: pass if e in (errno.ECONNREFUSED, errno.ENETUNREACH): raise self.RequestFailed(cause=e) except http.HTTPException, e: err = e LOGGER.debug("HTTPException %r" % e) try: conn.close() except: pass LOGGER.exception("rendernode.request failed")
def request(self, method, url, body=None, headers={}): """ """ # from octopus.dispatcher import settings # LOGGER.debug("Send request to RN: http://%s:%s%s %s (%s)" % (self.host, self.port, url, method, headers)) err = None conn = self.getHTTPConnection() # try to process the request at most RENDERNODE_REQUEST_MAX_RETRY_COUNT times. for i in xrange(singletonconfig.get('COMMUNICATION', 'RENDERNODE_REQUEST_MAX_RETRY_COUNT')): try: conn.request(method, url, body, headers) response = conn.getresponse() if response.length: data = response.read(response.length) else: data = None # request succeeded conn.close() return (response, data) except http.socket.error, e: err = e # LOGGER.debug("socket error %r" % e) try: conn.close() except: pass if e in (errno.ECONNREFUSED, errno.ENETUNREACH): raise self.RequestFailed(cause=e) except http.HTTPException, e: err = e # LOGGER.debug("HTTPException %r" % e) try: conn.close() except: pass LOGGER.exception("rendernode.request failed")
def aggregate(self): """ | Called each cycle to store data in a buffer array | Once every BUFFER_SIZE cycles, the data is dumped in stats.log for later use """ cycleData = [ self.cycleDate, copy(self.cycleTimers), copy(self.cycleCounts), copy(self.assignmentTimers) ] self.accumulationBuffer.append(cycleData) # Clean data for next cycle (only counts need to be cleaned, timer are overwritten) self._resetCounts() # Dump to file if singletonconfig.get('CORE', 'STATS_BUFFER_SIZE') <= len( self.accumulationBuffer): self._flush() return True
def put(self, computerName, commandId): '''Update command `commandId` running on rendernode `renderNodeId`. Returns "200 OK" on success, or "404 Bad Request" if the provided json data is not valid. ''' if singletonconfig.get('CORE', 'GET_STATS'): singletonstats.theStats.cycleCounts['update_commands'] += 1 computerName = computerName.lower() # try: # updateDict = self.sanitizeUpdateDict(self.getBodyAsJSON()) # except TypeError, e: # return Http400(repr(e.args)) updateDict = self.getBodyAsJSON() updateDict['renderNodeName'] = computerName try: self.framework.application.updateCommandApply(updateDict) except (KeyError, IndexError) as e: raise Http404(str(e)) except Exception, e: raise Http500("Exception during command update")
def computeAssignments(self): '''Computes and returns a list of (rendernode, command) assignments.''' LOGGER = logging.getLogger('main') from .model.node import NoRenderNodeAvailable, NoLicenseAvailableForTask # if no rendernodes available, return if not any(rn.isAvailable() for rn in self.dispatchTree.renderNodes.values()): return [] assignments = [] # first create a set of entrypoints that are not done nor cancelled nor blocked nor paused and that have at least one command ready # FIXME: hack to avoid getting the 'graphs' poolShare node in entryPoints, need to avoid it more nicely... entryPoints = set([ poolShare.node for poolShare in self.dispatchTree.poolShares.values() if poolShare.node.status not in [NODE_BLOCKED, NODE_DONE, NODE_CANCELED, NODE_PAUSED] and poolShare.node.readyCommandCount > 0 and poolShare.node.name != 'graphs' ]) # don't proceed to the calculation if no rns availables in the requested pools rnsBool = False for pool, nodesiterator in groupby( entryPoints, lambda x: x.poolShares.values()[0].pool): rnsAvailables = set([ rn for rn in pool.renderNodes if rn.status not in [RN_UNKNOWN, RN_PAUSED, RN_WORKING] ]) if len(rnsAvailables): rnsBool = True if not rnsBool: return [] # Log time updating max rn prevTimer = time.time() # sort by pool for the groupby entryPoints = sorted(entryPoints, key=lambda node: node.poolShares.values()[0].pool) # update the value of the maxrn for the poolshares (parallel dispatching) for pool, nodesiterator in groupby( entryPoints, lambda x: x.poolShares.values()[0].pool): # we are treating every active node of the pool nodesList = [node for node in nodesiterator] # the new maxRN value is calculated based on the number of active jobs of the pool, and the number of online rendernodes of the pool rnsNotOffline = set([ rn for rn in pool.renderNodes if rn.status not in [RN_UNKNOWN, RN_PAUSED] ]) rnsSize = len(rnsNotOffline) # LOGGER.debug("@ - nb rns awake:%r" % (rnsSize) ) # if we have a userdefined maxRN for some nodes, remove them from the list and substracts their maxRN from the pool's size l = nodesList[:] # duplicate the list to be safe when removing elements for node in l: # LOGGER.debug("@ - checking userDefMaxRN: %s -> %r maxRN=%d" % (node.name, node.poolShares.values()[0].userDefinedMaxRN, node.poolShares.values()[0].maxRN ) ) if node.poolShares.values( )[0].userDefinedMaxRN and node.poolShares.values( )[0].maxRN not in [-1, 0]: # LOGGER.debug("@ removing: %s -> maxRN=%d" % (node.name, node.poolShares.values()[0].maxRN ) ) nodesList.remove(node) rnsSize -= node.poolShares.values()[0].maxRN # LOGGER.debug("@ - nb rns awake after maxRN:%d" % (rnsSize) ) if len(nodesList) == 0: continue # Prepare updatedMaxRN with dispatch key proportions dkList = [] # list of dks (integer only) dkPositiveList = [ ] # Normalized list of dks (each min value of dk becomes 1, other higher elems of dkList gets proportionnal value) nbJobs = len(nodesList) # number of jobs in the current pool nbRNAssigned = 0 # number of render nodes assigned for this pool for node in nodesList: dkList.append(node.dispatchKey) dkMin = min(dkList) dkPositiveList = map(lambda x: x - dkMin + 1, dkList) dkSum = sum(dkPositiveList) # sort by id (fifo) nodesList = sorted(nodesList, key=lambda x: x.id) # then sort by dispatchKey (priority) nodesList = sorted(nodesList, key=lambda x: x.dispatchKey, reverse=True) for dk, nodeIterator in groupby(nodesList, lambda x: x.dispatchKey): nodes = [node for node in nodeIterator] dkPos = dkPositiveList[dkList.index(dk)] if dkSum > 0: updatedmaxRN = int(round(rnsSize * (dkPos / float(dkSum)))) else: updatedmaxRN = int(round(rnsSize / float(nbJobs))) for node in nodes: node.poolShares.values()[0].maxRN = updatedmaxRN nbRNAssigned += updatedmaxRN # Add remaining RNs to most important jobs unassignedRN = rnsSize - nbRNAssigned while unassignedRN > 0: for node in nodesList: if unassignedRN > 0: node.poolShares.values()[0].maxRN += 1 unassignedRN -= 1 else: break if singletonconfig.get('CORE', 'GET_STATS'): singletonstats.theStats.assignmentTimers[ 'update_max_rn'] = time.time() - prevTimer LOGGER.info("%8.2f ms --> .... updating max RN values", (time.time() - prevTimer) * 1000) # now, we are treating every nodes # sort by id (fifo) entryPoints = sorted(entryPoints, key=lambda node: node.id) # then sort by dispatchKey (priority) entryPoints = sorted(entryPoints, key=lambda node: node.dispatchKey, reverse=True) # Put nodes with a userDefinedMaxRN first userDefEntryPoints = ifilter( lambda node: node.poolShares.values()[0].userDefinedMaxRN, entryPoints) standardEntryPoints = ifilter( lambda node: not node.poolShares.values()[0].userDefinedMaxRN, entryPoints) scoredEntryPoints = chain(userDefEntryPoints, standardEntryPoints) # Log time dispatching RNs prevTimer = time.time() # Iterate over each entryPoint to get an assignment for entryPoint in scoredEntryPoints: if any([ poolShare.hasRenderNodesAvailable() for poolShare in entryPoint.poolShares.values() ]): try: for (rn, com) in entryPoint.dispatchIterator( lambda: self.queue.qsize() > 0): assignments.append((rn, com)) # increment the allocatedRN for the poolshare poolShare.allocatedRN += 1 # save the active poolshare of the rendernode rn.currentpoolshare = poolShare except NoRenderNodeAvailable: pass except NoLicenseAvailableForTask: LOGGER.info( "Missing license for node \"%s\" (other commands can start anyway)." % entryPoint.name) pass assignmentDict = collections.defaultdict(list) for (rn, com) in assignments: assignmentDict[rn].append(com) if singletonconfig.get('CORE', 'GET_STATS'): singletonstats.theStats.assignmentTimers[ 'dispatch_command'] = time.time() - prevTimer LOGGER.info("%8.2f ms --> .... dispatching commands", (time.time() - prevTimer) * 1000) # # Check replacements # # - faire une passe pour les jobs n'ayant pas leur part de gateau # - identifier dans leur pool les jobs killable # - pour chaque ressource, si match : on jette le job en cours ET on desactive son attribut killable # # Backfill # # TODO refaire une passe pour les jobs ayant un attribut "killable" et au moins une pool additionnelle return assignmentDict.items()
def mainLoop(self): ''' | Dispatcher main loop iteration. | Periodically called with tornado'sinternal callback mecanism, the frequency is defined by config: CORE.MASTER_UPDATE_INTERVAL | During this process, the dispatcher will: | - update completion and status for all jobs in dispatchTree | - update status of renderNodes | - validate inter tasks dependencies | - update the DB with recorded changes in the model | - compute new assignments and send them to the proper rendernodes | - release all finished jobs/rns ''' log = logging.getLogger('main') loopStartTime = time.time() prevTimer = loopStartTime if singletonconfig.get('CORE', 'GET_STATS'): singletonstats.theStats.cycleDate = loopStartTime log.info("-----------------------------------------------------") log.info(" Start dispatcher process cycle (old version).") try: self.threadPool.poll() except NoResultsPending: pass else: log.info("finished some network requests") pass self.cycle += 1 # Update of allocation is done when parsing the tree for completion and status update (done partially for invalidated node only i.e. when needed) self.dispatchTree.updateCompletionAndStatus() if singletonconfig.get('CORE', 'GET_STATS'): singletonstats.theStats.cycleTimers['update_tree'] = time.time( ) - prevTimer log.info("%8.2f ms --> update completion status" % ((time.time() - prevTimer) * 1000)) prevTimer = time.time() # Update render nodes self.updateRenderNodes() if singletonconfig.get('CORE', 'GET_STATS'): singletonstats.theStats.cycleTimers['update_rn'] = time.time( ) - prevTimer log.info("%8.2f ms --> update render node" % ((time.time() - prevTimer) * 1000)) prevTimer = time.time() # Validate dependencies self.dispatchTree.validateDependencies() if singletonconfig.get('CORE', 'GET_STATS'): singletonstats.theStats.cycleTimers[ 'update_dependencies'] = time.time() - prevTimer log.info("%8.2f ms --> validate dependencies" % ((time.time() - prevTimer) * 1000)) prevTimer = time.time() # update db self.updateDB() if singletonconfig.get('CORE', 'GET_STATS'): singletonstats.theStats.cycleTimers['update_db'] = time.time( ) - prevTimer log.info("%8.2f ms --> update DB" % ((time.time() - prevTimer) * 1000)) prevTimer = time.time() # compute and send command assignments to rendernodes assignments = self.computeAssignments() if singletonconfig.get('CORE', 'GET_STATS'): singletonstats.theStats.cycleTimers[ 'compute_assignment'] = time.time() - prevTimer log.info("%8.2f ms --> compute assignments." % ((time.time() - prevTimer) * 1000)) prevTimer = time.time() self.sendAssignments(assignments) if singletonconfig.get('CORE', 'GET_STATS'): singletonstats.theStats.cycleTimers['send_assignment'] = time.time( ) - prevTimer singletonstats.theStats.cycleCounts['num_assignments'] = len( assignments) log.info("%8.2f ms --> send %r assignments." % ((time.time() - prevTimer) * 1000, len(assignments))) prevTimer = time.time() # call the release finishing status on all rendernodes for renderNode in self.dispatchTree.renderNodes.values(): renderNode.releaseFinishingStatus() if singletonconfig.get('CORE', 'GET_STATS'): singletonstats.theStats.cycleTimers[ 'release_finishing'] = time.time() - prevTimer log.info("%8.2f ms --> releaseFinishingStatus" % ((time.time() - prevTimer) * 1000)) prevTimer = time.time() loopDuration = (time.time() - loopStartTime) * 1000 log.info("%8.2f ms --> cycle ended. " % loopDuration) # # Send stat data to disk # if singletonconfig.get('CORE', 'GET_STATS'): singletonstats.theStats.cycleTimers['time_elapsed'] = time.time( ) - loopStartTime singletonstats.theStats.aggregate()
# _request = "http://%s:%s/stats" % (options.hostname, options.port) _logPath = os.path.join(options.outputFile) # fileHandler = logging.handlers.RotatingFileHandler( _logPath, # maxBytes=20000000, # backupCount=1, # encoding="UTF-8") fileHandler = logging.FileHandler(_logPath, encoding="UTF-8") fileHandler.setFormatter(logging.Formatter('%(message)s')) statsLogger = logging.getLogger('stats') statsLogger.addHandler(fileHandler) statsLogger.setLevel(singletonconfig.get('CORE', 'LOG_LEVEL')) http_client = HTTPClient() try: response = http_client.fetch(_request) if response.error: print "Error: %s" % response.error print " %s" % response.body else: if response.body == "": print "Error: No stats retrieved" else: tmp = json.loads(response.body) del tmp["jobs"] del tmp["commands"]
def computeAssignments(self): '''Computes and returns a list of (rendernode, command) assignments.''' LOGGER = logging.getLogger('main') from .model.node import NoRenderNodeAvailable, NoLicenseAvailableForTask # if no rendernodes available, return if not any(rn.isAvailable() for rn in self.dispatchTree.renderNodes.values()): return [] # first create a set of entrypoints that are not done nor cancelled nor blocked nor paused and that have at least one command ready # FIXME: hack to avoid getting the 'graphs' poolShare node in entryPoints, need to avoid it more nicely... entryPoints = set([poolShare.node for poolShare in self.dispatchTree.poolShares.values() if poolShare.node.status not in (NODE_BLOCKED, NODE_DONE, NODE_CANCELED, NODE_PAUSED) and poolShare.node.readyCommandCount > 0 and poolShare.node.name != 'graphs']) # don't proceed to the calculation if no render nodes available in the requested pools isRenderNodesAvailable = False for pool, jobsIterator in groupby(entryPoints, lambda x: x.mainPoolShare().pool): renderNodesAvailable = set([rn for rn in pool.renderNodes if rn.status not in [RN_UNKNOWN, RN_PAUSED, RN_WORKING]]) if len(renderNodesAvailable): isRenderNodesAvailable = True break if not isRenderNodesAvailable: return [] # Log time updating max rn prevTimer = time.time() # sort by pool for the groupby entryPoints = sorted(entryPoints, key=lambda node: node.mainPoolShare().pool) # update the value of the maxrn for the poolshares (parallel dispatching) for pool, jobsIterator in groupby(entryPoints, lambda x: x.mainPoolShare().pool): # we are treating every active job of the pool jobsList = [job for job in jobsIterator] # the new maxRN value is calculated based on the number of active jobs of the pool, and the number of online rendernodes of the pool onlineRenderNodes = set([rn for rn in pool.renderNodes if rn.status not in [RN_UNKNOWN, RN_PAUSED]]) nbOnlineRenderNodes = len(onlineRenderNodes) # LOGGER.debug("@ - nb rns awake:%r" % (nbOnlineRenderNodes) ) # if we have a userdefined maxRN for some nodes, remove them from the list and substracts their maxRN from the pool's size l = jobsList[:] # duplicate the list to be safe when removing elements for job in l: # LOGGER.debug("@ - checking userDefMaxRN: %s -> %r maxRN=%d" % (job.name, job.mainPoolShare().userDefinedMaxRN, job.mainPoolShare().maxRN ) ) if job.mainPoolShare().userDefinedMaxRN and job.mainPoolShare().maxRN not in [-1, 0]: # LOGGER.debug("@ removing: %s -> maxRN=%d" % (job.name, job.mainPoolShare().maxRN ) ) jobsList.remove(job) nbOnlineRenderNodes -= job.mainPoolShare().maxRN # LOGGER.debug("@ - nb rns awake after maxRN:%d" % (nbOnlineRenderNodes) ) if len(jobsList) == 0: continue # Prepare updatedMaxRN with dispatch key proportions # list of dks (integer only) dkList = [job.dispatchKey for job in jobsList] nbJobs = len(jobsList) # number of jobs in the current pool nbRNAssigned = 0 # number of render nodes assigned for this pool dkMin = min(dkList) # dkPositiveList: Shift all dks values in order that each min value of dk becomes 1 dkPositiveList = map(lambda x: x-dkMin+1, dkList) # dk values start at 1 dkSum = sum(dkPositiveList) # sort by id (fifo) jobsList = sorted(jobsList, key=lambda x: x.id) # then sort by dispatchKey (priority) jobsList = sorted(jobsList, key=lambda x: x.dispatchKey, reverse=True) for dk, jobIterator in groupby(jobsList, lambda x: x.dispatchKey): jobs = [job for job in jobIterator] # dkPositive: Shift all dks values in order that each min value of dk becomes 1 dkPositive = dk - dkMin + 1 # Proportion of render nodes for updatedmaxRN = int(round(nbOnlineRenderNodes * (dkPositive / float(dkSum)))) for job in jobs: job.mainPoolShare().maxRN = updatedmaxRN nbRNAssigned += updatedmaxRN # PRA: Here is the main choice! # Add remaining RNs to most important jobs (to fix rounding errors) unassignedRN = nbOnlineRenderNodes - nbRNAssigned while unassignedRN > 0: for job in jobsList: if unassignedRN <= 0: break job.mainPoolShare().maxRN += 1 unassignedRN -= 1 if singletonconfig.get('CORE','GET_STATS'): singletonstats.theStats.assignmentTimers['update_max_rn'] = time.time() - prevTimer LOGGER.info( "%8.2f ms --> .... updating max RN values", (time.time() - prevTimer)*1000 ) # now, we are treating every nodes # sort by id (fifo) entryPoints = sorted(entryPoints, key=lambda node: node.id) # then sort by dispatchKey (priority) entryPoints = sorted(entryPoints, key=lambda node: node.dispatchKey, reverse=True) # Put nodes with a userDefinedMaxRN first userDefEntryPoints = ifilter(lambda node: node.mainPoolShare().userDefinedMaxRN, entryPoints) standardEntryPoints = ifilter(lambda node: not node.mainPoolShare().userDefinedMaxRN, entryPoints) scoredEntryPoints = chain(userDefEntryPoints, standardEntryPoints) # Log time dispatching RNs prevTimer = time.time() # Iterate over each entryPoint to get an assignment assignments = [] # list of (renderNode, Command) for entryPoint in scoredEntryPoints: # If we have dedicated render nodes for this poolShare if not any([poolShare.hasRenderNodesAvailable() for poolShare in entryPoint.poolShares.values()]): continue for (rn, com) in entryPoint.dispatchIterator(lambda: self.queue.qsize() > 0): assignments.append((rn, com)) # increment the allocatedRN for the poolshare entryPoint.mainPoolShare().allocatedRN += 1 # save the active poolshare of the rendernode rn.currentpoolshare = entryPoint.mainPoolShare() assignmentDict = collections.defaultdict(list) for (rn, com) in assignments: assignmentDict[rn].append(com) if singletonconfig.get('CORE','GET_STATS'): singletonstats.theStats.assignmentTimers['dispatch_command'] = time.time() - prevTimer LOGGER.info( "%8.2f ms --> .... dispatching commands", (time.time() - prevTimer)*1000 ) # # Check replacements # # - faire une passe pour les jobs n'ayant pas leur part de gateau # - identifier dans leur pool les jobs killable # - pour chaque ressource, si match : on jette le job en cours ET on desactive son attribut killable # # Backfill # # TODO refaire une passe pour les jobs ayant un attribut "killable" et au moins une pool additionnelle return assignmentDict.items()
def computeAssignments(self): '''Computes and returns a list of (rendernode, command) assignments.''' LOGGER = logging.getLogger('main') from .model.node import NoRenderNodeAvailable, NoLicenseAvailableForTask # if no rendernodes available, return if not any(rn.isAvailable() for rn in self.dispatchTree.renderNodes.values()): return [] # first create a set of entrypoints that are not done nor cancelled nor blocked nor paused and that have at least one command ready # FIXME: hack to avoid getting the 'graphs' poolShare node in entryPoints, need to avoid it more nicely... entryPoints = set([ poolShare.node for poolShare in self.dispatchTree.poolShares.values() if poolShare.node.status not in ( NODE_BLOCKED, NODE_DONE, NODE_CANCELED, NODE_PAUSED) and poolShare.node.readyCommandCount > 0 and poolShare.node.name != 'graphs' ]) # don't proceed to the calculation if no render nodes available in the requested pools isRenderNodesAvailable = False for pool, jobsIterator in groupby(entryPoints, lambda x: x.mainPoolShare().pool): renderNodesAvailable = set([ rn for rn in pool.renderNodes if rn.status not in [RN_UNKNOWN, RN_PAUSED, RN_WORKING] ]) if len(renderNodesAvailable): isRenderNodesAvailable = True break if not isRenderNodesAvailable: return [] # Log time updating max rn prevTimer = time.time() # sort by pool for the groupby entryPoints = sorted(entryPoints, key=lambda node: node.mainPoolShare().pool) # update the value of the maxrn for the poolshares (parallel dispatching) for pool, jobsIterator in groupby(entryPoints, lambda x: x.mainPoolShare().pool): # we are treating every active job of the pool jobsList = [job for job in jobsIterator] # the new maxRN value is calculated based on the number of active jobs of the pool, and the number of online rendernodes of the pool onlineRenderNodes = set([ rn for rn in pool.renderNodes if rn.status not in [RN_UNKNOWN, RN_PAUSED] ]) nbOnlineRenderNodes = len(onlineRenderNodes) # LOGGER.debug("@ - nb rns awake:%r" % (nbOnlineRenderNodes) ) # if we have a userdefined maxRN for some nodes, remove them from the list and substracts their maxRN from the pool's size l = jobsList[:] # duplicate the list to be safe when removing elements for job in l: # LOGGER.debug("@ - checking userDefMaxRN: %s -> %r maxRN=%d" % (job.name, job.mainPoolShare().userDefinedMaxRN, job.mainPoolShare().maxRN ) ) if job.mainPoolShare().userDefinedMaxRN and job.mainPoolShare( ).maxRN not in [-1, 0]: # LOGGER.debug("@ removing: %s -> maxRN=%d" % (job.name, job.mainPoolShare().maxRN ) ) jobsList.remove(job) nbOnlineRenderNodes -= job.mainPoolShare().maxRN # LOGGER.debug("@ - nb rns awake after maxRN:%d" % (nbOnlineRenderNodes) ) if len(jobsList) == 0: continue # Prepare updatedMaxRN with dispatch key proportions # list of dks (integer only) dkList = [job.dispatchKey for job in jobsList] nbJobs = len(jobsList) # number of jobs in the current pool nbRNAssigned = 0 # number of render nodes assigned for this pool dkMin = min(dkList) # dkPositiveList: Shift all dks values in order that each min value of dk becomes 1 dkPositiveList = map(lambda x: x - dkMin + 1, dkList) # dk values start at 1 dkSum = sum(dkPositiveList) # sort by id (fifo) jobsList = sorted(jobsList, key=lambda x: x.id) # then sort by dispatchKey (priority) jobsList = sorted(jobsList, key=lambda x: x.dispatchKey, reverse=True) for dk, jobIterator in groupby(jobsList, lambda x: x.dispatchKey): jobs = [job for job in jobIterator] # dkPositive: Shift all dks values in order that each min value of dk becomes 1 dkPositive = dk - dkMin + 1 # Proportion of render nodes for updatedmaxRN = int( round(nbOnlineRenderNodes * (dkPositive / float(dkSum)))) for job in jobs: job.mainPoolShare().maxRN = updatedmaxRN nbRNAssigned += updatedmaxRN # PRA: Here is the main choice! # Add remaining RNs to most important jobs (to fix rounding errors) unassignedRN = nbOnlineRenderNodes - nbRNAssigned while unassignedRN > 0: for job in jobsList: if unassignedRN <= 0: break job.mainPoolShare().maxRN += 1 unassignedRN -= 1 if singletonconfig.get('CORE', 'GET_STATS'): singletonstats.theStats.assignmentTimers[ 'update_max_rn'] = time.time() - prevTimer LOGGER.info("%8.2f ms --> .... updating max RN values", (time.time() - prevTimer) * 1000) # now, we are treating every nodes # sort by id (fifo) entryPoints = sorted(entryPoints, key=lambda node: node.id) # then sort by dispatchKey (priority) entryPoints = sorted(entryPoints, key=lambda node: node.dispatchKey, reverse=True) # Put nodes with a userDefinedMaxRN first userDefEntryPoints = ifilter( lambda node: node.mainPoolShare().userDefinedMaxRN, entryPoints) standardEntryPoints = ifilter( lambda node: not node.mainPoolShare().userDefinedMaxRN, entryPoints) scoredEntryPoints = chain(userDefEntryPoints, standardEntryPoints) # Log time dispatching RNs prevTimer = time.time() # Iterate over each entryPoint to get an assignment assignments = [] # list of (renderNode, Command) for entryPoint in scoredEntryPoints: # If we have dedicated render nodes for this poolShare if not any([ poolShare.hasRenderNodesAvailable() for poolShare in entryPoint.poolShares.values() ]): continue for (rn, com) in entryPoint.dispatchIterator( lambda: self.queue.qsize() > 0): assignments.append((rn, com)) # increment the allocatedRN for the poolshare entryPoint.mainPoolShare().allocatedRN += 1 # save the active poolshare of the rendernode rn.currentpoolshare = entryPoint.mainPoolShare() assignmentDict = collections.defaultdict(list) for (rn, com) in assignments: assignmentDict[rn].append(com) if singletonconfig.get('CORE', 'GET_STATS'): singletonstats.theStats.assignmentTimers[ 'dispatch_command'] = time.time() - prevTimer LOGGER.info("%8.2f ms --> .... dispatching commands", (time.time() - prevTimer) * 1000) # # Check replacements # # - faire une passe pour les jobs n'ayant pas leur part de gateau # - identifier dans leur pool les jobs killable # - pour chaque ressource, si match : on jette le job en cours ET on desactive son attribut killable # # Backfill # # TODO refaire une passe pour les jobs ayant un attribut "killable" et au moins une pool additionnelle return assignmentDict.items()
def post(self, computerName): """ A worker send a request to get registered on the server. """ if singletonconfig.get('CORE', 'GET_STATS'): singletonstats.theStats.cycleCounts['add_rns'] += 1 computerName = computerName.lower() if computerName.startswith(('1', '2')): return Http403( message="Cannot register a RenderNode without a name", content="Cannot register a RenderNode without a name") dct = self.getBodyAsJSON() if computerName in self.getDispatchTree().renderNodes: # When the registering worker is already listed in RN list logger.warning("RenderNode already registered: %s" % computerName) existingRN = self.getDispatchTree().renderNodes[computerName] if 'commands' not in dct: # No commands in current RN, reset command that might be still assigned to this RN existingRN.reset() else: logger.warning( "Reset commands that are assigned to this RN: %r" % dct.get('commands', '-')) for cmdId in dct['commands']: existingRN.commands[cmdId] = self.getDispatchTree( ).commands[cmdId] if 'status' in dct: existingRN.status = int(dct['status']) return HttpResponse(304, "RenderNode already registered.") else: # Add a new worker (and set infos given in request body) for key in ('name', 'port', 'status', 'cores', 'speed', 'ram', 'pools', 'caracteristics'): if not key in dct: return Http400("Missing key %r" % key, content="Missing key %r" % key) port = int(dct['port']) status = int(dct['status']) if status not in (RN_UNKNOWN, RN_PAUSED, RN_IDLE, RN_BOOTING): # FIXME: CONFLICT is not a good value maybe return HttpConflict( "Unallowed status for RenderNode registration") cores = int(dct['cores']) speed = float(dct['speed']) ram = int(dct['ram']) pools = dct['pools'] caracteristics = dct['caracteristics'] name, port = computerName.split(":", 1) puliversion = dct.get('puliversion', "unknown") createDate = dct.get('createDate', time.time()) renderNode = RenderNode(None, computerName, cores, speed, name, port, ram, caracteristics, puliversion=puliversion, createDate=createDate) renderNode.status = status poolList = [] # check the existence of the pools for poolName in pools: try: pool = self.getDispatchTree().pools[poolName] poolList.append(pool) except KeyError: return HttpConflict("Pool %s is not a registered pool", poolName) # add the rendernode to the pools for pool in poolList: pool.addRenderNode(renderNode) # add the rendernode to the list of rendernodes renderNode.pools = poolList self.getDispatchTree().renderNodes[renderNode.name] = renderNode self.writeCallback(json.dumps(renderNode.to_json()))
from octopus.dispatcher import settings # # Load specific logger for collecting stats. # # Custom level to avoid flooding the main loggers # We use a logger and handler with low level to ensure it always receive message even in log level is change # via the config file and reloaded. statsLog = logging.getLogger('server_stats') statsLog.setLevel(1) statsLog.propagate = False try: hd = logging.handlers.RotatingFileHandler( os.path.join(settings.LOGDIR, "stats.log"), maxBytes=singletonconfig.get('CORE', 'STATS_SIZE'), backupCount=0) hd.setFormatter(logging.Formatter('%(message)s')) hd.setLevel(1) statsLog.addHandler(hd) except IOError as err: print "Warning invalid path specified for log." class DispatcherStats(): """ | Class holding custom infos on the dispatcher. | This data can be periodically flushed in a specific log file for later use """
print "Command arguments: %s" % args print "Query: %s"+_request # fileHandler = logging.handlers.RotatingFileHandler( _logPath, # maxBytes=20000000, # backupCount=1, # encoding="UTF-8") fileHandler = logging.FileHandler( _logPath, encoding="UTF-8") fileHandler.setFormatter( logging.Formatter('%(message)s') ) statsLogger = logging.getLogger('stats') statsLogger.addHandler( fileHandler ) statsLogger.setLevel( singletonconfig.get('CORE','LOG_LEVEL') ) http_client = HTTPClient() try: response = http_client.fetch( _request ) if response.error: print "Error: %s" % response.error print " %s" % response.body else: if response.body == "": print "Error: No stats retrieved" else: data = json.loads(response.body)
def getHTTPConnection(self): timeout = singletonconfig.get('COMMUNICATION', 'RENDERNODE_REQUEST_TIMEOUT', 5) return http.HTTPConnection(self.host, self.port, timeout=timeout)
def computeAssignments(self): '''Computes and returns a list of (rendernode, command) assignments.''' from .model.node import NoRenderNodeAvailable, NoLicenseAvailableForTask # if no rendernodes available, return if not any(rn.isAvailable() for rn in self.dispatchTree.renderNodes.values()): return [] assignments = [] # first create a set of entrypoints that are not done nor cancelled nor blocked nor paused and that have at least one command ready # FIXME: hack to avoid getting the 'graphs' poolShare node in entryPoints, need to avoid it more nicely... entryPoints = set([poolShare.node for poolShare in self.dispatchTree.poolShares.values() if poolShare.node.status not in [NODE_BLOCKED, NODE_DONE, NODE_CANCELED, NODE_PAUSED] and poolShare.node.readyCommandCount > 0 and poolShare.node.name != 'graphs']) # don't proceed to the calculation if no rns availables in the requested pools rnsBool = False for pool, nodesiterator in groupby(entryPoints, lambda x: x.poolShares.values()[0].pool): rnsAvailables = set([rn for rn in pool.renderNodes if rn.status not in [RN_UNKNOWN, RN_PAUSED, RN_WORKING]]) if len(rnsAvailables): rnsBool = True if not rnsBool: return [] # Log time updating max rn prevTimer = time.time() # sort by pool for the groupby entryPoints = sorted(entryPoints, key=lambda node: node.poolShares.values()[0].pool) # update the value of the maxrn for the poolshares (parallel dispatching) for pool, nodesiterator in groupby(entryPoints, lambda x: x.poolShares.values()[0].pool): # we are treating every active node of the pool nodesList = [node for node in nodesiterator] # the new maxRN value is calculated based on the number of active jobs of the pool, and the number of online rendernodes of the pool rnsNotOffline = set([rn for rn in pool.renderNodes if rn.status not in [RN_UNKNOWN, RN_PAUSED]]) rnsSize = len(rnsNotOffline) # LOGGER.debug("@ - nb rns awake:%r" % (rnsSize) ) # if we have a userdefined maxRN for some nodes, remove them from the list and substracts their maxRN from the pool's size l = nodesList[:] # duplicate the list to be safe when removing elements for node in l: # LOGGER.debug("@ - checking userDefMaxRN: %s -> %r maxRN=%d" % (node.name, node.poolShares.values()[0].userDefinedMaxRN, node.poolShares.values()[0].maxRN ) ) if node.poolShares.values()[0].userDefinedMaxRN and node.poolShares.values()[0].maxRN not in [-1, 0]: # LOGGER.debug("@ removing: %s -> maxRN=%d" % (node.name, node.poolShares.values()[0].maxRN ) ) nodesList.remove(node) rnsSize -= node.poolShares.values()[0].maxRN # LOGGER.debug("@ - nb rns awake after maxRN:%d" % (rnsSize) ) if len(nodesList) == 0: continue # Prepare updatedMaxRN with dispatch key proportions dkList = [] # list of dks (integer only) dkPositiveList = [] # Normalized list of dks (each min value of dk becomes 1, other higher elems of dkList gets proportionnal value) nbJobs = len(nodesList) # number of jobs in the current pool nbRNAssigned = 0 # number of render nodes assigned for this pool for node in nodesList: dkList.append(node.dispatchKey) dkMin = min(dkList) dkPositiveList = map(lambda x: x-dkMin+1, dkList) dkSum = sum(dkPositiveList) # sort by id (fifo) nodesList = sorted(nodesList, key=lambda x: x.id) # then sort by dispatchKey (priority) nodesList = sorted(nodesList, key=lambda x: x.dispatchKey, reverse=True) for dk, nodeIterator in groupby(nodesList, lambda x: x.dispatchKey): nodes = [node for node in nodeIterator] dkPos = dkPositiveList[ dkList.index(dk) ] if dkSum > 0: updatedmaxRN = int( round( rnsSize * (dkPos / float(dkSum) ))) else: updatedmaxRN = int(round( rnsSize / float(nbJobs) )) for node in nodes: node.poolShares.values()[0].maxRN = updatedmaxRN nbRNAssigned += updatedmaxRN # Add remaining RNs to most important jobs unassignedRN = rnsSize - nbRNAssigned while unassignedRN > 0: for node in nodesList: if unassignedRN > 0: node.poolShares.values()[0].maxRN += 1 unassignedRN -= 1 else: break if singletonconfig.get('CORE','GET_STATS'): singletonstats.theStats.assignmentTimers['update_max_rn'] = time.time() - prevTimer LOGGER.info( "%8.2f ms --> .... updating max RN values", (time.time() - prevTimer)*1000 ) # now, we are treating every nodes # sort by id (fifo) entryPoints = sorted(entryPoints, key=lambda node: node.id) # then sort by dispatchKey (priority) entryPoints = sorted(entryPoints, key=lambda node: node.dispatchKey, reverse=True) # Put nodes with a userDefinedMaxRN first userDefEntryPoints = ifilter( lambda node: node.poolShares.values()[0].userDefinedMaxRN, entryPoints ) standardEntryPoints = ifilter( lambda node: not node.poolShares.values()[0].userDefinedMaxRN, entryPoints ) scoredEntryPoints = chain( userDefEntryPoints, standardEntryPoints) # Log time dispatching RNs prevTimer = time.time() # # HACK update license info for katana with rlmutils # This helps having the real number of used licenses before finishing assignment # This is done because katana rlm management sometime reserves 2 token (cf BUGLIST v1.4) try: import subprocess strRlmKatanaUsed='' strRlmKatanaUsed = subprocess.Popen(["/s/apps/lin/farm/tools/rlm_katana_used.sh"], stdout=subprocess.PIPE).communicate()[0] katanaUsed = int(strRlmKatanaUsed) LOGGER.debug("HACK update katana license: used = %d (+buffer in config:%d)" % (katanaUsed,singletonconfig.get('HACK','KATANA_BUFFER'))) # Sets used license number try: self.licenseManager.licenses["katana"].used = katanaUsed + singletonconfig.get('HACK','KATANA_BUFFER') except KeyError: LOGGER.warning("License katana not found... Impossible to set 'used' value: %d" % katanaUsed) except Exception, e: LOGGER.warning("Error getting number of katana license used via rlmutil (e: %r, rlmoutput=%r)" % (e,strRlmKatanaUsed))
class RenderNode(models.Model): '''This class represents the state of a RenderNode.''' # Sys infos name = models.StringField() speed = models.FloatField() coresNumber = models.IntegerField() ramSize = models.IntegerField() # Dynamic sys infos freeCoresNumber = models.IntegerField() usedCoresNumber = models.DictField(as_item_list=True) freeRam = models.IntegerField() systemFreeRam = models.IntegerField() systemSwapPercentage = models.FloatField() usedRam = models.DictField(as_item_list=True) # Worker state puliversion = models.StringField() commands = models.ModelDictField() status = models.IntegerField() host = models.StringField() port = models.IntegerField() pools = models.ModelListField(indexField='name') caracteristics = models.DictField() isRegistered = models.BooleanField() performance = models.FloatField() excluded = models.BooleanField() # Timers createDate = models.FloatField() registerDate = models.FloatField() lastAliveTime = models.FloatField() def __init__(self, id, name, coresNumber, speed, ip, port, ramSize, caracteristics=None, performance=0.0, puliversion="undefined", createDate=None): '''Constructs a new Rendernode. :parameters: - `name`: the name of the rendernode - `coresNumber`: the number of processors - `speed`: the speed of the processor ''' self.id = int(id) if id else None self.name = str(name) self.coresNumber = int(coresNumber) self.ramSize = int(ramSize) self.licenseManager = None self.freeCoresNumber = int(coresNumber) self.usedCoresNumber = {} self.freeRam = int(ramSize) # ramSize-usedRam i.e. the amount of RAM used if several commands running concurrently self.systemFreeRam = int(ramSize) # the RAM available on the system (updated each ping) self.systemSwapPercentage = 0 self.usedRam = {} self.speed = speed self.commands = {} self.status = RN_UNKNOWN self.responseId = None self.host = str(ip) self.port = int(port) self.pools = [] self.idInformed = False self.isRegistered = False self.lastAliveTime = 0 self.httpConnection = None self.caracteristics = caracteristics if caracteristics else {} self.currentpoolshare = None self.performance = float(performance) self.history = deque(maxlen=singletonconfig.get('CORE', 'RN_NB_ERRORS_TOLERANCE')) self.tasksHistory = deque(maxlen=15) self.excluded = False # Init new data self.puliversion = puliversion if createDate is None: self.createDate = 0 else: self.createDate = createDate self.registerDate = time.time() # Flag linked to the worker flag "isPaused". Handles the case when a worker is set paused but a command is still running (finishing) # the RN on the dispatcher must be flag not to be assigned (i.e. in isAvailable property) # self.canBeAssigned = True if not "softs" in self.caracteristics: self.caracteristics["softs"] = [] ## Returns True if this render node is available for command assignment. # def isAvailable(self): # Need to avoid nodes that have flag isPaused set (i.e. nodes paused by user but still running a command) return (self.isRegistered and self.status == RN_IDLE and not self.commands and not self.excluded) def reset(self, paused=False): # if paused, set the status to RN_PAUSED, else set it to Finishing, it will be set to IDLE in the next iteration of the dispatcher main loop if paused: self.status = RN_PAUSED else: self.status = RN_FINISHING # reset the commands left on this RN, if any for cmd in self.commands.values(): cmd.status = CMD_READY cmd.completion = 0. cmd.renderNode = None self.clearAssignment(cmd) self.commands = {} # reset the associated poolshare, if any if self.currentpoolshare: self.currentpoolshare.allocatedRN -= 1 self.currentpoolshare = None # reset the values for cores and ram self.freeCoresNumber = int(self.coresNumber) self.usedCoresNumber = {} self.freeRam = int(self.ramSize) self.usedRam = {} ## Returns a human readable representation of this RenderNode. # def __repr__(self): return u'RenderNode(id=%s, name=%s, host=%s, port=%s)' % (repr(self.id), repr(self.name), repr(self.host), repr(self.port)) ## Clears all of this rendernode's fields related to the specified assignment. # def clearAssignment(self, command): '''Removes command from the list of commands assigned to this rendernode.''' # in case of failed assignment, decrement the allocatedRN value if self.currentpoolshare: self.currentpoolshare.allocatedRN -= 1 self.currentpoolshare = None try: del self.commands[command.id] except KeyError: pass #LOGGER.debug('attempt to clear assignment of not assigned command %d on worker %s', command.id, self.name) else: self.releaseRessources(command) self.releaseLicense(command) ## Add a command assignment # def addAssignment(self, command): if not command.id in self.commands: self.commands[command.id] = command self.reserveRessources(command) # FIXME the assignment of the cmd should be done here and not in the dispatchIterator func command.assign(self) self.updateStatus() ## Reserve license # def reserveLicense(self, command, licenseManager): self.licenseManager = licenseManager lic = command.task.lic if not lic: return True return licenseManager.reserveLicenseForRenderNode(lic, self) ## Release licence # def releaseLicense(self, command): lic = command.task.lic if lic and self.licenseManager: self.licenseManager.releaseLicenseForRenderNode(lic, self) ## Reserve ressource # def reserveRessources(self, command): res = min(self.freeCoresNumber, command.task.maxNbCores) or self.freeCoresNumber self.usedCoresNumber[command.id] = res self.freeCoresNumber -= res res = min(self.freeRam, command.task.ramUse) or self.freeRam self.usedRam[command.id] = res self.freeRam -= res ## Release ressource # def releaseRessources(self, command): #res = self.usedCoresNumber[command.id] self.freeCoresNumber = self.coresNumber if command.id in self.usedCoresNumber: del self.usedCoresNumber[command.id] #res = self.usedRam[command.id] self.freeRam = self.ramSize if command.id in self.usedRam: del self.usedRam[command.id] ## Unassign a finished command # def unassign(self, command): if not isFinalStatus(command.status): raise ValueError("cannot unassign unfinished command %s" % repr(command)) self.clearAssignment(command) self.updateStatus() def remove(self): self.fireDestructionEvent(self) def updateStatus(self): """ Update rendernode status according to its states: having commands or not, commands status, time etc Status is not changed if no info is brought by the commands. """ # self.status is not RN_PAUSED and time elapsed is enough if time.time() > (self.lastAliveTime + singletonconfig.conf["COMMUNICATION"]["RN_TIMEOUT"]): # set the status of a render node to RN_UNKNOWN after TIMEOUT seconds have elapsed since last update # timeout the commands running on this node if RN_UNKNOWN != self.status: LOGGER.warning("rendernode %s is not responding", self.name) self.status = RN_UNKNOWN if self.commands: for cmd in self.commands.values(): cmd.status = CMD_TIMEOUT self.clearAssignment(cmd) return # This is necessary in case of a cancel command or a mylawn -k if not self.commands: # if self.status is RN_WORKING: # # cancel the command that is running on this RN because it's no longer registered in the model # LOGGER.warning("rendernode %s is reported as working but has no registered command" % self.name) if self.status not in (RN_IDLE, RN_PAUSED, RN_BOOTING): #LOGGER.warning("rendernode %s was %d and is now IDLE." % (self.name, self.status)) self.status = RN_IDLE if self.currentpoolshare: self.currentpoolshare.allocatedRN -= 1 self.currentpoolshare = None return commandStatus = [command.status for command in self.commands.values()] if CMD_RUNNING in commandStatus: self.status = RN_WORKING elif CMD_ASSIGNED in commandStatus: self.status = RN_ASSIGNED elif CMD_ERROR in commandStatus: self.status = RN_FINISHING elif CMD_FINISHING in commandStatus: self.status = RN_FINISHING elif CMD_DONE in commandStatus: self.status = RN_FINISHING # do not set the status to IDLE immediately, to ensure that the order of affectation will be respected elif CMD_TIMEOUT in commandStatus: self.status = RN_FINISHING elif CMD_CANCELED in commandStatus: for cmd in self.commands.values(): # this should not happened, but if it does, ensure the command is no more registered to the rn if cmd.status is CMD_CANCELED: self.clearAssignment(cmd) elif self.status not in (RN_IDLE, RN_BOOTING, RN_UNKNOWN, RN_PAUSED): LOGGER.error("Unable to compute new status for rendernode %r (status %r, commands %r)", self, self.status, self.commands) ## releases the finishing status of the rendernodes # def releaseFinishingStatus(self): if self.status is RN_FINISHING: # remove the commands that are in a final status for cmd in self.commands.values(): if isFinalStatus(cmd.status): self.unassign(cmd) if CMD_DONE == cmd.status: cmd.completion = 1.0 cmd.finish() self.status = RN_IDLE ## # # @warning The returned HTTPConnection is not safe to use from multiple threads # def getHTTPConnection(self): timeout = singletonconfig.get('COMMUNICATION', 'RENDERNODE_REQUEST_TIMEOUT', 5) return http.HTTPConnection(self.host, self.port, timeout=timeout) ## An exception class to report a render node http request failure. # class RequestFailed(Exception): pass ## Sends a HTTP request to the render node and returns a (HTTPResponse, data) tuple on success. # # This method tries to send the request at most RENDERNODE_REQUEST_MAX_RETRY_COUNT times, # waiting RENDERNODE_REQUEST_DELAY_AFTER_REQUEST_FAILURE seconds between each try. It # then raises a RenderNode.RequestFailed exception. # # @param method the HTTP method for this request # @param url the requested URL # @param headers a dictionary with string-keys and string-values (empty by default) # @param body the string body for this request (None by default) # @raise RenderNode.RequestFailed if the request fails. # @note it is a good idea to specify a Content-Length header when giving a non-empty body. # @see the RENDERNODE_REQUEST_MAX_RETRY_COUNT and # RENDERNODE_REQUEST_DELAY_AFTER_REQUEST_FAILURE params affect the execution of this method. # def request(self, method, url, body=None, headers={}): """ """ # from octopus.dispatcher import settings # LOGGER.debug("Send request to RN: http://%s:%s%s %s (%s)" % (self.host, self.port, url, method, headers)) err = None conn = self.getHTTPConnection() # try to process the request at most RENDERNODE_REQUEST_MAX_RETRY_COUNT times. for i in xrange(singletonconfig.get('COMMUNICATION', 'RENDERNODE_REQUEST_MAX_RETRY_COUNT')): try: conn.request(method, url, body, headers) response = conn.getresponse() if response.length: data = response.read(response.length) else: data = None # request succeeded conn.close() return (response, data) except http.socket.error, e: err = e # LOGGER.debug("socket error %r" % e) try: conn.close() except: pass if e in (errno.ECONNREFUSED, errno.ENETUNREACH): raise self.RequestFailed(cause=e) except http.HTTPException, e: err = e # LOGGER.debug("HTTPException %r" % e) try: conn.close() except: pass LOGGER.exception("rendernode.request failed") LOGGER.warning("request failed (%d/%d), reason: %s" % (i + 1, singletonconfig.get('COMMUNICATION', 'RENDERNODE_REQUEST_MAX_RETRY_COUNT'), err)) # request failed so let's sleep for a while time.sleep(singletonconfig.get('COMMUNICATION', 'RENDERNODE_REQUEST_DELAY_AFTER_REQUEST_FAILURE'))
def mainLoop(self): ''' | Dispatcher main loop iteration. | Periodically called with tornado'sinternal callback mecanism, the frequency is defined by config: CORE.MASTER_UPDATE_INTERVAL | During this process, the dispatcher will: | - update completion and status for all jobs in dispatchTree | - update status of renderNodes | - validate inter tasks dependencies | - update the DB with recorded changes in the model | - compute new assignments and send them to the proper rendernodes | - release all finished jobs/rns ''' log = logging.getLogger('main') loopStartTime = time.time() prevTimer = loopStartTime if singletonconfig.get('CORE', 'GET_STATS'): singletonstats.theStats.cycleDate = loopStartTime log.info("-----------------------------------------------------") log.info(" Start dispatcher process cycle (old version).") try: self.threadPool.poll() except NoResultsPending: pass else: log.info("finished some network requests") pass self.cycle += 1 # Update of allocation is done when parsing the tree for completion and status update (done partially for invalidated node only i.e. when needed) self.dispatchTree.updateCompletionAndStatus() if singletonconfig.get('CORE', 'GET_STATS'): singletonstats.theStats.cycleTimers['update_tree'] = time.time() - prevTimer log.info("%8.2f ms --> update completion status" % ((time.time() - prevTimer) * 1000)) prevTimer = time.time() # Update render nodes self.updateRenderNodes() if singletonconfig.get('CORE', 'GET_STATS'): singletonstats.theStats.cycleTimers['update_rn'] = time.time() - prevTimer log.info("%8.2f ms --> update render node" % ((time.time() - prevTimer) * 1000)) prevTimer = time.time() # Validate dependencies self.dispatchTree.validateDependencies() if singletonconfig.get('CORE', 'GET_STATS'): singletonstats.theStats.cycleTimers['update_dependencies'] = time.time() - prevTimer log.info("%8.2f ms --> validate dependencies" % ((time.time() - prevTimer) * 1000)) prevTimer = time.time() # update db self.updateDB() if singletonconfig.get('CORE', 'GET_STATS'): singletonstats.theStats.cycleTimers['update_db'] = time.time() - prevTimer log.info("%8.2f ms --> update DB" % ((time.time() - prevTimer) * 1000)) prevTimer = time.time() # compute and send command assignments to rendernodes assignments = self.computeAssignments() if singletonconfig.get('CORE', 'GET_STATS'): singletonstats.theStats.cycleTimers['compute_assignment'] = time.time() - prevTimer log.info("%8.2f ms --> compute assignments." % ((time.time() - prevTimer) * 1000)) prevTimer = time.time() self.sendAssignments(assignments) if singletonconfig.get('CORE', 'GET_STATS'): singletonstats.theStats.cycleTimers['send_assignment'] = time.time() - prevTimer singletonstats.theStats.cycleCounts['num_assignments'] = len(assignments) log.info("%8.2f ms --> send %r assignments." % ((time.time() - prevTimer) * 1000, len(assignments))) prevTimer = time.time() # call the release finishing status on all rendernodes for renderNode in self.dispatchTree.renderNodes.values(): renderNode.releaseFinishingStatus() if singletonconfig.get('CORE', 'GET_STATS'): singletonstats.theStats.cycleTimers['release_finishing'] = time.time() - prevTimer log.info("%8.2f ms --> releaseFinishingStatus" % ((time.time() - prevTimer) * 1000)) prevTimer = time.time() loopDuration = (time.time() - loopStartTime)*1000 log.info("%8.2f ms --> cycle ended. " % loopDuration) # # Send stat data to disk # if singletonconfig.get('CORE', 'GET_STATS'): singletonstats.theStats.cycleTimers['time_elapsed'] = time.time() - loopStartTime singletonstats.theStats.aggregate()
if options.verbose: print "Command options: %s" % options print "Command arguments: %s" % args print "Query: %s" + _request # fileHandler = logging.handlers.RotatingFileHandler( _logPath, # maxBytes=20000000, # backupCount=1, # encoding="UTF-8") fileHandler = logging.FileHandler(_logPath, encoding="UTF-8") fileHandler.setFormatter(logging.Formatter("%(message)s")) statsLogger = logging.getLogger("stats") statsLogger.addHandler(fileHandler) statsLogger.setLevel(singletonconfig.get("CORE", "LOG_LEVEL")) http_client = HTTPClient() try: response = http_client.fetch(_request) if response.error: print "Error: %s" % response.error print " %s" % response.body else: if response.body == "": print "Error: No stats retrieved" else: data = json.loads(response.body) aggregatedData = formatData(data)
try: import simplejson as json except ImportError: import json from octopus.core import singletonconfig from octopus.dispatcher import settings # # Load specific logger for collecting stats. # # Custom level to avoid flooding the main loggers # We use a logger and handler with low level to ensure it always receive message even in log level is change # via the config file and reloaded. hd = logging.handlers.RotatingFileHandler( os.path.join(settings.LOGDIR, "stats.log"), maxBytes=singletonconfig.get('CORE','STATS_SIZE'), backupCount=0 ) hd.setFormatter( logging.Formatter('%(message)s') ) hd.setLevel( 1 ) statsLog = logging.getLogger('server_stats') statsLog.addHandler( hd ) statsLog.setLevel( 1 ) class DispatcherStats(): """ | Class holding custom infos on the dispatcher. | This data can be periodically flushed in a specific log file for later use """ cycleDate = 0.0
def computeAssignments(self): '''Computes and returns a list of (rendernode, command) assignments.''' LOGGER = logging.getLogger('main') from .model.node import NoRenderNodeAvailable, NoLicenseAvailableForTask # if no rendernodes available, return if not any(rn.isAvailable() for rn in self.dispatchTree.renderNodes.values()): return [] assignments = [] # first create a set of entrypoints that are not done nor cancelled nor blocked nor paused and that have at least one command ready # FIXME: hack to avoid getting the 'graphs' poolShare node in entryPoints, need to avoid it more nicely... entryPoints = set([poolShare.node for poolShare in self.dispatchTree.poolShares.values() if poolShare.node.status not in [NODE_BLOCKED, NODE_DONE, NODE_CANCELED, NODE_PAUSED] and poolShare.node.readyCommandCount > 0 and poolShare.node.name != 'graphs']) # don't proceed to the calculation if no rns availables in the requested pools rnsBool = False for pool, nodesiterator in groupby(entryPoints, lambda x: x.poolShares.values()[0].pool): rnsAvailables = set([rn for rn in pool.renderNodes if rn.status not in [RN_UNKNOWN, RN_PAUSED, RN_WORKING]]) if len(rnsAvailables): rnsBool = True if not rnsBool: return [] # Log time updating max rn prevTimer = time.time() # sort by pool for the groupby entryPoints = sorted(entryPoints, key=lambda node: node.poolShares.values()[0].pool) # update the value of the maxrn for the poolshares (parallel dispatching) for pool, nodesiterator in groupby(entryPoints, lambda x: x.poolShares.values()[0].pool): # we are treating every active node of the pool nodesList = [node for node in nodesiterator] # the new maxRN value is calculated based on the number of active jobs of the pool, and the number of online rendernodes of the pool rnsNotOffline = set([rn for rn in pool.renderNodes if rn.status not in [RN_UNKNOWN, RN_PAUSED]]) rnsSize = len(rnsNotOffline) # log.debug("@ - nb rns awake:%r" % (rnsSize) ) # if we have a userdefined maxRN for some nodes, remove them from the list and substracts their maxRN from the pool's size l = nodesList[:] # duplicate the list to be safe when removing elements for node in l: # log.debug("@ - checking userDefMaxRN: %s -> %r maxRN=%d" % (node.name, node.poolShares.values()[0].userDefinedMaxRN, node.poolShares.values()[0].maxRN ) ) if node.poolShares.values()[0].userDefinedMaxRN and node.poolShares.values()[0].maxRN not in [-1, 0]: # log.debug("@ removing: %s -> maxRN=%d" % (node.name, node.poolShares.values()[0].maxRN ) ) nodesList.remove(node) rnsSize -= node.poolShares.values()[0].maxRN # log.debug("@ - nb rns awake after maxRN:%d" % (rnsSize) ) if len(nodesList) == 0: continue # Prepare updatedMaxRN with dispatch key proportions dkList = [] # list of dks (integer only) dkPositiveList = [] # Normalized list of dks (each min value of dk becomes 1, other higher elems of dkList gets proportionnal value) nbJobs = len(nodesList) # number of jobs in the current pool nbRNAssigned = 0 # number of render nodes assigned for this pool for node in nodesList: dkList.append(node.dispatchKey) dkMin = min(dkList) dkPositiveList = map(lambda x: x-dkMin+1, dkList) dkSum = sum(dkPositiveList) # sort by id (fifo) nodesList = sorted(nodesList, key=lambda x: x.id) # then sort by dispatchKey (priority) nodesList = sorted(nodesList, key=lambda x: x.dispatchKey, reverse=True) for dk, nodeIterator in groupby(nodesList, lambda x: x.dispatchKey): nodes = [node for node in nodeIterator] dkPos = dkPositiveList[ dkList.index(dk) ] if dkSum > 0: updatedmaxRN = int( round( rnsSize * (dkPos / float(dkSum) ))) else: updatedmaxRN = int(round( rnsSize / float(nbJobs) )) for node in nodes: node.poolShares.values()[0].maxRN = updatedmaxRN nbRNAssigned += updatedmaxRN # Add remaining RNs to most important jobs unassignedRN = rnsSize - nbRNAssigned while unassignedRN > 0: for node in nodesList: if unassignedRN > 0: node.poolShares.values()[0].maxRN += 1 unassignedRN -= 1 else: break if singletonconfig.get('CORE','GET_STATS'): singletonstats.theStats.assignmentTimers['update_max_rn'] = time.time() - prevTimer log.info( "%8.2f ms --> .... updating max RN values", (time.time() - prevTimer)*1000 ) # now, we are treating every nodes # sort by id (fifo) entryPoints = sorted(entryPoints, key=lambda node: node.id) # then sort by dispatchKey (priority) entryPoints = sorted(entryPoints, key=lambda node: node.dispatchKey, reverse=True) # Put nodes with a userDefinedMaxRN first userDefEntryPoints = ifilter( lambda node: node.poolShares.values()[0].userDefinedMaxRN, entryPoints ) standardEntryPoints = ifilter( lambda node: not node.poolShares.values()[0].userDefinedMaxRN, entryPoints ) scoredEntryPoints = chain( userDefEntryPoints, standardEntryPoints) # Log time dispatching RNs prevTimer = time.time() # Iterate over each entryPoint to get an assignment for entryPoint in scoredEntryPoints: if any([poolShare.hasRenderNodesAvailable() for poolShare in entryPoint.poolShares.values()]): try: for (rn, com) in entryPoint.dispatchIterator(lambda: self.queue.qsize() > 0): assignments.append((rn, com)) # increment the allocatedRN for the poolshare poolShare.allocatedRN += 1 # save the active poolshare of the rendernode rn.currentpoolshare = poolShare except NoRenderNodeAvailable: pass except NoLicenseAvailableForTask: log.info("Missing license for node \"%s\" (other commands can start anyway)." % entryPoint.name) pass assignmentDict = collections.defaultdict(list) for (rn, com) in assignments: assignmentDict[rn].append(com) if singletonconfig.get('CORE','GET_STATS'): singletonstats.theStats.assignmentTimers['dispatch_command'] = time.time() - prevTimer log.info( "%8.2f ms --> .... dispatching commands", (time.time() - prevTimer)*1000 ) # # Check replacements # # - faire une passe pour les jobs n'ayant pas leur part de gateau # - identifier dans leur pool les jobs killable # - pour chaque ressource, si match : on jette le job en cours ET on desactive son attribut killable # # Backfill # # TODO refaire une passe pour les jobs ayant un attribut "killable" et au moins une pool additionnelle return assignmentDict.items()
def post(self, computerName): """ A worker send a request to get registered on the server. """ if singletonconfig.get('CORE', 'GET_STATS'): singletonstats.theStats.cycleCounts['add_rns'] += 1 computerName = computerName.lower() if computerName.startswith(('1', '2')): return Http403(message="Cannot register a RenderNode without a name", content="Cannot register a RenderNode without a name") dct = self.getBodyAsJSON() if computerName in self.getDispatchTree().renderNodes: # When the registering worker is already listed in RN list logger.warning("RenderNode already registered: %s" % computerName) existingRN = self.getDispatchTree().renderNodes[computerName] if 'commands' not in dct: # No commands in current RN, reset command that might be still assigned to this RN existingRN.reset() else: logger.warning("Reset commands that are assigned to this RN: %r" % dct.get('commands', '-')) for cmdId in dct['commands']: existingRN.commands[cmdId] = self.getDispatchTree().commands[cmdId] if 'status' in dct: existingRN.status = int(dct['status']) return HttpResponse(304, "RenderNode already registered.") else: # Add a new worker (and set infos given in request body) for key in ('name', 'port', 'status', 'cores', 'speed', 'ram', 'pools', 'caracteristics'): if not key in dct: return Http400("Missing key %r" % key, content="Missing key %r" % key) port = int(dct['port']) status = int(dct['status']) if status not in (RN_UNKNOWN, RN_PAUSED, RN_IDLE, RN_BOOTING): # FIXME: CONFLICT is not a good value maybe return HttpConflict("Unallowed status for RenderNode registration") cores = int(dct['cores']) speed = float(dct['speed']) ram = int(dct['ram']) pools = dct['pools'] caracteristics = dct['caracteristics'] name, port = computerName.split(":", 1) puliversion = dct.get('puliversion', "unknown") createDate = dct.get('createDate', time.time()) renderNode = RenderNode(None, computerName, cores, speed, name, port, ram, caracteristics, puliversion=puliversion, createDate=createDate) renderNode.status = status poolList = [] # check the existence of the pools for poolName in pools: try: pool = self.getDispatchTree().pools[poolName] poolList.append(pool) except KeyError: return HttpConflict("Pool %s is not a registered pool", poolName) # add the rendernode to the pools for pool in poolList: pool.addRenderNode(renderNode) # add the rendernode to the list of rendernodes renderNode.pools = poolList self.getDispatchTree().renderNodes[renderNode.name] = renderNode self.writeCallback(json.dumps(renderNode.to_json()))
def canRun(self, command): # check if this rendernode has made too much errors in its last commands cpt = 0 for i in self.history: if i == CMD_ERROR: cpt += 1 if cpt == singletonconfig.get('CORE', 'RN_NB_ERRORS_TOLERANCE'): LOGGER.warning("RenderNode %s had only errors in its commands history, excluding..." % self.name) self.excluded = True return False if self.excluded: return False for (requirement, value) in command.task.requirements.items(): if requirement.lower() == "softs": # todo for soft in value: if not soft in self.caracteristics['softs']: return False else: if not requirement in self.caracteristics: return False else: caracteristic = self.caracteristics[requirement] if type(caracteristic) != type(value) and not isinstance(value, list): return False if isinstance(value, list) and len(value) == 2: a, b = value if type(a) != type(b) or type(a) != type(caracteristic): return False try: if not (a < caracteristic < b): return False except ValueError: return False else: if isinstance(caracteristic, bool) and caracteristic != value: return False if isinstance(caracteristic, basestring) and caracteristic != value: return False if isinstance(caracteristic, int) and caracteristic < value: return False if command.task.minNbCores: if self.freeCoresNumber < command.task.minNbCores: return False else: if self.freeCoresNumber != self.coresNumber: return False # # RAM requirement: we check task requirement with the amount of free RAM reported at last ping (systemFreeRam) # if command.task.ramUse != 0: if self.systemFreeRam < command.task.ramUse: LOGGER.info("Not enough ram on %s for command %d. %d needed, %d avail." % (self.name, command.id, int(command.task.ramUse), self.systemFreeRam)) return False # # timer requirements: a timer is on the task and is the same for all commands # if command.task.timer is not None: # LOGGER.debug("Current command %r has a timer : %s" % (command.id, datetime.datetime.fromtimestamp(command.task.timer) ) ) if time.time() < command.task.timer: LOGGER.info("Prevented execution of command %d because of timer present (%s)" % (command.id, datetime.datetime.fromtimestamp(command.task.timer))) return False return True