def __init__(self, framework): if self.init: return self.init = True self.nextCycle = time.time() MainLoopApplication.__init__(self, framework) self.threadPool = ThreadPool(16, 0, 0, None) # # Class holding custom infos on the dispatcher. # This data can be periodically flushed in a specific log file for later use # self.cycle = 1 self.dispatchTree = DispatchTree() self.licenseManager = LicenseManager() self.enablePuliDB = settings.DB_ENABLE self.cleanDB = settings.DB_CLEAN_DATA self.pulidb = None if self.enablePuliDB: self.pulidb = PuliDB(self.cleanDB, self.licenseManager) self.dispatchTree.registerModelListeners() rnsAlreadyInitialized = self.initPoolsDataFromBackend() if self.enablePuliDB and not self.cleanDB: LOGGER.warning("reloading jobs from database") beginTime = time.time() self.pulidb.restoreStateFromDb(self.dispatchTree, rnsAlreadyInitialized) LOGGER.warning("reloading took %.2fs" % (time.time() - beginTime)) LOGGER.warning("done reloading jobs from database") LOGGER.warning("reloaded %d tasks" % len(self.dispatchTree.tasks)) LOGGER.warning("checking dispatcher state") self.dispatchTree.updateCompletionAndStatus() self.updateRenderNodes() self.dispatchTree.validateDependencies() if self.enablePuliDB and not self.cleanDB: self.dispatchTree.toModifyElements = [] self.defaultPool = self.dispatchTree.pools['default'] LOGGER.warning("loading dispatch rules") self.loadRules() # it should be better to have a maxsize self.queue = Queue(maxsize=10000)
def __init__(self, framework): if self.init: return self.init = True self.nextCycle = time.time() MainLoopApplication.__init__(self, framework) self.threadPool = ThreadPool(16, 0, 0, None) LOGGER.info('settings.DEBUG = %s', settings.DEBUG) LOGGER.info('settings.ADDRESS = %s', settings.ADDRESS) LOGGER.info('settings.PORT = %s', settings.PORT) self.cycle = 1 self.dispatchTree = DispatchTree() self.licenseManager = LicenseManager() self.enablePuliDB = settings.DB_ENABLE self.cleanDB = settings.DB_CLEAN_DATA self.pulidb = None if self.enablePuliDB: self.pulidb = PuliDB(self.cleanDB, self.licenseManager) self.dispatchTree.registerModelListeners() rnsAlreadyInitialized = self.initPoolsDataFromBackend() if self.enablePuliDB and not self.cleanDB: LOGGER.info("reloading jobs from database") beginTime = time.time() self.pulidb.restoreStateFromDb(self.dispatchTree, rnsAlreadyInitialized) LOGGER.info("reloading took %s" % str(time.time() - beginTime)) LOGGER.info("done reloading jobs from database") LOGGER.info("reloaded %d tasks" % len(self.dispatchTree.tasks)) LOGGER.info("checking dispatcher state") self.dispatchTree.updateCompletionAndStatus() self.updateRenderNodes() self.dispatchTree.validateDependencies() if self.enablePuliDB and not self.cleanDB: self.dispatchTree.toModifyElements = [] self.defaultPool = self.dispatchTree.pools['default'] LOGGER.info("loading dispatch rules") self.loadRules() # it should be better to have a maxsize self.queue = Queue(maxsize=10000)
def __init__(self, framework): LOGGER = logging.getLogger('main.dispatcher') if self.init: return self.init = True self.nextCycle = time.time() MainLoopApplication.__init__(self, framework) self.threadPool = ThreadPool(16, 0, 0, None) # # Class holding custom infos on the dispatcher. # This data can be periodically flushed in a specific log file for # later use # self.cycle = 1 self.dispatchTree = DispatchTree() self.licenseManager = LicenseManager() self.enablePuliDB = settings.DB_ENABLE self.cleanDB = settings.DB_CLEAN_DATA self.restartService = False self.pulidb = None if self.enablePuliDB: self.pulidb = PuliDB(self.cleanDB, self.licenseManager) self.dispatchTree.registerModelListeners() rnsAlreadyInitialized = self.initPoolsDataFromBackend() if self.enablePuliDB and not self.cleanDB: LOGGER.warning("--- Reloading database (9 steps) ---") prevTimer = time.time() self.pulidb.restoreStateFromDb(self.dispatchTree, rnsAlreadyInitialized) LOGGER.warning("%d jobs reloaded from database" % len(self.dispatchTree.tasks)) LOGGER.warning("Total time elapsed %s" % elapsedTimeToString(prevTimer)) LOGGER.warning("") LOGGER.warning("--- Checking dispatcher state (3 steps) ---") startTimer = time.time() LOGGER.warning("1/3 Update completion and status") self.dispatchTree.updateCompletionAndStatus() LOGGER.warning(" Elapsed time %s" % elapsedTimeToString(startTimer)) prevTimer = time.time() LOGGER.warning("2/3 Update rendernodes") self.updateRenderNodes() LOGGER.warning(" Elapsed time %s" % elapsedTimeToString(prevTimer)) prevTimer = time.time() LOGGER.warning("3/3 Validate dependencies") self.dispatchTree.validateDependencies() LOGGER.warning(" Elapsed time %s" % elapsedTimeToString(prevTimer)) LOGGER.warning("Total time elapsed %s" % elapsedTimeToString(startTimer)) LOGGER.warning("") if self.enablePuliDB and not self.cleanDB: self.dispatchTree.toModifyElements = [] # If no 'default' pool exists, create default pool # When creating a pool with id=None, it is automatically appended in "toCreateElement" list in dispatcher and in the dispatcher's "pools" attribute if 'default' not in self.dispatchTree.pools: pool = Pool(None, name='default') LOGGER.warning( "Default pool was not loaded from DB, create a new default pool: %s" % pool) self.defaultPool = self.dispatchTree.pools['default'] LOGGER.warning("--- Loading dispatch rules ---") startTimer = time.time() self.loadRules() LOGGER.warning("Total time elapsed %s" % elapsedTimeToString(startTimer)) LOGGER.warning("") # it should be better to have a maxsize self.queue = Queue(maxsize=10000)
class Dispatcher(MainLoopApplication): '''The Dispatcher class is the core of the dispatcher application. It computes the assignments of commands to workers according to a DispatchTree and handles all the communications with the workers and clients. ''' instance = None init = False def __new__(cls, framework): if cls.instance is None: # Disable passing framework to super__new__ call. # It is automatically avaible via super class hierarchy # This removes a deprecation warning when launching dispatcher cls.instance = super(Dispatcher, cls).__new__(cls) return cls.instance def __init__(self, framework): LOGGER = logging.getLogger('main.dispatcher') if self.init: return self.init = True self.nextCycle = time.time() MainLoopApplication.__init__(self, framework) self.threadPool = ThreadPool(16, 0, 0, None) # # Class holding custom infos on the dispatcher. # This data can be periodically flushed in a specific log file for # later use # self.cycle = 1 self.dispatchTree = DispatchTree() self.licenseManager = LicenseManager() self.enablePuliDB = settings.DB_ENABLE self.cleanDB = settings.DB_CLEAN_DATA self.restartService = False self.pulidb = None if self.enablePuliDB: self.pulidb = PuliDB(self.cleanDB, self.licenseManager) self.dispatchTree.registerModelListeners() rnsAlreadyInitialized = self.initPoolsDataFromBackend() if self.enablePuliDB and not self.cleanDB: LOGGER.warning("--- Reloading database (9 steps) ---") prevTimer = time.time() self.pulidb.restoreStateFromDb(self.dispatchTree, rnsAlreadyInitialized) LOGGER.warning("%d jobs reloaded from database" % len(self.dispatchTree.tasks)) LOGGER.warning("Total time elapsed %s" % elapsedTimeToString(prevTimer)) LOGGER.warning("") LOGGER.warning("--- Checking dispatcher state (3 steps) ---") startTimer = time.time() LOGGER.warning("1/3 Update completion and status") self.dispatchTree.updateCompletionAndStatus() LOGGER.warning(" Elapsed time %s" % elapsedTimeToString(startTimer)) prevTimer = time.time() LOGGER.warning("2/3 Update rendernodes") self.updateRenderNodes() LOGGER.warning(" Elapsed time %s" % elapsedTimeToString(prevTimer)) prevTimer = time.time() LOGGER.warning("3/3 Validate dependencies") self.dispatchTree.validateDependencies() LOGGER.warning(" Elapsed time %s" % elapsedTimeToString(prevTimer)) LOGGER.warning("Total time elapsed %s" % elapsedTimeToString(startTimer)) LOGGER.warning("") if self.enablePuliDB and not self.cleanDB: self.dispatchTree.toModifyElements = [] # If no 'default' pool exists, create default pool # When creating a pool with id=None, it is automatically appended in "toCreateElement" list in dispatcher and in the dispatcher's "pools" attribute if 'default' not in self.dispatchTree.pools: pool = Pool(None, name='default') LOGGER.warning( "Default pool was not loaded from DB, create a new default pool: %s" % pool) self.defaultPool = self.dispatchTree.pools['default'] LOGGER.warning("--- Loading dispatch rules ---") startTimer = time.time() self.loadRules() LOGGER.warning("Total time elapsed %s" % elapsedTimeToString(startTimer)) LOGGER.warning("") # it should be better to have a maxsize self.queue = Queue(maxsize=10000) def initPoolsDataFromBackend(self): ''' Loads pools and workers from appropriate backend. ''' try: if settings.POOLS_BACKEND_TYPE == "file": manager = FilePoolManager() elif settings.POOLS_BACKEND_TYPE == "ws": manager = WebServicePoolManager() elif settings.POOLS_BACKEND_TYPE == "db": return False except Exception: return False computers = manager.listComputers() ### recreate the pools poolsList = manager.listPools() poolsById = {} for poolDesc in poolsList: pool = Pool(id=int(poolDesc.id), name=str(poolDesc.name)) self.dispatchTree.toCreateElements.append(pool) poolsById[pool.id] = pool ### recreate the rendernodes rnById = {} for computerDesc in computers: try: computerDesc.name = socket.getfqdn(computerDesc.name) ip = socket.gethostbyname(computerDesc.name) except socket.gaierror: continue renderNode = RenderNode( computerDesc.id, computerDesc.name + ":" + str(computerDesc.port), computerDesc.cpucount * computerDesc.cpucores, computerDesc.cpufreq, ip, computerDesc.port, computerDesc.ramsize, json.loads(computerDesc.properties)) self.dispatchTree.toCreateElements.append(renderNode) ## add the rendernodes to the pools for pool in computerDesc.pools: poolsById[pool.id].renderNodes.append(renderNode) renderNode.pools.append(poolsById[pool.id]) self.dispatchTree.renderNodes[str(renderNode.name)] = renderNode rnById[renderNode.id] = renderNode # add the pools to the dispatch tree for pool in poolsById.values(): self.dispatchTree.pools[pool.name] = pool if self.cleanDB or not self.enablePuliDB: graphs = FolderNode(1, "graphs", self.dispatchTree.root, "root", 0, 0, 0, FifoStrategy()) self.dispatchTree.toCreateElements.append(graphs) self.dispatchTree.nodes[graphs.id] = graphs ps = PoolShare(1, self.dispatchTree.pools["default"], graphs, PoolShare.UNBOUND) self.dispatchTree.toCreateElements.append(ps) if self.enablePuliDB: # clean the tables pools and rendernodes (overwrite) self.pulidb.dropPoolsAndRnsTables() self.pulidb.createElements(self.dispatchTree.toCreateElements) self.dispatchTree.resetDbElements() return True def shutdown(self): ''' Clean procedure before shutting done puli server. ''' logging.getLogger('main').warning( "-----------------------------------------------") logging.getLogger('main').warning( "Exit event caught: closing dispatcher...") try: self.dispatchTree.updateCompletionAndStatus() logging.getLogger('main').warning( "[OK] update completion and status") except Exception: logging.getLogger('main').warning( "[HS] update completion and status") try: self.updateRenderNodes() logging.getLogger('main').warning("[OK] update render nodes") except Exception: logging.getLogger('main').warning("[HS] update render nodes") try: self.dispatchTree.validateDependencies() logging.getLogger('main').warning("[OK] validate dependencies") except Exception: logging.getLogger('main').warning("[HS] validate dependencies") try: self.updateDB() logging.getLogger('main').warning("[OK] update DB") except Exception: logging.getLogger('main').warning("[HS] update DB") def loadRules(self): from .rules.graphview import GraphViewBuilder graphs = self.dispatchTree.findNodeByPath("/graphs", None) if graphs is None: logging.getLogger('main.dispatcher').fatal( "No '/graphs' node, impossible to load rule for /graphs.") self.stop() self.dispatchTree.rules.append( GraphViewBuilder(self.dispatchTree, graphs)) def prepare(self): pass def stop(self): '''Stops the application part of the dispatcher.''' #self.httpRequester.stopAll() pass @property def modified(self): return bool(self.dispatchTree.toArchiveElements or self.dispatchTree.toCreateElements or self.dispatchTree.toModifyElements) def mainLoop(self): ''' | Dispatcher main loop iteration. | Periodically called with tornado'sinternal callback mecanism, the frequency is defined by config: CORE.MASTER_UPDATE_INTERVAL | During this process, the dispatcher will: | - update completion and status for all jobs in dispatchTree | - update status of renderNodes | - validate inter tasks dependencies | - update the DB with recorded changes in the model | - compute new assignments and send them to the proper rendernodes | - release all finished jobs/rns ''' log = logging.getLogger('main') loopStartTime = time.time() prevTimer = loopStartTime if singletonconfig.get('CORE', 'GET_STATS'): singletonstats.theStats.cycleDate = loopStartTime log.info("-----------------------------------------------------") log.info(" Start dispatcher process cycle (old version).") try: self.threadPool.poll() except NoResultsPending: pass else: log.info("finished some network requests") pass self.cycle += 1 # Update of allocation is done when parsing the tree for completion and status update (done partially for invalidated node only i.e. when needed) self.dispatchTree.updateCompletionAndStatus() if singletonconfig.get('CORE', 'GET_STATS'): singletonstats.theStats.cycleTimers['update_tree'] = time.time( ) - prevTimer log.info("%8.2f ms --> update completion status" % ((time.time() - prevTimer) * 1000)) prevTimer = time.time() # Update render nodes self.updateRenderNodes() if singletonconfig.get('CORE', 'GET_STATS'): singletonstats.theStats.cycleTimers['update_rn'] = time.time( ) - prevTimer log.info("%8.2f ms --> update render node" % ((time.time() - prevTimer) * 1000)) prevTimer = time.time() # Validate dependencies self.dispatchTree.validateDependencies() if singletonconfig.get('CORE', 'GET_STATS'): singletonstats.theStats.cycleTimers[ 'update_dependencies'] = time.time() - prevTimer log.info("%8.2f ms --> validate dependencies" % ((time.time() - prevTimer) * 1000)) prevTimer = time.time() # update db self.updateDB() if singletonconfig.get('CORE', 'GET_STATS'): singletonstats.theStats.cycleTimers['update_db'] = time.time( ) - prevTimer log.info("%8.2f ms --> update DB" % ((time.time() - prevTimer) * 1000)) prevTimer = time.time() # compute and send command assignments to rendernodes assignments = self.computeAssignments() if singletonconfig.get('CORE', 'GET_STATS'): singletonstats.theStats.cycleTimers[ 'compute_assignment'] = time.time() - prevTimer log.info("%8.2f ms --> compute assignments." % ((time.time() - prevTimer) * 1000)) prevTimer = time.time() self.sendAssignments(assignments) if singletonconfig.get('CORE', 'GET_STATS'): singletonstats.theStats.cycleTimers['send_assignment'] = time.time( ) - prevTimer singletonstats.theStats.cycleCounts['num_assignments'] = len( assignments) log.info("%8.2f ms --> send %r assignments." % ((time.time() - prevTimer) * 1000, len(assignments))) prevTimer = time.time() # call the release finishing status on all rendernodes for renderNode in self.dispatchTree.renderNodes.values(): renderNode.releaseFinishingStatus() if singletonconfig.get('CORE', 'GET_STATS'): singletonstats.theStats.cycleTimers[ 'release_finishing'] = time.time() - prevTimer log.info("%8.2f ms --> releaseFinishingStatus" % ((time.time() - prevTimer) * 1000)) prevTimer = time.time() loopDuration = (time.time() - loopStartTime) * 1000 log.info("%8.2f ms --> cycle ended. " % loopDuration) # # Send stat data to disk # if singletonconfig.get('CORE', 'GET_STATS'): singletonstats.theStats.cycleTimers['time_elapsed'] = time.time( ) - loopStartTime singletonstats.theStats.aggregate() def updateDB(self): if settings.DB_ENABLE: self.pulidb.createElements(self.dispatchTree.toCreateElements) self.pulidb.updateElements(self.dispatchTree.toModifyElements) self.pulidb.archiveElements(self.dispatchTree.toArchiveElements) # logging.getLogger('main.dispatcher').info(" UpdateDB: create=%d update=%d delete=%d" % (len(self.dispatchTree.toCreateElements), len(self.dispatchTree.toModifyElements), len(self.dispatchTree.toArchiveElements)) ) self.dispatchTree.resetDbElements() def computeAssignments(self): '''Computes and returns a list of (rendernode, command) assignments.''' LOGGER = logging.getLogger('main') from .model.node import NoRenderNodeAvailable, NoLicenseAvailableForTask # if no rendernodes available, return if not any(rn.isAvailable() for rn in self.dispatchTree.renderNodes.values()): return [] assignments = [] # first create a set of entrypoints that are not done nor cancelled nor blocked nor paused and that have at least one command ready # FIXME: hack to avoid getting the 'graphs' poolShare node in entryPoints, need to avoid it more nicely... entryPoints = set([ poolShare.node for poolShare in self.dispatchTree.poolShares.values() if poolShare.node.status not in [NODE_BLOCKED, NODE_DONE, NODE_CANCELED, NODE_PAUSED] and poolShare.node.readyCommandCount > 0 and poolShare.node.name != 'graphs' ]) # don't proceed to the calculation if no rns availables in the requested pools rnsBool = False for pool, nodesiterator in groupby( entryPoints, lambda x: x.poolShares.values()[0].pool): rnsAvailables = set([ rn for rn in pool.renderNodes if rn.status not in [RN_UNKNOWN, RN_PAUSED, RN_WORKING] ]) if len(rnsAvailables): rnsBool = True if not rnsBool: return [] # Log time updating max rn prevTimer = time.time() # sort by pool for the groupby entryPoints = sorted(entryPoints, key=lambda node: node.poolShares.values()[0].pool) # update the value of the maxrn for the poolshares (parallel dispatching) for pool, nodesiterator in groupby( entryPoints, lambda x: x.poolShares.values()[0].pool): # we are treating every active node of the pool nodesList = [node for node in nodesiterator] # the new maxRN value is calculated based on the number of active jobs of the pool, and the number of online rendernodes of the pool rnsNotOffline = set([ rn for rn in pool.renderNodes if rn.status not in [RN_UNKNOWN, RN_PAUSED] ]) rnsSize = len(rnsNotOffline) # LOGGER.debug("@ - nb rns awake:%r" % (rnsSize) ) # if we have a userdefined maxRN for some nodes, remove them from the list and substracts their maxRN from the pool's size l = nodesList[:] # duplicate the list to be safe when removing elements for node in l: # LOGGER.debug("@ - checking userDefMaxRN: %s -> %r maxRN=%d" % (node.name, node.poolShares.values()[0].userDefinedMaxRN, node.poolShares.values()[0].maxRN ) ) if node.poolShares.values( )[0].userDefinedMaxRN and node.poolShares.values( )[0].maxRN not in [-1, 0]: # LOGGER.debug("@ removing: %s -> maxRN=%d" % (node.name, node.poolShares.values()[0].maxRN ) ) nodesList.remove(node) rnsSize -= node.poolShares.values()[0].maxRN # LOGGER.debug("@ - nb rns awake after maxRN:%d" % (rnsSize) ) if len(nodesList) == 0: continue # Prepare updatedMaxRN with dispatch key proportions dkList = [] # list of dks (integer only) dkPositiveList = [ ] # Normalized list of dks (each min value of dk becomes 1, other higher elems of dkList gets proportionnal value) nbJobs = len(nodesList) # number of jobs in the current pool nbRNAssigned = 0 # number of render nodes assigned for this pool for node in nodesList: dkList.append(node.dispatchKey) dkMin = min(dkList) dkPositiveList = map(lambda x: x - dkMin + 1, dkList) dkSum = sum(dkPositiveList) # sort by id (fifo) nodesList = sorted(nodesList, key=lambda x: x.id) # then sort by dispatchKey (priority) nodesList = sorted(nodesList, key=lambda x: x.dispatchKey, reverse=True) for dk, nodeIterator in groupby(nodesList, lambda x: x.dispatchKey): nodes = [node for node in nodeIterator] dkPos = dkPositiveList[dkList.index(dk)] if dkSum > 0: updatedmaxRN = int(round(rnsSize * (dkPos / float(dkSum)))) else: updatedmaxRN = int(round(rnsSize / float(nbJobs))) for node in nodes: node.poolShares.values()[0].maxRN = updatedmaxRN nbRNAssigned += updatedmaxRN # Add remaining RNs to most important jobs unassignedRN = rnsSize - nbRNAssigned while unassignedRN > 0: for node in nodesList: if unassignedRN > 0: node.poolShares.values()[0].maxRN += 1 unassignedRN -= 1 else: break if singletonconfig.get('CORE', 'GET_STATS'): singletonstats.theStats.assignmentTimers[ 'update_max_rn'] = time.time() - prevTimer LOGGER.info("%8.2f ms --> .... updating max RN values", (time.time() - prevTimer) * 1000) # now, we are treating every nodes # sort by id (fifo) entryPoints = sorted(entryPoints, key=lambda node: node.id) # then sort by dispatchKey (priority) entryPoints = sorted(entryPoints, key=lambda node: node.dispatchKey, reverse=True) # Put nodes with a userDefinedMaxRN first userDefEntryPoints = ifilter( lambda node: node.poolShares.values()[0].userDefinedMaxRN, entryPoints) standardEntryPoints = ifilter( lambda node: not node.poolShares.values()[0].userDefinedMaxRN, entryPoints) scoredEntryPoints = chain(userDefEntryPoints, standardEntryPoints) # Log time dispatching RNs prevTimer = time.time() # Iterate over each entryPoint to get an assignment for entryPoint in scoredEntryPoints: if any([ poolShare.hasRenderNodesAvailable() for poolShare in entryPoint.poolShares.values() ]): try: for (rn, com) in entryPoint.dispatchIterator( lambda: self.queue.qsize() > 0): assignments.append((rn, com)) # increment the allocatedRN for the poolshare poolShare.allocatedRN += 1 # save the active poolshare of the rendernode rn.currentpoolshare = poolShare except NoRenderNodeAvailable: pass except NoLicenseAvailableForTask: LOGGER.info( "Missing license for node \"%s\" (other commands can start anyway)." % entryPoint.name) pass assignmentDict = collections.defaultdict(list) for (rn, com) in assignments: assignmentDict[rn].append(com) if singletonconfig.get('CORE', 'GET_STATS'): singletonstats.theStats.assignmentTimers[ 'dispatch_command'] = time.time() - prevTimer LOGGER.info("%8.2f ms --> .... dispatching commands", (time.time() - prevTimer) * 1000) # # Check replacements # # - faire une passe pour les jobs n'ayant pas leur part de gateau # - identifier dans leur pool les jobs killable # - pour chaque ressource, si match : on jette le job en cours ET on desactive son attribut killable # # Backfill # # TODO refaire une passe pour les jobs ayant un attribut "killable" et au moins une pool additionnelle return assignmentDict.items() def updateRenderNodes(self): for rendernode in self.dispatchTree.renderNodes.values(): rendernode.updateStatus() def sendAssignments(self, assignmentList): '''Processes a list of (rendernode, command) assignments.''' def sendAssignment(args): rendernode, commands = args failures = [] for command in commands: headers = {} if not rendernode.idInformed: headers["rnId"] = rendernode.id root = command.task ancestors = [root] while root.parent: root = root.parent ancestors.append(root) arguments = {} environment = { 'PULI_USER': command.task.user, 'PULI_ALLOCATED_MEMORY': unicode(rendernode.usedRam[command.id]), 'PULI_ALLOCATED_CORES': unicode(rendernode.usedCoresNumber[command.id]), } for ancestor in ancestors: arguments.update(ancestor.arguments) environment.update(ancestor.environment) arguments.update(command.arguments) log = logging.getLogger('assign') log.info("Sending command: %d from task %s to %s" % (command.id, command.task.name, rendernode)) commandDict = { "id": command.id, "runner": str(command.task.runner), "arguments": arguments, "validationExpression": command.task.validationExpression, "taskName": command.task.name, "relativePathToLogDir": "%d" % command.task.id, "environment": environment, "runnerPackages": command.runnerPackages, "watcherPackages": command.watcherPackages } body = json.dumps(commandDict) headers["Content-Length"] = len(body) headers["Content-Type"] = "application/json" try: resp, data = rendernode.request("POST", "/commands/", body, headers) if not resp.status == 202: logging.getLogger('main.dispatcher').error( "Assignment request failed: command %d on worker %s", command.id, rendernode.name) failures.append((rendernode, command)) else: logging.getLogger('main.dispatcher').info( "Sent assignment of command %d to worker %s", command.id, rendernode.name) except rendernode.RequestFailed, e: logging.getLogger('main.dispatcher').error( "Assignment of command %d to worker %s failed. Worker is likely dead (%r)", command.id, rendernode.name, e) failures.append((rendernode, command)) return failures requests = makeRequests(sendAssignment, [[a, b] for (a, b) in assignmentList], self._assignmentFailed) for request in requests: self.threadPool.putRequest(request)
class Dispatcher(MainLoopApplication): '''The Dispatcher class is the core of the dispatcher application. It computes the assignments of commands to workers according to a DispatchTree and handles all the communications with the workers and clients. ''' instance = None init = False def __new__(cls, framework): if cls.instance is None: cls.instance = super(Dispatcher, cls).__new__(cls, framework) return cls.instance def __init__(self, framework): if self.init: return self.init = True self.nextCycle = time.time() MainLoopApplication.__init__(self, framework) self.threadPool = ThreadPool(16, 0, 0, None) LOGGER.info('settings.DEBUG = %s', settings.DEBUG) LOGGER.info('settings.ADDRESS = %s', settings.ADDRESS) LOGGER.info('settings.PORT = %s', settings.PORT) self.cycle = 1 self.dispatchTree = DispatchTree() self.licenseManager = LicenseManager() self.enablePuliDB = settings.DB_ENABLE self.cleanDB = settings.DB_CLEAN_DATA self.pulidb = None if self.enablePuliDB: self.pulidb = PuliDB(self.cleanDB, self.licenseManager) self.dispatchTree.registerModelListeners() rnsAlreadyInitialized = self.initPoolsDataFromBackend() if self.enablePuliDB and not self.cleanDB: LOGGER.info("reloading jobs from database") beginTime = time.time() self.pulidb.restoreStateFromDb(self.dispatchTree, rnsAlreadyInitialized) LOGGER.info("reloading took %s" % str(time.time() - beginTime)) LOGGER.info("done reloading jobs from database") LOGGER.info("reloaded %d tasks" % len(self.dispatchTree.tasks)) LOGGER.info("checking dispatcher state") self.dispatchTree.updateCompletionAndStatus() self.updateRenderNodes() self.dispatchTree.validateDependencies() if self.enablePuliDB and not self.cleanDB: self.dispatchTree.toModifyElements = [] self.defaultPool = self.dispatchTree.pools['default'] LOGGER.info("loading dispatch rules") self.loadRules() # it should be better to have a maxsize self.queue = Queue(maxsize=10000) def initPoolsDataFromBackend(self): '''Loads pools and workers from appropriate backend. ''' try: if settings.POOLS_BACKEND_TYPE == "file": manager = FilePoolManager() elif settings.POOLS_BACKEND_TYPE == "ws": manager = WebServicePoolManager() elif settings.POOLS_BACKEND_TYPE == "db": return False except Exception: return False computers = manager.listComputers() ### recreate the pools poolsList = manager.listPools() poolsById = {} for poolDesc in poolsList: pool = Pool(id=int(poolDesc.id), name=str(poolDesc.name)) self.dispatchTree.toCreateElements.append(pool) poolsById[pool.id] = pool ### recreate the rendernodes rnById = {} for computerDesc in computers: try: computerDesc.name = socket.getfqdn(computerDesc.name) ip = socket.gethostbyname(computerDesc.name) except socket.gaierror: continue renderNode = RenderNode(computerDesc.id, computerDesc.name + ":" + str(computerDesc.port), computerDesc.cpucount * computerDesc.cpucores, computerDesc.cpufreq, ip, computerDesc.port, computerDesc.ramsize, json.loads(computerDesc.properties)) self.dispatchTree.toCreateElements.append(renderNode) ## add the rendernodes to the pools for pool in computerDesc.pools: poolsById[pool.id].renderNodes.append(renderNode) renderNode.pools.append(poolsById[pool.id]) self.dispatchTree.renderNodes[str(renderNode.name)] = renderNode rnById[renderNode.id] = renderNode # add the pools to the dispatch tree for pool in poolsById.values(): self.dispatchTree.pools[pool.name] = pool if self.cleanDB or not self.enablePuliDB: graphs = FolderNode(1, "graphs", self.dispatchTree.root, "root", 0, 0, 0, FifoStrategy()) self.dispatchTree.toCreateElements.append(graphs) self.dispatchTree.nodes[graphs.id] = graphs ps = PoolShare(1, self.dispatchTree.pools["default"], graphs, PoolShare.UNBOUND) self.dispatchTree.toCreateElements.append(ps) if self.enablePuliDB: # clean the tables pools and rendernodes (overwrite) self.pulidb.dropPoolsAndRnsTables() self.pulidb.createElements(self.dispatchTree.toCreateElements) self.dispatchTree.resetDbElements() return True def loadRules(self): from .rules.graphview import GraphViewBuilder graphs = self.dispatchTree.findNodeByPath("/graphs", None) if graphs is None: LOGGER.fatal("No /graphs node, impossible to load rule for /graphs.") self.stop() self.dispatchTree.rules.append(GraphViewBuilder(self.dispatchTree, graphs)) from .rules.userview import UserView if self.cleanDB or not self.enablePuliDB: userview = UserView.register(self.dispatchTree, "root", "users") # self.dispatchTree.toCreateElements.append(userview.root) self.dispatchTree.nodes[userview.root.id] = userview.root else: for node in self.dispatchTree.root.children: if node.name == "users": root = node break else: raise RuntimeError("missing root node for UserView") userview = UserView(self.dispatchTree, root) def prepare(self): pass def stop(self): '''Stops the application part of the dispatcher.''' #self.httpRequester.stopAll() pass @property def modified(self): return bool(self.dispatchTree.toArchiveElements or self.dispatchTree.toCreateElements or self.dispatchTree.toModifyElements) def mainLoop(self): '''Dispatcher main loop iteration.''' try: self.threadPool.poll() except NoResultsPending: pass else: LOGGER.info("finished some network requests") self.cycle += 1 self.dispatchTree.updateCompletionAndStatus() self.updateRenderNodes() self.dispatchTree.validateDependencies() executedRequests = [] first = True while first or not self.queue.empty(): workload = self.queue.get() workload() executedRequests.append(workload) first = False # update db self.updateDB() # compute and send command assignments to rendernodes assignments = self.computeAssignments() self.sendAssignments(assignments) # call the release finishing status on all rendernodes for renderNode in self.dispatchTree.renderNodes.values(): renderNode.releaseFinishingStatus() for workload in executedRequests: workload.submit() def updateDB(self): if settings.DB_ENABLE: self.pulidb.createElements(self.dispatchTree.toCreateElements) self.pulidb.updateElements(self.dispatchTree.toModifyElements) self.pulidb.archiveElements(self.dispatchTree.toArchiveElements) self.dispatchTree.resetDbElements() def computeAssignments(self): '''Computes and returns a list of (rendernode, command) assignments.''' from .model.node import NoRenderNodeAvailable # if no rendernodes available, return if not any(rn.isAvailable() for rn in self.dispatchTree.renderNodes.values()): return [] assignments = [] # first create a set of entrypoints that are not done nor cancelled nor blocked nor paused and that have at least one command ready entryPoints = set([poolShare.node for poolShare in self.dispatchTree.poolShares.values() if poolShare.node.status not in [NODE_BLOCKED, NODE_DONE, NODE_CANCELED, NODE_PAUSED] and poolShare.node.readyCommandCount > 0]) # sort by pool for the groupby entryPoints = sorted(entryPoints, key=lambda node: node.poolShares.values()[0].pool) # don't proceed to the calculation if no rns availables in the requested pools rnsBool = False for pool, nodesiterator in groupby(entryPoints, lambda x: x.poolShares.values()[0].pool): rnsAvailables = set([rn for rn in pool.renderNodes if rn.status not in [RN_UNKNOWN, RN_PAUSED, RN_WORKING]]) if len(rnsAvailables): rnsBool = True if not rnsBool: return [] # update the value of the maxrn for the poolshares (parallel dispatching) for pool, nodesiterator in groupby(entryPoints, lambda x: x.poolShares.values()[0].pool): # we are treating every active node of the pool nodesList = [node for node in nodesiterator] # the new maxRN value is calculated based on the number of active jobs of the pool, and the number of online rendernodes of the pool rnsNotOffline = set([rn for rn in pool.renderNodes if rn.status not in [RN_UNKNOWN, RN_PAUSED]]) rnsSize = len(rnsNotOffline) # if we have a userdefined maxRN for some nodes, remove them from the list and substracts their maxRN from the pool's size l = nodesList[:] # duplicate the list to be safe when removing elements for node in l: if node.poolShares.values()[0].userDefinedMaxRN: nodesList.remove(node) rnsSize -= node.poolShares.values()[0].maxRN #LOGGER.warning("Pool %s has a size of %s rns and %s nodes" % (pool.name, str(rnsSize), str(len(nodesList)))) if len(nodesList) == 0: break updatedmaxRN = rnsSize // len(nodesList) remainingRN = rnsSize % len(nodesList) # sort by id (fifo) nodesList = sorted(nodesList, key=lambda x: x.id) # then sort by dispatchKey (priority) nodesList = sorted(nodesList, key=lambda x: x.dispatchKey, reverse=True) for node in nodesList: if node.dispatchKey != 0: node.poolShares.values()[0].maxRN = -1 continue node.poolShares.values()[0].maxRN = updatedmaxRN if remainingRN > 0: node.poolShares.values()[0].maxRN += 1 remainingRN -= 1 #LOGGER.warning(" Node %s has a maxrn of %s" % (node.name, str(node.poolShares.values()[0].maxRN))) # now, we are treating every nodes # sort by id (fifo) entryPoints = sorted(entryPoints, key=lambda node: node.id) # then sort by dispatchKey (priority) entryPoints = sorted(entryPoints, key=lambda node: node.dispatchKey, reverse=True) ### for entryPoint in entryPoints: if any([poolShare.hasRenderNodesAvailable() for poolShare in entryPoint.poolShares.values()]): try: for (rn, com) in entryPoint.dispatchIterator(lambda: self.queue.qsize() > 0): assignments.append((rn, com)) # increment the allocatedRN for the poolshare poolShare.allocatedRN += 1 # save the active poolshare of the rendernode rn.currentpoolshare = poolShare except NoRenderNodeAvailable: pass assignmentDict = collections.defaultdict(list) for (rn, com) in assignments: assignmentDict[rn].append(com) return assignmentDict.items() def updateRenderNodes(self): for rendernode in self.dispatchTree.renderNodes.values(): rendernode.updateStatus() def sendAssignments(self, assignmentList): '''Processes a list of (rendernode, command) assignments.''' def sendAssignment(args): rendernode, commands = args failures = [] for command in commands: headers = {} if not rendernode.idInformed: headers["rnId"] = rendernode.id root = command.task ancestors = [root] while root.parent: root = root.parent ancestors.append(root) arguments = {} environment = { 'PULI_USER': command.task.user, 'PULI_ALLOCATED_MEMORY': unicode(rendernode.usedRam[command.id]), 'PULI_ALLOCATED_CORES': unicode(rendernode.usedCoresNumber[command.id]), } for ancestor in ancestors: arguments.update(ancestor.arguments) environment.update(ancestor.environment) arguments.update(command.arguments) commandDict = { "id": command.id, "runner": str(command.task.runner), "arguments": arguments, "validationExpression": command.task.validationExpression, "taskName": command.task.name, "relativePathToLogDir": "%d" % command.task.id, "environment": environment, } body = json.dumps(commandDict) headers["Content-Length"] = len(body) headers["Content-Type"] = "application/json" try: resp, data = rendernode.request("POST", "/commands/", body, headers) if not resp.status == 202: LOGGER.error("Assignment request failed: command %d on worker %s", command.id, rendernode.name) failures.append((rendernode, command)) else: LOGGER.info("Sent assignment of command %d to worker %s", command.id, rendernode.name) except rendernode.RequestFailed, e: LOGGER.exception("Assignment of command %d to worker %s failed: %r", command.id, rendernode.name, e) failures.append((rendernode, command)) return failures requests = makeRequests(sendAssignment, [[a, b] for (a, b) in assignmentList], self._assignmentFailed) for request in requests: self.threadPool.putRequest(request)
def __init__(self, framework): if self.init: return self.init = True self.nextCycle = time.time() MainLoopApplication.__init__(self, framework) self.threadPool = ThreadPool(16, 0, 0, None) # # Class holding custom infos on the dispatcher. # This data can be periodically flushed in a specific log file for # later use # self.cycle = 1 self.dispatchTree = DispatchTree() self.licenseManager = LicenseManager() self.enablePuliDB = settings.DB_ENABLE self.cleanDB = settings.DB_CLEAN_DATA self.restartService = False self.pulidb = None if self.enablePuliDB: self.pulidb = PuliDB(self.cleanDB, self.licenseManager) self.dispatchTree.registerModelListeners() rnsAlreadyInitialized = self.initPoolsDataFromBackend() if self.enablePuliDB and not self.cleanDB: log.warning("--- Reloading database (9 steps) ---") prevTimer = time.time() self.pulidb.restoreStateFromDb(self.dispatchTree, rnsAlreadyInitialized) log.warning("%d jobs reloaded from database" % len(self.dispatchTree.tasks)) log.warning("Total time elapsed %s" % elapsedTimeToString(prevTimer)) log.warning("") log.warning("--- Checking dispatcher state (3 steps) ---") startTimer = time.time() log.warning("1/3 Update completion and status") self.dispatchTree.updateCompletionAndStatus() log.warning(" Elapsed time %s" % elapsedTimeToString(startTimer)) prevTimer = time.time() log.warning("2/3 Update rendernodes") self.updateRenderNodes() log.warning(" Elapsed time %s" % elapsedTimeToString(prevTimer)) prevTimer = time.time() log.warning("3/3 Validate dependencies") self.dispatchTree.validateDependencies() log.warning(" Elapsed time %s" % elapsedTimeToString(prevTimer)) log.warning("Total time elapsed %s" % elapsedTimeToString(startTimer)) log.warning("") if self.enablePuliDB and not self.cleanDB: self.dispatchTree.toModifyElements = [] # If no 'default' pool exists, create default pool # When creating a pool with id=None, it is automatically appended in "toCreateElement" list in dispatcher and in the dispatcher's "pools" attribute if 'default' not in self.dispatchTree.pools: pool = Pool(None, name='default') log.warning("Default pool was not loaded from DB, create a new default pool: %s" % pool) self.defaultPool = self.dispatchTree.pools['default'] log.warning("--- Loading dispatch rules ---") startTimer = time.time() self.loadRules() log.warning("Total time elapsed %s" % elapsedTimeToString(startTimer)) log.warning("") # it should be better to have a maxsize self.queue = Queue(maxsize=10000)
class Dispatcher(MainLoopApplication): '''The Dispatcher class is the core of the dispatcher application. It computes the assignments of commands to workers according to a DispatchTree and handles all the communications with the workers and clients. ''' instance = None init = False def __new__(cls, framework): if cls.instance is None: # Disable passing framework to super__new__ call. # It is automatically avaible via super class hierarchy # This removes a deprecation warning when launching dispatcher cls.instance = super(Dispatcher, cls).__new__(cls) return cls.instance def __init__(self, framework): if self.init: return self.init = True self.nextCycle = time.time() MainLoopApplication.__init__(self, framework) self.threadPool = ThreadPool(16, 0, 0, None) # # Class holding custom infos on the dispatcher. # This data can be periodically flushed in a specific log file for # later use # self.cycle = 1 self.dispatchTree = DispatchTree() self.licenseManager = LicenseManager() self.enablePuliDB = settings.DB_ENABLE self.cleanDB = settings.DB_CLEAN_DATA self.restartService = False self.pulidb = None if self.enablePuliDB: self.pulidb = PuliDB(self.cleanDB, self.licenseManager) self.dispatchTree.registerModelListeners() rnsAlreadyInitialized = self.initPoolsDataFromBackend() if self.enablePuliDB and not self.cleanDB: log.warning("--- Reloading database (9 steps) ---") prevTimer = time.time() self.pulidb.restoreStateFromDb(self.dispatchTree, rnsAlreadyInitialized) log.warning("%d jobs reloaded from database" % len(self.dispatchTree.tasks)) log.warning("Total time elapsed %s" % elapsedTimeToString(prevTimer)) log.warning("") log.warning("--- Checking dispatcher state (3 steps) ---") startTimer = time.time() log.warning("1/3 Update completion and status") self.dispatchTree.updateCompletionAndStatus() log.warning(" Elapsed time %s" % elapsedTimeToString(startTimer)) prevTimer = time.time() log.warning("2/3 Update rendernodes") self.updateRenderNodes() log.warning(" Elapsed time %s" % elapsedTimeToString(prevTimer)) prevTimer = time.time() log.warning("3/3 Validate dependencies") self.dispatchTree.validateDependencies() log.warning(" Elapsed time %s" % elapsedTimeToString(prevTimer)) log.warning("Total time elapsed %s" % elapsedTimeToString(startTimer)) log.warning("") if self.enablePuliDB and not self.cleanDB: self.dispatchTree.toModifyElements = [] # If no 'default' pool exists, create default pool # When creating a pool with id=None, it is automatically appended in "toCreateElement" list in dispatcher and in the dispatcher's "pools" attribute if 'default' not in self.dispatchTree.pools: pool = Pool(None, name='default') log.warning("Default pool was not loaded from DB, create a new default pool: %s" % pool) self.defaultPool = self.dispatchTree.pools['default'] log.warning("--- Loading dispatch rules ---") startTimer = time.time() self.loadRules() log.warning("Total time elapsed %s" % elapsedTimeToString(startTimer)) log.warning("") # it should be better to have a maxsize self.queue = Queue(maxsize=10000) def initPoolsDataFromBackend(self): ''' Loads pools and workers from appropriate backend. ''' try: if settings.POOLS_BACKEND_TYPE == "file": manager = FilePoolManager() elif settings.POOLS_BACKEND_TYPE == "ws": manager = WebServicePoolManager() elif settings.POOLS_BACKEND_TYPE == "db": return False except Exception: return False computers = manager.listComputers() ### recreate the pools poolsList = manager.listPools() poolsById = {} for poolDesc in poolsList: pool = Pool(id=int(poolDesc.id), name=str(poolDesc.name)) self.dispatchTree.toCreateElements.append(pool) poolsById[pool.id] = pool ### recreate the rendernodes rnById = {} for computerDesc in computers: try: computerDesc.name = socket.getfqdn(computerDesc.name) ip = socket.gethostbyname(computerDesc.name) except socket.gaierror: continue renderNode = RenderNode(computerDesc.id, computerDesc.name + ":" + str(computerDesc.port), computerDesc.cpucount * computerDesc.cpucores, computerDesc.cpufreq, ip, computerDesc.port, computerDesc.ramsize, json.loads(computerDesc.properties)) self.dispatchTree.toCreateElements.append(renderNode) ## add the rendernodes to the pools for pool in computerDesc.pools: poolsById[pool.id].renderNodes.append(renderNode) renderNode.pools.append(poolsById[pool.id]) self.dispatchTree.renderNodes[str(renderNode.name)] = renderNode rnById[renderNode.id] = renderNode # add the pools to the dispatch tree for pool in poolsById.values(): self.dispatchTree.pools[pool.name] = pool if self.cleanDB or not self.enablePuliDB: graphs = FolderNode(1, "graphs", self.dispatchTree.root, "root", 0, 0, 0, FifoStrategy()) self.dispatchTree.toCreateElements.append(graphs) self.dispatchTree.nodes[graphs.id] = graphs ps = PoolShare(1, self.dispatchTree.pools["default"], graphs, PoolShare.UNBOUND) self.dispatchTree.toCreateElements.append(ps) if self.enablePuliDB: # clean the tables pools and rendernodes (overwrite) self.pulidb.dropPoolsAndRnsTables() self.pulidb.createElements(self.dispatchTree.toCreateElements) self.dispatchTree.resetDbElements() return True def shutdown(self): ''' Clean procedure before shutting done puli server. ''' logging.getLogger('main').warning("-----------------------------------------------") logging.getLogger('main').warning("Exit event caught: closing dispatcher...") try: self.dispatchTree.updateCompletionAndStatus() logging.getLogger('main').warning("[OK] update completion and status") except Exception: logging.getLogger('main').warning("[HS] update completion and status") try: self.updateRenderNodes() logging.getLogger('main').warning("[OK] update render nodes") except Exception: logging.getLogger('main').warning("[HS] update render nodes") try: self.dispatchTree.validateDependencies() logging.getLogger('main').warning("[OK] validate dependencies") except Exception: logging.getLogger('main').warning("[HS] validate dependencies") try: self.updateDB() logging.getLogger('main').warning("[OK] update DB") except Exception: logging.getLogger('main').warning("[HS] update DB") def loadRules(self): from .rules.graphview import GraphViewBuilder graphs = self.dispatchTree.findNodeByPath("/graphs", None) if graphs is None: log.fatal("No '/graphs' node, impossible to load rule for /graphs.") self.stop() self.dispatchTree.rules.append(GraphViewBuilder(self.dispatchTree, graphs)) def prepare(self): pass def stop(self): '''Stops the application part of the dispatcher.''' #self.httpRequester.stopAll() pass @property def modified(self): return bool(self.dispatchTree.toArchiveElements or self.dispatchTree.toCreateElements or self.dispatchTree.toModifyElements) def mainLoop(self): ''' | Dispatcher main loop iteration. | Periodically called with tornado'sinternal callback mecanism, the frequency is defined by config: CORE.MASTER_UPDATE_INTERVAL | During this process, the dispatcher will: | - update completion and status for all jobs in dispatchTree | - update status of renderNodes | - validate inter tasks dependencies | - update the DB with recorded changes in the model | - compute new assignments and send them to the proper rendernodes | - release all finished jobs/rns ''' log = logging.getLogger('main') loopStartTime = time.time() prevTimer = loopStartTime if singletonconfig.get('CORE', 'GET_STATS'): singletonstats.theStats.cycleDate = loopStartTime log.info("-----------------------------------------------------") log.info(" Start dispatcher process cycle (old version).") try: self.threadPool.poll() except NoResultsPending: pass else: log.info("finished some network requests") pass self.cycle += 1 # Update of allocation is done when parsing the tree for completion and status update (done partially for invalidated node only i.e. when needed) self.dispatchTree.updateCompletionAndStatus() if singletonconfig.get('CORE', 'GET_STATS'): singletonstats.theStats.cycleTimers['update_tree'] = time.time() - prevTimer log.info("%8.2f ms --> update completion status" % ((time.time() - prevTimer) * 1000)) prevTimer = time.time() # Update render nodes self.updateRenderNodes() if singletonconfig.get('CORE', 'GET_STATS'): singletonstats.theStats.cycleTimers['update_rn'] = time.time() - prevTimer log.info("%8.2f ms --> update render node" % ((time.time() - prevTimer) * 1000)) prevTimer = time.time() # Validate dependencies self.dispatchTree.validateDependencies() if singletonconfig.get('CORE', 'GET_STATS'): singletonstats.theStats.cycleTimers['update_dependencies'] = time.time() - prevTimer log.info("%8.2f ms --> validate dependencies" % ((time.time() - prevTimer) * 1000)) prevTimer = time.time() # update db self.updateDB() if singletonconfig.get('CORE', 'GET_STATS'): singletonstats.theStats.cycleTimers['update_db'] = time.time() - prevTimer log.info("%8.2f ms --> update DB" % ((time.time() - prevTimer) * 1000)) prevTimer = time.time() # compute and send command assignments to rendernodes assignments = self.computeAssignments() if singletonconfig.get('CORE', 'GET_STATS'): singletonstats.theStats.cycleTimers['compute_assignment'] = time.time() - prevTimer log.info("%8.2f ms --> compute assignments." % ((time.time() - prevTimer) * 1000)) prevTimer = time.time() self.sendAssignments(assignments) if singletonconfig.get('CORE', 'GET_STATS'): singletonstats.theStats.cycleTimers['send_assignment'] = time.time() - prevTimer singletonstats.theStats.cycleCounts['num_assignments'] = len(assignments) log.info("%8.2f ms --> send %r assignments." % ((time.time() - prevTimer) * 1000, len(assignments))) prevTimer = time.time() # call the release finishing status on all rendernodes for renderNode in self.dispatchTree.renderNodes.values(): renderNode.releaseFinishingStatus() if singletonconfig.get('CORE', 'GET_STATS'): singletonstats.theStats.cycleTimers['release_finishing'] = time.time() - prevTimer log.info("%8.2f ms --> releaseFinishingStatus" % ((time.time() - prevTimer) * 1000)) prevTimer = time.time() loopDuration = (time.time() - loopStartTime)*1000 log.info("%8.2f ms --> cycle ended. " % loopDuration) # # Send stat data to disk # if singletonconfig.get('CORE', 'GET_STATS'): singletonstats.theStats.cycleTimers['time_elapsed'] = time.time() - loopStartTime singletonstats.theStats.aggregate() def updateDB(self): if settings.DB_ENABLE: self.pulidb.createElements(self.dispatchTree.toCreateElements) self.pulidb.updateElements(self.dispatchTree.toModifyElements) self.pulidb.archiveElements(self.dispatchTree.toArchiveElements) # log.info(" UpdateDB: create=%d update=%d delete=%d" % (len(self.dispatchTree.toCreateElements), len(self.dispatchTree.toModifyElements), len(self.dispatchTree.toArchiveElements)) ) self.dispatchTree.resetDbElements() def computeAssignments(self): '''Computes and returns a list of (rendernode, command) assignments.''' LOGGER = logging.getLogger('main') from .model.node import NoRenderNodeAvailable, NoLicenseAvailableForTask # if no rendernodes available, return if not any(rn.isAvailable() for rn in self.dispatchTree.renderNodes.values()): return [] assignments = [] # first create a set of entrypoints that are not done nor cancelled nor blocked nor paused and that have at least one command ready # FIXME: hack to avoid getting the 'graphs' poolShare node in entryPoints, need to avoid it more nicely... entryPoints = set([poolShare.node for poolShare in self.dispatchTree.poolShares.values() if poolShare.node.status not in [NODE_BLOCKED, NODE_DONE, NODE_CANCELED, NODE_PAUSED] and poolShare.node.readyCommandCount > 0 and poolShare.node.name != 'graphs']) # don't proceed to the calculation if no rns availables in the requested pools rnsBool = False for pool, nodesiterator in groupby(entryPoints, lambda x: x.poolShares.values()[0].pool): rnsAvailables = set([rn for rn in pool.renderNodes if rn.status not in [RN_UNKNOWN, RN_PAUSED, RN_WORKING]]) if len(rnsAvailables): rnsBool = True if not rnsBool: return [] # Log time updating max rn prevTimer = time.time() # sort by pool for the groupby entryPoints = sorted(entryPoints, key=lambda node: node.poolShares.values()[0].pool) # update the value of the maxrn for the poolshares (parallel dispatching) for pool, nodesiterator in groupby(entryPoints, lambda x: x.poolShares.values()[0].pool): # we are treating every active node of the pool nodesList = [node for node in nodesiterator] # the new maxRN value is calculated based on the number of active jobs of the pool, and the number of online rendernodes of the pool rnsNotOffline = set([rn for rn in pool.renderNodes if rn.status not in [RN_UNKNOWN, RN_PAUSED]]) rnsSize = len(rnsNotOffline) # log.debug("@ - nb rns awake:%r" % (rnsSize) ) # if we have a userdefined maxRN for some nodes, remove them from the list and substracts their maxRN from the pool's size l = nodesList[:] # duplicate the list to be safe when removing elements for node in l: # log.debug("@ - checking userDefMaxRN: %s -> %r maxRN=%d" % (node.name, node.poolShares.values()[0].userDefinedMaxRN, node.poolShares.values()[0].maxRN ) ) if node.poolShares.values()[0].userDefinedMaxRN and node.poolShares.values()[0].maxRN not in [-1, 0]: # log.debug("@ removing: %s -> maxRN=%d" % (node.name, node.poolShares.values()[0].maxRN ) ) nodesList.remove(node) rnsSize -= node.poolShares.values()[0].maxRN # log.debug("@ - nb rns awake after maxRN:%d" % (rnsSize) ) if len(nodesList) == 0: continue # Prepare updatedMaxRN with dispatch key proportions dkList = [] # list of dks (integer only) dkPositiveList = [] # Normalized list of dks (each min value of dk becomes 1, other higher elems of dkList gets proportionnal value) nbJobs = len(nodesList) # number of jobs in the current pool nbRNAssigned = 0 # number of render nodes assigned for this pool for node in nodesList: dkList.append(node.dispatchKey) dkMin = min(dkList) dkPositiveList = map(lambda x: x-dkMin+1, dkList) dkSum = sum(dkPositiveList) # sort by id (fifo) nodesList = sorted(nodesList, key=lambda x: x.id) # then sort by dispatchKey (priority) nodesList = sorted(nodesList, key=lambda x: x.dispatchKey, reverse=True) for dk, nodeIterator in groupby(nodesList, lambda x: x.dispatchKey): nodes = [node for node in nodeIterator] dkPos = dkPositiveList[ dkList.index(dk) ] if dkSum > 0: updatedmaxRN = int( round( rnsSize * (dkPos / float(dkSum) ))) else: updatedmaxRN = int(round( rnsSize / float(nbJobs) )) for node in nodes: node.poolShares.values()[0].maxRN = updatedmaxRN nbRNAssigned += updatedmaxRN # Add remaining RNs to most important jobs unassignedRN = rnsSize - nbRNAssigned while unassignedRN > 0: for node in nodesList: if unassignedRN > 0: node.poolShares.values()[0].maxRN += 1 unassignedRN -= 1 else: break if singletonconfig.get('CORE','GET_STATS'): singletonstats.theStats.assignmentTimers['update_max_rn'] = time.time() - prevTimer log.info( "%8.2f ms --> .... updating max RN values", (time.time() - prevTimer)*1000 ) # now, we are treating every nodes # sort by id (fifo) entryPoints = sorted(entryPoints, key=lambda node: node.id) # then sort by dispatchKey (priority) entryPoints = sorted(entryPoints, key=lambda node: node.dispatchKey, reverse=True) # Put nodes with a userDefinedMaxRN first userDefEntryPoints = ifilter( lambda node: node.poolShares.values()[0].userDefinedMaxRN, entryPoints ) standardEntryPoints = ifilter( lambda node: not node.poolShares.values()[0].userDefinedMaxRN, entryPoints ) scoredEntryPoints = chain( userDefEntryPoints, standardEntryPoints) # Log time dispatching RNs prevTimer = time.time() # Iterate over each entryPoint to get an assignment for entryPoint in scoredEntryPoints: if any([poolShare.hasRenderNodesAvailable() for poolShare in entryPoint.poolShares.values()]): try: for (rn, com) in entryPoint.dispatchIterator(lambda: self.queue.qsize() > 0): assignments.append((rn, com)) # increment the allocatedRN for the poolshare poolShare.allocatedRN += 1 # save the active poolshare of the rendernode rn.currentpoolshare = poolShare except NoRenderNodeAvailable: pass except NoLicenseAvailableForTask: log.info("Missing license for node \"%s\" (other commands can start anyway)." % entryPoint.name) pass assignmentDict = collections.defaultdict(list) for (rn, com) in assignments: assignmentDict[rn].append(com) if singletonconfig.get('CORE','GET_STATS'): singletonstats.theStats.assignmentTimers['dispatch_command'] = time.time() - prevTimer log.info( "%8.2f ms --> .... dispatching commands", (time.time() - prevTimer)*1000 ) # # Check replacements # # - faire une passe pour les jobs n'ayant pas leur part de gateau # - identifier dans leur pool les jobs killable # - pour chaque ressource, si match : on jette le job en cours ET on desactive son attribut killable # # Backfill # # TODO refaire une passe pour les jobs ayant un attribut "killable" et au moins une pool additionnelle return assignmentDict.items() def updateRenderNodes(self): for rendernode in self.dispatchTree.renderNodes.values(): rendernode.updateStatus() def sendAssignments(self, assignmentList): '''Processes a list of (rendernode, command) assignments.''' def sendAssignment(args): rendernode, commands = args failures = [] for command in commands: headers = {} if not rendernode.idInformed: headers["rnId"] = rendernode.id root = command.task ancestors = [root] while root.parent: root = root.parent ancestors.append(root) arguments = {} environment = { 'PULI_USER': command.task.user, 'PULI_ALLOCATED_MEMORY': unicode(rendernode.usedRam[command.id]), 'PULI_ALLOCATED_CORES': unicode(rendernode.usedCoresNumber[command.id]), } for ancestor in ancestors: arguments.update(ancestor.arguments) environment.update(ancestor.environment) arguments.update(command.arguments) log = logging.getLogger('assign') log.info("Sending command: %d from task %s to %s" % (command.id, command.task.name, rendernode)) commandDict = { "id": command.id, "runner": str(command.task.runner), "arguments": arguments, "validationExpression": command.task.validationExpression, "taskName": command.task.name, "relativePathToLogDir": "%d" % command.task.id, "environment": environment, "runnerPackages": command.runnerPackages, "watcherPackages": command.watcherPackages } body = json.dumps(commandDict) headers["Content-Length"] = len(body) headers["Content-Type"] = "application/json" try: resp, data = rendernode.request("POST", "/commands/", body, headers) if not resp.status == 202: log.error("Assignment request failed: command %d on worker %s", command.id, rendernode.name) failures.append((rendernode, command)) else: log.info("Sent assignment of command %d to worker %s", command.id, rendernode.name) except rendernode.RequestFailed, e: log.error("Assignment of command %d to worker %s failed. Worker is likely dead (%r)", command.id, rendernode.name, e) failures.append((rendernode, command)) return failures requests = makeRequests(sendAssignment, [[a, b] for (a, b) in assignmentList], self._assignmentFailed) for request in requests: self.threadPool.putRequest(request)
class Dispatcher(MainLoopApplication): '''The Dispatcher class is the core of the dispatcher application. It computes the assignments of commands to workers according to a DispatchTree and handles all the communications with the workers and clients. ''' instance = None init = False def __new__(cls, framework): if cls.instance is None: # Disable passing framework to super__new__ call. It is automatically avaible via super class hierarchy # This removes a deprecation warning when launching dispatcher cls.instance = super(Dispatcher, cls).__new__(cls) return cls.instance def __init__(self, framework): if self.init: return self.init = True self.nextCycle = time.time() MainLoopApplication.__init__(self, framework) self.threadPool = ThreadPool(16, 0, 0, None) # # Class holding custom infos on the dispatcher. # This data can be periodically flushed in a specific log file for later use # self.cycle = 1 self.dispatchTree = DispatchTree() self.licenseManager = LicenseManager() self.enablePuliDB = settings.DB_ENABLE self.cleanDB = settings.DB_CLEAN_DATA self.pulidb = None if self.enablePuliDB: self.pulidb = PuliDB(self.cleanDB, self.licenseManager) self.dispatchTree.registerModelListeners() rnsAlreadyInitialized = self.initPoolsDataFromBackend() if self.enablePuliDB and not self.cleanDB: LOGGER.warning("reloading jobs from database") beginTime = time.time() self.pulidb.restoreStateFromDb(self.dispatchTree, rnsAlreadyInitialized) LOGGER.warning("reloading took %.2fs" % (time.time() - beginTime)) LOGGER.warning("done reloading jobs from database") LOGGER.warning("reloaded %d tasks" % len(self.dispatchTree.tasks)) LOGGER.warning("checking dispatcher state") self.dispatchTree.updateCompletionAndStatus() self.updateRenderNodes() self.dispatchTree.validateDependencies() if self.enablePuliDB and not self.cleanDB: self.dispatchTree.toModifyElements = [] self.defaultPool = self.dispatchTree.pools['default'] LOGGER.warning("loading dispatch rules") self.loadRules() # it should be better to have a maxsize self.queue = Queue(maxsize=10000) def initPoolsDataFromBackend(self): ''' Loads pools and workers from appropriate backend. ''' try: if settings.POOLS_BACKEND_TYPE == "file": manager = FilePoolManager() elif settings.POOLS_BACKEND_TYPE == "ws": manager = WebServicePoolManager() elif settings.POOLS_BACKEND_TYPE == "db": return False except Exception: return False computers = manager.listComputers() ### recreate the pools poolsList = manager.listPools() poolsById = {} for poolDesc in poolsList: pool = Pool(id=int(poolDesc.id), name=str(poolDesc.name)) self.dispatchTree.toCreateElements.append(pool) poolsById[pool.id] = pool ### recreate the rendernodes rnById = {} for computerDesc in computers: try: computerDesc.name = socket.getfqdn(computerDesc.name) ip = socket.gethostbyname(computerDesc.name) except socket.gaierror: continue renderNode = RenderNode(computerDesc.id, computerDesc.name + ":" + str(computerDesc.port), computerDesc.cpucount * computerDesc.cpucores, computerDesc.cpufreq, ip, computerDesc.port, computerDesc.ramsize, json.loads(computerDesc.properties)) self.dispatchTree.toCreateElements.append(renderNode) ## add the rendernodes to the pools for pool in computerDesc.pools: poolsById[pool.id].renderNodes.append(renderNode) renderNode.pools.append(poolsById[pool.id]) self.dispatchTree.renderNodes[str(renderNode.name)] = renderNode rnById[renderNode.id] = renderNode # add the pools to the dispatch tree for pool in poolsById.values(): self.dispatchTree.pools[pool.name] = pool if self.cleanDB or not self.enablePuliDB: graphs = FolderNode(1, "graphs", self.dispatchTree.root, "root", 0, 0, 0, FifoStrategy()) self.dispatchTree.toCreateElements.append(graphs) self.dispatchTree.nodes[graphs.id] = graphs ps = PoolShare(1, self.dispatchTree.pools["default"], graphs, PoolShare.UNBOUND) self.dispatchTree.toCreateElements.append(ps) if self.enablePuliDB: # clean the tables pools and rendernodes (overwrite) self.pulidb.dropPoolsAndRnsTables() self.pulidb.createElements(self.dispatchTree.toCreateElements) self.dispatchTree.resetDbElements() return True def loadRules(self): from .rules.graphview import GraphViewBuilder graphs = self.dispatchTree.findNodeByPath("/graphs", None) if graphs is None: LOGGER.fatal("No /graphs node, impossible to load rule for /graphs.") self.stop() self.dispatchTree.rules.append(GraphViewBuilder(self.dispatchTree, graphs)) def prepare(self): pass def stop(self): '''Stops the application part of the dispatcher.''' #self.httpRequester.stopAll() pass @property def modified(self): return bool(self.dispatchTree.toArchiveElements or self.dispatchTree.toCreateElements or self.dispatchTree.toModifyElements) def mainLoop(self): ''' | Dispatcher main loop iteration. | Periodically called with tornado'sinternal callback mecanism, the frequency is defined by config: CORE.MASTER_UPDATE_INTERVAL | During this process, the dispatcher will: | - update completion and status for all jobs in dispatchTree | - update status of renderNodes | - validate inter tasks dependencies | - update the DB with recorded changes in the model | - compute new assignments and send them to the proper rendernodes | - release all finished jobs/rns ''' # JSA DEBUG: timer pour profiler les etapes loopStartTime = time.time() prevTimer = loopStartTime if singletonconfig.get('CORE','GET_STATS'): singletonstats.theStats.cycleDate = loopStartTime LOGGER.info("") LOGGER.info("-----------------------------------------------------") LOGGER.info(" Start dispatcher process cycle.") LOGGER.info("-----------------------------------------------------") # JSA: Check if requests are finished (necessaire ?) try: self.threadPool.poll() except NoResultsPending: pass else: LOGGER.info("finished some network requests") self.cycle += 1 # Update of allocation is done when parsing the tree for completion and status update (done partially for invalidated node only i.e. when needed) self.dispatchTree.updateCompletionAndStatus() if singletonconfig.get('CORE','GET_STATS'): singletonstats.theStats.cycleTimers['update_tree'] = time.time() - prevTimer LOGGER.info("%8.2f ms --> update completion status" % ( (time.time() - prevTimer)*1000 ) ) prevTimer = time.time() # Update render nodes self.updateRenderNodes() if singletonconfig.get('CORE','GET_STATS'): singletonstats.theStats.cycleTimers['update_rn'] = time.time() - prevTimer LOGGER.info("%8.2f ms --> update render node" % ( (time.time() - prevTimer)*1000 ) ) prevTimer = time.time() # Validate dependencies self.dispatchTree.validateDependencies() if singletonconfig.get('CORE','GET_STATS'): singletonstats.theStats.cycleTimers['update_dependencies'] = time.time() - prevTimer LOGGER.info("%8.2f ms --> validate dependencies" % ( (time.time() - prevTimer)*1000 ) ) prevTimer = time.time() # update db self.updateDB() if singletonconfig.get('CORE','GET_STATS'): singletonstats.theStats.cycleTimers['update_db'] = time.time() - prevTimer LOGGER.info("%8.2f ms --> update DB" % ( (time.time() - prevTimer)*1000 ) ) prevTimer = time.time() # compute and send command assignments to rendernodes assignments = self.computeAssignments() if singletonconfig.get('CORE','GET_STATS'): singletonstats.theStats.cycleTimers['compute_assignment'] = time.time() - prevTimer LOGGER.info("%8.2f ms --> compute assignments." % ( (time.time() - prevTimer)*1000) ) prevTimer = time.time() self.sendAssignments(assignments) if singletonconfig.get('CORE','GET_STATS'): singletonstats.theStats.cycleTimers['send_assignment'] = time.time() - prevTimer singletonstats.theStats.cycleCounts['num_assignments'] = len(assignments) LOGGER.info("%8.2f ms --> send %r assignments." % ( (time.time() - prevTimer)*1000, len(assignments) ) ) prevTimer = time.time() # call the release finishing status on all rendernodes for renderNode in self.dispatchTree.renderNodes.values(): renderNode.releaseFinishingStatus() if singletonconfig.get('CORE','GET_STATS'): singletonstats.theStats.cycleTimers['release_finishing'] = time.time() - prevTimer LOGGER.info("%8.2f ms --> releaseFinishingStatus" % ( (time.time() - prevTimer)*1000 ) ) prevTimer = time.time() loopDuration = (time.time() - loopStartTime)*1000 LOGGER.info( "%8.2f ms --> cycle ended. " % loopDuration ) LOGGER.info("-----------------------------------------------------") # TODO: process average and sums of datas in stats, if flush time, send it to disk if singletonconfig.get('CORE','GET_STATS'): singletonstats.theStats.cycleTimers['time_elapsed'] = time.time() - loopStartTime singletonstats.theStats.aggregate() def updateDB(self): # TODO: Study how to change the DB subsystem to a simple file dump (json or pickle) # data1 = {'a': [1, 2.0, 3, 4], # 'b': ('string', u'Unicode string'), # 'c': None} # with open('/datas/puli/Puli/data.json', 'wb') as fp: # json.dump(self.dispatchTree, fp) # import shelve # d = shelve.open('/datas/puli/Puli/data.pkl') # d['test'] = self.dispatchTree # d.close() if settings.DB_ENABLE: self.pulidb.createElements(self.dispatchTree.toCreateElements) self.pulidb.updateElements(self.dispatchTree.toModifyElements) self.pulidb.archiveElements(self.dispatchTree.toArchiveElements) # LOGGER.info(" UpdateDB: create=%d update=%d delete=%d" % (len(self.dispatchTree.toCreateElements), len(self.dispatchTree.toModifyElements), len(self.dispatchTree.toArchiveElements)) ) self.dispatchTree.resetDbElements() def computeAssignments(self): '''Computes and returns a list of (rendernode, command) assignments.''' from .model.node import NoRenderNodeAvailable, NoLicenseAvailableForTask # if no rendernodes available, return if not any(rn.isAvailable() for rn in self.dispatchTree.renderNodes.values()): return [] assignments = [] # first create a set of entrypoints that are not done nor cancelled nor blocked nor paused and that have at least one command ready # FIXME: hack to avoid getting the 'graphs' poolShare node in entryPoints, need to avoid it more nicely... entryPoints = set([poolShare.node for poolShare in self.dispatchTree.poolShares.values() if poolShare.node.status not in [NODE_BLOCKED, NODE_DONE, NODE_CANCELED, NODE_PAUSED] and poolShare.node.readyCommandCount > 0 and poolShare.node.name != 'graphs']) # don't proceed to the calculation if no rns availables in the requested pools rnsBool = False for pool, nodesiterator in groupby(entryPoints, lambda x: x.poolShares.values()[0].pool): rnsAvailables = set([rn for rn in pool.renderNodes if rn.status not in [RN_UNKNOWN, RN_PAUSED, RN_WORKING]]) if len(rnsAvailables): rnsBool = True if not rnsBool: return [] # Log time updating max rn prevTimer = time.time() # sort by pool for the groupby entryPoints = sorted(entryPoints, key=lambda node: node.poolShares.values()[0].pool) # update the value of the maxrn for the poolshares (parallel dispatching) for pool, nodesiterator in groupby(entryPoints, lambda x: x.poolShares.values()[0].pool): # we are treating every active node of the pool nodesList = [node for node in nodesiterator] # the new maxRN value is calculated based on the number of active jobs of the pool, and the number of online rendernodes of the pool rnsNotOffline = set([rn for rn in pool.renderNodes if rn.status not in [RN_UNKNOWN, RN_PAUSED]]) rnsSize = len(rnsNotOffline) # LOGGER.debug("@ - nb rns awake:%r" % (rnsSize) ) # if we have a userdefined maxRN for some nodes, remove them from the list and substracts their maxRN from the pool's size l = nodesList[:] # duplicate the list to be safe when removing elements for node in l: # LOGGER.debug("@ - checking userDefMaxRN: %s -> %r maxRN=%d" % (node.name, node.poolShares.values()[0].userDefinedMaxRN, node.poolShares.values()[0].maxRN ) ) if node.poolShares.values()[0].userDefinedMaxRN and node.poolShares.values()[0].maxRN not in [-1, 0]: # LOGGER.debug("@ removing: %s -> maxRN=%d" % (node.name, node.poolShares.values()[0].maxRN ) ) nodesList.remove(node) rnsSize -= node.poolShares.values()[0].maxRN # LOGGER.debug("@ - nb rns awake after maxRN:%d" % (rnsSize) ) if len(nodesList) == 0: continue # Prepare updatedMaxRN with dispatch key proportions dkList = [] # list of dks (integer only) dkPositiveList = [] # Normalized list of dks (each min value of dk becomes 1, other higher elems of dkList gets proportionnal value) nbJobs = len(nodesList) # number of jobs in the current pool nbRNAssigned = 0 # number of render nodes assigned for this pool for node in nodesList: dkList.append(node.dispatchKey) dkMin = min(dkList) dkPositiveList = map(lambda x: x-dkMin+1, dkList) dkSum = sum(dkPositiveList) # sort by id (fifo) nodesList = sorted(nodesList, key=lambda x: x.id) # then sort by dispatchKey (priority) nodesList = sorted(nodesList, key=lambda x: x.dispatchKey, reverse=True) for dk, nodeIterator in groupby(nodesList, lambda x: x.dispatchKey): nodes = [node for node in nodeIterator] dkPos = dkPositiveList[ dkList.index(dk) ] if dkSum > 0: updatedmaxRN = int( round( rnsSize * (dkPos / float(dkSum) ))) else: updatedmaxRN = int(round( rnsSize / float(nbJobs) )) for node in nodes: node.poolShares.values()[0].maxRN = updatedmaxRN nbRNAssigned += updatedmaxRN # Add remaining RNs to most important jobs unassignedRN = rnsSize - nbRNAssigned while unassignedRN > 0: for node in nodesList: if unassignedRN > 0: node.poolShares.values()[0].maxRN += 1 unassignedRN -= 1 else: break if singletonconfig.get('CORE','GET_STATS'): singletonstats.theStats.assignmentTimers['update_max_rn'] = time.time() - prevTimer LOGGER.info( "%8.2f ms --> .... updating max RN values", (time.time() - prevTimer)*1000 ) # now, we are treating every nodes # sort by id (fifo) entryPoints = sorted(entryPoints, key=lambda node: node.id) # then sort by dispatchKey (priority) entryPoints = sorted(entryPoints, key=lambda node: node.dispatchKey, reverse=True) # Put nodes with a userDefinedMaxRN first userDefEntryPoints = ifilter( lambda node: node.poolShares.values()[0].userDefinedMaxRN, entryPoints ) standardEntryPoints = ifilter( lambda node: not node.poolShares.values()[0].userDefinedMaxRN, entryPoints ) scoredEntryPoints = chain( userDefEntryPoints, standardEntryPoints) # Log time dispatching RNs prevTimer = time.time() # # HACK update license info for katana with rlmutils # This helps having the real number of used licenses before finishing assignment # This is done because katana rlm management sometime reserves 2 token (cf BUGLIST v1.4) try: import subprocess strRlmKatanaUsed='' strRlmKatanaUsed = subprocess.Popen(["/s/apps/lin/farm/tools/rlm_katana_used.sh"], stdout=subprocess.PIPE).communicate()[0] katanaUsed = int(strRlmKatanaUsed) LOGGER.debug("HACK update katana license: used = %d (+buffer in config:%d)" % (katanaUsed,singletonconfig.get('HACK','KATANA_BUFFER'))) # Sets used license number try: self.licenseManager.licenses["katana"].used = katanaUsed + singletonconfig.get('HACK','KATANA_BUFFER') except KeyError: LOGGER.warning("License katana not found... Impossible to set 'used' value: %d" % katanaUsed) except Exception, e: LOGGER.warning("Error getting number of katana license used via rlmutil (e: %r, rlmoutput=%r)" % (e,strRlmKatanaUsed)) # ENDHACK # # Iterate over each entryPoint to get an assignment for entryPoint in scoredEntryPoints: if any([poolShare.hasRenderNodesAvailable() for poolShare in entryPoint.poolShares.values()]): try: for (rn, com) in entryPoint.dispatchIterator(lambda: self.queue.qsize() > 0): assignments.append((rn, com)) # increment the allocatedRN for the poolshare poolShare.allocatedRN += 1 # save the active poolshare of the rendernode rn.currentpoolshare = poolShare except NoRenderNodeAvailable: pass except NoLicenseAvailableForTask: LOGGER.info("Missing license for node \"%s\" (other commands can start anyway)." % entryPoint.name) pass assignmentDict = collections.defaultdict(list) for (rn, com) in assignments: assignmentDict[rn].append(com) if singletonconfig.get('CORE','GET_STATS'): singletonstats.theStats.assignmentTimers['dispatch_command'] = time.time() - prevTimer LOGGER.info( "%8.2f ms --> .... dispatching commands", (time.time() - prevTimer)*1000 ) # # Check replacements # # - faire une passe pour les jobs n'ayant pas leur part de gateau # - identifier dans leur pool les jobs killable # - pour chaque ressource, si match : on jette le job en cours ET on desactive son attribut killable # # Backfill # # TODO refaire une passe pour les jobs ayant un attribut "killable" et au moins une pool additionnelle return assignmentDict.items()