def _getInfo(allMesosNodes, ip): info = None try: info = allMesosNodes[ip] except KeyError: # never seen by mesos - 1 of 3 possibilities: # 1) node is still launching mesos & will come online soon # 2) no jobs have been assigned to this worker. This means the executor was never # launched, so we don't even get an executorInfo back indicating 0 workers running # 3) mesos crashed before launching, worker will never come online # In all 3 situations it's safe to fake executor info with 0 workers, since in all # cases there are no workers running. We also won't waste any money in cases 1/2 since # we will still wait for the end of the node's billing cycle for the actual # termination. info = NodeInfo(coresTotal=1, coresUsed=0, requestedCores=0, memoryTotal=1, memoryUsed=0, requestedMemory=0, workers=0) else: # Node was tracked but we haven't seen this in the last 10 minutes inUse = self.scaler.leader.batchSystem.nodeInUse(ip) if not inUse: # The node hasn't reported in the last 10 minutes & last we know # there weren't any tasks running. We will fake executorInfo with no # worker to reflect this, since otherwise this node will never # be considered for termination info.workers = 0 else: pass # despite the node not reporting to mesos jobs may still be running # so we can't terminate the node return info
def _getInfo(allMesosNodes, ip): info = None try: info = allMesosNodes[ip] except KeyError: # never seen by mesos - 1 of 3 possibilities: # 1) node is still launching mesos & will come online soon # 2) no jobs have been assigned to this worker. This means the executor was never # launched, so we don't even get an executorInfo back indicating 0 workers running # 3) mesos crashed before launching, worker will never come online # In all 3 situations it's safe to fake executor info with 0 workers, since in all # cases there are no workers running. info = NodeInfo(coresTotal=1, coresUsed=0, requestedCores=0, memoryTotal=1, memoryUsed=0, requestedMemory=0, workers=0) else: # Node was tracked but we haven't seen this in the last 10 minutes inUse = self.scaler.leader.batchSystem.nodeInUse(ip) if not inUse: # The node hasn't reported in the last 10 minutes & last we know # there weren't any tasks running. We will fake executorInfo with no # worker to reflect this, since otherwise this node will never # be considered for termination info.workers = 0 else: pass # despite the node not reporting to mesos jobs may still be running # so we can't terminate the node return info
def frameworkMessage(self, driver, executorId, agentId, message): """ Invoked when an executor sends a message. """ # Take it out of base 64 encoding from Protobuf message = decode_data(message).decode() log.debug('Got framework message from executor %s running on agent %s: %s', executorId.value, agentId.value, message) message = ast.literal_eval(message) assert isinstance(message, dict) # Handle the mandatory fields of a message nodeAddress = message.pop('address') executor = self._registerNode(nodeAddress, agentId.value) # Handle optional message fields for k, v in message.items(): if k == 'nodeInfo': assert isinstance(v, dict) resources = [taskData for taskData in self.runningJobMap.values() if taskData.executorID == executorId.value] requestedCores = sum(taskData.cores for taskData in resources) requestedMemory = sum(taskData.memory for taskData in resources) executor.nodeInfo = NodeInfo(requestedCores=requestedCores, requestedMemory=requestedMemory, **v) self.executors[nodeAddress] = executor else: raise RuntimeError("Unknown message field '%s'." % k)
def getNodes(self): return { address: NodeInfo(cores=0, memory=0, workers=1 if w.busyEvent.is_set() else 0) for address, w in enumerate(self.workers) }
def getNodes(self): nodes = dict() for i, worker in enumerate(self.nodesToWorker.values()): nodes[(i, self.preemptable)] = NodeInfo(coresTotal=0, coresUsed=0, requestedCores=1, memoryTotal=0, memoryUsed=0, requestedMemory=1, workers=1 if worker.busyEvent.is_set() else 0) return nodes
def getNodes(self, preemptable=False, timeout=None): nodes = dict() for node in self.nodesToWorker: if node.preemptable == preemptable: worker = self.nodesToWorker[node] nodes[node.privateIP] = NodeInfo(coresTotal=0, coresUsed=0, requestedCores=1, memoryTotal=0, memoryUsed=0, requestedMemory=1, workers=1 if worker.busyEvent.is_set() else 0) return nodes
def getNodes(self): return { address: NodeInfo(coresTotal=0, coresUsed=0, requestedCores=1, memoryTotal=0, memoryUsed=0, requestedMemory=1, workers=1 if w.busyEvent.is_set() else 0) for address, w in enumerate(self.workers) }
def frameworkMessage(self, driver, executorId, slaveId, message): """ Invoked when an executor sends a message. """ log.debug( 'Got framework message from executor %s running on slave %s: %s', executorId.value, slaveId.value, message) message = ast.literal_eval(message) assert isinstance(message, dict) # Handle the mandatory fields of a message nodeAddress = message.pop('address') executor = self._registerNode(nodeAddress, slaveId.value) # Handle optional message fields for k, v in message.iteritems(): if k == 'nodeInfo': assert isinstance(v, dict) executor.nodeInfo = NodeInfo(**v) else: raise RuntimeError("Unknown message field '%s'." % k)
def frameworkMessage(self, driver, executorId, slaveId, message): """ Invoked when an executor sends a message. """ message = ast.literal_eval(message) assert isinstance(message, dict) # Handle the mandatory fields of a message nodeAddress = message.pop('address') executor = self.executors.get(nodeAddress) if executor is None or executor.slaveId != slaveId: executor = Expando(nodeAddress=nodeAddress, slaveId=slaveId, nodeInfo=None) self.executors[nodeAddress] = executor executor.lastSeen = time.time() # Handle optional message fields for k, v in message.iteritems(): if k == 'nodeInfo': assert isinstance(v, dict) executor.nodeInfo = NodeInfo(**v) else: raise RuntimeError("Unknown message field '%s'." % k)