def test_zk_master_detector_creation(self): class WrappedZookeeperMasterDetector(ZookeeperMasterDetector): def __init__(self, *args, **kw): super(WrappedZookeeperMasterDetector, self).__init__(*args, **kw) self.changed = threading.Event() def on_change(self, membership): self.changed.set() super(WrappedZookeeperMasterDetector, self).on_change(membership) event = threading.Event() leader_queue = [] def appointed_callback(future): leader_queue.append(future.result()) event.set() self.create_root() # construct master detector and detect master detector = WrappedZookeeperMasterDetector.from_uri(self.uri) leader_future = detector.detect().add_done_callback(appointed_callback) # trigger detection by registering master master_pid = PID('10.1.2.3', 12345, 'master(1)') self.register_master(master_pid) detector.changed.wait(timeout=10) assert detector.changed.is_set() event.wait(timeout=10) assert event.is_set() assert leader_queue == [master_pid] leader_queue = [] event.clear() # start new detection loop when existing master changes leader_future = detector.detect(master_pid).add_done_callback( appointed_callback) detector.changed.clear() # register new master (won't trigger detection until original master is gone.) new_master_pid = PID('10.2.3.4', 12345, 'master(1)') self.register_master(new_master_pid) detector.changed.wait(timeout=10) assert detector.changed.is_set() detector.changed.clear() assert leader_queue == [] assert not event.is_set() # failover existing master assert self.unregister_master(master_pid) # make sure new master is detected. detector.changed.wait(timeout=10) assert detector.changed.is_set() event.wait(timeout=10) assert event.is_set() assert leader_queue == [new_master_pid]
def test_standalone_change_detection(): master_pid_str = 'master(1)@192.168.33.2:12345' master_pid = PID.from_string(master_pid_str) detector = StandaloneMasterDetector.from_uri(master_pid_str) event = threading.Event() future = detector.detect(previous=master_pid) future.add_done_callback(lambda f: event.set()) assert future.running() assert not event.is_set() detector.appoint(PID.from_string('master(2)@192.168.33.2:12345')) event.wait(timeout=1.0) assert event.is_set()
def setUp(self): self.__pong = subprocess.Popen( 'vagrant ssh -- LIBPROCESS_IP=192.168.33.2 LIBPROCESS_PORT=31337 GLOG_v=5 ./pong', shell=True) self.ping_pid = PID.from_string('(1)@192.168.33.2:31337') self.context = Context(ip='192.168.33.1') self.context.start()
def start(self): log.info('MesosExecutorDriver.start called') slave_pid = PID.from_string(self.get_or_else('MESOS_SLAVE_PID')) slave_id = self.get_or_else('MESOS_SLAVE_ID') framework_id = self.get_or_else('MESOS_FRAMEWORK_ID') executor_id = self.get_or_else('MESOS_EXECUTOR_ID') directory = self.get_or_else('MESOS_DIRECTORY') checkpoint = self.get_bool('MESOS_CHECKPOINT') recovery_timeout_secs = 15 * 60 # 15 minutes if checkpoint: # TODO(wickman) Implement Duration. Instead take seconds for now try: recovery_timeout_secs = int(self.get_or_else('MESOS_RECOVERY_TIMEOUT')) except ValueError: raise RuntimeError('MESOS_RECOVERY_TIMEOUT must be in seconds.') assert self.executor_process is None self.executor_process = ExecutorProcess( slave_pid, self, self.executor, slave_id, framework_id, executor_id, directory, checkpoint, recovery_timeout_secs, ) self.context.spawn(self.executor_process) self.status = mesos.DRIVER_RUNNING return self.status
def start(self): log.info('MesosExecutorDriver.start called') slave_pid = PID.from_string(self.get_env('MESOS_SLAVE_PID')) slave_id = self.get_env('MESOS_SLAVE_ID') framework_id = self.get_env('MESOS_FRAMEWORK_ID') executor_id = self.get_env('MESOS_EXECUTOR_ID') directory = self.get_env('MESOS_DIRECTORY') checkpoint = self.get_bool('MESOS_CHECKPOINT') recovery_timeout_secs = duration_to_seconds( os.environ.get('MESOS_RECOVERY_TIMEOUT', '15mins')) assert self.executor_process is None self.executor_process = ExecutorProcess( slave_pid, self, self.executor, slave_id, framework_id, executor_id, directory, checkpoint, recovery_timeout_secs, ) self.context.spawn(self.executor_process) log.info("Started driver") self.status = mesos_pb2.DRIVER_RUNNING self.started.set() return self.status
def detected(self, master_future): try: master_uri = master_future.result() except Exception as e: log.fatal('Failed to detect master: %s' % e) # TODO(wickman) Are we on MainThread? If not, this might not actually terminate anything # but this thread. sys.exit(1) if self.connected.is_set(): self.connected.clear() with timed(log.debug, 'scheduler::disconnected'): camel_call(self.scheduler, 'disconnected', self.driver) # TODO(wickman) Implement authentication. if master_uri: log.info('New master detected: %s' % master_uri) self.master = PID.from_string("master@%s" % master_uri) self.link(self.master) else: self.master = None self.__maybe_register() # TODO(wickman) Detectors should likely operate on PIDs and not URIs. self.detector.detect(previous=master_uri).add_done_callback( self.detected)
def __init__(self): #get the executor ID and slave PID from environment variables if (os.environ.get("MESOS_EXECUTOR_ID") is None): print("MESOS_EXECUTOR_ID must be defined") sys.exit(1) else: self.executorID = os.environ.get("MESOS_EXECUTOR_ID") if (os.environ.get("MESOS_FRAMEWORK_ID") is None): print("MESOS_FRAMEWORK_ID must be defined") sys.exit(1) else: self.frameworkID = os.environ.get("MESOS_FRAMEWORK_ID") if (os.environ.get("MESOS_SLAVE_PID") is None): print("MESOS_SLAVE_PID must be defined") sys.exit(1) else: self.slavePID = PID.from_string(os.environ.get("MESOS_SLAVE_PID")) if (os.environ.get("MESOS_SLAVE_ID") is None): print("MESOS_SLAVE_ID must be defined") sys.exit(1) else: self.slaveID = os.environ.get("MESOS_SLAVE_ID") self.registered = False super(ExecutorProcess, self).__init__(self.executorID)
def detected(self, master_future): try: master_uri = master_future.result() except Exception as e: log.fatal('Failed to detect master: %s' % e) # TODO(wickman) Are we on MainThread? If not, this might not actually terminate anything # but this thread. sys.exit(1) if self.connected.is_set(): self.connected.clear() with timed(log.debug, 'scheduler::disconnected'): camel_call(self.scheduler, 'disconnected', self.driver) # TODO(wickman) Implement authentication. if master_uri: log.info('New master detected: %s' % master_uri) self.master = PID.from_string("master@%s" % master_uri) self.link(self.master) else: self.master = None self.__maybe_register() # TODO(wickman) Detectors should likely operate on PIDs and not URIs. self.detector.detect(previous=master_uri).add_done_callback(self.detected)
def start(self): log.info('MesosExecutorDriver.start called') slave_pid = PID.from_string(self.get_or_else('MESOS_SLAVE_PID')) slave_id = self.get_or_else('MESOS_SLAVE_ID') framework_id = self.get_or_else('MESOS_FRAMEWORK_ID') executor_id = self.get_or_else('MESOS_EXECUTOR_ID') directory = self.get_or_else('MESOS_DIRECTORY') checkpoint = self.get_bool('MESOS_CHECKPOINT') recovery_timeout_secs = 15 * 60 # 15 minutes if checkpoint: # TODO(wickman) Implement Duration. Instead take seconds for now try: recovery_timeout_secs = int( self.get_or_else('MESOS_RECOVERY_TIMEOUT')) except ValueError: raise RuntimeError( 'MESOS_RECOVERY_TIMEOUT must be in seconds.') assert self.executor_process is None self.executor_process = ExecutorProcess( slave_pid, self, self.executor, slave_id, framework_id, executor_id, directory, checkpoint, recovery_timeout_secs, ) self.context.spawn(self.executor_process) self.status = mesos.DRIVER_RUNNING return self.status
def resource_offers(self, from_pid, message): assert self.master is not None if not self.valid_origin(from_pid): return for offer, pid in zip(message.offers, message.pids): self.saved_offers[offer.id][offer.slave_id] = PID.from_string(pid) with timed(log.debug, 'scheduler::resource_offers'): self.scheduler.resource_offers(self.driver, message.offers)
def status_update(self, from_pid, message): if not self.valid_origin(from_pid): return if message.pid: sender_pid = PID.from_string(message.pid) self.status_update_acknowledgement(message.update, sender_pid) with timed(log.debug, 'scheduler::status_update'): camel_call(self.scheduler, 'status_update', self.driver, message.update.status)
def test_standalone_immediate_detection(): master_pid = PID.from_string('master(1)@192.168.33.2:12345') detector = StandaloneMasterDetector(leader=master_pid) event = threading.Event() future = detector.detect(previous=None) future.add_done_callback(lambda f: event.set()) event.wait(timeout=1.0) assert event.is_set() assert future.result() == master_pid
def send_run_task(self, to, framework_id, task): message = internal.RunTaskMessage( framework_id=framework_id, framework=self._framework_map[framework_id.value], task=task, # this appears to be no longer used though it is a required field. pid=str(PID('127.0.0.1', 31337, 'not_used(123)')), ) self.send(to, message)
def master_info_to_pid(master_info): # Require master_info.pid to be set, instead of assuming specific master id. if not master_info.HasField('pid'): raise ValueError('master_info missing pid!') return PID.from_string(master_info.pid)
def from_uri(cls, uri): try: leader_pid = PID.from_string(uri) except ValueError: raise cls.InvalidUri('Not a PID: %r' % uri) return cls(leader=leader_pid)
def detected(self, master): self.master = None if master: self.master = PID.from_string("master@%s" % master) self.link(self.master)
resource.name = "mem" resource.type = mesos.Value.SCALAR resource.scalar.value = (psutil.virtual_memory().available) / 1000000 resources.append(resource) resource = mesos.Resource() resource.name = "disk" resource.type = mesos.Value.SCALAR resource.scalar.value = (psutil.disk_usage('/').free) / 1000000 resources.append(resource) return resources if __name__ == '__main__': print("Starting agent context") agentContext = Context(port=args.port) agentContext.start() masterPID = PID.from_string('master@' + args.master) agentProcess = AgentProcess('slave(1)', masterPID) print("Spawning agent process") agentPID = agentContext.spawn(agentProcess) print("Sending slave registration message") agentProcess.register() agentContext.join()