Exemple #1
0
    def test_zk_master_detector_creation(self):
        class WrappedZookeeperMasterDetector(ZookeeperMasterDetector):
            def __init__(self, *args, **kw):
                super(WrappedZookeeperMasterDetector,
                      self).__init__(*args, **kw)
                self.changed = threading.Event()

            def on_change(self, membership):
                self.changed.set()
                super(WrappedZookeeperMasterDetector,
                      self).on_change(membership)

        event = threading.Event()
        leader_queue = []

        def appointed_callback(future):
            leader_queue.append(future.result())
            event.set()

        self.create_root()

        # construct master detector and detect master
        detector = WrappedZookeeperMasterDetector.from_uri(self.uri)
        leader_future = detector.detect().add_done_callback(appointed_callback)

        # trigger detection by registering master
        master_pid = PID('10.1.2.3', 12345, 'master(1)')
        self.register_master(master_pid)
        detector.changed.wait(timeout=10)
        assert detector.changed.is_set()
        event.wait(timeout=10)
        assert event.is_set()
        assert leader_queue == [master_pid]
        leader_queue = []
        event.clear()

        # start new detection loop when existing master changes
        leader_future = detector.detect(master_pid).add_done_callback(
            appointed_callback)
        detector.changed.clear()

        # register new master (won't trigger detection until original master is gone.)
        new_master_pid = PID('10.2.3.4', 12345, 'master(1)')
        self.register_master(new_master_pid)
        detector.changed.wait(timeout=10)
        assert detector.changed.is_set()
        detector.changed.clear()
        assert leader_queue == []
        assert not event.is_set()

        # failover existing master
        assert self.unregister_master(master_pid)

        # make sure new master is detected.
        detector.changed.wait(timeout=10)
        assert detector.changed.is_set()
        event.wait(timeout=10)
        assert event.is_set()
        assert leader_queue == [new_master_pid]
Exemple #2
0
def test_standalone_change_detection():
    master_pid_str = 'master(1)@192.168.33.2:12345'
    master_pid = PID.from_string(master_pid_str)
    detector = StandaloneMasterDetector.from_uri(master_pid_str)
    event = threading.Event()
    future = detector.detect(previous=master_pid)
    future.add_done_callback(lambda f: event.set())
    assert future.running()
    assert not event.is_set()
    detector.appoint(PID.from_string('master(2)@192.168.33.2:12345'))
    event.wait(timeout=1.0)
    assert event.is_set()
Exemple #3
0
def test_standalone_change_detection():
  master_pid_str = 'master(1)@192.168.33.2:12345'
  master_pid = PID.from_string(master_pid_str)
  detector = StandaloneMasterDetector.from_uri(master_pid_str)
  event = threading.Event()
  future = detector.detect(previous=master_pid)
  future.add_done_callback(lambda f: event.set())
  assert future.running()
  assert not event.is_set()
  detector.appoint(PID.from_string('master(2)@192.168.33.2:12345'))
  event.wait(timeout=1.0)
  assert event.is_set()
 def setUp(self):
   self.__pong = subprocess.Popen(
       'vagrant ssh -- LIBPROCESS_IP=192.168.33.2 LIBPROCESS_PORT=31337 GLOG_v=5 ./pong',
       shell=True)
   self.ping_pid = PID.from_string('(1)@192.168.33.2:31337')
   self.context = Context(ip='192.168.33.1')
   self.context.start()
Exemple #5
0
  def start(self):
    log.info('MesosExecutorDriver.start called')

    slave_pid = PID.from_string(self.get_or_else('MESOS_SLAVE_PID'))
    slave_id = self.get_or_else('MESOS_SLAVE_ID')
    framework_id = self.get_or_else('MESOS_FRAMEWORK_ID')
    executor_id = self.get_or_else('MESOS_EXECUTOR_ID')
    directory = self.get_or_else('MESOS_DIRECTORY')
    checkpoint = self.get_bool('MESOS_CHECKPOINT')
    recovery_timeout_secs = 15 * 60  # 15 minutes
    if checkpoint:
      # TODO(wickman) Implement Duration.  Instead take seconds for now
      try:
        recovery_timeout_secs = int(self.get_or_else('MESOS_RECOVERY_TIMEOUT'))
      except ValueError:
        raise RuntimeError('MESOS_RECOVERY_TIMEOUT must be in seconds.')

    assert self.executor_process is None
    self.executor_process = ExecutorProcess(
        slave_pid,
        self,
        self.executor,
        slave_id,
        framework_id,
        executor_id,
        directory,
        checkpoint,
        recovery_timeout_secs,
    )
    self.context.spawn(self.executor_process)
    self.status = mesos.DRIVER_RUNNING
    return self.status
Exemple #6
0
  def start(self):
    log.info('MesosExecutorDriver.start called')

    slave_pid = PID.from_string(self.get_env('MESOS_SLAVE_PID'))
    slave_id = self.get_env('MESOS_SLAVE_ID')
    framework_id = self.get_env('MESOS_FRAMEWORK_ID')
    executor_id = self.get_env('MESOS_EXECUTOR_ID')
    directory = self.get_env('MESOS_DIRECTORY')
    checkpoint = self.get_bool('MESOS_CHECKPOINT')
    recovery_timeout_secs = duration_to_seconds(
        os.environ.get('MESOS_RECOVERY_TIMEOUT', '15mins'))

    assert self.executor_process is None
    self.executor_process = ExecutorProcess(
        slave_pid,
        self,
        self.executor,
        slave_id,
        framework_id,
        executor_id,
        directory,
        checkpoint,
        recovery_timeout_secs,
    )

    self.context.spawn(self.executor_process)

    log.info("Started driver")

    self.status = mesos_pb2.DRIVER_RUNNING
    self.started.set()
    return self.status
Exemple #7
0
    def detected(self, master_future):
        try:
            master_uri = master_future.result()
        except Exception as e:
            log.fatal('Failed to detect master: %s' % e)
            # TODO(wickman) Are we on MainThread?  If not, this might not actually terminate anything
            # but this thread.
            sys.exit(1)

        if self.connected.is_set():
            self.connected.clear()
            with timed(log.debug, 'scheduler::disconnected'):
                camel_call(self.scheduler, 'disconnected', self.driver)

        # TODO(wickman) Implement authentication.
        if master_uri:
            log.info('New master detected: %s' % master_uri)
            self.master = PID.from_string("master@%s" % master_uri)
            self.link(self.master)
        else:
            self.master = None

        self.__maybe_register()

        # TODO(wickman) Detectors should likely operate on PIDs and not URIs.
        self.detector.detect(previous=master_uri).add_done_callback(
            self.detected)
Exemple #8
0
    def __init__(self):
        #get the executor ID and slave PID from environment variables
        if (os.environ.get("MESOS_EXECUTOR_ID") is None):
            print("MESOS_EXECUTOR_ID must be defined")
            sys.exit(1)
        else:
            self.executorID = os.environ.get("MESOS_EXECUTOR_ID")

        if (os.environ.get("MESOS_FRAMEWORK_ID") is None):
            print("MESOS_FRAMEWORK_ID must be defined")
            sys.exit(1)
        else:
            self.frameworkID = os.environ.get("MESOS_FRAMEWORK_ID")

        if (os.environ.get("MESOS_SLAVE_PID") is None):
            print("MESOS_SLAVE_PID must be defined")
            sys.exit(1)
        else:
            self.slavePID = PID.from_string(os.environ.get("MESOS_SLAVE_PID"))

        if (os.environ.get("MESOS_SLAVE_ID") is None):
            print("MESOS_SLAVE_ID must be defined")
            sys.exit(1)
        else:
            self.slaveID = os.environ.get("MESOS_SLAVE_ID")

        self.registered = False

        super(ExecutorProcess, self).__init__(self.executorID)
 def setUp(self):
     self.__pong = subprocess.Popen(
         'vagrant ssh -- LIBPROCESS_IP=192.168.33.2 LIBPROCESS_PORT=31337 GLOG_v=5 ./pong',
         shell=True)
     self.ping_pid = PID.from_string('(1)@192.168.33.2:31337')
     self.context = Context(ip='192.168.33.1')
     self.context.start()
Exemple #10
0
  def detected(self, master_future):
    try:
      master_uri = master_future.result()
    except Exception as e:
      log.fatal('Failed to detect master: %s' % e)
      # TODO(wickman) Are we on MainThread?  If not, this might not actually terminate anything
      # but this thread.
      sys.exit(1)

    if self.connected.is_set():
      self.connected.clear()
      with timed(log.debug, 'scheduler::disconnected'):
        camel_call(self.scheduler, 'disconnected', self.driver)

    # TODO(wickman) Implement authentication.
    if master_uri:
      log.info('New master detected: %s' % master_uri)
      self.master = PID.from_string("master@%s" % master_uri)
      self.link(self.master)
    else:
      self.master = None

    self.__maybe_register()

    # TODO(wickman) Detectors should likely operate on PIDs and not URIs.
    self.detector.detect(previous=master_uri).add_done_callback(self.detected)
Exemple #11
0
    def start(self):
        log.info('MesosExecutorDriver.start called')

        slave_pid = PID.from_string(self.get_env('MESOS_SLAVE_PID'))
        slave_id = self.get_env('MESOS_SLAVE_ID')
        framework_id = self.get_env('MESOS_FRAMEWORK_ID')
        executor_id = self.get_env('MESOS_EXECUTOR_ID')
        directory = self.get_env('MESOS_DIRECTORY')
        checkpoint = self.get_bool('MESOS_CHECKPOINT')
        recovery_timeout_secs = duration_to_seconds(
            os.environ.get('MESOS_RECOVERY_TIMEOUT', '15mins'))

        assert self.executor_process is None
        self.executor_process = ExecutorProcess(
            slave_pid,
            self,
            self.executor,
            slave_id,
            framework_id,
            executor_id,
            directory,
            checkpoint,
            recovery_timeout_secs,
        )

        self.context.spawn(self.executor_process)

        log.info("Started driver")

        self.status = mesos_pb2.DRIVER_RUNNING
        self.started.set()
        return self.status
Exemple #12
0
    def start(self):
        log.info('MesosExecutorDriver.start called')

        slave_pid = PID.from_string(self.get_or_else('MESOS_SLAVE_PID'))
        slave_id = self.get_or_else('MESOS_SLAVE_ID')
        framework_id = self.get_or_else('MESOS_FRAMEWORK_ID')
        executor_id = self.get_or_else('MESOS_EXECUTOR_ID')
        directory = self.get_or_else('MESOS_DIRECTORY')
        checkpoint = self.get_bool('MESOS_CHECKPOINT')
        recovery_timeout_secs = 15 * 60  # 15 minutes
        if checkpoint:
            # TODO(wickman) Implement Duration.  Instead take seconds for now
            try:
                recovery_timeout_secs = int(
                    self.get_or_else('MESOS_RECOVERY_TIMEOUT'))
            except ValueError:
                raise RuntimeError(
                    'MESOS_RECOVERY_TIMEOUT must be in seconds.')

        assert self.executor_process is None
        self.executor_process = ExecutorProcess(
            slave_pid,
            self,
            self.executor,
            slave_id,
            framework_id,
            executor_id,
            directory,
            checkpoint,
            recovery_timeout_secs,
        )
        self.context.spawn(self.executor_process)
        self.status = mesos.DRIVER_RUNNING
        return self.status
Exemple #13
0
 def resource_offers(self, from_pid, message):
   assert self.master is not None
   if not self.valid_origin(from_pid):
     return
   for offer, pid in zip(message.offers, message.pids):
     self.saved_offers[offer.id][offer.slave_id] = PID.from_string(pid)
   with timed(log.debug, 'scheduler::resource_offers'):
     self.scheduler.resource_offers(self.driver, message.offers)
Exemple #14
0
 def resource_offers(self, from_pid, message):
     assert self.master is not None
     if not self.valid_origin(from_pid):
         return
     for offer, pid in zip(message.offers, message.pids):
         self.saved_offers[offer.id][offer.slave_id] = PID.from_string(pid)
     with timed(log.debug, 'scheduler::resource_offers'):
         self.scheduler.resource_offers(self.driver, message.offers)
Exemple #15
0
 def status_update(self, from_pid, message):
   if not self.valid_origin(from_pid):
     return
   if message.pid:
     sender_pid = PID.from_string(message.pid)
     self.status_update_acknowledgement(message.update, sender_pid)
   with timed(log.debug, 'scheduler::status_update'):
     camel_call(self.scheduler, 'status_update', self.driver, message.update.status)
Exemple #16
0
def test_standalone_immediate_detection():
  master_pid = PID.from_string('master(1)@192.168.33.2:12345')
  detector = StandaloneMasterDetector(leader=master_pid)
  event = threading.Event()
  future = detector.detect(previous=None)
  future.add_done_callback(lambda f: event.set())
  event.wait(timeout=1.0)
  assert event.is_set()
  assert future.result() == master_pid
Exemple #17
0
 def send_run_task(self, to, framework_id, task):
     message = internal.RunTaskMessage(
         framework_id=framework_id,
         framework=self._framework_map[framework_id.value],
         task=task,
         # this appears to be no longer used though it is a required field.
         pid=str(PID('127.0.0.1', 31337, 'not_used(123)')),
     )
     self.send(to, message)
Exemple #18
0
 def status_update(self, from_pid, message):
     if not self.valid_origin(from_pid):
         return
     if message.pid:
         sender_pid = PID.from_string(message.pid)
         self.status_update_acknowledgement(message.update, sender_pid)
     with timed(log.debug, 'scheduler::status_update'):
         camel_call(self.scheduler, 'status_update', self.driver,
                    message.update.status)
Exemple #19
0
def test_standalone_immediate_detection():
    master_pid = PID.from_string('master(1)@192.168.33.2:12345')
    detector = StandaloneMasterDetector(leader=master_pid)
    event = threading.Event()
    future = detector.detect(previous=None)
    future.add_done_callback(lambda f: event.set())
    event.wait(timeout=1.0)
    assert event.is_set()
    assert future.result() == master_pid
Exemple #20
0
def master_info_to_pid(master_info):
  # Require master_info.pid to be set, instead of assuming specific master id.
  if not master_info.HasField('pid'):
    raise ValueError('master_info missing pid!')
  return PID.from_string(master_info.pid)
Exemple #21
0
 def from_uri(cls, uri):
   try:
     leader_pid = PID.from_string(uri)
   except ValueError:
     raise cls.InvalidUri('Not a PID: %r' % uri)
   return cls(leader=leader_pid)
Exemple #22
0
def master_info_to_pid(master_info):
  # Require master_info.pid to be set, instead of assuming specific master id.
  if not master_info.HasField('pid'):
    raise ValueError('master_info missing pid!')
  return PID.from_string(master_info.pid)
Exemple #23
0
 def detected(self, master):
   self.master = None
   if master:
     self.master = PID.from_string("master@%s" % master)
     self.link(self.master)
Exemple #24
0
        resource.name = "mem"
        resource.type = mesos.Value.SCALAR
        resource.scalar.value = (psutil.virtual_memory().available) / 1000000
        resources.append(resource)

        resource = mesos.Resource()
        resource.name = "disk"
        resource.type = mesos.Value.SCALAR
        resource.scalar.value = (psutil.disk_usage('/').free) / 1000000
        resources.append(resource)

        return resources


if __name__ == '__main__':

    print("Starting agent context")
    agentContext = Context(port=args.port)
    agentContext.start()

    masterPID = PID.from_string('master@' + args.master)
    agentProcess = AgentProcess('slave(1)', masterPID)

    print("Spawning agent process")
    agentPID = agentContext.spawn(agentProcess)

    print("Sending slave registration message")
    agentProcess.register()

    agentContext.join()
Exemple #25
0
 def from_uri(cls, uri):
   try:
     leader_pid = PID.from_string(uri)
   except ValueError:
     raise cls.InvalidUri('Not a PID: %r' % uri)
   return cls(leader=leader_pid)