Esempio n. 1
0
  def test_launch_cluster_all_nodes_successful(self):
    for i in range(self._cluster.num_nodes):
      task_id, remaining = self._launcher.launch(self._offer)
      del self._offer.resources[:]
      self._offer.resources.extend(remaining)
      assert task_id == "mysos-cluster0-%s" % i

    tasks = self._driver.method_calls["launchTasks"]
    assert len(tasks) == self._cluster.num_nodes

    # No new tasks are launched.
    assert self._launcher.launch(self._offer)[0] is None
    assert len(self._driver.method_calls["launchTasks"]) == self._cluster.num_nodes

    # All 3 nodes have successfully started.
    status = mesos_pb2.TaskStatus()
    status.state = mesos_pb2.TASK_RUNNING  # Valid state.
    status.slave_id.value = self._offer.slave_id.value
    for i in range(self._cluster.num_nodes):
      status.task_id.value = "mysos-cluster0-%s" % i
      self._launcher.status_update(status)

    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The first slave is elected.
    assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths
    # Two slaves.
    assert len([x for x in self._storage.paths.keys() if x.startswith(
        "/mysos/test/cluster0/slaves/member_")]) == 2
  def _shutdown(self, status_result):
    runner_status = self._runner.status

    try:
      deadline(self._runner.stop, timeout=self.STOP_TIMEOUT)
    except Timeout:
      log.error('Failed to stop runner within deadline.')

    try:
      deadline(self._chained_checker.stop, timeout=self.STOP_TIMEOUT)
    except Timeout:
      log.error('Failed to stop all checkers within deadline.')

    # If the runner was alive when _shutdown was called, defer to the status_result,
    # otherwise the runner's terminal state is the preferred state.
    exit_status = runner_status or status_result

    self.send_update(
        self._driver,
        self._task_id,
        self.translate_exit_state_to_mesos(exit_status.status),
        status_result.reason)

    self.terminated.set()
    defer(self._driver.stop, delay=self.PERSISTENCE_WAIT)
Esempio n. 3
0
  def test_launch_cluster_all_nodes_successful(self):
    for i in range(self._cluster.num_nodes):
      task_id, remaining = self._launcher.launch(self._offer)
      del self._offer.resources[:]
      self._offer.resources.extend(remaining)
      assert task_id == "mysos-cluster0-%s" % i

    tasks = self._driver.method_calls["launchTasks"]
    assert len(tasks) == self._cluster.num_nodes

    # No new tasks are launched.
    assert self._launcher.launch(self._offer)[0] is None
    assert len(self._driver.method_calls["launchTasks"]) == self._cluster.num_nodes

    # All 3 nodes have successfully started.
    status = mesos_pb2.TaskStatus()
    status.state = mesos_pb2.TASK_RUNNING  # Valid state.
    status.slave_id.value = self._offer.slave_id.value
    for i in range(self._cluster.num_nodes):
      status.task_id.value = "mysos-cluster0-%s" % i
      self._launcher.status_update(status)

    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The first slave is elected.
    assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths
    # Two slaves.
    assert len([x for x in self._storage.paths.keys() if x.startswith(
        "/mysos/test/cluster0/slaves/member_")]) == 2
  def _shutdown(self, status_result):
    runner_status = self._runner.status

    try:
      deadline(self._runner.stop, timeout=self.STOP_TIMEOUT)
    except Timeout:
      log.error('Failed to stop runner within deadline.')

    try:
      deadline(self._chained_checker.stop, timeout=self.STOP_TIMEOUT)
    except Timeout:
      log.error('Failed to stop all checkers within deadline.')

    # If the runner was alive when _shutdown was called, defer to the status_result,
    # otherwise the runner's terminal state is the preferred state.
    exit_status = runner_status or status_result

    self.send_update(
        self._driver,
        self._task_id,
        self.translate_exit_state_to_mesos(exit_status.status),
        status_result.reason)

    self.terminated.set()
    defer(self._driver.stop, delay=self.PERSISTENCE_WAIT)
Esempio n. 5
0
def test_scheduler_runs():
  """
    Verifies that the scheduler successfully launches 3 "no-op" MySQL tasks.
    NOTE: Due to the limitation of zake the scheduler's ZK operations are not propagated to
    executors in separate processes but they are unit-tested separately.
  """
  import mesos.native

  # Make sure fake_mysos_executor.pex is available to be fetched by Mesos slave.
  assert os.path.isfile('dist/fake_mysos_executor.pex')

  storage = FakeStorage(SequentialThreadingHandler())
  zk_client = FakeClient(storage=storage)
  zk_client.start()

  zk_url = "zk://fake_host/home/mysos/clusters"
  cluster_name = "test_cluster"
  num_nodes = 3

  state_provider = LocalStateProvider(safe_mkdtemp())

  framework_info = FrameworkInfo(
      user=getpass.getuser(),
      name="mysos",
      checkpoint=False)

  state = Scheduler(framework_info)

  scheduler = MysosScheduler(
      state,
      state_provider,
      getpass.getuser(),
      os.path.abspath("dist/fake_mysos_executor.pex"),
      "./fake_mysos_executor.pex",
      zk_client,
      zk_url,
      Amount(40, Time.SECONDS),
      "/fakepath",
      gen_encryption_key())

  scheduler_driver = mesos.native.MesosSchedulerDriver(
      scheduler,
      framework_info,
      "local")
  scheduler_driver.start()

  # Wait until the scheduler is connected and becomes available.
  assert scheduler.connected.wait(30)

  scheduler.create_cluster(cluster_name, "mysql_user", num_nodes)

  # A slave is promoted to be the master.
  deadline(
      lambda: wait_for_master(
          get_cluster_path(posixpath.join(zk_url, 'discover'), cluster_name),
          zk_client),
      Amount(40, Time.SECONDS))

  assert scheduler_driver.stop() == DRIVER_STOPPED
def wait_until_not(thing, clock=time, timeout=1.0):
  """wait until something is booleany False"""
  def wait():
    while thing():
      clock.sleep(1.0)
  try:
    deadline(wait, timeout=timeout, daemon=True)
  except Timeout:
    pass
Esempio n. 7
0
def wait_until_not(thing, clock=time, timeout=1.0):
    """wait until something is booleany False"""
    def wait():
        while thing():
            clock.sleep(1.0)

    try:
        deadline(wait, timeout=timeout, daemon=True)
    except Timeout:
        pass
Esempio n. 8
0
def wait_until_not(thing, timeout=EVENT_WAIT_TIMEOUT_SECS):
  """wait until something is booleany False"""
  def wait():
    while thing():
      time.sleep(0.1)
  try:
    deadline(wait, timeout=timeout, daemon=True)
    return True
  except Timeout:
    return False
Esempio n. 9
0
    def test_launcher_kill(self):
        for i in range(self._cluster.num_nodes):
            task_id, remaining = self._launcher.launch(self._offer)
            del self._offer.resources[:]
            self._offer.resources.extend(remaining)
            assert task_id == "mysos-cluster0-%s" % i

        tasks = self._driver.method_calls["launchTasks"]
        assert len(tasks) == self._cluster.num_nodes

        # No new tasks are launched.
        assert self._launcher.launch(self._offer)[0] is None
        assert len(self._driver.method_calls["launchTasks"]
                   ) == self._cluster.num_nodes

        # All 3 nodes have successfully started.
        status = mesos_pb2.TaskStatus()
        status.state = mesos_pb2.TASK_RUNNING  # Valid state.
        status.slave_id.value = self._offer.slave_id.value
        for i in range(self._cluster.num_nodes):
            status.task_id.value = "mysos-cluster0-%s" % i
            self._launcher.status_update(status)

        deadline(
            lambda: wait_for_master(
                get_cluster_path(self._zk_url, self._cluster.name), self.
                _zk_client), Amount(5, Time.SECONDS))

        # The first slave is elected.
        assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths
        # Two slaves.
        assert len([
            x for x in self._storage.paths.keys()
            if x.startswith("/mysos/test/cluster0/slaves/member_")
        ]) == 2

        # Kill the cluster.
        with pytest.raises(MySQLClusterLauncher.PermissionError):
            self._launcher.kill("wrong_password")

        self._launcher.kill(self._cluster.password)  # Correct password.

        # All 3 nodes are successfully killed.
        status = mesos_pb2.TaskStatus()
        status.state = mesos_pb2.TASK_KILLED
        status.slave_id.value = self._offer.slave_id.value
        for i in range(self._cluster.num_nodes):
            status.task_id.value = "mysos-cluster0-%s" % i
            self._launcher.status_update(status)

        assert "/mysos/test/cluster0" not in self._storage.paths  # ServerSets removed.
        assert not self._state_provider.load_cluster_state(
            "cluster0")  # State removed.
Esempio n. 10
0
  def test_launcher_kill(self):
    for i in range(self._cluster.num_nodes):
      task_id, remaining = self._launcher.launch(self._offer)
      del self._offer.resources[:]
      self._offer.resources.extend(remaining)
      assert task_id == "mysos-cluster0-%s" % i

    tasks = self._driver.method_calls["launchTasks"]
    assert len(tasks) == self._cluster.num_nodes

    # No new tasks are launched.
    assert self._launcher.launch(self._offer)[0] is None
    assert len(self._driver.method_calls["launchTasks"]) == self._cluster.num_nodes

    # All 3 nodes have successfully started.
    status = mesos_pb2.TaskStatus()
    status.state = mesos_pb2.TASK_RUNNING  # Valid state.
    status.slave_id.value = self._offer.slave_id.value
    for i in range(self._cluster.num_nodes):
      status.task_id.value = "mysos-cluster0-%s" % i
      self._launcher.status_update(status)

    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The first slave is elected.
    assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths
    # Two slaves.
    assert len([x for x in self._storage.paths.keys() if x.startswith(
        "/mysos/test/cluster0/slaves/member_")]) == 2

    # Kill the cluster.
    with pytest.raises(MySQLClusterLauncher.PermissionError):
      self._launcher.kill("wrong_password")

    # Correct password.
    self._launcher.kill(self._password_box.decrypt(self._cluster.encrypted_password))

    # All 3 nodes are successfully killed.
    status = mesos_pb2.TaskStatus()
    status.state = mesos_pb2.TASK_KILLED
    status.slave_id.value = self._offer.slave_id.value
    for i in range(self._cluster.num_nodes):
      status.task_id.value = "mysos-cluster0-%s" % i
      self._launcher.status_update(status)

    assert "/mysos/test/cluster0" not in self._storage.paths  # ServerSets removed.
    assert not self._state_provider.load_cluster_state("cluster0")  # State removed.
Esempio n. 11
0
 def _initialize_sandbox(self, driver, assigned_task):
   self._sandbox = self._sandbox_provider.from_assigned_task(assigned_task)
   self.sandbox_initialized.set()
   try:
     deadline(self._sandbox.create, timeout=self.SANDBOX_INITIALIZATION_TIMEOUT,
              daemon=True, propagate=True)
   except Timeout:
     self._die(driver, mesos_pb.TASK_FAILED, 'Timed out waiting for sandbox to initialize!')
     return
   except self._sandbox.Error as e:
     self._die(driver, mesos_pb.TASK_FAILED, 'Failed to initialize sandbox: %s' % e)
     return
   self.sandbox_created.set()
   return True
 def _initialize_sandbox(self, driver, assigned_task):
   self._sandbox = self._sandbox_provider.from_assigned_task(assigned_task)
   self.sandbox_initialized.set()
   try:
     deadline(self._sandbox.create, timeout=self.SANDBOX_INITIALIZATION_TIMEOUT,
              daemon=True, propagate=True)
   except Timeout:
     self._die(driver, mesos_pb.TASK_FAILED, 'Timed out waiting for sandbox to initialize!')
     return
   except self._sandbox.Error as e:
     self._die(driver, mesos_pb.TASK_FAILED, 'Failed to initialize sandbox: %s' % e)
     return
   self.sandbox_created.set()
   return True
Esempio n. 13
0
def test_scheduler_runs():
    """
    Verifies that the scheduler successfully launches 3 "no-op" MySQL tasks.
    NOTE: Due to the limitation of zake the scheduler's ZK operations are not propagated to
    executors in separate processes but they are unit-tested separately.
  """
    import mesos.native

    # Make sure fake_mysos_executor.pex is available to be fetched by Mesos slave.
    assert os.path.isfile('dist/fake_mysos_executor.pex')

    storage = FakeStorage(SequentialThreadingHandler())
    zk_client = FakeClient(storage=storage)
    zk_client.start()

    zk_url = "zk://fake_host/home/mysos/clusters"
    cluster_name = "test_cluster"
    num_nodes = 3

    state_provider = LocalStateProvider(safe_mkdtemp())

    framework_info = FrameworkInfo(user=getpass.getuser(),
                                   name="mysos",
                                   checkpoint=False)

    state = Scheduler(framework_info)

    scheduler = MysosScheduler(state, state_provider, getpass.getuser(),
                               os.path.abspath("dist/fake_mysos_executor.pex"),
                               "./fake_mysos_executor.pex", zk_client, zk_url,
                               Amount(40, Time.SECONDS), "/fakepath",
                               gen_encryption_key())

    scheduler_driver = mesos.native.MesosSchedulerDriver(
        scheduler, framework_info, "local")
    scheduler_driver.start()

    # Wait until the scheduler is connected and becomes available.
    assert scheduler.connected.wait(30)

    scheduler.create_cluster(cluster_name, "mysql_user", num_nodes)

    # A slave is promoted to be the master.
    deadline(
        lambda: wait_for_master(
            get_cluster_path(posixpath.join(zk_url, 'discover'), cluster_name),
            zk_client), Amount(40, Time.SECONDS))

    assert scheduler_driver.stop() == DRIVER_STOPPED
Esempio n. 14
0
  def fetch(self, uri, directory):
    log.info("Fetching %s from HDFS" % uri)

    if "JAVA_HOME" in os.environ:
      log.info("Using JAVA_HOME '%s' for HDFS commands" % os.environ["JAVA_HOME"])

    config = os.environ.get("HADOOP_CONF_DIR", HADOOP_CONF_DIR)
    h = HDFSHelper(config, heap_limit=Amount(256, Data.MB))
    try:
      f = lambda: h.copy_to_local(uri, directory)
      deadline(f, timeout=self._timeout, propagate=True, daemon=True)
    except HDFSHelper.InternalError as e:
      raise self.Error('Unable to fetch HDFS package: %s' % e)
    except Timeout as e:
      raise self.Error("Failed to fetch package from HDFS within : %s" % (self._timeout, e))
  def test_demote(self):
    task_control = FakeTaskControl()
    runner = MysosTaskRunner(
        self._self_instance,
        self._client,
        "/home/test/my_cluster",
        NoopPackageInstaller(),
        task_control,
        self._state_manager)

    manager = ClusterManager(self._client, "/home/test/my_cluster")
    runner.start()

    self_member = manager.add_member(self._self_instance)

    # 'self_instance' becomes the master.
    manager.promote_member(self_member)

    runner.promoted.wait(1)

    another_member = manager.add_member(ServiceInstance(Endpoint("another_host", 10000)))

    # This demotes 'self_instance', which should cause runner to stop.
    manager.promote_member(another_member)

    assert deadline(runner.join, Amount(1, Time.SECONDS))
  def _start_runner(self, driver, assigned_task, mesos_task, portmap):
    if self.runner_aborted.is_set():
      self._die(driver, mesos_pb.TASK_KILLED, 'Task killed during initialization.')

    try:
      deadline(self._runner.start, timeout=self.START_TIMEOUT, propagate=True)
    except TaskError as e:
      self._die(driver, mesos_pb.TASK_FAILED, 'Task initialization failed: %s' % e)
      return False
    except Timeout:
      self._die(driver, mesos_pb.TASK_LOST, 'Timed out waiting for task to start!')
      return False

    self.runner_started.set()
    log.debug('Task started.')

    return True
Esempio n. 17
0
  def _start_runner(self, driver, assigned_task, mesos_task, portmap):
    if self.runner_aborted.is_set():
      self._die(driver, mesos_pb.TASK_KILLED, 'Task killed during initialization.')

    try:
      deadline(self._runner.start, timeout=self.START_TIMEOUT, propagate=True)
    except TaskError as e:
      self._die(driver, mesos_pb.TASK_FAILED, 'Task initialization failed: %s' % e)
      return False
    except Timeout:
      self._die(driver, mesos_pb.TASK_LOST, 'Timed out waiting for task to start!')
      return False

    self.runner_started.set()
    log.debug('Task started.')

    return True
Esempio n. 18
0
  def test_launch_cluster_insufficient_resources(self):
    """All but one slave in the slave are launched successfully."""
    del self._offer.resources[:]
    resources = create_resources(
        cpus=DEFAULT_TASK_CPUS * 3,
        mem=DEFAULT_TASK_MEM * 3,
        disk=DEFAULT_TASK_DISK * 3 - Amount(1, Data.MB),  # 1mb less than required disk space.
        ports=set([10000, 10001, 10002]))
    self._offer.resources.extend(resources)

    # There is one fewer port than required to launch the entire cluster.
    for i in range(self._cluster.num_nodes - 1):
      task_id, remaining = self._launcher.launch(self._offer)
      del self._offer.resources[:]
      self._offer.resources.extend(remaining)
      assert task_id == "mysos-cluster0-%s" % i

    tasks = self._driver.method_calls["launchTasks"]
    assert len(tasks) == self._cluster.num_nodes - 1

    # The final task cannot get launched.
    assert self._launcher.launch(self._offer)[0] is None
    assert len(self._driver.method_calls["launchTasks"]) == self._cluster.num_nodes - 1

    # The two nodes have successfully started.
    status = mesos_pb2.TaskStatus()
    status.state = mesos_pb2.TASK_RUNNING  # Valid state.
    status.slave_id.value = self._offer.slave_id.value
    for i in range(self._cluster.num_nodes - 1):
      status.task_id.value = "mysos-cluster0-%s" % i
      self._launcher.status_update(status)

    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The first slave is elected.
    assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths
    # One slave.
    assert len([x for x in self._storage.paths.keys() if x.startswith(
      "/mysos/test/cluster0/slaves/member_")]) == 1
  def test_mysqld_error(self):
    task_control = FakeTaskControl(mysqld="exit 123")
    runner = MysosTaskRunner(
        self._self_instance,
        self._client,
        "/home/test/my_cluster",
        NoopPackageInstaller(),
        task_control,
        self._state_manager)

    runner.start()
    assert deadline(runner.join, Amount(1, Time.SECONDS)) == 123
  def test_stop(self):
    task_control = FakeTaskControl()
    runner = MysosTaskRunner(
        self._self_instance,
        self._client,
        "/home/test/my_cluster",
        NoopPackageInstaller(),
        task_control,
        self._state_manager)
    runner.start()
    assert runner.stop()

    # Killed by SIGTERM.
    assert deadline(runner.join, Amount(1, Time.SECONDS)) == -signal.SIGTERM
  def test_stop_interminable(self):
    cmd = """trap "echo Trapped SIGTERM!" TERM
while :
do
  sleep 60
done
"""
    task_control = FakeTaskControl(mysqld=cmd)
    runner = MysosTaskRunner(
      self._self_instance,
      self._client,
      "/home/test/my_cluster",
      NoopPackageInstaller(),
      task_control,
      self._state_manager)

    task_control._mysqld = cmd
    runner.start()
    assert runner.stop(timeout=1)
    assert deadline(runner.join, Amount(1, Time.SECONDS)) == -signal.SIGKILL
  def test_reparent(self):
    task_control = FakeTaskControl()
    runner = MysosTaskRunner(
        self._self_instance,
        self._client,
        "/home/test/my_cluster",
        NoopPackageInstaller(),
        task_control,
        self._state_manager)

    manager = ClusterManager(self._client, "/home/test/my_cluster")
    runner.start()

    # Promote another instance.
    master = ServiceInstance(Endpoint("another_host", 10000))
    another_member = manager.add_member(master)
    manager.promote_member(another_member)

    assert runner.master.get(True, 1) == master

    assert runner.stop()
    assert deadline(runner.join, Amount(1, Time.SECONDS))
Esempio n. 23
0
def test_deadline_no_timeout():
  assert 'success' == deadline(lambda: 'success')
Esempio n. 24
0
def propagate_deadline(*args, **kw):
  return deadline(*args, daemon=True, propagate=True, **kw)
Esempio n. 25
0
def test_deadline_no_timeout():
    assert "success" == deadline(lambda: "success")
Esempio n. 26
0
  def test_master_failover(self):
    for i in range(self._cluster.num_nodes):
      task_id, remaining = self._launcher.launch(self._offer)
      del self._offer.resources[:]
      self._offer.resources.extend(remaining)
      assert task_id == "mysos-cluster0-%s" % i

    tasks = self._driver.method_calls["launchTasks"]
    assert len(tasks) == self._cluster.num_nodes

    # All 3 nodes have successfully started.
    status = mesos_pb2.TaskStatus()
    status.state = mesos_pb2.TASK_RUNNING
    status.slave_id.value = self._offer.slave_id.value

    for i in range(self._cluster.num_nodes):
      status.task_id.value = "mysos-cluster0-%s" % i
      self._launcher.status_update(status)

    # No log positions queries are sent for the first epoch.
    assert "sendFrameworkMessage" not in self._driver.method_calls

    # Wait for the election to complete.
    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The first slave is elected.
    assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths

    # Now fail the master task.
    status.task_id.value = "mysos-cluster0-0"
    status.state = mesos_pb2.TASK_FAILED
    self._launcher.status_update(status)

    assert len(self._launcher._cluster.running_tasks) == 2

    # Log positions queries are sent.
    self._launcher._elector._elect()
    assert len(self._driver.method_calls["sendFrameworkMessage"]) >= 2

    for i in range(1, self._cluster.num_nodes):
      self._launcher.framework_message(
          "mysos-cluster0-%s" % i,
          self._offer.slave_id.value,
          json.dumps(dict(epoch=1, position=str(i))))

    # Wait for the election to complete.
    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The slave with the highest position is elected.
    assert "/mysos/test/cluster0/master/member_0000000002" in self._storage.paths

    assert len(self._launcher._cluster.running_tasks) == 2

    # When a new offer comes in, a new task is launched.
    del self._offer.resources[:]
    resources = create_resources(
        cpus=DEFAULT_TASK_CPUS,
        mem=DEFAULT_TASK_MEM,
        disk=DEFAULT_TASK_DISK,
        ports=set([10000]))
    self._offer.resources.extend(resources)
    task_id, _ = self._launcher.launch(self._offer)
    assert task_id == "mysos-cluster0-3"

    launched = self._driver.method_calls["launchTasks"]
    # One task is relaunched to make up for the failed one.
    assert len(launched) == self._cluster.num_nodes + 1
Esempio n. 27
0
def propagate_deadline(*args, **kw):
    return deadline(*args, daemon=True, propagate=True, **kw)
Esempio n. 28
0
def test_deadline_default_timeout():
    timeout = partial(time.sleep, 0.5)
    with pytest.raises(Timeout):
        deadline(timeout)
Esempio n. 29
0
def test_deadline_no_timeout():
    assert 'success' == deadline(lambda: 'success')
Esempio n. 30
0
def test_deadline_custom_timeout():
    timeout = partial(time.sleep, 0.2)
    with pytest.raises(Timeout):
        deadline(timeout, 0.1)
Esempio n. 31
0
def test_deadline_default_timeout():
  timeout = partial(time.sleep, 0.5)
  with pytest.raises(Timeout):
    deadline(timeout)
Esempio n. 32
0
  def test_launcher_recovery_before_election_completed(self):
    # 1. Launch a cluster on the running launcher.
    for i in range(self._cluster.num_nodes):
      task_id, remaining = self._launcher.launch(self._offer)
      del self._offer.resources[:]
      self._offer.resources.extend(remaining)
      assert task_id == "mysos-cluster0-%s" % i

    tasks = self._driver.method_calls["launchTasks"]
    assert len(tasks) == self._cluster.num_nodes

    # No new tasks are launched.
    assert self._launcher.launch(self._offer)[0] is None
    assert len(self._driver.method_calls["launchTasks"]) == self._cluster.num_nodes

    # All 3 nodes have successfully started.
    status = mesos_pb2.TaskStatus()
    status.state = mesos_pb2.TASK_RUNNING
    status.slave_id.value = self._offer.slave_id.value
    for i in range(self._cluster.num_nodes):
      status.task_id.value = "mysos-cluster0-%s" % i
      self._launcher.status_update(status)

    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The first slave is elected.
    assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths
    # Two slaves.
    assert len([x for x in self._storage.paths.keys() if x.startswith(
        "/mysos/test/cluster0/slaves/member_")]) == 2

    # Now fail the master task which leads to re-election.
    status.task_id.value = "mysos-cluster0-0"
    status.state = mesos_pb2.TASK_FAILED
    self._launcher.status_update(status)

    # 2. Recover the launcher.
    self._cluster = self._state_provider.load_cluster_state(self._cluster.name)
    self._launcher = MySQLClusterLauncher(
        self._driver,
        self._cluster,
        self._state_provider,
        self._zk_url,
        self._zk_client,
        self._framework_user,
        "./executor.pex",
        "cmd.sh",
        Amount(5, Time.SECONDS),
        "/etc/mysos/admin_keyfile.yml",
        self._scheduler_key,
        query_interval=Amount(150, Time.MILLISECONDS))

    for i in range(1, self._cluster.num_nodes):
      self._launcher.framework_message(
          "mysos-cluster0-%s" % i,
          self._offer.slave_id.value,
          json.dumps(dict(epoch=2, position=str(i))))

    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The second slave has the larger position and is elected.
    assert "/mysos/test/cluster0/master/member_0000000002" in self._storage.paths
Esempio n. 33
0
  def test_launcher_recovery_before_election_completed(self):
    # 1. Launch a cluster on the running launcher.
    for i in range(self._cluster.num_nodes):
      task_id, remaining = self._launcher.launch(self._offer)
      del self._offer.resources[:]
      self._offer.resources.extend(remaining)
      assert task_id == "mysos-cluster0-%s" % i

    tasks = self._driver.method_calls["launchTasks"]
    assert len(tasks) == self._cluster.num_nodes

    # No new tasks are launched.
    assert self._launcher.launch(self._offer)[0] is None
    assert len(self._driver.method_calls["launchTasks"]) == self._cluster.num_nodes

    # All 3 nodes have successfully started.
    status = mesos_pb2.TaskStatus()
    status.state = mesos_pb2.TASK_RUNNING
    status.slave_id.value = self._offer.slave_id.value
    for i in range(self._cluster.num_nodes):
      status.task_id.value = "mysos-cluster0-%s" % i
      self._launcher.status_update(status)

    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The first slave is elected.
    assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths
    # Two slaves.
    assert len([x for x in self._storage.paths.keys() if x.startswith(
        "/mysos/test/cluster0/slaves/member_")]) == 2

    # Now fail the master task which leads to re-election.
    status.task_id.value = "mysos-cluster0-0"
    status.state = mesos_pb2.TASK_FAILED
    self._launcher.status_update(status)

    # 2. Recover the launcher.
    self._cluster = self._state_provider.load_cluster_state(self._cluster.name)
    self._launcher = MySQLClusterLauncher(
        self._driver,
        self._cluster,
        self._state_provider,
        self._zk_url,
        self._zk_client,
        self._framework_user,
        "./executor.pex",
        "cmd.sh",
        Amount(5, Time.SECONDS),
        "/etc/mysos/admin_keyfile.yml",
        self._scheduler_key,
        query_interval=Amount(150, Time.MILLISECONDS))

    for i in range(1, self._cluster.num_nodes):
      self._launcher.framework_message(
          "mysos-cluster0-%s" % i,
          self._offer.slave_id.value,
          json.dumps(dict(epoch=2, position=str(i))))

    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The second slave has the larger position and is elected.
    assert "/mysos/test/cluster0/master/member_0000000002" in self._storage.paths
Esempio n. 34
0
  def test_master_failover(self):
    for i in range(self._cluster.num_nodes):
      task_id, remaining = self._launcher.launch(self._offer)
      del self._offer.resources[:]
      self._offer.resources.extend(remaining)
      assert task_id == "mysos-cluster0-%s" % i

    tasks = self._driver.method_calls["launchTasks"]
    assert len(tasks) == self._cluster.num_nodes

    # All 3 nodes have successfully started.
    status = mesos_pb2.TaskStatus()
    status.state = mesos_pb2.TASK_RUNNING
    status.slave_id.value = self._offer.slave_id.value

    for i in range(self._cluster.num_nodes):
      status.task_id.value = "mysos-cluster0-%s" % i
      self._launcher.status_update(status)

    # No log positions queries are sent for the first epoch.
    assert "sendFrameworkMessage" not in self._driver.method_calls

    # Wait for the election to complete.
    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The first slave is elected.
    assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths

    # Now fail the master task.
    status.task_id.value = "mysos-cluster0-0"
    status.state = mesos_pb2.TASK_FAILED
    self._launcher.status_update(status)

    assert len(self._launcher._cluster.running_tasks) == 2

    # Log positions queries are sent.
    self._launcher._elector._elect()
    assert len(self._driver.method_calls["sendFrameworkMessage"]) >= 2

    for i in range(1, self._cluster.num_nodes):
      self._launcher.framework_message(
          "mysos-cluster0-%s" % i,
          self._offer.slave_id.value,
          json.dumps(dict(epoch=1, position=str(i))))

    # Wait for the election to complete.
    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The slave with the highest position is elected.
    assert "/mysos/test/cluster0/master/member_0000000002" in self._storage.paths

    assert len(self._launcher._cluster.running_tasks) == 2

    # When a new offer comes in, a new task is launched.
    del self._offer.resources[:]
    resources = create_resources(cpus=1, mem=512, ports=set([10000]))
    self._offer.resources.extend(resources)
    task_id, _ = self._launcher.launch(self._offer)
    assert task_id == "mysos-cluster0-3"

    launched = self._driver.method_calls["launchTasks"]
    # One task is relaunched to make up for the failed one.
    assert len(launched) == self._cluster.num_nodes + 1
Esempio n. 35
0
def test_deadline_custom_timeout():
  timeout = partial(time.sleep, 0.2)
  with pytest.raises(Timeout):
    deadline(timeout, 0.1)