Ejemplo n.º 1
0
  def test_launch_cluster_all_nodes_successful(self):
    for i in range(self._cluster.num_nodes):
      task_id, remaining = self._launcher.launch(self._offer)
      del self._offer.resources[:]
      self._offer.resources.extend(remaining)
      assert task_id == "mysos-cluster0-%s" % i

    tasks = self._driver.method_calls["launchTasks"]
    assert len(tasks) == self._cluster.num_nodes

    # No new tasks are launched.
    assert self._launcher.launch(self._offer)[0] is None
    assert len(self._driver.method_calls["launchTasks"]) == self._cluster.num_nodes

    # All 3 nodes have successfully started.
    status = mesos_pb2.TaskStatus()
    status.state = mesos_pb2.TASK_RUNNING  # Valid state.
    status.slave_id.value = self._offer.slave_id.value
    for i in range(self._cluster.num_nodes):
      status.task_id.value = "mysos-cluster0-%s" % i
      self._launcher.status_update(status)

    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The first slave is elected.
    assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths
    # Two slaves.
    assert len([x for x in self._storage.paths.keys() if x.startswith(
        "/mysos/test/cluster0/slaves/member_")]) == 2
Ejemplo n.º 2
0
  def test_launch_cluster_all_nodes_successful(self):
    for i in range(self._cluster.num_nodes):
      task_id, remaining = self._launcher.launch(self._offer)
      del self._offer.resources[:]
      self._offer.resources.extend(remaining)
      assert task_id == "mysos-cluster0-%s" % i

    tasks = self._driver.method_calls["launchTasks"]
    assert len(tasks) == self._cluster.num_nodes

    # No new tasks are launched.
    assert self._launcher.launch(self._offer)[0] is None
    assert len(self._driver.method_calls["launchTasks"]) == self._cluster.num_nodes

    # All 3 nodes have successfully started.
    status = mesos_pb2.TaskStatus()
    status.state = mesos_pb2.TASK_RUNNING  # Valid state.
    status.slave_id.value = self._offer.slave_id.value
    for i in range(self._cluster.num_nodes):
      status.task_id.value = "mysos-cluster0-%s" % i
      self._launcher.status_update(status)

    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The first slave is elected.
    assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths
    # Two slaves.
    assert len([x for x in self._storage.paths.keys() if x.startswith(
        "/mysos/test/cluster0/slaves/member_")]) == 2
Ejemplo n.º 3
0
  def create(args, options):
    validate_common_options(options)

    if not options.num_nodes:
      app.error("--num_nodes is required")

    if not options.cluster_user:
      app.error("--cluster_user is required")

    url = 'http://%s:%s/clusters/%s' % (options.api_host, options.api_port, options.cluster_name)
    values = dict(
        num_nodes=int(options.num_nodes),
        cluster_user=options.cluster_user,
        size=options.size if options.size else '',
        backup_id=options.backup_id if options.backup_id else '')

    req = urllib2.Request(url, urllib.urlencode(values))
    try:
      response = urllib2.urlopen(req).read()
    except urllib2.HTTPError as e:
      log.error("POST request failed: %s, %s, %s" % (
          e.code, BaseHTTPServer.BaseHTTPRequestHandler.responses[e.code], e.read()))
      app.quit(1)

    try:
      result = json.loads(response)
      if not isinstance(result, dict):
        raise ValueError()
    except ValueError:
      log.error("Invalid response: %s" % response)
      app.quit(1)

    log.info("Cluster created. Cluster info: %s" % str(result))
    with open(options.password_file, 'w') as f:
      f.write(result["cluster_password"])

    log.info("Waiting for the master for this cluster to be elected...")
    master_endpoint = wait_for_master(result['cluster_url']).service_endpoint

    connection_str = "mysql://%s:%s@%s:%d/" % (
        options.cluster_user,
        result["cluster_password"],
        master_endpoint.host,
        master_endpoint.port)
    log.info("Connecting to the MySQL cluster master: %s" % connection_str)
    engine = create_engine(connection_str)

    for i in range(5):  # Loop for 5 times/seconds to wait for the master to be promoted.
      try:
        # TODO(jyx): Test writing to the master and reading from the slave.
        result = engine.execute("SELECT 1;").scalar()
        assert 1 == int(result), "Expecting result to be 1 but got %s" % result
        break
      except OperationalError:
        if i == 4:
          raise
        log.debug("MySQL master not ready yet. Sleep for 1 second...")
        time.sleep(1)

    log.info("Cluster successfully started")
Ejemplo n.º 4
0
def test_scheduler_runs():
  """
    Verifies that the scheduler successfully launches 3 "no-op" MySQL tasks.
    NOTE: Due to the limitation of zake the scheduler's ZK operations are not propagated to
    executors in separate processes but they are unit-tested separately.
  """
  import mesos.native

  # Make sure fake_mysos_executor.pex is available to be fetched by Mesos slave.
  assert os.path.isfile('dist/fake_mysos_executor.pex')

  storage = FakeStorage(SequentialThreadingHandler())
  zk_client = FakeClient(storage=storage)
  zk_client.start()

  zk_url = "zk://fake_host/home/mysos/clusters"
  cluster_name = "test_cluster"
  num_nodes = 3

  state_provider = LocalStateProvider(safe_mkdtemp())

  framework_info = FrameworkInfo(
      user=getpass.getuser(),
      name="mysos",
      checkpoint=False)

  state = Scheduler(framework_info)

  scheduler = MysosScheduler(
      state,
      state_provider,
      getpass.getuser(),
      os.path.abspath("dist/fake_mysos_executor.pex"),
      "./fake_mysos_executor.pex",
      zk_client,
      zk_url,
      Amount(40, Time.SECONDS),
      "/fakepath",
      gen_encryption_key())

  scheduler_driver = mesos.native.MesosSchedulerDriver(
      scheduler,
      framework_info,
      "local")
  scheduler_driver.start()

  # Wait until the scheduler is connected and becomes available.
  assert scheduler.connected.wait(30)

  scheduler.create_cluster(cluster_name, "mysql_user", num_nodes)

  # A slave is promoted to be the master.
  deadline(
      lambda: wait_for_master(
          get_cluster_path(posixpath.join(zk_url, 'discover'), cluster_name),
          zk_client),
      Amount(40, Time.SECONDS))

  assert scheduler_driver.stop() == DRIVER_STOPPED
Ejemplo n.º 5
0
    def test_launcher_kill(self):
        for i in range(self._cluster.num_nodes):
            task_id, remaining = self._launcher.launch(self._offer)
            del self._offer.resources[:]
            self._offer.resources.extend(remaining)
            assert task_id == "mysos-cluster0-%s" % i

        tasks = self._driver.method_calls["launchTasks"]
        assert len(tasks) == self._cluster.num_nodes

        # No new tasks are launched.
        assert self._launcher.launch(self._offer)[0] is None
        assert len(self._driver.method_calls["launchTasks"]
                   ) == self._cluster.num_nodes

        # All 3 nodes have successfully started.
        status = mesos_pb2.TaskStatus()
        status.state = mesos_pb2.TASK_RUNNING  # Valid state.
        status.slave_id.value = self._offer.slave_id.value
        for i in range(self._cluster.num_nodes):
            status.task_id.value = "mysos-cluster0-%s" % i
            self._launcher.status_update(status)

        deadline(
            lambda: wait_for_master(
                get_cluster_path(self._zk_url, self._cluster.name), self.
                _zk_client), Amount(5, Time.SECONDS))

        # The first slave is elected.
        assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths
        # Two slaves.
        assert len([
            x for x in self._storage.paths.keys()
            if x.startswith("/mysos/test/cluster0/slaves/member_")
        ]) == 2

        # Kill the cluster.
        with pytest.raises(MySQLClusterLauncher.PermissionError):
            self._launcher.kill("wrong_password")

        self._launcher.kill(self._cluster.password)  # Correct password.

        # All 3 nodes are successfully killed.
        status = mesos_pb2.TaskStatus()
        status.state = mesos_pb2.TASK_KILLED
        status.slave_id.value = self._offer.slave_id.value
        for i in range(self._cluster.num_nodes):
            status.task_id.value = "mysos-cluster0-%s" % i
            self._launcher.status_update(status)

        assert "/mysos/test/cluster0" not in self._storage.paths  # ServerSets removed.
        assert not self._state_provider.load_cluster_state(
            "cluster0")  # State removed.
Ejemplo n.º 6
0
  def test_launcher_kill(self):
    for i in range(self._cluster.num_nodes):
      task_id, remaining = self._launcher.launch(self._offer)
      del self._offer.resources[:]
      self._offer.resources.extend(remaining)
      assert task_id == "mysos-cluster0-%s" % i

    tasks = self._driver.method_calls["launchTasks"]
    assert len(tasks) == self._cluster.num_nodes

    # No new tasks are launched.
    assert self._launcher.launch(self._offer)[0] is None
    assert len(self._driver.method_calls["launchTasks"]) == self._cluster.num_nodes

    # All 3 nodes have successfully started.
    status = mesos_pb2.TaskStatus()
    status.state = mesos_pb2.TASK_RUNNING  # Valid state.
    status.slave_id.value = self._offer.slave_id.value
    for i in range(self._cluster.num_nodes):
      status.task_id.value = "mysos-cluster0-%s" % i
      self._launcher.status_update(status)

    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The first slave is elected.
    assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths
    # Two slaves.
    assert len([x for x in self._storage.paths.keys() if x.startswith(
        "/mysos/test/cluster0/slaves/member_")]) == 2

    # Kill the cluster.
    with pytest.raises(MySQLClusterLauncher.PermissionError):
      self._launcher.kill("wrong_password")

    # Correct password.
    self._launcher.kill(self._password_box.decrypt(self._cluster.encrypted_password))

    # All 3 nodes are successfully killed.
    status = mesos_pb2.TaskStatus()
    status.state = mesos_pb2.TASK_KILLED
    status.slave_id.value = self._offer.slave_id.value
    for i in range(self._cluster.num_nodes):
      status.task_id.value = "mysos-cluster0-%s" % i
      self._launcher.status_update(status)

    assert "/mysos/test/cluster0" not in self._storage.paths  # ServerSets removed.
    assert not self._state_provider.load_cluster_state("cluster0")  # State removed.
Ejemplo n.º 7
0
def test_scheduler_runs():
    """
    Verifies that the scheduler successfully launches 3 "no-op" MySQL tasks.
    NOTE: Due to the limitation of zake the scheduler's ZK operations are not propagated to
    executors in separate processes but they are unit-tested separately.
  """
    import mesos.native

    # Make sure fake_mysos_executor.pex is available to be fetched by Mesos slave.
    assert os.path.isfile('dist/fake_mysos_executor.pex')

    storage = FakeStorage(SequentialThreadingHandler())
    zk_client = FakeClient(storage=storage)
    zk_client.start()

    zk_url = "zk://fake_host/home/mysos/clusters"
    cluster_name = "test_cluster"
    num_nodes = 3

    state_provider = LocalStateProvider(safe_mkdtemp())

    framework_info = FrameworkInfo(user=getpass.getuser(),
                                   name="mysos",
                                   checkpoint=False)

    state = Scheduler(framework_info)

    scheduler = MysosScheduler(state, state_provider, getpass.getuser(),
                               os.path.abspath("dist/fake_mysos_executor.pex"),
                               "./fake_mysos_executor.pex", zk_client, zk_url,
                               Amount(40, Time.SECONDS), "/fakepath",
                               gen_encryption_key())

    scheduler_driver = mesos.native.MesosSchedulerDriver(
        scheduler, framework_info, "local")
    scheduler_driver.start()

    # Wait until the scheduler is connected and becomes available.
    assert scheduler.connected.wait(30)

    scheduler.create_cluster(cluster_name, "mysql_user", num_nodes)

    # A slave is promoted to be the master.
    deadline(
        lambda: wait_for_master(
            get_cluster_path(posixpath.join(zk_url, 'discover'), cluster_name),
            zk_client), Amount(40, Time.SECONDS))

    assert scheduler_driver.stop() == DRIVER_STOPPED
Ejemplo n.º 8
0
  def test_launch_cluster_insufficient_resources(self):
    """All but one slave in the slave are launched successfully."""
    del self._offer.resources[:]
    resources = create_resources(
        cpus=DEFAULT_TASK_CPUS * 3,
        mem=DEFAULT_TASK_MEM * 3,
        disk=DEFAULT_TASK_DISK * 3 - Amount(1, Data.MB),  # 1mb less than required disk space.
        ports=set([10000, 10001, 10002]))
    self._offer.resources.extend(resources)

    # There is one fewer port than required to launch the entire cluster.
    for i in range(self._cluster.num_nodes - 1):
      task_id, remaining = self._launcher.launch(self._offer)
      del self._offer.resources[:]
      self._offer.resources.extend(remaining)
      assert task_id == "mysos-cluster0-%s" % i

    tasks = self._driver.method_calls["launchTasks"]
    assert len(tasks) == self._cluster.num_nodes - 1

    # The final task cannot get launched.
    assert self._launcher.launch(self._offer)[0] is None
    assert len(self._driver.method_calls["launchTasks"]) == self._cluster.num_nodes - 1

    # The two nodes have successfully started.
    status = mesos_pb2.TaskStatus()
    status.state = mesos_pb2.TASK_RUNNING  # Valid state.
    status.slave_id.value = self._offer.slave_id.value
    for i in range(self._cluster.num_nodes - 1):
      status.task_id.value = "mysos-cluster0-%s" % i
      self._launcher.status_update(status)

    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The first slave is elected.
    assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths
    # One slave.
    assert len([x for x in self._storage.paths.keys() if x.startswith(
      "/mysos/test/cluster0/slaves/member_")]) == 1
Ejemplo n.º 9
0
    def create(args, options):
        validate_common_options(options)

        if not options.num_nodes:
            app.error("--num_nodes is required")

        if not options.cluster_user:
            app.error("--cluster_user is required")

        url = 'http://%s:%s/clusters/%s' % (options.api_host, options.api_port,
                                            options.cluster_name)
        values = dict(
            num_nodes=int(options.num_nodes),
            cluster_user=options.cluster_user,
            size=options.size
            if options.size else '',  # 'urlencode()' doesn't accept None.
            backup_id=options.backup_id if options.backup_id else '',
            cluster_password=options.cluster_password
            if options.cluster_password else '')

        req = urllib2.Request(url, urllib.urlencode(values))
        try:
            response = urllib2.urlopen(req).read()
        except urllib2.HTTPError as e:
            log.error("POST request failed: %s, %s, %s" %
                      (e.code,
                       BaseHTTPServer.BaseHTTPRequestHandler.responses[e.code],
                       e.read()))
            app.quit(1)

        try:
            result = json.loads(response)
            if not isinstance(result, dict):
                raise ValueError()
        except ValueError:
            log.error("Invalid response: %s" % response)
            app.quit(1)

        log.info("Cluster created. Cluster info: %s" % str(result))
        with open(options.password_file, 'w') as f:
            f.write(result["cluster_password"])

        log.info("Waiting for the master for this cluster to be elected...")
        master_endpoint = wait_for_master(
            result['cluster_url']).service_endpoint

        connection_str = "mysql://%s:%s@%s:%d/" % (
            options.cluster_user, result["cluster_password"],
            master_endpoint.host, master_endpoint.port)
        log.info("Connecting to the MySQL cluster master: %s" % connection_str)
        engine = create_engine(connection_str)

        for i in range(
                5
        ):  # Loop for 5 times/seconds to wait for the master to be promoted.
            try:
                # TODO(jyx): Test writing to the master and reading from the slave.
                result = engine.execute("SELECT 1;").scalar()
                assert 1 == int(
                    result), "Expecting result to be 1 but got %s" % result
                break
            except OperationalError:
                if i == 4:
                    raise
                log.debug("MySQL master not ready yet. Sleep for 1 second...")
                time.sleep(1)

        log.info("Cluster successfully started")
Ejemplo n.º 10
0
  def test_launcher_recovery_before_election_completed(self):
    # 1. Launch a cluster on the running launcher.
    for i in range(self._cluster.num_nodes):
      task_id, remaining = self._launcher.launch(self._offer)
      del self._offer.resources[:]
      self._offer.resources.extend(remaining)
      assert task_id == "mysos-cluster0-%s" % i

    tasks = self._driver.method_calls["launchTasks"]
    assert len(tasks) == self._cluster.num_nodes

    # No new tasks are launched.
    assert self._launcher.launch(self._offer)[0] is None
    assert len(self._driver.method_calls["launchTasks"]) == self._cluster.num_nodes

    # All 3 nodes have successfully started.
    status = mesos_pb2.TaskStatus()
    status.state = mesos_pb2.TASK_RUNNING
    status.slave_id.value = self._offer.slave_id.value
    for i in range(self._cluster.num_nodes):
      status.task_id.value = "mysos-cluster0-%s" % i
      self._launcher.status_update(status)

    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The first slave is elected.
    assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths
    # Two slaves.
    assert len([x for x in self._storage.paths.keys() if x.startswith(
        "/mysos/test/cluster0/slaves/member_")]) == 2

    # Now fail the master task which leads to re-election.
    status.task_id.value = "mysos-cluster0-0"
    status.state = mesos_pb2.TASK_FAILED
    self._launcher.status_update(status)

    # 2. Recover the launcher.
    self._cluster = self._state_provider.load_cluster_state(self._cluster.name)
    self._launcher = MySQLClusterLauncher(
        self._driver,
        self._cluster,
        self._state_provider,
        self._zk_url,
        self._zk_client,
        self._framework_user,
        "./executor.pex",
        "cmd.sh",
        Amount(5, Time.SECONDS),
        "/etc/mysos/admin_keyfile.yml",
        self._scheduler_key,
        query_interval=Amount(150, Time.MILLISECONDS))

    for i in range(1, self._cluster.num_nodes):
      self._launcher.framework_message(
          "mysos-cluster0-%s" % i,
          self._offer.slave_id.value,
          json.dumps(dict(epoch=2, position=str(i))))

    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The second slave has the larger position and is elected.
    assert "/mysos/test/cluster0/master/member_0000000002" in self._storage.paths
Ejemplo n.º 11
0
  def test_master_failover(self):
    for i in range(self._cluster.num_nodes):
      task_id, remaining = self._launcher.launch(self._offer)
      del self._offer.resources[:]
      self._offer.resources.extend(remaining)
      assert task_id == "mysos-cluster0-%s" % i

    tasks = self._driver.method_calls["launchTasks"]
    assert len(tasks) == self._cluster.num_nodes

    # All 3 nodes have successfully started.
    status = mesos_pb2.TaskStatus()
    status.state = mesos_pb2.TASK_RUNNING
    status.slave_id.value = self._offer.slave_id.value

    for i in range(self._cluster.num_nodes):
      status.task_id.value = "mysos-cluster0-%s" % i
      self._launcher.status_update(status)

    # No log positions queries are sent for the first epoch.
    assert "sendFrameworkMessage" not in self._driver.method_calls

    # Wait for the election to complete.
    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The first slave is elected.
    assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths

    # Now fail the master task.
    status.task_id.value = "mysos-cluster0-0"
    status.state = mesos_pb2.TASK_FAILED
    self._launcher.status_update(status)

    assert len(self._launcher._cluster.running_tasks) == 2

    # Log positions queries are sent.
    self._launcher._elector._elect()
    assert len(self._driver.method_calls["sendFrameworkMessage"]) >= 2

    for i in range(1, self._cluster.num_nodes):
      self._launcher.framework_message(
          "mysos-cluster0-%s" % i,
          self._offer.slave_id.value,
          json.dumps(dict(epoch=1, position=str(i))))

    # Wait for the election to complete.
    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The slave with the highest position is elected.
    assert "/mysos/test/cluster0/master/member_0000000002" in self._storage.paths

    assert len(self._launcher._cluster.running_tasks) == 2

    # When a new offer comes in, a new task is launched.
    del self._offer.resources[:]
    resources = create_resources(cpus=1, mem=512, ports=set([10000]))
    self._offer.resources.extend(resources)
    task_id, _ = self._launcher.launch(self._offer)
    assert task_id == "mysos-cluster0-3"

    launched = self._driver.method_calls["launchTasks"]
    # One task is relaunched to make up for the failed one.
    assert len(launched) == self._cluster.num_nodes + 1
Ejemplo n.º 12
0
  def test_launcher_recovery_before_election_completed(self):
    # 1. Launch a cluster on the running launcher.
    for i in range(self._cluster.num_nodes):
      task_id, remaining = self._launcher.launch(self._offer)
      del self._offer.resources[:]
      self._offer.resources.extend(remaining)
      assert task_id == "mysos-cluster0-%s" % i

    tasks = self._driver.method_calls["launchTasks"]
    assert len(tasks) == self._cluster.num_nodes

    # No new tasks are launched.
    assert self._launcher.launch(self._offer)[0] is None
    assert len(self._driver.method_calls["launchTasks"]) == self._cluster.num_nodes

    # All 3 nodes have successfully started.
    status = mesos_pb2.TaskStatus()
    status.state = mesos_pb2.TASK_RUNNING
    status.slave_id.value = self._offer.slave_id.value
    for i in range(self._cluster.num_nodes):
      status.task_id.value = "mysos-cluster0-%s" % i
      self._launcher.status_update(status)

    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The first slave is elected.
    assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths
    # Two slaves.
    assert len([x for x in self._storage.paths.keys() if x.startswith(
        "/mysos/test/cluster0/slaves/member_")]) == 2

    # Now fail the master task which leads to re-election.
    status.task_id.value = "mysos-cluster0-0"
    status.state = mesos_pb2.TASK_FAILED
    self._launcher.status_update(status)

    # 2. Recover the launcher.
    self._cluster = self._state_provider.load_cluster_state(self._cluster.name)
    self._launcher = MySQLClusterLauncher(
        self._driver,
        self._cluster,
        self._state_provider,
        self._zk_url,
        self._zk_client,
        self._framework_user,
        "./executor.pex",
        "cmd.sh",
        Amount(5, Time.SECONDS),
        "/etc/mysos/admin_keyfile.yml",
        self._scheduler_key,
        query_interval=Amount(150, Time.MILLISECONDS))

    for i in range(1, self._cluster.num_nodes):
      self._launcher.framework_message(
          "mysos-cluster0-%s" % i,
          self._offer.slave_id.value,
          json.dumps(dict(epoch=2, position=str(i))))

    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The second slave has the larger position and is elected.
    assert "/mysos/test/cluster0/master/member_0000000002" in self._storage.paths
Ejemplo n.º 13
0
  def test_master_failover(self):
    for i in range(self._cluster.num_nodes):
      task_id, remaining = self._launcher.launch(self._offer)
      del self._offer.resources[:]
      self._offer.resources.extend(remaining)
      assert task_id == "mysos-cluster0-%s" % i

    tasks = self._driver.method_calls["launchTasks"]
    assert len(tasks) == self._cluster.num_nodes

    # All 3 nodes have successfully started.
    status = mesos_pb2.TaskStatus()
    status.state = mesos_pb2.TASK_RUNNING
    status.slave_id.value = self._offer.slave_id.value

    for i in range(self._cluster.num_nodes):
      status.task_id.value = "mysos-cluster0-%s" % i
      self._launcher.status_update(status)

    # No log positions queries are sent for the first epoch.
    assert "sendFrameworkMessage" not in self._driver.method_calls

    # Wait for the election to complete.
    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The first slave is elected.
    assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths

    # Now fail the master task.
    status.task_id.value = "mysos-cluster0-0"
    status.state = mesos_pb2.TASK_FAILED
    self._launcher.status_update(status)

    assert len(self._launcher._cluster.running_tasks) == 2

    # Log positions queries are sent.
    self._launcher._elector._elect()
    assert len(self._driver.method_calls["sendFrameworkMessage"]) >= 2

    for i in range(1, self._cluster.num_nodes):
      self._launcher.framework_message(
          "mysos-cluster0-%s" % i,
          self._offer.slave_id.value,
          json.dumps(dict(epoch=1, position=str(i))))

    # Wait for the election to complete.
    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The slave with the highest position is elected.
    assert "/mysos/test/cluster0/master/member_0000000002" in self._storage.paths

    assert len(self._launcher._cluster.running_tasks) == 2

    # When a new offer comes in, a new task is launched.
    del self._offer.resources[:]
    resources = create_resources(
        cpus=DEFAULT_TASK_CPUS,
        mem=DEFAULT_TASK_MEM,
        disk=DEFAULT_TASK_DISK,
        ports=set([10000]))
    self._offer.resources.extend(resources)
    task_id, _ = self._launcher.launch(self._offer)
    assert task_id == "mysos-cluster0-3"

    launched = self._driver.method_calls["launchTasks"]
    # One task is relaunched to make up for the failed one.
    assert len(launched) == self._cluster.num_nodes + 1