Example #1
0
  def delete_cluster(self, cluster_name, password):
    """
      :return: ZooKeeper URL for this Mysos cluster that can be used to wait for the termination of
               the cluster.
    """
    with self._lock:
      if not self._driver:
        raise self.ServiceUnavailable("Service unavailable. Try again later")

      if cluster_name not in self._state.clusters:
        raise self.ClusterNotFound("Cluster '%s' not found" % cluster_name)

      launcher = self._launchers[cluster_name]
      launcher.kill(password)
      log.info("Attempted to kill cluster %s" % cluster_name)

      self._metrics.cluster_count.decrement()
      cluster_info = launcher.cluster_info
      self._metrics.total_requested_cpus.write(
          self._metrics.total_requested_cpus.read() - cluster_info.total_cpus)
      self._metrics.total_requested_mem_mb.write(
          self._metrics.total_requested_mem_mb.read() - cluster_info.total_mem_mb)
      self._metrics.total_requested_disk_mb.write(
          self._metrics.total_requested_disk_mb.read() - cluster_info.total_disk_mb)

      if launcher.terminated:
        log.info("Deleting the launcher for cluster %s directly because the cluster has already "
                 "terminated" % launcher.cluster_name)
        self._delete_launcher(launcher)

      return get_cluster_path(self._discover_zk_url, cluster_name)
  def from_task(self, task, sandbox):
    data = json.loads(task.data)
    cluster_name, host, port, zk_url = data['cluster'], data['host'], data['port'], data['zk_url']
    _, servers, path = parse(zk_url)
    kazoo = KazooClient(servers)
    kazoo.start()
    self_instance = ServiceInstance(Endpoint(host, port))

    try:
      task_control = self._task_control_provider.from_task(task, sandbox)
      installer = self._installer_provider.from_task(task, sandbox)
      backup_store = self._backup_store_provider.from_task(task, sandbox)
    except (TaskControl.Error, PackageInstaller.Error) as e:
      kazoo.stop()  # Kazoo needs to be cleaned up. See kazoo/issues/217.
      raise TaskError(e.message)

    state_manager = StateManager(sandbox, backup_store)

    return MysosTaskRunner(
        self_instance,
        kazoo,
        get_cluster_path(path, cluster_name),
        installer,
        task_control,
        state_manager)
Example #3
0
  def test_launch_cluster_all_nodes_successful(self):
    for i in range(self._cluster.num_nodes):
      task_id, remaining = self._launcher.launch(self._offer)
      del self._offer.resources[:]
      self._offer.resources.extend(remaining)
      assert task_id == "mysos-cluster0-%s" % i

    tasks = self._driver.method_calls["launchTasks"]
    assert len(tasks) == self._cluster.num_nodes

    # No new tasks are launched.
    assert self._launcher.launch(self._offer)[0] is None
    assert len(self._driver.method_calls["launchTasks"]) == self._cluster.num_nodes

    # All 3 nodes have successfully started.
    status = mesos_pb2.TaskStatus()
    status.state = mesos_pb2.TASK_RUNNING  # Valid state.
    status.slave_id.value = self._offer.slave_id.value
    for i in range(self._cluster.num_nodes):
      status.task_id.value = "mysos-cluster0-%s" % i
      self._launcher.status_update(status)

    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The first slave is elected.
    assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths
    # Two slaves.
    assert len([x for x in self._storage.paths.keys() if x.startswith(
        "/mysos/test/cluster0/slaves/member_")]) == 2
Example #4
0
def test_scheduler_runs():
  """
    Verifies that the scheduler successfully launches 3 "no-op" MySQL tasks.
    NOTE: Due to the limitation of zake the scheduler's ZK operations are not propagated to
    executors in separate processes but they are unit-tested separately.
  """
  import mesos.native

  # Make sure fake_mysos_executor.pex is available to be fetched by Mesos slave.
  assert os.path.isfile('dist/fake_mysos_executor.pex')

  storage = FakeStorage(SequentialThreadingHandler())
  zk_client = FakeClient(storage=storage)
  zk_client.start()

  zk_url = "zk://fake_host/home/mysos/clusters"
  cluster_name = "test_cluster"
  num_nodes = 3

  state_provider = LocalStateProvider(safe_mkdtemp())

  framework_info = FrameworkInfo(
      user=getpass.getuser(),
      name="mysos",
      checkpoint=False)

  state = Scheduler(framework_info)

  scheduler = MysosScheduler(
      state,
      state_provider,
      getpass.getuser(),
      os.path.abspath("dist/fake_mysos_executor.pex"),
      "./fake_mysos_executor.pex",
      zk_client,
      zk_url,
      Amount(40, Time.SECONDS),
      "/fakepath",
      gen_encryption_key())

  scheduler_driver = mesos.native.MesosSchedulerDriver(
      scheduler,
      framework_info,
      "local")
  scheduler_driver.start()

  # Wait until the scheduler is connected and becomes available.
  assert scheduler.connected.wait(30)

  scheduler.create_cluster(cluster_name, "mysql_user", num_nodes)

  # A slave is promoted to be the master.
  deadline(
      lambda: wait_for_master(
          get_cluster_path(posixpath.join(zk_url, 'discover'), cluster_name),
          zk_client),
      Amount(40, Time.SECONDS))

  assert scheduler_driver.stop() == DRIVER_STOPPED
Example #5
0
  def test_launcher_kill(self):
    for i in range(self._cluster.num_nodes):
      task_id, remaining = self._launcher.launch(self._offer)
      del self._offer.resources[:]
      self._offer.resources.extend(remaining)
      assert task_id == "mysos-cluster0-%s" % i

    tasks = self._driver.method_calls["launchTasks"]
    assert len(tasks) == self._cluster.num_nodes

    # No new tasks are launched.
    assert self._launcher.launch(self._offer)[0] is None
    assert len(self._driver.method_calls["launchTasks"]) == self._cluster.num_nodes

    # All 3 nodes have successfully started.
    status = mesos_pb2.TaskStatus()
    status.state = mesos_pb2.TASK_RUNNING  # Valid state.
    status.slave_id.value = self._offer.slave_id.value
    for i in range(self._cluster.num_nodes):
      status.task_id.value = "mysos-cluster0-%s" % i
      self._launcher.status_update(status)

    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The first slave is elected.
    assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths
    # Two slaves.
    assert len([x for x in self._storage.paths.keys() if x.startswith(
        "/mysos/test/cluster0/slaves/member_")]) == 2

    # Kill the cluster.
    with pytest.raises(MySQLClusterLauncher.PermissionError):
      self._launcher.kill("wrong_password")

    # Correct password.
    self._launcher.kill(self._password_box.decrypt(self._cluster.encrypted_password))

    # All 3 nodes are successfully killed.
    status = mesos_pb2.TaskStatus()
    status.state = mesos_pb2.TASK_KILLED
    status.slave_id.value = self._offer.slave_id.value
    for i in range(self._cluster.num_nodes):
      status.task_id.value = "mysos-cluster0-%s" % i
      self._launcher.status_update(status)

    assert "/mysos/test/cluster0" not in self._storage.paths  # ServerSets removed.
    assert not self._state_provider.load_cluster_state("cluster0")  # State removed.
Example #6
0
  def delete_cluster(self, cluster_name, password):
    """
      :return: ZooKeeper URL for this Mysos cluster that can be used to wait for the termination of
               the cluster.
    """
    with self._lock:
      if not self._driver:
        raise self.ServiceUnavailable("Service unavailable. Try again later")

      if cluster_name not in self._state.clusters:
        raise self.ClusterNotFound("Cluster '%s' not found" % cluster_name)

      launcher = self._launchers[cluster_name]
      launcher.kill(password)
      log.info("Attempted to kill cluster %s" % cluster_name)

      return get_cluster_path(self._discover_zk_url, cluster_name)
Example #7
0
  def test_launch_cluster_insufficient_resources(self):
    """All but one slave in the slave are launched successfully."""
    del self._offer.resources[:]
    resources = create_resources(
        cpus=DEFAULT_TASK_CPUS * 3,
        mem=DEFAULT_TASK_MEM * 3,
        disk=DEFAULT_TASK_DISK * 3 - Amount(1, Data.MB),  # 1mb less than required disk space.
        ports=set([10000, 10001, 10002]))
    self._offer.resources.extend(resources)

    # There is one fewer port than required to launch the entire cluster.
    for i in range(self._cluster.num_nodes - 1):
      task_id, remaining = self._launcher.launch(self._offer)
      del self._offer.resources[:]
      self._offer.resources.extend(remaining)
      assert task_id == "mysos-cluster0-%s" % i

    tasks = self._driver.method_calls["launchTasks"]
    assert len(tasks) == self._cluster.num_nodes - 1

    # The final task cannot get launched.
    assert self._launcher.launch(self._offer)[0] is None
    assert len(self._driver.method_calls["launchTasks"]) == self._cluster.num_nodes - 1

    # The two nodes have successfully started.
    status = mesos_pb2.TaskStatus()
    status.state = mesos_pb2.TASK_RUNNING  # Valid state.
    status.slave_id.value = self._offer.slave_id.value
    for i in range(self._cluster.num_nodes - 1):
      status.task_id.value = "mysos-cluster0-%s" % i
      self._launcher.status_update(status)

    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The first slave is elected.
    assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths
    # One slave.
    assert len([x for x in self._storage.paths.keys() if x.startswith(
      "/mysos/test/cluster0/slaves/member_")]) == 1
Example #8
0
    def delete_cluster(self, cluster_name, password):
        """
      :return: ZooKeeper URL for this Mysos cluster that can be used to wait for the termination of
               the cluster.
    """
        with self._lock:
            if not self._driver:
                raise self.ServiceUnavailable(
                    "Service unavailable. Try again later")

            if cluster_name not in self._state.clusters:
                raise self.ClusterNotFound("Cluster '%s' not found" %
                                           cluster_name)

            launcher = self._launchers[cluster_name]
            launcher.kill(password)
            log.info("Attempted to kill cluster %s" % cluster_name)

            self._metrics.cluster_count.decrement()
            cluster_info = launcher.cluster_info
            self._metrics.total_requested_cpus.write(
                self._metrics.total_requested_cpus.read() -
                cluster_info.total_cpus)
            self._metrics.total_requested_mem_mb.write(
                self._metrics.total_requested_mem_mb.read() -
                cluster_info.total_mem_mb)
            self._metrics.total_requested_disk_mb.write(
                self._metrics.total_requested_disk_mb.read() -
                cluster_info.total_disk_mb)

            if launcher.terminated:
                log.info(
                    "Deleting the launcher for cluster %s directly because the cluster has already "
                    "terminated" % launcher.cluster_name)
                self._delete_launcher(launcher)

            return get_cluster_path(self._discover_zk_url, cluster_name)
Example #9
0
  def create_cluster(
        self,
        cluster_name,
        cluster_user,
        num_nodes,
        size=None,
        backup_id=None,
        cluster_password=None):
    """
      :param cluster_name: Name of the cluster.
      :param cluster_user: The user account on MySQL server.
      :param num_nodes: Number of nodes in the cluster.
      :param size: The size of instances in the cluster as a JSON dictionary of 'cpus', 'mem',
                   'disk'. 'mem' and 'disk' are specified with data size units: kb, mb, gb, etc. If
                   given 'None' then app defaults are used.
      :param backup_id: The 'backup_id' of the backup to restore from. If None then Mysos starts an
                        empty instance.
      :param cluster_password: The password used for accessing MySQL instances in the cluster as
                               well as deleting the cluster from Mysos. If None then Mysos generates
                               one for the cluster. In either case the password is sent back as
                               part of the return value.

      :return: a tuple of the following:
        - ZooKeeper URL for this Mysos cluster that can be used to resolve MySQL cluster info.
        - The password for the specified user of the specified cluster.

      NOTE:
        If the scheduler fails over after the cluster state is checkpointed, the new scheduler
        instance will restore the state and continue to launch the cluster. TODO(jyx): Need to allow
        users to retrieve the cluster info (e.g. the passwords) afterwards in case the user request
        has failed but the cluster is successfully created eventually.
    """
    with self._lock:
      if not self._driver:
        # 'self._driver' is set in (re)registered() after which the scheduler becomes "available".
        # We need to get hold of the driver to recover the scheduler.
        raise self.ServiceUnavailable("Service unavailable. Try again later")

      if cluster_name in self._state.clusters:
        raise self.ClusterExists("Cluster '%s' already exists" % cluster_name)

      if not cluster_user:
        raise self.InvalidUser('Invalid user name: %s' % cluster_user)

      num_nodes = int(num_nodes)
      if num_nodes <= 0:
        raise ValueError("Invalid number of cluster nodes: %s" % num_nodes)

      resources = parse_size(size)

      if (resources['cpus'] <= EXECUTOR_CPUS_EPSILON or
          resources['mem'] <= EXECUTOR_MEM_EPSILON or
          resources['disk'] <= EXECUTOR_DISK_EPSILON):
        raise ValueError(
            "Instance 'size' too small. It should be larger than what Mysos executor consumes: "
            "(cpus, mem, disk) = (%s, %s, %s)" % (
                EXECUTOR_CPUS_EPSILON, EXECUTOR_MEM_EPSILON, EXECUTOR_DISK_EPSILON))

      log.info("Requested resources per instance for cluster %s: %s" % (resources, cluster_name))

      self._metrics.total_requested_cpus.write(
          self._metrics.total_requested_cpus.read() + resources['cpus'] * num_nodes)
      self._metrics.total_requested_mem_mb.write(
          self._metrics.total_requested_mem_mb.read() + resources['mem'].as_(Data.MB) * num_nodes)
      self._metrics.total_requested_disk_mb.write(
          self._metrics.total_requested_disk_mb.read() + resources['disk'].as_(Data.MB) * num_nodes)

      self._state.clusters.add(cluster_name)
      self._state_provider.dump_scheduler_state(self._state)

      if not cluster_password:
        log.info("Generating password for cluster %s" % cluster_name)
        cluster_password = gen_password()

      # Return the plaintext version to the client but store the encrypted version.
      cluster = MySQLCluster(
          cluster_name,
          cluster_user,
          self._password_box.encrypt(cluster_password),
          num_nodes,
          cpus=resources['cpus'],
          mem=resources['mem'],
          disk=resources['disk'],
          backup_id=backup_id)
      self._state_provider.dump_cluster_state(cluster)

      log.info("Creating launcher for cluster %s" % cluster_name)
      self._launchers[cluster_name] = MySQLClusterLauncher(
          self._driver,
          cluster,
          self._state_provider,
          self._discover_zk_url,
          self._kazoo,
          self._framework_user,
          self._executor_uri,
          self._executor_cmd,
          self._election_timeout,
          self._admin_keypath,
          self._scheduler_key,
          installer_args=self._installer_args,
          backup_store_args=self._backup_store_args,
          executor_environ=self._executor_environ,
          executor_source_prefix=self._executor_source_prefix,
          framework_role=self._framework_role)

      self._metrics.cluster_count.increment()

      return get_cluster_path(self._discover_zk_url, cluster_name), cluster_password
Example #10
0
    def __init__(self,
                 driver,
                 cluster,
                 state_provider,
                 zk_url,
                 kazoo,
                 framework_user,
                 executor_uri,
                 executor_cmd,
                 election_timeout,
                 admin_keypath,
                 scheduler_key,
                 installer_args=None,
                 backup_store_args=None,
                 executor_environ=None,
                 executor_source_prefix=None,
                 framework_role='*',
                 query_interval=Amount(1, Time.SECONDS)):
        """
      :param driver: Mesos scheduler driver.
      :param cluster: The MySQLCluster state object.
      :param state_provider: For restoring and persisting the cluster state.
      :param zk_url: The ZooKeeper URL for cluster member discovery and master election.
      :param kazoo: The Kazoo client to access ZooKeeper with.
      :param executor_uri: See flags.
      :param executor_cmd: See flags.
      :param election_timeout: See flags.
      :param admin_keypath: See flags.
      :param scheduler_key: Used for encrypting cluster passwords.
      :param installer_args: See flags.
      :param backup_store_args: See flags.
      :param executor_environ: See flags.
      :param executor_source_prefix: See flags.
      :param framework_role: See flags.
      :param query_interval: See MySQLMasterElector. Use the default value for production and allow
                             tests to use a different value.
    """
        self._driver = driver

        if not isinstance(cluster, MySQLCluster):
            raise TypeError("'cluster' should be an instance of MySQLCluster")
        self._cluster = cluster

        if not isinstance(state_provider, StateProvider):
            raise TypeError(
                "'state_provider' should be an instance of StateProvider")
        self._state_provider = state_provider

        self._framework_role = framework_role

        # Passed along to executors.
        self._zk_url = zk_url
        self._framework_user = framework_user
        self._executor_uri = executor_uri
        self._executor_cmd = executor_cmd
        self._election_timeout = election_timeout
        self._admin_keypath = admin_keypath
        self._installer_args = installer_args
        self._backup_store_args = backup_store_args
        self._executor_environ = executor_environ
        self._executor_source_prefix = executor_source_prefix

        # Used by the elector.
        self._query_interval = query_interval

        zk_root = zookeeper.parse(zk_url)[2]
        self._cluster_manager = ClusterManager(
            kazoo, get_cluster_path(zk_root, cluster.name))

        self._password_box = PasswordBox(scheduler_key)
        self._password_box.decrypt(
            cluster.encrypted_password)  # Validate the password.

        self._lock = threading.Lock()

        if self._cluster.master_id:
            log.info(
                "Republish master %s for cluster %s in case it's not published"
                % (self._cluster.master_id, self.cluster_name))
            self._cluster_manager.promote_member(self._cluster.master_id)

        if len(self._cluster.tasks) > 0:
            log.info("Recovered %s tasks for cluster '%s'" %
                     (len(self._cluster.tasks), self.cluster_name))

        # A recovered launcher should continue the election if the previous one was incomplete when the
        # scheduler failed over. Mesos will deliver all missed events that affect the election to the
        # scheduler.
        if len(self._cluster.running_tasks
               ) > 0 and not self._cluster.master_id:
            log.info("Restarting election for the recovered launcher")
            self._elector = self._new_elector()
            # Add current slaves.
            for t in self._cluster.running_tasks:
                self._elector.add_slave(t.task_id, t.mesos_slave_id)
            self._elector.start()
        else:
            # New launcher, the elector is set when the election starts and reset to None when it ends.
            self._elector = None

        self._terminating = False
Example #11
0
  def test_launcher_recovery_before_election_completed(self):
    # 1. Launch a cluster on the running launcher.
    for i in range(self._cluster.num_nodes):
      task_id, remaining = self._launcher.launch(self._offer)
      del self._offer.resources[:]
      self._offer.resources.extend(remaining)
      assert task_id == "mysos-cluster0-%s" % i

    tasks = self._driver.method_calls["launchTasks"]
    assert len(tasks) == self._cluster.num_nodes

    # No new tasks are launched.
    assert self._launcher.launch(self._offer)[0] is None
    assert len(self._driver.method_calls["launchTasks"]) == self._cluster.num_nodes

    # All 3 nodes have successfully started.
    status = mesos_pb2.TaskStatus()
    status.state = mesos_pb2.TASK_RUNNING
    status.slave_id.value = self._offer.slave_id.value
    for i in range(self._cluster.num_nodes):
      status.task_id.value = "mysos-cluster0-%s" % i
      self._launcher.status_update(status)

    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The first slave is elected.
    assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths
    # Two slaves.
    assert len([x for x in self._storage.paths.keys() if x.startswith(
        "/mysos/test/cluster0/slaves/member_")]) == 2

    # Now fail the master task which leads to re-election.
    status.task_id.value = "mysos-cluster0-0"
    status.state = mesos_pb2.TASK_FAILED
    self._launcher.status_update(status)

    # 2. Recover the launcher.
    self._cluster = self._state_provider.load_cluster_state(self._cluster.name)
    self._launcher = MySQLClusterLauncher(
        self._driver,
        self._cluster,
        self._state_provider,
        self._zk_url,
        self._zk_client,
        self._framework_user,
        "./executor.pex",
        "cmd.sh",
        Amount(5, Time.SECONDS),
        "/etc/mysos/admin_keyfile.yml",
        self._scheduler_key,
        query_interval=Amount(150, Time.MILLISECONDS))

    for i in range(1, self._cluster.num_nodes):
      self._launcher.framework_message(
          "mysos-cluster0-%s" % i,
          self._offer.slave_id.value,
          json.dumps(dict(epoch=2, position=str(i))))

    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The second slave has the larger position and is elected.
    assert "/mysos/test/cluster0/master/member_0000000002" in self._storage.paths
Example #12
0
  def test_launcher_recovery_before_election_completed(self):
    # 1. Launch a cluster on the running launcher.
    for i in range(self._cluster.num_nodes):
      task_id, remaining = self._launcher.launch(self._offer)
      del self._offer.resources[:]
      self._offer.resources.extend(remaining)
      assert task_id == "mysos-cluster0-%s" % i

    tasks = self._driver.method_calls["launchTasks"]
    assert len(tasks) == self._cluster.num_nodes

    # No new tasks are launched.
    assert self._launcher.launch(self._offer)[0] is None
    assert len(self._driver.method_calls["launchTasks"]) == self._cluster.num_nodes

    # All 3 nodes have successfully started.
    status = mesos_pb2.TaskStatus()
    status.state = mesos_pb2.TASK_RUNNING
    status.slave_id.value = self._offer.slave_id.value
    for i in range(self._cluster.num_nodes):
      status.task_id.value = "mysos-cluster0-%s" % i
      self._launcher.status_update(status)

    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The first slave is elected.
    assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths
    # Two slaves.
    assert len([x for x in self._storage.paths.keys() if x.startswith(
        "/mysos/test/cluster0/slaves/member_")]) == 2

    # Now fail the master task which leads to re-election.
    status.task_id.value = "mysos-cluster0-0"
    status.state = mesos_pb2.TASK_FAILED
    self._launcher.status_update(status)

    # 2. Recover the launcher.
    self._cluster = self._state_provider.load_cluster_state(self._cluster.name)
    self._launcher = MySQLClusterLauncher(
        self._driver,
        self._cluster,
        self._state_provider,
        self._zk_url,
        self._zk_client,
        self._framework_user,
        "./executor.pex",
        "cmd.sh",
        Amount(5, Time.SECONDS),
        "/etc/mysos/admin_keyfile.yml",
        self._scheduler_key,
        query_interval=Amount(150, Time.MILLISECONDS))

    for i in range(1, self._cluster.num_nodes):
      self._launcher.framework_message(
          "mysos-cluster0-%s" % i,
          self._offer.slave_id.value,
          json.dumps(dict(epoch=2, position=str(i))))

    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The second slave has the larger position and is elected.
    assert "/mysos/test/cluster0/master/member_0000000002" in self._storage.paths
Example #13
0
  def test_master_failover(self):
    for i in range(self._cluster.num_nodes):
      task_id, remaining = self._launcher.launch(self._offer)
      del self._offer.resources[:]
      self._offer.resources.extend(remaining)
      assert task_id == "mysos-cluster0-%s" % i

    tasks = self._driver.method_calls["launchTasks"]
    assert len(tasks) == self._cluster.num_nodes

    # All 3 nodes have successfully started.
    status = mesos_pb2.TaskStatus()
    status.state = mesos_pb2.TASK_RUNNING
    status.slave_id.value = self._offer.slave_id.value

    for i in range(self._cluster.num_nodes):
      status.task_id.value = "mysos-cluster0-%s" % i
      self._launcher.status_update(status)

    # No log positions queries are sent for the first epoch.
    assert "sendFrameworkMessage" not in self._driver.method_calls

    # Wait for the election to complete.
    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The first slave is elected.
    assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths

    # Now fail the master task.
    status.task_id.value = "mysos-cluster0-0"
    status.state = mesos_pb2.TASK_FAILED
    self._launcher.status_update(status)

    assert len(self._launcher._cluster.running_tasks) == 2

    # Log positions queries are sent.
    self._launcher._elector._elect()
    assert len(self._driver.method_calls["sendFrameworkMessage"]) >= 2

    for i in range(1, self._cluster.num_nodes):
      self._launcher.framework_message(
          "mysos-cluster0-%s" % i,
          self._offer.slave_id.value,
          json.dumps(dict(epoch=1, position=str(i))))

    # Wait for the election to complete.
    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The slave with the highest position is elected.
    assert "/mysos/test/cluster0/master/member_0000000002" in self._storage.paths

    assert len(self._launcher._cluster.running_tasks) == 2

    # When a new offer comes in, a new task is launched.
    del self._offer.resources[:]
    resources = create_resources(
        cpus=DEFAULT_TASK_CPUS,
        mem=DEFAULT_TASK_MEM,
        disk=DEFAULT_TASK_DISK,
        ports=set([10000]))
    self._offer.resources.extend(resources)
    task_id, _ = self._launcher.launch(self._offer)
    assert task_id == "mysos-cluster0-3"

    launched = self._driver.method_calls["launchTasks"]
    # One task is relaunched to make up for the failed one.
    assert len(launched) == self._cluster.num_nodes + 1
Example #14
0
  def create_cluster(self, cluster_name, cluster_user, num_nodes, backup_id=None):
    """
      :param cluster_name: Name of the cluster.
      :param cluster_user: The user account on MySQL server.
      :param num_nodes: Number of nodes in the cluster.
      :param backup_id: The 'backup_id' of the backup to restore from. If None then Mysos starts an
                        empty instance.

      :return: a tuple of the following:
        - ZooKeeper URL for this Mysos cluster that can be used to resolve MySQL cluster info.
        - The password for the specified user of the specified cluster.

      NOTE:
        If the scheduler fails over after the cluster state is checkpointed, the new scheduler
        instance will restore the state and continue to launch the cluster. TODO(jyx): Need to allow
        users to retrieve the cluster info (e.g. the passwords) afterwards in case the user request
        has failed but the cluster is successfully created eventually.
    """
    with self._lock:
      if not self._driver:
        # 'self._driver' is set in (re)registered() after which the scheduler becomes "available".
        # We need to get hold of the driver to recover the scheduler.
        raise self.ServiceUnavailable("Service unavailable. Try again later")

      if cluster_name in self._state.clusters:
        raise self.ClusterExists("Cluster '%s' already exists" % cluster_name)

      if not cluster_user:
        raise self.InvalidUser('Invalid user name: %s' % cluster_user)

      if int(num_nodes) <= 0:
        raise ValueError("Invalid number of cluster nodes: %s" % num_nodes)

      self._state.clusters.add(cluster_name)
      self._state_provider.dump_scheduler_state(self._state)

      cluster = MySQLCluster(
          cluster_name,
          cluster_user,
          gen_password(),
          int(num_nodes),
          backup_id=backup_id)
      self._state_provider.dump_cluster_state(cluster)

      log.info("Creating launcher for cluster %s" % cluster_name)
      self._launchers[cluster_name] = MySQLClusterLauncher(
          self._driver,
          cluster,
          self._state_provider,
          self._discover_zk_url,
          self._kazoo,
          self._framework_user,
          self._executor_uri,
          self._executor_cmd,
          self._election_timeout,
          self._admin_keypath,
          installer_args=self._installer_args,
          backup_store_args=self._backup_store_args,
          executor_environ=self._executor_environ,
          framework_role=self._framework_role)

      return get_cluster_path(self._discover_zk_url, cluster_name), cluster.password
Example #15
0
    def create_cluster(self,
                       cluster_name,
                       cluster_user,
                       num_nodes,
                       size=None,
                       backup_id=None,
                       cluster_password=None):
        """
      :param cluster_name: Name of the cluster.
      :param cluster_user: The user account on MySQL server.
      :param num_nodes: Number of nodes in the cluster.
      :param size: The size of instances in the cluster as a JSON dictionary of 'cpus', 'mem',
                   'disk'. 'mem' and 'disk' are specified with data size units: kb, mb, gb, etc. If
                   given 'None' then app defaults are used.
      :param backup_id: The 'backup_id' of the backup to restore from. If None then Mysos starts an
                        empty instance.
      :param cluster_password: The password used for accessing MySQL instances in the cluster as
                               well as deleting the cluster from Mysos. If None then Mysos generates
                               one for the cluster. In either case the password is sent back as
                               part of the return value.

      :return: a tuple of the following:
        - ZooKeeper URL for this Mysos cluster that can be used to resolve MySQL cluster info.
        - The password for the specified user of the specified cluster.

      NOTE:
        If the scheduler fails over after the cluster state is checkpointed, the new scheduler
        instance will restore the state and continue to launch the cluster. TODO(jyx): Need to allow
        users to retrieve the cluster info (e.g. the passwords) afterwards in case the user request
        has failed but the cluster is successfully created eventually.
    """
        with self._lock:
            if not self._driver:
                # 'self._driver' is set in (re)registered() after which the scheduler becomes "available".
                # We need to get hold of the driver to recover the scheduler.
                raise self.ServiceUnavailable(
                    "Service unavailable. Try again later")

            if cluster_name in self._state.clusters:
                raise self.ClusterExists("Cluster '%s' already exists" %
                                         cluster_name)

            if not cluster_user:
                raise self.InvalidUser('Invalid user name: %s' % cluster_user)

            num_nodes = int(num_nodes)
            if num_nodes <= 0:
                raise ValueError("Invalid number of cluster nodes: %s" %
                                 num_nodes)

            resources = parse_size(size)

            if (resources['cpus'] <= EXECUTOR_CPUS_EPSILON
                    or resources['mem'] <= EXECUTOR_MEM_EPSILON
                    or resources['disk'] <= EXECUTOR_DISK_EPSILON):
                raise ValueError(
                    "Instance 'size' too small. It should be larger than what Mysos executor consumes: "
                    "(cpus, mem, disk) = (%s, %s, %s)" %
                    (EXECUTOR_CPUS_EPSILON, EXECUTOR_MEM_EPSILON,
                     EXECUTOR_DISK_EPSILON))

            log.info("Requested resources per instance for cluster %s: %s" %
                     (resources, cluster_name))

            self._metrics.total_requested_cpus.write(
                self._metrics.total_requested_cpus.read() +
                resources['cpus'] * num_nodes)
            self._metrics.total_requested_mem_mb.write(
                self._metrics.total_requested_mem_mb.read() +
                resources['mem'].as_(Data.MB) * num_nodes)
            self._metrics.total_requested_disk_mb.write(
                self._metrics.total_requested_disk_mb.read() +
                resources['disk'].as_(Data.MB) * num_nodes)

            self._state.clusters.add(cluster_name)
            self._state_provider.dump_scheduler_state(self._state)

            if not cluster_password:
                log.info("Generating password for cluster %s" % cluster_name)
                cluster_password = gen_password()

            # Return the plaintext version to the client but store the encrypted version.
            cluster = MySQLCluster(
                cluster_name,
                cluster_user,
                self._password_box.encrypt(cluster_password),
                num_nodes,
                cpus=resources['cpus'],
                mem=resources['mem'],
                disk=resources['disk'],
                backup_id=backup_id)
            self._state_provider.dump_cluster_state(cluster)

            log.info("Creating launcher for cluster %s" % cluster_name)
            self._launchers[cluster_name] = MySQLClusterLauncher(
                self._driver,
                cluster,
                self._state_provider,
                self._discover_zk_url,
                self._kazoo,
                self._framework_user,
                self._executor_uri,
                self._executor_cmd,
                self._election_timeout,
                self._admin_keypath,
                self._scheduler_key,
                installer_args=self._installer_args,
                backup_store_args=self._backup_store_args,
                executor_environ=self._executor_environ,
                executor_source_prefix=self._executor_source_prefix,
                framework_role=self._framework_role)

            self._metrics.cluster_count.increment()

            return get_cluster_path(self._discover_zk_url,
                                    cluster_name), cluster_password
Example #16
0
  def test_master_failover(self):
    for i in range(self._cluster.num_nodes):
      task_id, remaining = self._launcher.launch(self._offer)
      del self._offer.resources[:]
      self._offer.resources.extend(remaining)
      assert task_id == "mysos-cluster0-%s" % i

    tasks = self._driver.method_calls["launchTasks"]
    assert len(tasks) == self._cluster.num_nodes

    # All 3 nodes have successfully started.
    status = mesos_pb2.TaskStatus()
    status.state = mesos_pb2.TASK_RUNNING
    status.slave_id.value = self._offer.slave_id.value

    for i in range(self._cluster.num_nodes):
      status.task_id.value = "mysos-cluster0-%s" % i
      self._launcher.status_update(status)

    # No log positions queries are sent for the first epoch.
    assert "sendFrameworkMessage" not in self._driver.method_calls

    # Wait for the election to complete.
    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The first slave is elected.
    assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths

    # Now fail the master task.
    status.task_id.value = "mysos-cluster0-0"
    status.state = mesos_pb2.TASK_FAILED
    self._launcher.status_update(status)

    assert len(self._launcher._cluster.running_tasks) == 2

    # Log positions queries are sent.
    self._launcher._elector._elect()
    assert len(self._driver.method_calls["sendFrameworkMessage"]) >= 2

    for i in range(1, self._cluster.num_nodes):
      self._launcher.framework_message(
          "mysos-cluster0-%s" % i,
          self._offer.slave_id.value,
          json.dumps(dict(epoch=1, position=str(i))))

    # Wait for the election to complete.
    deadline(
        lambda: wait_for_master(
            get_cluster_path(self._zk_url, self._cluster.name),
            self._zk_client),
        Amount(5, Time.SECONDS))

    # The slave with the highest position is elected.
    assert "/mysos/test/cluster0/master/member_0000000002" in self._storage.paths

    assert len(self._launcher._cluster.running_tasks) == 2

    # When a new offer comes in, a new task is launched.
    del self._offer.resources[:]
    resources = create_resources(cpus=1, mem=512, ports=set([10000]))
    self._offer.resources.extend(resources)
    task_id, _ = self._launcher.launch(self._offer)
    assert task_id == "mysos-cluster0-3"

    launched = self._driver.method_calls["launchTasks"]
    # One task is relaunched to make up for the failed one.
    assert len(launched) == self._cluster.num_nodes + 1
Example #17
0
  def __init__(
      self,
      driver,
      cluster,
      state_provider,
      zk_url,
      kazoo,
      framework_user,
      executor_uri,
      executor_cmd,
      election_timeout,
      admin_keypath,
      scheduler_key,
      installer_args=None,
      backup_store_args=None,
      executor_environ=None,
      framework_role='*',
      query_interval=Amount(1, Time.SECONDS)):
    """
      :param driver: Mesos scheduler driver.
      :param cluster: The MySQLCluster state object.
      :param state_provider: For restoring and persisting the cluster state.
      :param zk_url: The ZooKeeper URL for cluster member discovery and master election.
      :param kazoo: The Kazoo client to access ZooKeeper with.
      :param executor_uri: See flags.
      :param executor_cmd: See flags.
      :param election_timeout: See flags.
      :param admin_keypath: See flags.
      :param scheduler_key: Used for encrypting cluster passwords.
      :param installer_args: See flags.
      :param backup_store_args: See flags.
      :param executor_environ: See flags.
      :param framework_role: See flags.
      :param query_interval: See MySQLMasterElector. Use the default value for production and allow
                             tests to use a different value.
    """
    self._driver = driver

    if not isinstance(cluster, MySQLCluster):
      raise TypeError("'cluster' should be an instance of MySQLCluster")
    self._cluster = cluster

    if not isinstance(state_provider, StateProvider):
      raise TypeError("'state_provider' should be an instance of StateProvider")
    self._state_provider = state_provider

    self._framework_role = framework_role

    # Passed along to executors.
    self._zk_url = zk_url
    self._framework_user = framework_user
    self._executor_uri = executor_uri
    self._executor_cmd = executor_cmd
    self._election_timeout = election_timeout
    self._admin_keypath = admin_keypath
    self._installer_args = installer_args
    self._backup_store_args = backup_store_args
    self._executor_environ = executor_environ

    # Used by the elector.
    self._query_interval = query_interval

    zk_root = zookeeper.parse(zk_url)[2]
    self._cluster_manager = ClusterManager(kazoo, get_cluster_path(zk_root, cluster.name))

    self._password_box = PasswordBox(scheduler_key)
    self._password_box.decrypt(cluster.encrypted_password)  # Validate the password.

    self._lock = threading.Lock()

    if self._cluster.master_id:
      log.info("Republish master %s for cluster %s in case it's not published" % (
          self._cluster.master_id, self.cluster_name))
      self._cluster_manager.promote_member(self._cluster.master_id)

    if len(self._cluster.tasks) > 0:
      log.info("Recovered %s tasks for cluster '%s'" % (
          len(self._cluster.tasks), self.cluster_name))

    # A recovered launcher should continue the election if the previous one was incomplete when the
    # scheduler failed over. Mesos will deliver all missed events that affect the election to the
    # scheduler.
    if len(self._cluster.running_tasks) > 0 and not self._cluster.master_id:
      log.info("Restarting election for the recovered launcher")
      self._elector = self._new_elector()
      # Add current slaves.
      for t in self._cluster.running_tasks:
        self._elector.add_slave(t.task_id, t.mesos_slave_id)
      self._elector.start()
    else:
      # New launcher, the elector is set when the election starts and reset to None when it ends.
      self._elector = None

    self._terminating = False
def test_scheduler_runs():
    """
    Verifies that the scheduler successfully launches 3 "no-op" MySQL tasks.
    NOTE: Due to the limitation of zake the scheduler's ZK operations are not propagated to
    executors in separate processes but they are unit-tested separately.
  """
    import mesos.native

    # Make sure fake_mysos_executor.pex is available to be fetched by Mesos slave.
    assert os.path.isfile('dist/fake_mysos_executor.pex')

    storage = FakeStorage(SequentialThreadingHandler())
    zk_client = FakeClient(storage=storage)
    zk_client.start()

    zk_url = "zk://fake_host/home/mysos/clusters"
    cluster_name = "test_cluster"
    num_nodes = 3

    state_provider = LocalStateProvider(safe_mkdtemp())

    framework_info = FrameworkInfo(user=getpass.getuser(),
                                   name="mysos",
                                   checkpoint=False)

    state = Scheduler(framework_info)

    scheduler = MysosScheduler(state, state_provider, getpass.getuser(),
                               os.path.abspath("dist/fake_mysos_executor.pex"),
                               "./fake_mysos_executor.pex", zk_client, zk_url,
                               Amount(40, Time.SECONDS), "/fakepath",
                               gen_encryption_key())

    RootMetrics().register_observable('scheduler', scheduler)

    scheduler_driver = mesos.native.MesosSchedulerDriver(
        scheduler, framework_info, "local")
    scheduler_driver.start()

    # Wait until the scheduler is connected and becomes available.
    assert scheduler.connected.wait(30)

    scheduler.create_cluster(cluster_name,
                             "mysql_user",
                             num_nodes,
                             cluster_password="******")

    # A slave is promoted to be the master.
    deadline(
        lambda: wait_for_master(
            get_cluster_path(posixpath.join(zk_url, 'discover'), cluster_name),
            zk_client), Amount(40, Time.SECONDS))

    scheduler.delete_cluster(cluster_name, password="******")

    # The cluster is deleted from ZooKeeper.
    deadline(
        lambda: wait_for_termination(
            get_cluster_path(posixpath.join(zk_url, 'discover'), cluster_name),
            zk_client), Amount(40, Time.SECONDS))

    sample = RootMetrics().sample()
    assert sample['scheduler.tasks_killed'] == 1

    assert scheduler_driver.stop() == DRIVER_STOPPED