Example #1
0
  def from_assigned_task(self, assigned_task, _):
    mesos_task = mesos_task_instance_from_assigned_task(assigned_task)

    if not mesos_task.has_announce():
      return None

    portmap = resolve_ports(mesos_task, assigned_task.assignedPorts)

    # assigned_task.slaveHost is the --hostname argument passed into the mesos slave.
    # Using this allows overriding the hostname published into ZK when announcing.
    # If no argument was passed to the mesos-slave, the slave falls back to gethostname().
    endpoint, additional = make_endpoints(
      assigned_task.slaveHost,
      portmap,
      mesos_task.announce().primary_port().get())

    client = self.make_zk_client()
    if mesos_task.announce().has_zk_path():
      if self.__allow_custom_serverset_path:
        path = mesos_task.announce().zk_path().get()
      else:
        app.error('Executor must be started with --announcer-allow-custom-serverset-path in order '
            'to use zk_path in the Announcer config')
    else:
      path = self.make_zk_path(assigned_task)

    initial_interval = mesos_task.health_check_config().initial_interval_secs().get()
    interval = mesos_task.health_check_config().interval_secs().get()
    consecutive_failures = mesos_task.health_check_config().max_consecutive_failures().get()
    timeout_secs = initial_interval + (consecutive_failures * interval)

    return AnnouncerChecker(
      client, path, timeout_secs, endpoint, additional=additional, shard=assigned_task.instanceId,
      name=self.name)
Example #2
0
  def delete(args, options):
    validate_common_options(options)

    with open(options.password_file, 'r') as f:
      password = f.read().strip()
      if not password:
        app.error("Empty password file")

    url = 'http://%s:%s/clusters/%s' % (options.api_host, options.api_port, options.cluster_name)
    values = dict(password=password)

    req = urllib2.Request(url, urllib.urlencode(values))
    req.get_method = lambda: 'DELETE'

    try:
      response = urllib2.urlopen(req).read()
    except urllib2.HTTPError as e:
      log.error("DELETE request failed: %s, %s, %s" % (
          e.code, BaseHTTPServer.BaseHTTPRequestHandler.responses[e.code], e.read()))
      app.quit(1)

    try:
      result = json.loads(response)
      if not isinstance(result, dict):
        raise ValueError()
    except ValueError:
      log.error("Invalid response: %s" % response)
      app.quit(1)

    log.info("Cluster deletion result: %s" % result)

    log.info("Waiting for the cluster to terminate...")
    wait_for_termination(result['cluster_url'])

    log.info("Cluster terminated/deleted")
Example #3
0
def really_run(task,
               root,
               sandbox,
               task_id=None,
               user=None,
               prebound_ports=None,
               chroot=None,
               daemon=False):

    prebound_ports = prebound_ports or {}
    missing_ports = set(task.ports()) - set(prebound_ports.keys())
    if missing_ports:
        app.error('ERROR!  Unbound ports: %s' %
                  ' '.join(port for port in missing_ports))
    task_runner = TaskRunner(task.task,
                             root,
                             sandbox,
                             task_id=task_id,
                             user=user,
                             portmap=prebound_ports,
                             chroot=chroot)
    if daemon:
        print('Daemonizing and starting runner.')
        try:
            log.teardown_stderr_logging()
            daemonize()
        except Exception as e:
            print("Failed to daemonize: %s" % e)
            sys.exit(1)
    try:
        task_runner.run()
    except KeyboardInterrupt:
        print('Got keyboard interrupt, killing job!')
        task_runner.close_ckpt()
        task_runner.kill()
Example #4
0
def tail(args, options):
    """Tail the logs of a task process.

    Usage: thermos tail task_name [process_name]
  """
    if len(args) == 0:
        app.error("Expected a task to tail, got nothing!")
    if len(args) not in (1, 2):
        app.error("Expected at most two arguments (task and optional process), got %d" % len(args))

    task_id = args[0]
    detector = TaskDetector(root=options.root)
    checkpoint = CheckpointDispatcher.from_file(detector.get_checkpoint(task_id))
    log_dir = checkpoint.header.log_dir
    process_runs = [(process, run) for (process, run) in detector.get_process_runs(task_id, log_dir)]
    if len(args) == 2:
        process_runs = [(process, run) for (process, run) in process_runs if process == args[1]]

    if len(process_runs) == 0:
        print("ERROR: No processes found.", file=sys.stderr)
        sys.exit(1)

    processes = set([process for process, _ in process_runs])
    if len(processes) != 1:
        print("ERROR: More than one process matches query.", file=sys.stderr)
        sys.exit(1)

    process = processes.pop()
    run = max([run for _, run in process_runs])

    logdir = TaskPath(root=options.root, task_id=args[0], process=process, run=run, log_dir=log_dir).getpath(
        "process_logdir"
    )
    logfile = os.path.join(logdir, "stderr" if options.use_stderr else "stdout")

    monitor = TaskMonitor(TaskPath(root=options.root), args[0])

    def log_is_active():
        active_processes = monitor.get_active_processes()
        for process_status, process_run in active_processes:
            if process_status.process == process and process_run == run:
                return True
        return False

    if not log_is_active():
        print("Tail of terminal log %s" % logfile)
        for line in tail_closed(logfile):
            print(line.rstrip())
        return

    now = time.time()
    next_check = now + 5.0
    print("Tail of active log %s" % logfile)
    for line in tail_f(logfile, include_last=True, forever=False):
        print(line.rstrip())
        if time.time() > next_check:
            if not log_is_active():
                break
            else:
                next_check = time.time() + 5.0
  def main(args, options):
    thermos_runner_provider = DefaultThermosTaskRunnerProvider(
        dump_runner_pex(),
        artifact_dir=os.path.realpath('.'),
    )

    # status providers:
    status_providers = [HealthCheckerProvider()]

    if options.announcer_enable:
      if options.announcer_ensemble is None:
        app.error('Must specify --announcer-ensemble if the announcer is enabled.')
      status_providers.append(DefaultAnnouncerCheckerProvider(
          options.announcer_ensemble, options.announcer_serverset_path))

    # Create executor stub
    thermos_executor = AuroraExecutor(
        runner_provider=thermos_runner_provider,
        status_providers=status_providers,
    )

    # Create driver stub
    driver = MesosExecutorDriver(thermos_executor)

    # This is an ephemeral executor -- shutdown if we receive no tasks within a certain
    # time period
    ExecutorTimeout(thermos_executor.launched, driver).start()

    # Start executor
    driver.run()

    log.info('MesosExecutorDriver.run() has finished.')
def get_task_from_options(args, opts, **kw):
  loader = ThermosConfigLoader.load_json if opts.json else ThermosConfigLoader.load

  if len(args) != 1:
    app.error('Should specify precisely one config, instead got: %s' % args)

  tasks = loader(args[0], bindings=opts.bindings, **kw)

  task_list = list(tasks.tasks())
  if len(task_list) == 0:
    app.error("No tasks specified!")

  if opts.task is None and len(task_list) > 1:
    app.error("Multiple tasks in config but no task name specified!")

  task = None
  if opts.task is not None:
    for t in task_list:
      if t.task().name().get() == opts.task:
        task = t
        break
    if task is None:
      app.error("Could not find task %s!" % opts.task)
  else:
    task = task_list[0]

  if kw.get('strict', False):
    if not task.task.check().ok():
      app.error(task.task.check().message())

  return task
Example #7
0
  def create(args, options):
    validate_common_options(options)

    if not options.num_nodes:
      app.error("--num_nodes is required")

    if not options.cluster_user:
      app.error("--cluster_user is required")

    url = 'http://%s:%s/clusters/%s' % (options.api_host, options.api_port, options.cluster_name)
    values = dict(
        num_nodes=int(options.num_nodes),
        cluster_user=options.cluster_user,
        size=options.size if options.size else '',
        backup_id=options.backup_id if options.backup_id else '')

    req = urllib2.Request(url, urllib.urlencode(values))
    try:
      response = urllib2.urlopen(req).read()
    except urllib2.HTTPError as e:
      log.error("POST request failed: %s, %s, %s" % (
          e.code, BaseHTTPServer.BaseHTTPRequestHandler.responses[e.code], e.read()))
      app.quit(1)

    try:
      result = json.loads(response)
      if not isinstance(result, dict):
        raise ValueError()
    except ValueError:
      log.error("Invalid response: %s" % response)
      app.quit(1)

    log.info("Cluster created. Cluster info: %s" % str(result))
    with open(options.password_file, 'w') as f:
      f.write(result["cluster_password"])

    log.info("Waiting for the master for this cluster to be elected...")
    master_endpoint = wait_for_master(result['cluster_url']).service_endpoint

    connection_str = "mysql://%s:%s@%s:%d/" % (
        options.cluster_user,
        result["cluster_password"],
        master_endpoint.host,
        master_endpoint.port)
    log.info("Connecting to the MySQL cluster master: %s" % connection_str)
    engine = create_engine(connection_str)

    for i in range(5):  # Loop for 5 times/seconds to wait for the master to be promoted.
      try:
        # TODO(jyx): Test writing to the master and reading from the slave.
        result = engine.execute("SELECT 1;").scalar()
        assert 1 == int(result), "Expecting result to be 1 but got %s" % result
        break
      except OperationalError:
        if i == 4:
          raise
        log.debug("MySQL master not ready yet. Sleep for 1 second...")
        time.sleep(1)

    log.info("Cluster successfully started")
Example #8
0
 def add(file_or_dir):
     if os.path.isfile(file_or_dir):
         add_file(file_or_dir)
     elif os.path.isdir(file_or_dir):
         add_dir(file_or_dir)
     else:
         app.error("Unknown or non-existent file: %s" % file_or_dir)
Example #9
0
  def from_assigned_task(self, assigned_task, _):
    mesos_task = mesos_task_instance_from_assigned_task(assigned_task)

    if not mesos_task.has_announce():
      return None

    portmap = resolve_ports(mesos_task, assigned_task.assignedPorts)

    # assigned_task.slaveHost is the --hostname argument passed into the mesos slave.
    # Using this allows overriding the hostname published into ZK when announcing.
    # If no argument was passed to the mesos-slave, the slave falls back to gethostname().
    endpoint, additional = make_endpoints(
      assigned_task.slaveHost,
      portmap,
      mesos_task.announce().primary_port().get())

    client = self.make_zk_client()
    if mesos_task.announce().has_zk_path():
      if self.__allow_custom_serverset_path:
        path = mesos_task.announce().zk_path().get()
      else:
        app.error('Executor must be started with --announcer-allow-custom-serverset-path in order '
            'to use zk_path in the Announcer config')
    else:
      path = self.make_zk_path(assigned_task)

    initial_interval = mesos_task.health_check_config().initial_interval_secs().get()
    interval = mesos_task.health_check_config().interval_secs().get()
    consecutive_failures = mesos_task.health_check_config().max_consecutive_failures().get()
    timeout_secs = initial_interval + (consecutive_failures * interval)

    return AnnouncerChecker(
      client, path, timeout_secs, endpoint, additional=additional, shard=assigned_task.instanceId,
      name=self.name)
Example #10
0
def generate_token_interactive():
    password = getpass('Enter your Subsonic password: '******'Enter a salt (an integer of at least six digits): ')
    if len(salt) < 6 or not salt.isdigit():
        app.error('Salt value is not an integer of at least six digits.')
    token = md5(password + salt).hexdigest()
    print 'Your API token is: {}'.format(token)
    print 'This must be used with the same salt value entered during this session.'
Example #11
0
def tail(args, options):
  """Tail the logs of a task process.

    Usage: thermos tail task_name [process_name]
  """
  if len(args) == 0:
    app.error('Expected a task to tail, got nothing!')
  if len(args) not in (1, 2):
    app.error('Expected at most two arguments (task and optional process), got %d' % len(args))

  task_id = args[0]
  detector = TaskDetector(root=options.root)
  checkpoint = CheckpointDispatcher.from_file(detector.get_checkpoint(task_id))
  log_dir = checkpoint.header.log_dir
  process_runs = [(process, run) for (process, run) in detector.get_process_runs(task_id, log_dir)]
  if len(args) == 2:
    process_runs = [(process, run) for (process, run) in process_runs if process == args[1]]

  if len(process_runs) == 0:
    print('ERROR: No processes found.', file=sys.stderr)
    sys.exit(1)

  processes = set([process for process, _ in process_runs])
  if len(processes) != 1:
    print('ERROR: More than one process matches query.', file=sys.stderr)
    sys.exit(1)

  process = processes.pop()
  run = max([run for _, run in process_runs])

  logdir = TaskPath(root=options.root, task_id=args[0], process=process,
     run=run, log_dir=log_dir).getpath('process_logdir')
  logfile = os.path.join(logdir, 'stderr' if options.use_stderr else 'stdout')

  monitor = TaskMonitor(TaskPath(root=options.root), args[0])
  def log_is_active():
    active_processes = monitor.get_active_processes()
    for process_status, process_run in active_processes:
      if process_status.process == process and process_run == run:
        return True
    return False

  if not log_is_active():
    print('Tail of terminal log %s' % logfile)
    for line in tail_closed(logfile):
      print(line.rstrip())
    return

  now = time.time()
  next_check = now + 5.0
  print('Tail of active log %s' % logfile)
  for line in tail_f(logfile, include_last=True, forever=False):
    print(line.rstrip())
    if time.time() > next_check:
      if not log_is_active():
        break
      else:
        next_check = time.time() + 5.0
Example #12
0
def initialize(options):
  cwd_path = os.path.abspath(CWD)
  checkpoint_root = os.path.join(cwd_path, MesosPathDetector.DEFAULT_SANDBOX_PATH)

  # status providers:
  status_providers = [
      HealthCheckerProvider(),
      ResourceManagerProvider(checkpoint_root=checkpoint_root)
  ]

  if options.announcer_enable:
    if options.announcer_ensemble is None:
      app.error('Must specify --announcer-ensemble if the announcer is enabled.')
    status_providers.append(DefaultAnnouncerCheckerProvider(
      options.announcer_ensemble,
      options.announcer_serverset_path,
      options.announcer_allow_custom_serverset_path
    ))

  # Create executor stub
  if options.execute_as_user or options.nosetuid:
    # If nosetuid is set, execute_as_user is also None
    thermos_runner_provider = UserOverrideThermosTaskRunnerProvider(
      dump_runner_pex(),
      checkpoint_root,
      artifact_dir=cwd_path,
      process_logger_destination=options.runner_logger_destination,
      process_logger_mode=options.runner_logger_mode,
      rotate_log_size_mb=options.runner_rotate_log_size_mb,
      rotate_log_backups=options.runner_rotate_log_backups,
      preserve_env=options.preserve_env
    )
    thermos_runner_provider.set_role(None)

    thermos_executor = AuroraExecutor(
      runner_provider=thermos_runner_provider,
      status_providers=status_providers,
      sandbox_provider=UserOverrideDirectorySandboxProvider(options.execute_as_user)
    )
  else:
    thermos_runner_provider = DefaultThermosTaskRunnerProvider(
      dump_runner_pex(),
      checkpoint_root,
      artifact_dir=cwd_path,
      process_logger_destination=options.runner_logger_destination,
      process_logger_mode=options.runner_logger_mode,
      rotate_log_size_mb=options.runner_rotate_log_size_mb,
      rotate_log_backups=options.runner_rotate_log_backups,
      preserve_env=options.preserve_env
    )

    thermos_executor = AuroraExecutor(
      runner_provider=thermos_runner_provider,
      status_providers=status_providers
    )

  return thermos_executor
Example #13
0
def read(args, options):
    """Replay a thermos checkpoint.

  Usage: thermos read [options] checkpoint_filename
  Options:
    --simple	Do not replay the full task state machine.  Only print out the contents of
                each checkpoint log message.
  """
    if len(args) != 1:
        app.error('Expected one checkpoint file, got %s' % len(args))
    if not os.path.exists(args[0]):
        app.error('Could not find %s' % args[0])

    dispatcher = CheckpointDispatcher()
    state = RunnerState(processes={})
    with open(args[0], 'r') as fp:
        try:
            for record in ThriftRecordReader(fp, RunnerCkpt):
                if not options.simple:
                    dispatcher.dispatch(state, record)
                else:
                    print('CKPT: %s' % record)
        except RecordIO.Error as err:
            print("Failed to recover from %s: %s" % (fp.name, err))
            return

    if not options.simple:
        if state is None or state.header is None:
            print('Checkpoint stream CORRUPT or outdated format')
            return
        print('Recovered Task Header:')
        print('  id:      %s' % state.header.task_id)
        print('  user:    %s' % state.header.user)
        print('  host:    %s' % state.header.hostname)
        print('  sandbox: %s' % state.header.sandbox)
        if state.header.ports:
            print('  ports:   %s' %
                  ' '.join('%s->%s' % (name, port)
                           for (name, port) in state.header.ports.items()))
        print('Recovered Task States:')
        for task_status in state.statuses:
            print(
                '  %s [pid: %d] => %s' %
                (time.asctime(time.localtime(task_status.timestamp_ms /
                                             1000.0)), task_status.runner_pid,
                 TaskState._VALUES_TO_NAMES[task_status.state]))
        print('Recovered Processes:')
        for process, process_history in state.processes.items():
            print('  %s   runs: %s' % (process, len(process_history)))
            for k in reversed(range(len(process_history))):
                run = process_history[k]
                print(
                    '    %2d: pid=%d, rc=%s, finish:%s, state:%s' %
                    (k, run.pid, run.return_code if run.return_code is not None
                     else '', time.asctime(time.localtime(run.stop_time))
                     if run.stop_time else 'None',
                     ProcessState._VALUES_TO_NAMES.get(run.state, 'Unknown')))
def initialize(options):
    cwd_path = os.path.abspath(CWD)
    checkpoint_root = os.path.join(cwd_path,
                                   MesosPathDetector.DEFAULT_SANDBOX_PATH)

    # status providers:
    status_providers = [
        HealthCheckerProvider(),
        ResourceManagerProvider(checkpoint_root=checkpoint_root)
    ]

    if options.announcer_enable:
        if options.announcer_ensemble is None:
            app.error(
                'Must specify --announcer-ensemble if the announcer is enabled.'
            )
        status_providers.append(
            DefaultAnnouncerCheckerProvider(
                options.announcer_ensemble, options.announcer_serverset_path,
                options.announcer_allow_custom_serverset_path))

    # Create executor stub
    if options.execute_as_user or options.nosetuid:
        # If nosetuid is set, execute_as_user is also None
        thermos_runner_provider = UserOverrideThermosTaskRunnerProvider(
            dump_runner_pex(),
            checkpoint_root,
            artifact_dir=cwd_path,
            process_logger_destination=options.runner_logger_destination,
            process_logger_mode=options.runner_logger_mode,
            rotate_log_size_mb=options.runner_rotate_log_size_mb,
            rotate_log_backups=options.runner_rotate_log_backups,
            preserve_env=options.preserve_env)
        thermos_runner_provider.set_role(None)

        thermos_executor = AuroraExecutor(
            runner_provider=thermos_runner_provider,
            status_providers=status_providers,
            sandbox_provider=UserOverrideDirectorySandboxProvider(
                options.execute_as_user))
    else:
        thermos_runner_provider = DefaultThermosTaskRunnerProvider(
            dump_runner_pex(),
            checkpoint_root,
            artifact_dir=cwd_path,
            process_logger_destination=options.runner_logger_destination,
            process_logger_mode=options.runner_logger_mode,
            rotate_log_size_mb=options.runner_rotate_log_size_mb,
            rotate_log_backups=options.runner_rotate_log_backups,
            preserve_env=options.preserve_env)

        thermos_executor = AuroraExecutor(
            runner_provider=thermos_runner_provider,
            status_providers=status_providers)

    return thermos_executor
Example #15
0
def read(args, options):
  """Replay a thermos checkpoint.

  Usage: thermos read [options] checkpoint_filename
  Options:
    --simple	Do not replay the full task state machine.  Only print out the contents of
                each checkpoint log message.
  """
  if len(args) != 1:
    app.error('Expected one checkpoint file, got %s' % len(args))
  if not os.path.exists(args[0]):
    app.error('Could not find %s' % args[0])

  dispatcher = CheckpointDispatcher()
  state = RunnerState(processes={})
  with open(args[0], 'r') as fp:
    try:
      for record in ThriftRecordReader(fp, RunnerCkpt):
        if not options.simple:
          dispatcher.dispatch(state, record)
        else:
          print('CKPT: %s' % record)
    except RecordIO.Error as err:
      print("Failed to recover from %s: %s" % (fp.name, err))
      return

  if not options.simple:
    if state is None or state.header is None:
      print('Checkpoint stream CORRUPT or outdated format')
      return
    print('Recovered Task Header:')
    print('  id:      %s' % state.header.task_id)
    print('  user:    %s' % state.header.user)
    print('  host:    %s' % state.header.hostname)
    print('  sandbox: %s' % state.header.sandbox)
    if state.header.ports:
      print('  ports:   %s' % ' '.join(
        '%s->%s' % (name, port) for (name, port) in state.header.ports.items()))
    print('Recovered Task States:')
    for task_status in state.statuses:
      print('  %s [pid: %d] => %s' % (
        time.asctime(time.localtime(task_status.timestamp_ms / 1000.0)),
        task_status.runner_pid,
        TaskState._VALUES_TO_NAMES[task_status.state]))
    print('Recovered Processes:')
    for process, process_history in state.processes.items():
      print('  %s   runs: %s' % (process, len(process_history)))
      for k in reversed(range(len(process_history))):
        run = process_history[k]
        print('    %2d: pid=%d, rc=%s, finish:%s, state:%s' % (
          k,
          run.pid,
          run.return_code if run.return_code is not None else '',
          time.asctime(time.localtime(run.stop_time)) if run.stop_time else 'None',
          ProcessState._VALUES_TO_NAMES.get(run.state, 'Unknown')))
  def main():
    if MesosExecutorDriver is None:
      app.error('Could not load MesosExecutorDriver!')

    thermos_gc_executor, metric_writer, driver = initialize()

    thermos_gc_executor.start()
    metric_writer.start()
    driver.run()

    log.info('MesosExecutorDriver.run() has finished.')
Example #17
0
def pid_provider():
  options = app.get_options()
  for path, _, pid in list_pids():
    if pid == options.pid:
      break
  else:
    app.error('Could not find pid %s' % options.pid)
  def loader():
    with open(path, 'rb') as fp:
      return fp.read()
  return loader
  def main(args, options):
    if MesosExecutorDriver is None:
      app.error('Could not load MesosExecutorDriver!')

    # status providers:
    status_providers = [
        HealthCheckerProvider(),
        ResourceManagerProvider(checkpoint_root=options.checkpoint_root)
    ]

    if options.announcer_enable:
      if options.announcer_ensemble is None:
        app.error('Must specify --announcer-ensemble if the announcer is enabled.')
      status_providers.append(DefaultAnnouncerCheckerProvider(
        options.announcer_ensemble, options.announcer_serverset_path))

    # Create executor stub
    if options.execute_as_user or options.nosetuid:
      # If nosetuid is set, execute_as_user is also None
      thermos_runner_provider = UserOverrideThermosTaskRunnerProvider(
        dump_runner_pex(),
        artifact_dir=os.path.abspath(CWD)
      )
      thermos_runner_provider.set_role(None)

      thermos_executor = AuroraExecutor(
        runner_provider=thermos_runner_provider,
        status_providers=status_providers,
        sandbox_provider=UserOverrideDirectorySandboxProvider(options.execute_as_user)
      )
    else:
      thermos_runner_provider = DefaultThermosTaskRunnerProvider(
        dump_runner_pex(),
        artifact_dir=os.path.abspath(CWD)
      )

      thermos_executor = AuroraExecutor(
        runner_provider=thermos_runner_provider,
        status_providers=status_providers
      )

    # Create driver stub
    driver = MesosExecutorDriver(thermos_executor)

    # This is an ephemeral executor -- shutdown if we receive no tasks within a certain
    # time period
    ExecutorTimeout(thermos_executor.launched, driver).start()

    # Start executor
    driver.run()

    log.info('MesosExecutorDriver.run() has finished.')
Example #19
0
def pid_provider():
    options = app.get_options()
    for path, _, pid in list_pids():
        if pid == options.pid:
            break
    else:
        app.error('Could not find pid %s' % options.pid)

    def loader():
        with open(path, 'rb') as fp:
            return fp.read()

    return loader
Example #20
0
    def main(args, options):
        if MesosExecutorDriver is None:
            app.error('Could not load MesosExecutorDriver!')

        # status providers:
        status_providers = [
            HealthCheckerProvider(),
            ResourceManagerProvider(checkpoint_root=options.checkpoint_root)
        ]

        if options.announcer_enable:
            if options.announcer_ensemble is None:
                app.error(
                    'Must specify --announcer-ensemble if the announcer is enabled.'
                )
            status_providers.append(
                DefaultAnnouncerCheckerProvider(
                    options.announcer_ensemble,
                    options.announcer_serverset_path))

        # Create executor stub
        if options.execute_as_user or options.nosetuid:
            # If nosetuid is set, execute_as_user is also None
            thermos_runner_provider = UserOverrideThermosTaskRunnerProvider(
                dump_runner_pex(), artifact_dir=os.path.abspath(CWD))
            thermos_runner_provider.set_role(None)

            thermos_executor = AuroraExecutor(
                runner_provider=thermos_runner_provider,
                status_providers=status_providers,
                sandbox_provider=UserOverrideDirectorySandboxProvider(
                    options.execute_as_user))
        else:
            thermos_runner_provider = DefaultThermosTaskRunnerProvider(
                dump_runner_pex(), artifact_dir=os.path.abspath(CWD))

            thermos_executor = AuroraExecutor(
                runner_provider=thermos_runner_provider,
                status_providers=status_providers)

        # Create driver stub
        driver = MesosExecutorDriver(thermos_executor)

        # This is an ephemeral executor -- shutdown if we receive no tasks within a certain
        # time period
        ExecutorTimeout(thermos_executor.launched, driver).start()

        # Start executor
        driver.run()

        log.info('MesosExecutorDriver.run() has finished.')
Example #21
0
def main(args):
    if len(args) != 1:
        app.error('Must supply a serverset path to monitor.')

    def on_join(endpoint):
        print('@ %s += %s' % (datetime.now(), endpoint))

    def on_leave(endpoint):
        print('@ %s -= %s' % (datetime.now(), endpoint))

    ss = ServerSet(ZooKeeper(), args[0], on_join=on_join, on_leave=on_leave)

    while True:
        time.sleep(100)
Example #22
0
def to_acl(access):
  cred = access.credential().get()
  if access.scheme().get() == 'digest':
    cred_parts = access.credential().get().split(':')
    if len(cred_parts) != 2:
      app.error('Digest credential should be of the form <user>:<password>')
    cred = make_digest_acl_credential(cred_parts[0], cred_parts[1])
  return make_acl(access.scheme().get(),
                  cred,
                  read=access.permissions().read().get(),
                  write=access.permissions().write().get(),
                  create=access.permissions().create().get(),
                  delete=access.permissions().delete().get(),
                  admin=access.permissions().admin().get())
Example #23
0
def proxy_main(args, opts):
    assert opts.thermos_json and os.path.exists(opts.thermos_json)
    assert opts.sandbox
    assert opts.checkpoint_root

    thermos_task = get_task_from_options(opts)
    prebound_ports = opts.prebound_ports
    missing_ports = set(thermos_task.ports()) - set(prebound_ports)

    if missing_ports:
        app.error('ERROR!  Unbound ports: %s' %
                  ' '.join(port for port in missing_ports))

    task_runner = TaskRunner(thermos_task.task,
                             opts.checkpoint_root,
                             opts.sandbox,
                             task_id=opts.task_id,
                             user=opts.setuid,
                             portmap=prebound_ports,
                             chroot=opts.chroot,
                             planner_class=CappedTaskPlanner)

    for sig in (signal.SIGUSR1, signal.SIGUSR2):
        signal.signal(sig, functools.partial(runner_teardown, task_runner))

    try:
        task_runner.run()
    except TaskRunner.InternalError as err:
        app.error('Internal error: %s' % err)
    except TaskRunner.InvalidTask as err:
        app.error(str(err))
    except TaskRunner.StateError:
        app.error('Task appears to already be in a terminal state.')
    except KeyboardInterrupt:
        runner_teardown(task_runner)
Example #24
0
def main(args):
    if len(args) != 1:
        app.error("Must supply a serverset path to monitor.")

    def on_join(endpoint):
        print("@ %s += %s" % (datetime.now(), endpoint))

    def on_leave(endpoint):
        print("@ %s -= %s" % (datetime.now(), endpoint))

    ss = ServerSet(ZooKeeper(), args[0], on_join=on_join, on_leave=on_leave)

    while True:
        time.sleep(100)
Example #25
0
def to_acl(access):
    cred = access.credential().get()
    if access.scheme().get() == 'digest':
        cred_parts = access.credential().get().split(':')
        if len(cred_parts) != 2:
            app.error(
                'Digest credential should be of the form <user>:<password>')
        cred = make_digest_acl_credential(cred_parts[0], cred_parts[1])
    return make_acl(access.scheme().get(),
                    cred,
                    read=access.permissions().read().get(),
                    write=access.permissions().write().get(),
                    create=access.permissions().create().get(),
                    delete=access.permissions().delete().get(),
                    admin=access.permissions().admin().get())
Example #26
0
  def main(args, options):
    if MesosExecutorDriver is None:
      app.error('Could not load MesosExecutorDriver!')

    thermos_executor = initialize(options)

    # Create driver stub
    driver = MesosExecutorDriver(thermos_executor)

    # This is an ephemeral executor -- shutdown if we receive no tasks within a certain
    # time period
    ExecutorTimeout(thermos_executor.launched, driver).start()

    # Start executor
    driver.run()

    log.info('MesosExecutorDriver.run() has finished.')
Example #27
0
    def main(args, options):
        if MesosExecutorDriver is None:
            app.error('Could not load MesosExecutorDriver!')

        thermos_executor = initialize(options)

        # Create driver stub
        driver = MesosExecutorDriver(thermos_executor)

        # This is an ephemeral executor -- shutdown if we receive no tasks within a certain
        # time period
        ExecutorTimeout(thermos_executor.launched, driver).start()

        # Start executor
        driver.run()

        log.info('MesosExecutorDriver.run() has finished.')
Example #28
0
def main(args, options):
  if not args:
    app.error('expected at least one ServerSet endpoint')

  def changed(endpoint, old, new):
    print '%s changed:' % endpoint
    print '  old:', _format_instances(old)
    print '  new:', _format_instances(new)
    print

  print 'Watching ServerSet endpoints. Hit ^C to exit.'
  print

  endpoints = []
  for arg in args:
    endpoints.append(ServerSetClient(arg, watcher=changed))
  while True:
    raw_input()
Example #29
0
def main(args, options):
  from pkg_resources import WorkingSet, Requirement, find_distributions

  if not options.site_dir:
    app.error('Must supply --site')

  distributions = list(find_distributions(options.site_dir))
  working_set = WorkingSet()
  for dist in distributions:
    working_set.add(dist)

  for arg in args:
    arg_req = Requirement.parse(arg)
    found_dist = working_set.find(arg_req)
    if not found_dist:
      print('Could not find %s!' % arg_req)
    out_zip = Distiller(found_dist).distill()
    print('Dumped %s => %s' % (arg_req, out_zip))
Example #30
0
def main(args, options):
    if not args:
        app.error('expected at least one ServerSet endpoint')

    def changed(endpoint, old, new):
        print '%s changed:' % endpoint
        print '  old:', _format_instances(old)
        print '  new:', _format_instances(new)
        print

    print 'Watching ServerSet endpoints. Hit ^C to exit.'
    print

    endpoints = []
    for arg in args:
        endpoints.append(ServerSetClient(arg, watcher=changed))
    while True:
        raw_input()
Example #31
0
def main(args, options):
    from pkg_resources import WorkingSet, Requirement, find_distributions

    if not options.site_dir:
        app.error('Must supply --site')

    distributions = list(find_distributions(options.site_dir))
    working_set = WorkingSet()
    for dist in distributions:
        working_set.add(dist)

    for arg in args:
        arg_req = Requirement.parse(arg)
        found_dist = working_set.find(arg_req)
        if not found_dist:
            print('Could not find %s!' % arg_req)
        out_zip = Distiller(found_dist).distill()
        print('Dumped %s => %s' % (arg_req, out_zip))
Example #32
0
  def main():
    if MesosExecutorDriver is None:
      app.error('Could not load MesosExecutorDriver!')

    # Create executor stub
    thermos_gc_executor = ThermosGCExecutor(FixedPathDetector(DEFAULT_CHECKPOINT_ROOT))
    thermos_gc_executor.start()

    # Start metrics collection
    metric_writer = DiskMetricWriter(thermos_gc_executor.metrics, ExecutorDetector.VARS_PATH)
    metric_writer.start()

    # Create driver stub
    driver = MesosExecutorDriver(thermos_gc_executor)

    # Start GC executor
    driver.run()

    log.info('MesosExecutorDriver.run() has finished.')
Example #33
0
def main(args, options):
  if len(args) > 0:
    app.error('Must provide hsperfdata via -f/-p')

  if options.list:
    print_pids()
    return

  perfdata = None
  if options.filename:
    perfdata = PerfData.get(file_provider())
  elif options.pid:
    perfdata = PerfData.get(pid_provider())

  if perfdata is None:
    app.error('No hsperfdata provider specified!')

  perfdata.sample()
  for key in sorted(perfdata):
    print('%s: %s' % (key, perfdata[key]))
Example #34
0
def main(args, options):
    if len(args) > 0:
        app.error('Must provide hsperfdata via -f/-p')

    if options.list:
        print_pids()
        return

    perfdata = None
    if options.filename:
        perfdata = PerfData.get(file_provider())
    elif options.pid:
        perfdata = PerfData.get(pid_provider())

    if perfdata is None:
        app.error('No hsperfdata provider specified!')

    perfdata.sample()
    for key in sorted(perfdata):
        print('%s: %s' % (key, perfdata[key]))
    def validate_common_options(options):
        if not options.api_host:
            app.error("--api_host is required")

        if not options.api_port:
            app.error("--api_port is required")

        if not options.cluster_name:
            app.error("--cluster is required")

        if not options.password_file:
            app.error("--password_file is required")
        log.info("Using --password_file=%s" % options.password_file)
        safe_mkdir(os.path.dirname(options.password_file))
Example #36
0
  def validate_common_options(options):
    if not options.api_host:
      app.error("--api_host is required")

    if not options.api_port:
      app.error("--api_port is required")

    if not options.cluster_name:
      app.error("--cluster is required")

    if not options.password_file:
      app.error("--password_file is required")
    log.info("Using --password_file=%s" % options.password_file)
    safe_mkdir(os.path.dirname(options.password_file))
Example #37
0
def _really_run(task, root, sandbox, task_id=None, user=None, prebound_ports=None, chroot=None,
                daemon=False):
  prebound_ports = prebound_ports or {}
  missing_ports = set(task.ports()) - set(prebound_ports.keys())
  if missing_ports:
    app.error('ERROR!  Unbound ports: %s' % ' '.join(port for port in missing_ports))
  task_runner = TaskRunner(task.task, root, sandbox, task_id=task_id,
                           user=user, portmap=prebound_ports, chroot=chroot)
  if daemon:
    print('Daemonizing and starting runner.')
    try:
      log.teardown_stderr_logging()
      daemonize()
    except Exception as e:
      print("Failed to daemonize: %s" % e)
      sys.exit(1)
  try:
    task_runner.run()
  except KeyboardInterrupt:
    print('Got keyboard interrupt, killing job!')
    task_runner.close_ckpt()
    task_runner.kill()
    def delete(args, options):
        validate_common_options(options)

        with open(options.password_file, 'r') as f:
            password = f.read().strip()
            if not password:
                app.error("Empty password file")

        url = 'http://%s:%s/clusters/%s' % (options.api_host, options.api_port,
                                            options.cluster_name)
        values = dict(password=password)

        req = urllib2.Request(url, urllib.urlencode(values))
        req.get_method = lambda: 'DELETE'

        try:
            response = urllib2.urlopen(req).read()
        except urllib2.HTTPError as e:
            log.error("DELETE request failed: %s, %s, %s" %
                      (e.code,
                       BaseHTTPServer.BaseHTTPRequestHandler.responses[e.code],
                       e.read()))
            app.quit(1)

        try:
            result = json.loads(response)
            if not isinstance(result, dict):
                raise ValueError()
        except ValueError:
            log.error("Invalid response: %s" % response)
            app.quit(1)

        log.info("Cluster deletion result: %s" % result)

        log.info("Waiting for the cluster to terminate...")
        wait_for_termination(result['cluster_url'])

        log.info("Cluster terminated/deleted")
    def main(args, options):
        thermos_runner_provider = DefaultThermosTaskRunnerProvider(
            dump_runner_pex(),
            artifact_dir=os.path.realpath('.'),
        )

        # status providers:
        status_providers = [HealthCheckerProvider()]

        if options.announcer_enable:
            if options.announcer_ensemble is None:
                app.error(
                    'Must specify --announcer-ensemble if the announcer is enabled.'
                )
            status_providers.append(
                DefaultAnnouncerCheckerProvider(
                    options.announcer_ensemble,
                    options.announcer_serverset_path))

        # Create executor stub
        thermos_executor = AuroraExecutor(
            runner_provider=thermos_runner_provider,
            status_providers=status_providers,
        )

        # Create driver stub
        driver = MesosExecutorDriver(thermos_executor)

        # This is an ephemeral executor -- shutdown if we receive no tasks within a certain
        # time period
        ExecutorTimeout(thermos_executor.launched, driver).start()

        # Start executor
        driver.run()

        log.info('MesosExecutorDriver.run() has finished.')
Example #40
0
  def main(args, options):
    if MesosExecutorDriver is None:
      app.error('Could not load MesosExecutorDriver!')

    thermos_executor = initialize(options)

    # Create driver stub
    driver = MesosExecutorDriver(thermos_executor)

    # This is an ephemeral executor -- shutdown if we receive no tasks within a certain
    # time period
    ExecutorTimeout(thermos_executor.launched, driver).start()

    # Start executor and wait until it is stopped.
    driver_thread = ExecutorDriverThread(driver)
    driver_thread.start()
    try:
      while driver_thread.isAlive():
        driver_thread.join(5)
    except (KeyboardInterrupt, SystemExit):
      driver.stop()
      raise

    log.info('MesosExecutorDriver.run() has finished.')
    def main(args, options):
        if MesosExecutorDriver is None:
            app.error('Could not load MesosExecutorDriver!')

        thermos_executor = initialize(options)

        # Create driver stub
        driver = MesosExecutorDriver(thermos_executor)

        # This is an ephemeral executor -- shutdown if we receive no tasks within a certain
        # time period
        ExecutorTimeout(thermos_executor.launched, driver).start()

        # Start executor and wait until it is stopped.
        driver_thread = ExecutorDriverThread(driver)
        driver_thread.start()
        try:
            while driver_thread.isAlive():
                driver_thread.join(5)
        except (KeyboardInterrupt, SystemExit):
            driver.stop()
            raise

        log.info('MesosExecutorDriver.run() has finished.')
Example #42
0
def get_task_from_options(opts):
    tasks = ThermosConfigLoader.load_json(opts.thermos_json)
    if len(tasks.tasks()) == 0:
        app.error("No tasks specified!")
    if len(tasks.tasks()) > 1:
        app.error("Multiple tasks in config but no task name specified!")
    task = tasks.tasks()[0]
    if not task.task.check().ok():
        app.error(task.task.check().message())
    return task
def get_task_from_options(opts):
  tasks = ThermosConfigLoader.load_json(opts.thermos_json)
  if len(tasks.tasks()) == 0:
    app.error("No tasks specified!")
  if len(tasks.tasks()) > 1:
    app.error("Multiple tasks in config but no task name specified!")
  task = tasks.tasks()[0]
  if not task.task.check().ok():
    app.error(task.task.check().message())
  return task
Example #44
0
def make_zk_auth(zk_auth_config):
  if zk_auth_config is None:
    return None

  try:
    with open(zk_auth_config) as fp:
      try:
        zk_auth = ZkAuth.json_load(fp, strict=True)
        if not zk_auth.check().ok():
          app.error('ZK authentication config is invalid %s' % zk_auth.check().message())
        return zk_auth
      except (TypeError, ValueError, AttributeError) as ex:
        app.error('Problem parsing ZK authentication config %s' % ex)
  except IOError as ie:
    app.error('Failed to open config file %s' % ie)
Example #45
0
def make_zk_auth(zk_auth_config):
    if zk_auth_config is None:
        return None

    try:
        with open(zk_auth_config) as fp:
            try:
                zk_auth = ZkAuth.json_load(fp, strict=True)
                if not zk_auth.check().ok():
                    app.error('ZK authentication config is invalid %s' %
                              zk_auth.check().message())
                return zk_auth
            except (TypeError, ValueError, AttributeError) as ex:
                app.error('Problem parsing ZK authentication config %s' % ex)
    except IOError as ie:
        app.error('Failed to open config file %s' % ie)
def proxy_main(args, opts):
  assert opts.thermos_json and os.path.exists(opts.thermos_json)
  assert opts.sandbox
  assert opts.checkpoint_root

  thermos_task = get_task_from_options(opts)
  prebound_ports = opts.prebound_ports
  missing_ports = set(thermos_task.ports()) - set(prebound_ports)

  if missing_ports:
    app.error('ERROR!  Unbound ports: %s' % ' '.join(port for port in missing_ports))

  task_runner = TaskRunner(
      thermos_task.task,
      opts.checkpoint_root,
      opts.sandbox,
      task_id=opts.task_id,
      user=opts.setuid,
      portmap=prebound_ports,
      chroot=opts.chroot,
      planner_class=CappedTaskPlanner
  )

  for sig in (signal.SIGUSR1, signal.SIGUSR2):
    signal.signal(sig, functools.partial(runner_teardown, task_runner))

  try:
    task_runner.run()
  except TaskRunner.InternalError as err:
    app.error('Internal error: %s' % err)
  except TaskRunner.InvalidTask as err:
    app.error(str(err))
  except TaskRunner.StateError:
    app.error('Task appears to already be in a terminal state.')
  except KeyboardInterrupt:
    runner_teardown(task_runner)
Example #47
0
    def main(args, options):
        log.info("Options in use: %s", options)

        if not options.api_port:
            app.error('Must specify --port')

        if not options.mesos_master:
            app.error('Must specify --mesos_master')

        if not options.framework_user:
            app.error('Must specify --framework_user')

        if not options.executor_uri:
            app.error('Must specify --executor_uri')

        if not options.executor_cmd:
            app.error('Must specify --executor_cmd')

        if not options.zk_url:
            app.error('Must specify --zk_url')

        if not options.admin_keypath:
            app.error('Must specify --admin_keypath')

        try:
            election_timeout = parse_time(options.election_timeout)
            framework_failover_timeout = parse_time(
                options.framework_failover_timeout)
        except InvalidTime as e:
            app.error(e.message)

        try:
            _, zk_servers, zk_root = zookeeper.parse(options.zk_url)
        except Exception as e:
            app.error("Invalid --zk_url: %s" % e.message)

        web_assets_dir = os.path.join(options.work_dir, "web")
        pkgutil.unpack_assets(web_assets_dir, MYSOS_MODULE, ASSET_RELPATH)
        log.info("Extracted web assets into %s" % options.work_dir)

        fw_principal = None
        fw_secret = None
        if options.framework_authentication_file:
            try:
                with open(options.framework_authentication_file, "r") as f:
                    cred = yaml.load(f)
                fw_principal = cred["principal"]
                fw_secret = cred["secret"]
                log.info(
                    "Loaded credential (principal=%s) for framework authentication"
                    % fw_principal)
            except IOError as e:
                app.error(
                    "Unable to read the framework authentication key file: %s"
                    % e)
            except (KeyError, yaml.YAMLError) as e:
                app.error(
                    "Invalid framework authentication key file format %s" % e)

        log.info("Starting Mysos scheduler")

        kazoo = KazooClient(zk_servers)
        kazoo.start()

        if options.state_storage == 'zk':
            log.info("Using ZooKeeper (path: %s) for state storage" % zk_root)
            state_provider = ZooKeeperStateProvider(kazoo, zk_root)
        else:
            log.info("Using local disk for state storage")
            state_provider = LocalStateProvider(options.work_dir)

        try:
            state = state_provider.load_scheduler_state()
        except StateProvider.Error as e:
            app.error(e.message)

        if state:
            log.info("Successfully restored scheduler state")
            framework_info = state.framework_info
            if framework_info.HasField('id'):
                log.info("Recovered scheduler's FrameworkID is %s" %
                         framework_info.id.value)
        else:
            log.info("No scheduler state to restore")
            framework_info = FrameworkInfo(
                user=options.framework_user,
                name=FRAMEWORK_NAME,
                checkpoint=True,
                failover_timeout=framework_failover_timeout.as_(Time.SECONDS),
                role=options.framework_role)
            if fw_principal:
                framework_info.principal = fw_principal
            state = Scheduler(framework_info)
            state_provider.dump_scheduler_state(state)

        scheduler = MysosScheduler(state,
                                   state_provider,
                                   options.framework_user,
                                   options.executor_uri,
                                   options.executor_cmd,
                                   kazoo,
                                   options.zk_url,
                                   election_timeout,
                                   options.admin_keypath,
                                   installer_args=options.installer_args,
                                   backup_store_args=options.backup_store_args,
                                   executor_environ=options.executor_environ,
                                   framework_role=options.framework_role)

        if fw_principal and fw_secret:
            cred = Credential(principal=fw_principal, secret=fw_secret)
            scheduler_driver = mesos.native.MesosSchedulerDriver(
                scheduler, framework_info, options.mesos_master, cred)
        else:
            scheduler_driver = mesos.native.MesosSchedulerDriver(
                scheduler, framework_info, options.mesos_master)

        scheduler_driver.start()

        server = HttpServer()
        server.mount_routes(MysosServer(scheduler, web_assets_dir))

        et = ExceptionalThread(target=server.run,
                               args=('0.0.0.0', options.api_port, 'cherrypy'))
        et.daemon = True
        et.start()

        try:
            # Wait for the scheduler to stop.
            # The use of 'stopped' event instead of scheduler_driver.join() is necessary to stop the
            # process with SIGINT.
            while not scheduler.stopped.wait(timeout=0.5):
                pass
        except KeyboardInterrupt:
            log.info('Interrupted, exiting.')
        else:
            log.info('Scheduler exited.')

        app.shutdown(
            1
        )  # Mysos scheduler is supposed to be long-running thus the use of exit status 1.
    def create(args, options):
        validate_common_options(options)

        if not options.num_nodes:
            app.error("--num_nodes is required")

        if not options.cluster_user:
            app.error("--cluster_user is required")

        url = 'http://%s:%s/clusters/%s' % (options.api_host, options.api_port,
                                            options.cluster_name)
        values = dict(
            num_nodes=int(options.num_nodes),
            cluster_user=options.cluster_user,
            size=options.size
            if options.size else '',  # 'urlencode()' doesn't accept None.
            backup_id=options.backup_id if options.backup_id else '',
            cluster_password=options.cluster_password
            if options.cluster_password else '')

        req = urllib2.Request(url, urllib.urlencode(values))
        try:
            response = urllib2.urlopen(req).read()
        except urllib2.HTTPError as e:
            log.error("POST request failed: %s, %s, %s" %
                      (e.code,
                       BaseHTTPServer.BaseHTTPRequestHandler.responses[e.code],
                       e.read()))
            app.quit(1)

        try:
            result = json.loads(response)
            if not isinstance(result, dict):
                raise ValueError()
        except ValueError:
            log.error("Invalid response: %s" % response)
            app.quit(1)

        log.info("Cluster created. Cluster info: %s" % str(result))
        with open(options.password_file, 'w') as f:
            f.write(result["cluster_password"])

        log.info("Waiting for the master for this cluster to be elected...")
        master_endpoint = wait_for_master(
            result['cluster_url']).service_endpoint

        connection_str = "mysql://%s:%s@%s:%d/" % (
            options.cluster_user, result["cluster_password"],
            master_endpoint.host, master_endpoint.port)
        log.info("Connecting to the MySQL cluster master: %s" % connection_str)
        engine = create_engine(connection_str)

        for i in range(
                5
        ):  # Loop for 5 times/seconds to wait for the master to be promoted.
            try:
                # TODO(jyx): Test writing to the master and reading from the slave.
                result = engine.execute("SELECT 1;").scalar()
                assert 1 == int(
                    result), "Expecting result to be 1 but got %s" % result
                break
            except OperationalError:
                if i == 4:
                    raise
                log.debug("MySQL master not ready yet. Sleep for 1 second...")
                time.sleep(1)

        log.info("Cluster successfully started")
Example #49
0
  def main(args, options):
    log.info("Options in use: %s", options)

    if not options.api_port:
      app.error('Must specify --port')

    if not options.mesos_master:
      app.error('Must specify --mesos_master')

    if not options.framework_user:
      app.error('Must specify --framework_user')

    if not options.executor_uri:
      app.error('Must specify --executor_uri')

    if not options.executor_cmd:
      app.error('Must specify --executor_cmd')

    if not options.zk_url:
      app.error('Must specify --zk_url')

    if not options.admin_keypath:
      app.error('Must specify --admin_keypath')

    try:
      election_timeout = parse_time(options.election_timeout)
      framework_failover_timeout = parse_time(options.framework_failover_timeout)
    except InvalidTime as e:
      app.error(e.message)

    try:
      _, zk_servers, zk_root = zookeeper.parse(options.zk_url)
    except Exception as e:
      app.error("Invalid --zk_url: %s" % e.message)

    web_assets_dir = os.path.join(options.work_dir, "web")
    pkgutil.unpack_assets(web_assets_dir, MYSOS_MODULE, ASSET_RELPATH)
    log.info("Extracted web assets into %s" % options.work_dir)

    fw_principal = None
    fw_secret = None
    if options.framework_authentication_file:
      try:
        with open(options.framework_authentication_file, "r") as f:
          cred = yaml.load(f)
        fw_principal = cred["principal"]
        fw_secret = cred["secret"]
        log.info("Loaded credential (principal=%s) for framework authentication" % fw_principal)
      except IOError as e:
        app.error("Unable to read the framework authentication key file: %s" % e)
      except (KeyError, yaml.YAMLError) as e:
        app.error("Invalid framework authentication key file format %s" % e)

    log.info("Starting Mysos scheduler")

    kazoo = KazooClient(zk_servers)
    kazoo.start()

    if options.state_storage == 'zk':
      log.info("Using ZooKeeper (path: %s) for state storage" % zk_root)
      state_provider = ZooKeeperStateProvider(kazoo, zk_root)
    else:
      log.info("Using local disk for state storage")
      state_provider = LocalStateProvider(options.work_dir)

    try:
      state = state_provider.load_scheduler_state()
    except StateProvider.Error as e:
      app.error(e.message)

    if state:
      log.info("Successfully restored scheduler state")
      framework_info = state.framework_info
      if framework_info.HasField('id'):
        log.info("Recovered scheduler's FrameworkID is %s" % framework_info.id.value)
    else:
      log.info("No scheduler state to restore")
      framework_info = FrameworkInfo(
          user=options.framework_user,
          name=FRAMEWORK_NAME,
          checkpoint=True,
          failover_timeout=framework_failover_timeout.as_(Time.SECONDS),
          role=options.framework_role)
      if fw_principal:
        framework_info.principal = fw_principal
      state = Scheduler(framework_info)
      state_provider.dump_scheduler_state(state)

    scheduler = MysosScheduler(
        state,
        state_provider,
        options.framework_user,
        options.executor_uri,
        options.executor_cmd,
        kazoo,
        options.zk_url,
        election_timeout,
        options.admin_keypath,
        installer_args=options.installer_args,
        backup_store_args=options.backup_store_args,
        executor_environ=options.executor_environ,
        framework_role=options.framework_role)

    if fw_principal and fw_secret:
      cred = Credential(principal=fw_principal, secret=fw_secret)
      scheduler_driver = mesos.native.MesosSchedulerDriver(
          scheduler,
          framework_info,
          options.mesos_master,
          cred)
    else:
      scheduler_driver = mesos.native.MesosSchedulerDriver(
          scheduler,
          framework_info,
          options.mesos_master)

    scheduler_driver.start()

    server = HttpServer()
    server.mount_routes(MysosServer(scheduler, web_assets_dir))

    et = ExceptionalThread(
        target=server.run, args=('0.0.0.0', options.api_port, 'cherrypy'))
    et.daemon = True
    et.start()

    try:
      # Wait for the scheduler to stop.
      # The use of 'stopped' event instead of scheduler_driver.join() is necessary to stop the
      # process with SIGINT.
      while not scheduler.stopped.wait(timeout=0.5):
        pass
    except KeyboardInterrupt:
      log.info('Interrupted, exiting.')
    else:
      log.info('Scheduler exited.')

    app.shutdown(1)  # Mysos scheduler is supposed to be long-running thus the use of exit status 1.