Ejemplo n.º 1
0
def run(args, options):
    """usage: run cluster/role/env/job cmd

  Runs a shell command on all machines currently hosting shards of a single job.

  This feature supports the same command line wildcards that are used to
  populate a job's commands.

  This means anything in the {{mesos.*}} and {{thermos.*}} namespaces.
  """
    # TODO(William Farner): Add support for invoking on individual shards.
    # TODO(Kevin Sweeney): Restore the ability to run across jobs with globs (See MESOS-3010).
    if not args:
        die('job path is required')
    job_path = args.pop(0)
    try:
        cluster_name, role, env, name = AuroraJobKey.from_path(job_path)
    except AuroraJobKey.Error as e:
        die('Invalid job path "%s": %s' % (job_path, e))

    command = ' '.join(args)
    cluster = CLUSTERS[cluster_name]
    dcr = DistributedCommandRunner(cluster, role, env, [name],
                                   options.ssh_user)
    dcr.run(command,
            parallelism=options.num_threads,
            executor_sandbox=options.executor_sandbox)
Ejemplo n.º 2
0
    def disambiguate_args_or_die(cls,
                                 args,
                                 options,
                                 client_factory=AuroraClientAPI):
        """
    Returns a (AuroraClientAPI, AuroraJobKey, AuroraConfigFile:str) tuple
    if one can be found given the args, potentially querying the scheduler with the returned client.
    Calls die() with an appropriate error message otherwise.

    Arguments:
      args: args from app command invocation.
      options: options from app command invocation. must have env and cluster attributes.
      client_factory: a callable (cluster) -> AuroraClientAPI.
    """
        if not len(args) > 0:
            die('job path is required')
        try:
            job_key = AuroraJobKey.from_path(args[0])
            client = client_factory(job_key.cluster)
            config_file = args[1] if len(
                args) > 1 else None  # the config for hooks
            return client, job_key, config_file
        except AuroraJobKey.Error:
            log.warning(
                "Failed to parse job path, falling back to compatibility mode")
            role = args[0] if len(args) > 0 else None
            name = args[1] if len(args) > 1 else None
            env = None
            config_file = None  # deprecated form does not support hooks functionality
            cluster = options.cluster
            if not cluster:
                die('cluster is required')
            client = client_factory(cluster)
            return client, cls._disambiguate_or_die(client, role, env,
                                                    name), config_file
Ejemplo n.º 3
0
def perform_maintenance_hosts(cluster):
    """usage: perform_maintenance cluster [--filename=filename]
                                        [--hosts=hosts]
                                        [--batch_size=num]
                                        [--post_drain_script=path]
                                        [--grouping=function]

  Asks the scheduler to remove any running tasks from the machine and remove it
  from service temporarily, perform some action on them, then return the machines
  to service.
  """
    options = app.get_options()
    drainable_hosts = parse_hosts(options)

    if options.post_drain_script:
        if not os.path.exists(options.post_drain_script):
            die("No such file: %s" % options.post_drain_script)
        cmd = os.path.abspath(options.post_drain_script)
        drained_callback = lambda host: subprocess.Popen([cmd, host])
    else:
        drained_callback = None

    MesosMaintenance(CLUSTERS[cluster], options.verbosity).perform_maintenance(
        drainable_hosts,
        batch_size=int(options.batch_size),
        callback=drained_callback,
        grouping_function=options.grouping,
    )
Ejemplo n.º 4
0
def do_open(args, _):
  """usage: open cluster[/role[/env/job]]

  Opens the scheduler page for a cluster, role or job in the default web browser.
  """
  cluster_name = role = env = job = None
  args = args[0].split("/")
  if len(args) > 0:
    cluster_name = args[0]
    if len(args) > 1:
      role = args[1]
      if len(args) > 2:
        env = args[2]
        if len(args) > 3:
          job = args[3]
        else:
          # TODO(ksweeney): Remove this after MESOS-2945 is completed.
          die('env scheduler pages are not yet implemented, please specify job')

  if not cluster_name:
    die('cluster is required')

  api = make_client(cluster_name)

  import webbrowser
  webbrowser.open_new_tab(synthesize_url(api.scheduler.scheduler().url, role, env, job))
Ejemplo n.º 5
0
  def disambiguate_args_or_die(cls, args, options, client_factory=AuroraClientAPI):
    """
    Returns a (AuroraClientAPI, AuroraJobKey, AuroraConfigFile:str) tuple
    if one can be found given the args, potentially querying the scheduler with the returned client.
    Calls die() with an appropriate error message otherwise.

    Arguments:
      args: args from app command invocation.
      options: options from app command invocation. must have env and cluster attributes.
      client_factory: a callable (cluster) -> AuroraClientAPI.
    """
    if not len(args) > 0:
      die('job path is required')
    try:
      job_key = AuroraJobKey.from_path(args[0])
      client = client_factory(job_key.cluster)
      config_file = args[1] if len(args) > 1 else None  # the config for hooks
      return client, job_key, config_file
    except AuroraJobKey.Error:
      log.warning("Failed to parse job path, falling back to compatibility mode")
      role = args[0] if len(args) > 0 else None
      name = args[1] if len(args) > 1 else None
      env = None
      config_file = None  # deprecated form does not support hooks functionality
      cluster = options.cluster
      if not cluster:
        die('cluster is required')
      client = client_factory(cluster)
      return client, cls._disambiguate_or_die(client, role, env, name), config_file
Ejemplo n.º 6
0
def list_jobs(cluster_and_role):
  """usage: list_jobs [--show_cron_schedule] cluster/role/env/job"""
  def show_job_simple(job):
    if options.show_cron_schedule:
      print(('{0}/{1.key.role}/{1.key.environment}/{1.key.name}' +
          '\t\'{1.cronSchedule}\'\t{1.cronCollisionPolicy}').format(cluster, job))
    else:
      print('{0}/{1.key.role}/{1.key.environment}/{1.key.name}'.format(cluster, job))

  def show_job_pretty(job):
    print("Job %s/%s/%s/%s:" %
        (cluster, job.key.role, job.key.environment, job.key.name))
    print('\tcron schedule: %s' % job.cronSchedule)
    print('\tcron policy:   %s' % job.cronCollisionPolicy)

  options = app.get_options()
  if options.show_cron_schedule and options.pretty:
    print_fn = show_job_pretty
  else:
    print_fn = show_job_simple
  # Take the cluster_and_role parameter, and split it into its two components.
  if cluster_and_role.count('/') != 1:
    die('list_jobs parameter must be in cluster/role format')
  (cluster,role) = cluster_and_role.split('/')
  api = make_client(cluster)
  resp = api.get_jobs(role)
  check_and_log_response(resp)
  for job in resp.result.getJobsResult.configs:
    print_fn(job)
Ejemplo n.º 7
0
def perform_maintenance_hosts(cluster):
    """usage: perform_maintenance cluster [--filename=filename]
                                        [--hosts=hosts]
                                        [--batch_size=num]
                                        [--post_drain_script=path]
                                        [--grouping=function]

  Asks the scheduler to remove any running tasks from the machine and remove it
  from service temporarily, perform some action on them, then return the machines
  to service.
  """
    options = app.get_options()
    drainable_hosts = parse_hosts(options)

    if options.post_drain_script:
        if not os.path.exists(options.post_drain_script):
            die("No such file: %s" % options.post_drain_script)
        cmd = os.path.abspath(options.post_drain_script)
        drained_callback = lambda host: subprocess.Popen([cmd, host])
    else:
        drained_callback = None

    MesosMaintenance(CLUSTERS[cluster], options.verbosity).perform_maintenance(
        drainable_hosts,
        batch_size=int(options.batch_size),
        callback=drained_callback,
        grouping_function=options.grouping)
Ejemplo n.º 8
0
def parse_hosts(options):
    if not (options.filename or options.hosts):
        die("Please specify either --filename or --hosts")
    if options.filename:
        with open(options.filename, "r") as hosts:
            hosts = [hostname.strip() for hostname in hosts]
    elif options.hosts:
        hosts = [hostname.strip() for hostname in options.hosts.split(",")]
    if not hosts:
        die("No valid hosts found.")
    return hosts
Ejemplo n.º 9
0
def diff(job_spec, config_file):
  """usage: diff cluster/role/env/job config

  Compares a job configuration against a running job.
  By default the diff will be displayed using 'diff', though you may choose an alternate
  diff program by specifying the DIFF_VIEWER environment variable."""
  options = app.get_options()
  config = get_job_config(job_spec, config_file, options)
  if options.rename_from:
    cluster, role, env, name = options.rename_from
  else:
    cluster = config.cluster()
    role = config.role()
    env = config.environment()
    name = config.name()
  api = make_client(cluster)
  resp = api.query(api.build_query(role, name, statuses=ACTIVE_STATES, env=env))
  if resp.responseCode != ResponseCode.OK:
    die('Request failed, server responded with "%s"' % resp.message)
  remote_tasks = [t.assignedTask.task for t in resp.result.scheduleStatusResult.tasks]
  resp = api.populate_job_config(config)
  if resp.responseCode != ResponseCode.OK:
    die('Request failed, server responded with "%s"' % resp.message)
  local_tasks = resp.result.populateJobResult.populated

  pp = pprint.PrettyPrinter(indent=2)
  def pretty_print_task(task):
    # The raw configuration is not interesting - we only care about what gets parsed.
    task.configuration = None
    task.executorConfig = ExecutorConfig(
        name=AURORA_EXECUTOR_NAME,
        data=json.loads(task.executorConfig.data))
    return pp.pformat(vars(task))

  def pretty_print_tasks(tasks):
    return ',\n'.join([pretty_print_task(t) for t in tasks])

  def dump_tasks(tasks, out_file):
    out_file.write(pretty_print_tasks(tasks))
    out_file.write('\n')
    out_file.flush()

  diff_program = os.environ.get('DIFF_VIEWER', 'diff')
  with NamedTemporaryFile() as local:
    dump_tasks(local_tasks, local)
    with NamedTemporaryFile() as remote:
      dump_tasks(remote_tasks, remote)
      result = subprocess.call([diff_program, remote.name, local.name])
      # Unlike most commands, diff doesn't return zero on success; it returns
      # 1 when a successful diff is non-empty.
      if result != 0 and result != 1:
        return result
      else:
        return 0
Ejemplo n.º 10
0
def parse_hosts(options):
    if not (options.filename or options.hosts):
        die('Please specify either --filename or --hosts')
    if options.filename:
        with open(options.filename, 'r') as hosts:
            hosts = [hostname.strip() for hostname in hosts]
    elif options.hosts:
        hosts = [hostname.strip() for hostname in options.hosts.split(",")]
    if not hosts:
        die('No valid hosts found.')
    return hosts
Ejemplo n.º 11
0
def _validate_update_config(config):
  job_size = config.instances()
  max_failures = config.update_config().max_total_failures().get()

  if max_failures >= job_size:
    die(UPDATE_CONFIG_MAX_FAILURES_ERROR % (job_size, job_size - 1))

  if config.is_dedicated():
    min_failure_threshold = int(math.floor(job_size * 0.02))
    if max_failures < min_failure_threshold:
      die(UPDATE_CONFIG_DEDICATED_THRESHOLD_ERROR % (job_size, min_failure_threshold))
Ejemplo n.º 12
0
def _validate_update_config(config):
    job_size = config.instances()
    max_failures = config.update_config().max_total_failures().get()

    if max_failures >= job_size:
        die(UPDATE_CONFIG_MAX_FAILURES_ERROR % (job_size, job_size - 1))

    if config.is_dedicated():
        min_failure_threshold = int(math.floor(job_size * 0.02))
        if max_failures < min_failure_threshold:
            die(UPDATE_CONFIG_DEDICATED_THRESHOLD_ERROR %
                (job_size, min_failure_threshold))
Ejemplo n.º 13
0
def help(args):
  """usage: help [subcommand]

  Prints help for using the aurora client, or one of its specific subcommands.
  """
  if not args:
    print(generate_full_usage())
    sys.exit(0)

  if len(args) > 1:
    die('Please specify at most one subcommand.')

  subcmd = args[0]
  if subcmd in app.get_commands():
    app.command_parser(subcmd).print_help()
  else:
    print('Subcommand %s not found.' % subcmd)
    sys.exit(1)
Ejemplo n.º 14
0
def help(args):
    """usage: help [subcommand]

  Prints help for using the aurora client, or one of its specific subcommands.
  """
    if not args:
        print(generate_full_usage())
        sys.exit(0)

    if len(args) > 1:
        die('Please specify at most one subcommand.')

    subcmd = args[0]
    if subcmd in app.get_commands():
        app.command_parser(subcmd).print_help()
    else:
        print('Subcommand %s not found.' % subcmd)
        sys.exit(1)
Ejemplo n.º 15
0
 def warn_if_dangerous_change(api, job_spec, config):
   # Get the current job status, so that we can check if there's anything
   # dangerous about this update.
   job_key = AuroraJobKey(config.cluster(), config.role(), config.environment(), config.name())
   resp = api.query(api.build_query(config.role(), config.name(),
       statuses=ACTIVE_STATES, env=config.environment()))
   if resp.responseCode != ResponseCode.OK:
     die('Could not get job status from server for comparison: %s' % resp.message)
   remote_tasks = [t.assignedTask.task for t in resp.result.scheduleStatusResult.tasks]
   resp = api.populate_job_config(config)
   if resp.responseCode != ResponseCode.OK:
     die('Server could not populate job config for comparison: %s' % resp.message)
   local_task_count = len(resp.result.populateJobResult.populated)
   remote_task_count = len(remote_tasks)
   if (local_task_count >= 4 * remote_task_count or local_task_count <= 4 * remote_task_count
       or local_task_count == 0):
     print('Warning: this update is a large change. Press ^c within 5 seconds to abort')
     time.sleep(5)
Ejemplo n.º 16
0
def run(args, options):
  """usage: run cluster/role/env/job cmd

  Runs a shell command on all machines currently hosting shards of a single job.

  This feature supports the same command line wildcards that are used to
  populate a job's commands.

  This means anything in the {{mesos.*}} and {{thermos.*}} namespaces.
  """
  # TODO(William Farner): Add support for invoking on individual shards.
  # TODO(Kevin Sweeney): Restore the ability to run across jobs with globs (See MESOS-3010).
  if not args:
    die('job path is required')
  job_path = args.pop(0)
  try:
    cluster_name, role, env, name = AuroraJobKey.from_path(job_path)
  except AuroraJobKey.Error as e:
    die('Invalid job path "%s": %s' % (job_path, e))

  command = ' '.join(args)
  cluster = CLUSTERS[cluster_name]
  dcr = DistributedCommandRunner(cluster, role, env, [name], options.ssh_user)
  dcr.run(command, parallelism=options.num_threads, executor_sandbox=options.executor_sandbox)
Ejemplo n.º 17
0
  def _disambiguate_or_die(cls, client, role, env, name):
    # Returns a single AuroraJobKey if one can be found given the args, potentially
    # querying the scheduler. Calls die() with an appropriate error message otherwise.
    try:
      disambiguator = cls(client, role, env, name)
    except ValueError as e:
      die(e)

    if not disambiguator.ambiguous:
      return AuroraJobKey(client.cluster.name, role, env, name)

    deprecation_warning("Job ambiguously specified - querying the scheduler to disambiguate")
    matches = disambiguator.query_matches()
    if len(matches) == 1:
      (match,) = matches
      log.info("Found job %s" % match)
      return match
    elif len(matches) == 0:
      die("No jobs found")
    else:
      die("Multiple jobs match (%s) - disambiguate by using the CLUSTER/ROLE/ENV/NAME form"
          % ",".join(str(m) for m in matches))
Ejemplo n.º 18
0
    def _disambiguate_or_die(cls, client, role, env, name):
        # Returns a single AuroraJobKey if one can be found given the args, potentially
        # querying the scheduler. Calls die() with an appropriate error message otherwise.
        try:
            disambiguator = cls(client, role, env, name)
        except ValueError as e:
            die(e)

        if not disambiguator.ambiguous:
            return AuroraJobKey(client.cluster.name, role, env, name)

        deprecation_warning(
            "Job ambiguously specified - querying the scheduler to disambiguate"
        )
        matches = disambiguator.query_matches()
        if len(matches) == 1:
            (match, ) = matches
            log.info("Found job %s" % match)
            return match
        elif len(matches) == 0:
            die("No jobs found")
        else:
            die("Multiple jobs match (%s) - disambiguate by using the CLUSTER/ROLE/ENV/NAME form"
                % ",".join(str(m) for m in matches))
Ejemplo n.º 19
0
def query(args, options):
    """usage: query [--shards=N[,N,...]]
                  [--states=State[,State,...]]
                  cluster [role [job]]

  Query Mesos about jobs and tasks.
  """
    def _convert_fmt_string(fmtstr):
        import re

        def convert(match):
            return "%%(%s)s" % match.group(1)

        return re.sub(r'%(\w+)%', convert, fmtstr)

    def flatten_task(t, d={}):
        for key in t.__dict__.keys():
            val = getattr(t, key)
            try:
                val.__dict__.keys()
            except AttributeError:
                d[key] = val
            else:
                flatten_task(val, d)

        return d

    def map_values(d):
        default_value = lambda v: v
        mapping = {
            'status': lambda v: ScheduleStatus._VALUES_TO_NAMES[v],
        }
        return dict(
            (k, mapping.get(k, default_value)(v)) for (k, v) in d.items())

    for state in options.states.split(','):
        if state not in ScheduleStatus._NAMES_TO_VALUES:
            msg = "Unknown state '%s' specified.  Valid states are:\n" % state
            msg += ','.join(ScheduleStatus._NAMES_TO_VALUES.keys())
            die(msg)

    # Role, Job, Instances, States, and the listformat
    if len(args) == 0:
        die('Must specify at least cluster.')

    cluster = args[0]
    role = args[1] if len(args) > 1 else None
    job = args[2] if len(args) > 2 else None
    instances = set(map(
        int, options.shards.split(','))) if options.shards else set()

    if options.states:
        states = set(
            map(ScheduleStatus._NAMES_TO_VALUES.get,
                options.states.split(',')))
    else:
        states = ACTIVE_STATES | TERMINAL_STATES
    listformat = _convert_fmt_string(options.listformat)

    #  Figure out "expensive" queries here and bone if they do not have --force
    #  - Does not specify role
    if role is None and not options.force:
        die('--force is required for expensive queries (no role specified)')

    #  - Does not specify job
    if job is None and not options.force:
        die('--force is required for expensive queries (no job specified)')

    #  - Specifies status outside of ACTIVE_STATES
    if not (states <= ACTIVE_STATES) and not options.force:
        die('--force is required for expensive queries (states outside ACTIVE states'
            )

    api = AuroraClientAPI(CLUSTERS[cluster], options.verbosity)
    query_info = api.query(
        api.build_query(role, job, instances=instances, statuses=states))
    tasks = query_info.result.scheduleStatusResult.tasks
    if query_info.responseCode != ResponseCode.OK:
        die('Failed to query scheduler: %s' % query_info.message)
    if tasks is None:
        return

    try:
        for task in tasks:
            d = flatten_task(task)
            print(listformat % map_values(d))
    except KeyError:
        msg = "Unknown key in format string.  Valid keys are:\n"
        msg += ','.join(d.keys())
        die(msg)
Ejemplo n.º 20
0
def _validate_health_check_config(config):
    # TODO(Sathya): Remove this check after health_check_interval_secs deprecation cycle is complete.
    if config.raw().has_health_check_interval_secs() and config.raw(
    ).has_health_check_config():
        die(HEALTH_CHECK_INTERVAL_SECS_ERROR)
Ejemplo n.º 21
0
def _validate_health_check_config(config):
  # TODO(Sathya): Remove this check after health_check_interval_secs deprecation cycle is complete.
  if config.raw().has_health_check_interval_secs() and config.raw().has_health_check_config():
    die(HEALTH_CHECK_INTERVAL_SECS_ERROR)
Ejemplo n.º 22
0
def query(args, options):
    """usage: query [--shards=N[,N,...]]
                  [--states=State[,State,...]]
                  cluster [role [job]]

  Query Mesos about jobs and tasks.
  """

    def _convert_fmt_string(fmtstr):
        import re

        def convert(match):
            return "%%(%s)s" % match.group(1)

        return re.sub(r"%(\w+)%", convert, fmtstr)

    def flatten_task(t, d={}):
        for key in t.__dict__.keys():
            val = getattr(t, key)
            try:
                val.__dict__.keys()
            except AttributeError:
                d[key] = val
            else:
                flatten_task(val, d)

        return d

    def map_values(d):
        default_value = lambda v: v
        mapping = {"status": lambda v: ScheduleStatus._VALUES_TO_NAMES[v]}
        return dict((k, mapping.get(k, default_value)(v)) for (k, v) in d.items())

    for state in options.states.split(","):
        if state not in ScheduleStatus._NAMES_TO_VALUES:
            msg = "Unknown state '%s' specified.  Valid states are:\n" % state
            msg += ",".join(ScheduleStatus._NAMES_TO_VALUES.keys())
            die(msg)

    # Role, Job, Instances, States, and the listformat
    if len(args) == 0:
        die("Must specify at least cluster.")

    cluster = args[0]
    role = args[1] if len(args) > 1 else None
    job = args[2] if len(args) > 2 else None
    instances = set(map(int, options.shards.split(","))) if options.shards else set()

    if options.states:
        states = set(map(ScheduleStatus._NAMES_TO_VALUES.get, options.states.split(",")))
    else:
        states = ACTIVE_STATES | TERMINAL_STATES
    listformat = _convert_fmt_string(options.listformat)

    #  Figure out "expensive" queries here and bone if they do not have --force
    #  - Does not specify role
    if role is None and not options.force:
        die("--force is required for expensive queries (no role specified)")

    #  - Does not specify job
    if job is None and not options.force:
        die("--force is required for expensive queries (no job specified)")

    #  - Specifies status outside of ACTIVE_STATES
    if not (states <= ACTIVE_STATES) and not options.force:
        die("--force is required for expensive queries (states outside ACTIVE states")

    api = AuroraClientAPI(CLUSTERS[cluster], options.verbosity)
    query_info = api.query(api.build_query(role, job, instances=instances, statuses=states))
    tasks = query_info.result.scheduleStatusResult.tasks
    if query_info.responseCode != ResponseCode.OK:
        die("Failed to query scheduler: %s" % query_info.message)
    if tasks is None:
        return

    try:
        for task in tasks:
            d = flatten_task(task)
            print(listformat % map_values(d))
    except KeyError:
        msg = "Unknown key in format string.  Valid keys are:\n"
        msg += ",".join(d.keys())
        die(msg)
Ejemplo n.º 23
0
def ssh(args, options):
  """usage: ssh cluster/role/env/job shard [args...]

  Initiate an SSH session on the machine that a shard is running on.
  """
  if not args:
    die('Job path is required')
  job_path = args.pop(0)
  try:
    cluster_name, role, env, name = AuroraJobKey.from_path(job_path)
  except AuroraJobKey.Error as e:
    die('Invalid job path "%s": %s' % (job_path, e))
  if not args:
    die('Shard is required')
  try:
    shard = int(args.pop(0))
  except ValueError:
    die('Shard must be an integer')
  api = make_client(cluster_name)
  resp = api.query(api.build_query(role, name, set([int(shard)]), env=env))
  check_and_log_response(resp)

  first_task = resp.result.scheduleStatusResult.tasks[0]
  remote_cmd = 'bash' if not args else ' '.join(args)
  command = DistributedCommandRunner.substitute(remote_cmd, first_task,
      api.cluster, executor_sandbox=options.executor_sandbox)

  ssh_command = ['ssh', '-t']

  role = first_task.assignedTask.task.owner.role
  slave_host = first_task.assignedTask.slaveHost

  for tunnel in options.tunnels:
    try:
      port, name = tunnel.split(':')
      port = int(port)
    except ValueError:
      die('Could not parse tunnel: %s.  Must be of form PORT:NAME' % tunnel)
    if name not in first_task.assignedTask.assignedPorts:
      die('Task %s has no port named %s' % (first_task.assignedTask.taskId, name))
    ssh_command += [
        '-L', '%d:%s:%d' % (port, slave_host, first_task.assignedTask.assignedPorts[name])]

  ssh_command += ['%s@%s' % (options.ssh_user or role, slave_host), command]
  return subprocess.call(ssh_command)