Example #1
0
def mesos_task_instance_from_assigned_task(assigned_task):
  """Deserialize MesosTaskInstance from an AssignedTask thrift."""
  thermos_task = assigned_task.task.executorConfig.data

  if not thermos_task:
    raise TaskInfoError('Task did not have a thermos config!')

  try:
    json_blob = json.loads(thermos_task)
  except (TypeError, ValueError) as e:
    raise TaskInfoError('Could not deserialize thermos config: %s' % e)

  # TODO(wickman) Determine if there are any serialized MesosTaskInstances in the wild;
  # kill this code if not.
  if 'instance' in json_blob:
    return MesosTaskInstance.json_loads(thermos_task)

  # This is a MesosJob
  task_instance = task_instance_from_job(
      MesosJob.json_loads(thermos_task), assigned_task.instanceId)

  try:
    ThermosTaskValidator.assert_valid_task(task_instance.task())
    ThermosTaskValidator.assert_all_refs_bound(task_instance)
  except ThermosTaskValidator.InvalidTaskError as e:
    raise UnexpectedUnboundRefsError('Got invalid task: %s' % e)

  task_instance, _ = task_instance.interpolate()
  return task_instance
Example #2
0
def mesos_task_instance_from_assigned_task(assigned_task):
    """Deserialize MesosTaskInstance from an AssignedTask thrift."""
    thermos_task = assigned_task.task.executorConfig.data

    if not thermos_task:
        raise TaskInfoError('Task did not have a thermos config!')

    try:
        json_blob = json.loads(thermos_task)
    except (TypeError, ValueError) as e:
        raise TaskInfoError('Could not deserialize thermos config: %s' % e)

    # TODO(wickman) Determine if there are any serialized MesosTaskInstances in the wild;
    # kill this code if not.
    if 'instance' in json_blob:
        return MesosTaskInstance.json_loads(thermos_task)

    # This is a MesosJob
    task_instance = task_instance_from_job(MesosJob.json_loads(thermos_task),
                                           assigned_task.instanceId,
                                           assigned_task.slaveHost)

    try:
        ThermosTaskValidator.assert_valid_task(task_instance.task())
        ThermosTaskValidator.assert_all_refs_bound(task_instance)
    except ThermosTaskValidator.InvalidTaskError as e:
        raise UnexpectedUnboundRefsError('Got invalid task: %s' % e)

    task_instance, _ = task_instance.interpolate()
    return task_instance
Example #3
0
    def __init__(self,
                 task,
                 checkpoint_root,
                 sandbox,
                 log_dir=None,
                 task_id=None,
                 portmap=None,
                 user=None,
                 chroot=False,
                 clock=time,
                 universal_handler=None,
                 planner_class=TaskPlanner,
                 hostname=None,
                 process_logger_destination=None,
                 process_logger_mode=None,
                 rotate_log_size_mb=None,
                 rotate_log_backups=None,
                 preserve_env=False):
        """
      required:
        task (config.Task) = the task to run
        checkpoint_root (path) = the checkpoint root
        sandbox (path) = the sandbox in which the path will be run
                         [if None, cwd will be assumed, but garbage collection will be
                          disabled for this task.]

      optional:
        log_dir (string)  = directory to house stdout/stderr logs. If not specified, logs will be
                            written into the sandbox directory under .logs/
        task_id (string)  = bind to this task id.  if not specified, will synthesize an id based
                            upon task.name()
        portmap (dict)    = a map (string => integer) from name to port, e.g. { 'http': 80 }
        user (string)     = the user to run the task as.  if not current user, requires setuid
                            privileges.
        chroot (boolean)  = whether or not to chroot into the sandbox prior to exec.
        clock (time interface) = the clock to use throughout
        universal_handler = checkpoint record handler (only used for testing)
        planner_class (TaskPlanner class) = TaskPlanner class to use for constructing the task
                            planning policy.
        process_logger_destination (string) = The destination of logger to use for all processes.
        process_logger_mode (string) = The mode of logger to use for all processes.
        rotate_log_size_mb (integer) = The maximum size of the rotated stdout/stderr logs in MiB.
        rotate_log_backups (integer) = The maximum number of rotated stdout/stderr log backups.
        preserve_env (boolean) = whether or not env variables for the runner should be in the
                                 env for the task being run
    """
        if not issubclass(planner_class, TaskPlanner):
            raise TypeError('planner_class must be a TaskPlanner.')
        self._clock = clock
        launch_time = self._clock.time()
        launch_time_ms = '%06d' % int(
            (launch_time - int(launch_time)) * (10**6))
        if not task_id:
            self._task_id = '%s-%s.%s' % (
                task.name(),
                time.strftime('%Y%m%d-%H%M%S',
                              time.localtime(launch_time)), launch_time_ms)
        else:
            self._task_id = task_id
        current_user = TaskRunnerHelper.get_actual_user()
        self._user = user or current_user
        # TODO(wickman) This should be delegated to the ProcessPlatform / Helper
        if self._user != current_user:
            if os.geteuid() != 0:
                raise ValueError(
                    'task specifies user as %s, but %s does not have setuid permission!'
                    % (self._user, current_user))
        self._portmap = portmap or {}
        self._launch_time = launch_time
        self._log_dir = log_dir or os.path.join(sandbox, '.logs')
        self._process_logger_destination = process_logger_destination
        self._process_logger_mode = process_logger_mode
        self._rotate_log_size_mb = rotate_log_size_mb
        self._rotate_log_backups = rotate_log_backups
        self._pathspec = TaskPath(root=checkpoint_root,
                                  task_id=self._task_id,
                                  log_dir=self._log_dir)
        self._hostname = hostname or socket.gethostname()
        try:
            ThermosTaskValidator.assert_valid_task(task)
            ThermosTaskValidator.assert_valid_ports(task, self._portmap)
        except ThermosTaskValidator.InvalidTaskError as e:
            raise self.InvalidTask('Invalid task: %s' % e)
        context = ThermosContext(task_id=self._task_id,
                                 ports=self._portmap,
                                 user=self._user)
        self._task, uninterp = (task %
                                Environment(thermos=context)).interpolate()
        if len(uninterp) > 0:
            raise self.InvalidTask('Failed to interpolate task, missing: %s' %
                                   ', '.join(str(ref) for ref in uninterp))
        try:
            ThermosTaskValidator.assert_same_task(self._pathspec, self._task)
        except ThermosTaskValidator.InvalidTaskError as e:
            raise self.InvalidTask('Invalid task: %s' % e)
        self._plan = None  # plan currently being executed (updated by Handlers)
        self._regular_plan = planner_class(
            self._task,
            clock=clock,
            process_filter=lambda proc: proc.final().get() is False)
        self._finalizing_plan = planner_class(
            self._task,
            clock=clock,
            process_filter=lambda proc: proc.final().get() is True)
        self._chroot = chroot
        self._sandbox = sandbox
        self._terminal_state = None
        self._ckpt = None
        self._process_map = dict(
            (p.name().get(), p) for p in self._task.processes())
        self._task_processes = {}
        self._stages = dict(
            (state, stage(self)) for state, stage in self.STAGES.items())
        self._finalization_start = None
        self._preemption_deadline = None
        self._watcher = ProcessMuxer(self._pathspec)
        self._state = RunnerState(processes={})
        self._preserve_env = preserve_env

        # create runner state
        universal_handler = universal_handler or TaskRunnerUniversalHandler
        self._dispatcher = CheckpointDispatcher()
        self._dispatcher.register_handler(universal_handler(self))
        self._dispatcher.register_handler(TaskRunnerProcessHandler(self))
        self._dispatcher.register_handler(TaskRunnerTaskHandler(self))

        # recover checkpointed runner state and update plan
        self._recovery = True
        self._replay_runner_ckpt()
Example #4
0
 def on_initialization(self, header):
     log.debug('_on_initialization: %s' % header)
     ThermosTaskValidator.assert_valid_task(self._runner.task)
     ThermosTaskValidator.assert_valid_ports(self._runner.task,
                                             header.ports)
     self._checkpoint(RunnerCkpt(runner_header=header))
Example #5
0
def convert(job, metadata=frozenset(), ports=frozenset()):
    """Convert a Pystachio MesosJob to an Aurora Thrift JobConfiguration."""

    owner = Identity(user=getpass.getuser())
    key = JobKey(
        role=assert_valid_field('role', fully_interpolated(job.role())),
        environment=assert_valid_field('environment',
                                       fully_interpolated(job.environment())),
        name=assert_valid_field('name', fully_interpolated(job.name())))

    task_raw = job.task()

    MB = 1024 * 1024
    task = TaskConfig()

    def not_empty_or(item, default):
        return default if item is Empty else fully_interpolated(item)

    # job components
    task.production = fully_interpolated(job.production(), bool)
    task.isService = select_service_bit(job)
    task.maxTaskFailures = fully_interpolated(job.max_task_failures())
    task.priority = fully_interpolated(job.priority())
    task.contactEmail = not_empty_or(job.contact(), None)
    task.tier = not_empty_or(job.tier(), None)

    if job.has_partition_policy():
        task.partitionPolicy = PartitionPolicy(
            fully_interpolated(job.partition_policy().reschedule()),
            fully_interpolated(job.partition_policy().delay_secs()))

    # Add metadata to a task, to display in the scheduler UI.
    metadata_set = frozenset()
    if job.has_metadata():
        customized_metadata = job.metadata()
        metadata_set |= frozenset(
            (str(fully_interpolated(key_value_metadata.key())),
             str(fully_interpolated(key_value_metadata.value())))
            for key_value_metadata in customized_metadata)
    metadata_set |= frozenset(
        (str(key), str(value)) for key, value in metadata)
    task.metadata = frozenset(
        Metadata(key=key, value=value) for key, value in metadata_set)

    # task components
    if not task_raw.has_resources():
        raise InvalidConfig('Task must specify resources!')

    if (fully_interpolated(task_raw.resources().ram()) == 0
            or fully_interpolated(task_raw.resources().disk()) == 0):
        raise InvalidConfig(
            'Must specify ram and disk resources, got ram:%r disk:%r' %
            (fully_interpolated(task_raw.resources().ram()),
             fully_interpolated(task_raw.resources().disk())))

    numCpus = fully_interpolated(task_raw.resources().cpu())
    ramMb = fully_interpolated(task_raw.resources().ram()) / MB
    diskMb = fully_interpolated(task_raw.resources().disk()) / MB
    if numCpus <= 0 or ramMb <= 0 or diskMb <= 0:
        raise InvalidConfig(
            'Task has invalid resources.  cpu/ramMb/diskMb must all be positive: '
            'cpu:%r ramMb:%r diskMb:%r' % (numCpus, ramMb, diskMb))
    numGpus = fully_interpolated(task_raw.resources().gpu())

    task.resources = frozenset([
        Resource(numCpus=numCpus),
        Resource(ramMb=ramMb),
        Resource(diskMb=diskMb)
    ] + [Resource(namedPort=p)
         for p in ports] + ([Resource(numGpus=numGpus)] if numGpus else []))

    task.job = key
    task.owner = owner
    task.taskLinks = {}  # See AURORA-739
    task.constraints = constraints_to_thrift(
        not_empty_or(job.constraints(), {}))
    task.container = create_container_config(job.container())

    underlying, refs = job.interpolate()

    # need to fake an instance id for the sake of schema checking
    underlying_checked = underlying.bind(mesos={
        'instance': 31337,
        'hostname': ''
    })
    try:
        ThermosTaskValidator.assert_valid_task(underlying_checked.task())
    except ThermosTaskValidator.InvalidTaskError as e:
        raise InvalidConfig('Task is invalid: %s' % e)
    if not underlying_checked.check().ok():
        raise InvalidConfig('Job not fully specified: %s' %
                            underlying.check().message())

    unbound = []
    for ref in refs:
        if ref in (THERMOS_TASK_ID_REF, MESOS_INSTANCE_REF,
                   MESOS_HOSTNAME_REF) or (Ref.subscope(
                       THERMOS_PORT_SCOPE_REF, ref)):
            continue
        unbound.append(ref)

    if unbound:
        raise InvalidConfig('Config contains unbound variables: %s' %
                            ' '.join(map(str, unbound)))

    # set the executor that will be used by the Mesos task. Thermos is the default
    executor = job.executor_config()
    if fully_interpolated(executor.name()) == AURORA_EXECUTOR_NAME:
        task.executorConfig = ExecutorConfig(
            name=AURORA_EXECUTOR_NAME,
            data=filter_aliased_fields(underlying).json_dumps())
    else:
        task.executorConfig = ExecutorConfig(
            name=fully_interpolated(executor.name()),
            data=fully_interpolated(executor.data()))

    return JobConfiguration(
        key=key,
        owner=owner,
        cronSchedule=not_empty_or(job.cron_schedule(), None),
        cronCollisionPolicy=select_cron_policy(job.cron_collision_policy()),
        taskConfig=task,
        instanceCount=fully_interpolated(job.instances()))
Example #6
0
  def __init__(self, task, checkpoint_root, sandbox, log_dir=None,
               task_id=None, portmap=None, user=None, chroot=False, clock=time,
               universal_handler=None, planner_class=TaskPlanner, hostname=None,
               process_logger_destination=None, process_logger_mode=None,
               rotate_log_size_mb=None, rotate_log_backups=None,
               preserve_env=False, mesos_containerizer_path=None, container_sandbox=None):
    """
      required:
        task (config.Task) = the task to run
        checkpoint_root (path) = the checkpoint root
        sandbox (path) = the sandbox in which the path will be run
                         [if None, cwd will be assumed, but garbage collection will be
                          disabled for this task.]

      optional:
        log_dir (string)  = directory to house stdout/stderr logs. If not specified, logs will be
                            written into the sandbox directory under .logs/
        task_id (string)  = bind to this task id.  if not specified, will synthesize an id based
                            upon task.name()
        portmap (dict)    = a map (string => integer) from name to port, e.g. { 'http': 80 }
        user (string)     = the user to run the task as.  if not current user, requires setuid
                            privileges.
        chroot (boolean)  = whether or not to chroot into the sandbox prior to exec.
        clock (time interface) = the clock to use throughout
        universal_handler = checkpoint record handler (only used for testing)
        planner_class (TaskPlanner class) = TaskPlanner class to use for constructing the task
                            planning policy.
        process_logger_destination (string) = The destination of logger to use for all processes.
        process_logger_mode (string) = The mode of logger to use for all processes.
        rotate_log_size_mb (integer) = The maximum size of the rotated stdout/stderr logs in MiB.
        rotate_log_backups (integer) = The maximum number of rotated stdout/stderr log backups.
        preserve_env (boolean) = whether or not env variables for the runner should be in the
                                 env for the task being run
        mesos_containerizer_path = the path to the mesos-containerizer executable that will be used
                                   to isolate the task's filesystem (if using a filesystem image).
        container_sandbox = the path within the isolated filesystem where the task's sandbox is
                            mounted.
    """
    if not issubclass(planner_class, TaskPlanner):
      raise TypeError('planner_class must be a TaskPlanner.')
    self._clock = clock
    launch_time = self._clock.time()
    launch_time_ms = '%06d' % int((launch_time - int(launch_time)) * (10 ** 6))
    if not task_id:
      self._task_id = '%s-%s.%s' % (task.name(),
                                    time.strftime('%Y%m%d-%H%M%S', time.localtime(launch_time)),
                                    launch_time_ms)
    else:
      self._task_id = task_id
    current_user = TaskRunnerHelper.get_actual_user()
    self._user = user or current_user
    # TODO(wickman) This should be delegated to the ProcessPlatform / Helper
    if self._user != current_user:
      if os.geteuid() != 0:
        raise ValueError('task specifies user as %s, but %s does not have setuid permission!' % (
          self._user, current_user))
    self._portmap = portmap or {}
    self._launch_time = launch_time
    self._log_dir = log_dir or os.path.join(sandbox, '.logs')
    self._process_logger_destination = process_logger_destination
    self._process_logger_mode = process_logger_mode
    self._rotate_log_size_mb = rotate_log_size_mb
    self._rotate_log_backups = rotate_log_backups
    self._pathspec = TaskPath(root=checkpoint_root, task_id=self._task_id, log_dir=self._log_dir)
    self._hostname = hostname or socket.gethostname()
    try:
      ThermosTaskValidator.assert_valid_task(task)
      ThermosTaskValidator.assert_valid_ports(task, self._portmap)
    except ThermosTaskValidator.InvalidTaskError as e:
      raise self.InvalidTask('Invalid task: %s' % e)
    context = ThermosContext(
        task_id=self._task_id,
        ports=self._portmap,
        user=self._user)
    self._task, uninterp = (task % Environment(thermos=context)).interpolate()
    if len(uninterp) > 0:
      raise self.InvalidTask('Failed to interpolate task, missing: %s' %
          ', '.join(str(ref) for ref in uninterp))
    try:
      ThermosTaskValidator.assert_same_task(self._pathspec, self._task)
    except ThermosTaskValidator.InvalidTaskError as e:
      raise self.InvalidTask('Invalid task: %s' % e)
    self._plan = None  # plan currently being executed (updated by Handlers)
    self._regular_plan = planner_class(self._task, clock=clock,
        process_filter=lambda proc: proc.final().get() is False)
    self._finalizing_plan = planner_class(self._task, clock=clock,
        process_filter=lambda proc: proc.final().get() is True)
    self._chroot = chroot
    self._sandbox = sandbox
    self._container_sandbox = container_sandbox
    self._terminal_state = None
    self._ckpt = None
    self._process_map = dict((p.name().get(), p) for p in self._task.processes())
    self._task_processes = {}
    self._stages = dict((state, stage(self)) for state, stage in self.STAGES.items())
    self._finalization_start = None
    self._preemption_deadline = None
    self._watcher = ProcessMuxer(self._pathspec)
    self._state = RunnerState(processes={})
    self._preserve_env = preserve_env
    self._mesos_containerizer_path = mesos_containerizer_path

    # create runner state
    universal_handler = universal_handler or TaskRunnerUniversalHandler
    self._dispatcher = CheckpointDispatcher()
    self._dispatcher.register_handler(universal_handler(self))
    self._dispatcher.register_handler(TaskRunnerProcessHandler(self))
    self._dispatcher.register_handler(TaskRunnerTaskHandler(self))

    # recover checkpointed runner state and update plan
    self._recovery = True
    self._replay_runner_ckpt()
Example #7
0
 def on_initialization(self, header):
   log.debug('_on_initialization: %s', header)
   ThermosTaskValidator.assert_valid_task(self._runner.task)
   ThermosTaskValidator.assert_valid_ports(self._runner.task, header.ports)
   self._checkpoint(RunnerCkpt(runner_header=header))
Example #8
0
def convert(job, metadata=frozenset(), ports=frozenset()):
  """Convert a Pystachio MesosJob to an Aurora Thrift JobConfiguration."""

  owner = Identity(user=getpass.getuser())
  key = JobKey(
    role=assert_valid_field('role', fully_interpolated(job.role())),
    environment=assert_valid_field('environment', fully_interpolated(job.environment())),
    name=assert_valid_field('name', fully_interpolated(job.name())))

  task_raw = job.task()

  MB = 1024 * 1024
  task = TaskConfig()

  def not_empty_or(item, default):
    return default if item is Empty else fully_interpolated(item)

  # job components
  task.production = fully_interpolated(job.production(), bool)
  task.isService = select_service_bit(job)
  task.maxTaskFailures = fully_interpolated(job.max_task_failures())
  task.priority = fully_interpolated(job.priority())
  task.contactEmail = not_empty_or(job.contact(), None)
  task.tier = not_empty_or(job.tier(), None)

  # Add metadata to a task, to display in the scheduler UI.
  task.metadata = frozenset(Metadata(key=str(key), value=str(value)) for key, value in metadata)

  # task components
  if not task_raw.has_resources():
    raise InvalidConfig('Task must specify resources!')

  if (fully_interpolated(task_raw.resources().ram()) == 0
      or fully_interpolated(task_raw.resources().disk()) == 0):
    raise InvalidConfig('Must specify ram and disk resources, got ram:%r disk:%r' % (
      fully_interpolated(task_raw.resources().ram()),
      fully_interpolated(task_raw.resources().disk())))

  task.numCpus = fully_interpolated(task_raw.resources().cpu())
  task.ramMb = fully_interpolated(task_raw.resources().ram()) / MB
  task.diskMb = fully_interpolated(task_raw.resources().disk()) / MB
  if task.numCpus <= 0 or task.ramMb <= 0 or task.diskMb <= 0:
    raise InvalidConfig('Task has invalid resources.  cpu/ramMb/diskMb must all be positive: '
        'cpu:%r ramMb:%r diskMb:%r' % (task.numCpus, task.ramMb, task.diskMb))

  task.job = key
  task.owner = owner
  task.requestedPorts = ports
  task.taskLinks = {}  # See AURORA-739
  task.constraints = constraints_to_thrift(not_empty_or(job.constraints(), {}))
  task.container = create_container_config(job.container())

  underlying, refs = job.interpolate()

  # need to fake an instance id for the sake of schema checking
  underlying_checked = underlying.bind(mesos={'instance': 31337, 'hostname': ''})
  try:
    ThermosTaskValidator.assert_valid_task(underlying_checked.task())
  except ThermosTaskValidator.InvalidTaskError as e:
    raise InvalidConfig('Task is invalid: %s' % e)
  if not underlying_checked.check().ok():
    raise InvalidConfig('Job not fully specified: %s' % underlying.check().message())

  unbound = []
  for ref in refs:
    if ref in (THERMOS_TASK_ID_REF, MESOS_INSTANCE_REF, MESOS_HOSTNAME_REF) or (
        Ref.subscope(THERMOS_PORT_SCOPE_REF, ref)):
      continue
    unbound.append(ref)

  if unbound:
    raise InvalidConfig('Config contains unbound variables: %s' % ' '.join(map(str, unbound)))

  task.executorConfig = ExecutorConfig(
      name=AURORA_EXECUTOR_NAME,
      data=filter_aliased_fields(underlying).json_dumps())

  return JobConfiguration(
      key=key,
      owner=owner,
      cronSchedule=not_empty_or(job.cron_schedule(), None),
      cronCollisionPolicy=select_cron_policy(job.cron_collision_policy()),
      taskConfig=task,
      instanceCount=fully_interpolated(job.instances()))