Exemple #1
0
    def _drain_hosts(self, drainable_hosts):
        """"Drains tasks from the specified hosts.

    This will move active tasks on these hosts to the DRAINING state, causing them to be
    rescheduled elsewhere.

    :param drainable_hosts: Hosts that are in maintenance mode and ready to be drained
    :type drainable_hosts: gen.apache.aurora.ttypes.Hosts
    :rtype: set of host names failed to drain
    """
        check_and_log_response(self._client.drain_hosts(drainable_hosts))
        drainable_hostnames = [hostname for hostname in drainable_hosts.hostNames]

        total_wait = self.STATUS_POLL_INTERVAL
        not_drained_hostnames = set(drainable_hostnames)
        while not self._wait_event.is_set() and not_drained_hostnames:
            log.info("Waiting for hosts to be in DRAINED: %s" % not_drained_hostnames)
            self._wait_event.wait(self.STATUS_POLL_INTERVAL.as_(Time.SECONDS))

            statuses = self.check_status(list(not_drained_hostnames))
            not_drained_hostnames = set(h[0] for h in statuses if h[1] != "DRAINED")

            total_wait += self.STATUS_POLL_INTERVAL
            if not_drained_hostnames and total_wait > self.MAX_STATUS_WAIT:
                log.warning(
                    "Failed to move all hosts into DRAINED within %s:\n%s"
                    % (
                        self.MAX_STATUS_WAIT,
                        "\n".join("\tHost:%s\tStatus:%s" % h for h in sorted(statuses) if h[1] != "DRAINED"),
                    )
                )
                break

        return not_drained_hostnames
Exemple #2
0
    def _apply_states(self):
        """
      os.stat() the corresponding checkpoint stream of this task and determine if there are new ckpt
      records.  Attempt to read those records and update the high watermark for that stream.
      Returns True if new states were applied, False otherwise.
    """
        ckpt_offset = None
        try:
            ckpt_offset = os.stat(self._runner_ckpt).st_size

            updated = False
            if self._ckpt_head < ckpt_offset:
                with open(self._runner_ckpt, "r") as fp:
                    fp.seek(self._ckpt_head)
                    rr = ThriftRecordReader(fp, RunnerCkpt)
                    while True:
                        runner_update = rr.try_read()
                        if not runner_update:
                            break
                        try:
                            self._dispatcher.dispatch(self._runnerstate, runner_update)
                        except CheckpointDispatcher.InvalidSequenceNumber as e:
                            log.error("Checkpoint stream is corrupt: %s" % e)
                            break
                    new_ckpt_head = fp.tell()
                    updated = self._ckpt_head != new_ckpt_head
                    self._ckpt_head = new_ckpt_head
            return updated
        except OSError as e:
            if e.errno == errno.ENOENT:
                # The log doesn't yet exist, will retry later.
                log.warning("Could not read from checkpoint %s" % self._runner_ckpt)
                return False
            else:
                raise
Exemple #3
0
    def _fast_forward_stream(self, process_name):
        log.debug('Fast forwarding %s stream to seq=%s' %
                  (process_name, self._watermarks[process_name]))
        assert self._processes.get(process_name) is not None
        fp = self._processes[process_name]
        rr = ThriftRecordReader(fp, RunnerCkpt)
        current_watermark = -1
        records = 0
        while current_watermark < self._watermarks[process_name]:
            last_pos = fp.tell()
            record = rr.try_read()
            if record is None:
                break
            new_watermark = record.process_status.seq
            if new_watermark > self._watermarks[process_name]:
                log.debug(
                    'Over-seeked %s [watermark = %s, high watermark = %s], rewinding.'
                    % (process_name, new_watermark,
                       self._watermarks[process_name]))
                fp.seek(last_pos)
                break
            current_watermark = new_watermark
            records += 1

        if current_watermark < self._watermarks[process_name]:
            log.warning(
                'Only able to fast forward to %s@sequence=%s, high watermark is %s'
                % (process_name, current_watermark,
                   self._watermarks[process_name]))

        if records:
            log.debug('Fast forwarded %s %s record(s) to seq=%s.' %
                      (process_name, records, current_watermark))
    def method_wrapper(*args):
      with self._lock:
        start = time.time()
        while not self._terminating.is_set() and (
            time.time() - start) < self.RPC_MAXIMUM_WAIT.as_(Time.SECONDS):

          # Only automatically append a SessionKey if this is not part of the read-only API.
          auth_args = () if hasattr(ReadOnlyScheduler.Iface, method_name) else (self.session_key(),)
          try:
            method = getattr(self.client(), method_name)
            if not callable(method):
              return method

            resp = method(*(args + auth_args))
            if resp is not None and resp.responseCode == ResponseCode.ERROR_TRANSIENT:
              raise self.TransientError(", ".join(
                  [m.message for m in resp.details] if resp.details else []))
            if resp.serverInfo.thriftAPIVersion != THRIFT_API_VERSION:
              raise self.APIVersionError("Client Version: %s, Server Version: %s" %
                  (THRIFT_API_VERSION, resp.serverInfo.thriftAPIVersion))
            return resp
          except (TTransport.TTransportException, self.TimeoutError, self.TransientError) as e:
            if not self._terminating.is_set():
              log.warning('Connection error with scheduler: %s, reconnecting...' % e)
              self.invalidate()
              self._terminating.wait(self.RPC_RETRY_INTERVAL.as_(Time.SECONDS))
          except Exception as e:
            # Take any error that occurs during the RPC call, and transform it
            # into something clients can handle.
            if not self._terminating.is_set():
              raise self.ThriftInternalError("Error during thrift call %s to %s: %s" %
                                            (method_name, self.cluster.name, e))
        if not self._terminating.is_set():
          raise self.TimeoutError('Timed out attempting to issue %s to %s' % (
              method_name, self.cluster.name))
Exemple #5
0
 def on_finish(service_instance):
   try:
     self._members[member_id] = ServiceInstance.unpack(service_instance)
   except Exception as e:
     log.warning('Failed to deserialize endpoint: %s' % e)
     return
   self._on_join(self._members[member_id])
Exemple #6
0
    def _drain_hosts(self, drainable_hosts):
        """"Drains tasks from the specified hosts.

    This will move active tasks on these hosts to the DRAINING state, causing them to be
    rescheduled elsewhere.

    :param drainable_hosts: Hosts that are in maintenance mode and ready to be drained
    :type drainable_hosts: gen.apache.aurora.ttypes.Hosts
    """
        check_and_log_response(self._client.drain_hosts(drainable_hosts))
        not_ready_hostnames = [
            hostname for hostname in drainable_hosts.hostNames
        ]
        while not_ready_hostnames:
            resp = self._client.maintenance_status(
                Hosts(set(not_ready_hostnames)))
            if not resp.result.maintenanceStatusResult.statuses:
                not_ready_hostnames = None
            for host_status in resp.result.maintenanceStatusResult.statuses:
                if host_status.mode != MaintenanceMode.DRAINED:
                    log.warning(
                        '%s is currently in status %s' %
                        (host_status.host,
                         MaintenanceMode._VALUES_TO_NAMES[host_status.mode]))
                else:
                    not_ready_hostnames.remove(host_status.host)
Exemple #7
0
def setup_child_subreaping():
  """
  This uses the prctl(2) syscall to set the `PR_SET_CHILD_SUBREAPER` flag. This
  means if any children processes need to be reparented, they will be reparented
  to this process.

  More documentation here: http://man7.org/linux/man-pages/man2/prctl.2.html
  and here: https://lwn.net/Articles/474787/

  Callers should reap terminal children to prevent zombies.
  """
  log.debug("Calling prctl(2) with PR_SET_CHILD_SUBREAPER")
  # This constant is taken from prctl.h
  PR_SET_CHILD_SUBREAPER = 36
  try:
    library_name = ctypes.util.find_library('c')
    if library_name is None:
      log.warning("libc is not found. Unable to call prctl!")
      log.warning("Children subreaping is disabled!")
      return
    libc = ctypes.CDLL(library_name, use_errno=True)
    # If we are on a system where prctl doesn't exist, this will throw an
    # attribute error.
    ret = libc.prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0)
    if ret != 0:
      errno = ctypes.get_errno()
      raise OSError(errno, os.strerror(errno))
  except Exception as e:
    log.error("Unable to call prctl %s" % e)
    log.error("Children subreaping is disabled!")
Exemple #8
0
  def allocate_port(self, name, port=None):
    if port is not None:
      if name in self._ports and self._ports[name] != port:
        raise EphemeralPortAllocator.PortConflict(
            'Port binding %s=>%s conflicts with current binding %s=>%s' % (
          name, port, name, self._ports[name]))
      else:
        self._ports[name] = port
        return port

    if name in self._ports:
      return self._ports[name]

    while True:
      rand_port = random.randint(*EphemeralPortAllocator.SOCKET_RANGE)
      # if this ever needs to be performant, make a peer set.
      if rand_port in self._ports.values():
        continue
      try:
        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        s.bind(('localhost', rand_port))
        s.close()
        self._ports[name] = rand_port
        break
      except OSError as e:
        if e.errno == errno.EADDRINUSE:
          log.warning('Could not bind port: %s' % e)
          time.sleep(0.2)
          continue
        else:
          raise
    return self._ports[name]
 def _complete_maintenance(self, drained_hosts):
     """End the maintenance status for a give set of hosts."""
     check_and_log_response(self._client.end_maintenance(drained_hosts))
     resp = self._client.maintenance_status(drained_hosts)
     for host_status in resp.result.maintenanceStatusResult.statuses:
         if host_status.mode != MaintenanceMode.NONE:
             log.warning('%s is DRAINING or in DRAINED' % host_status.host)
  def _drain_hosts(self, drainable_hosts, clock=time):
    """"Drains tasks from the specified hosts.

    This will move active tasks on these hosts to the DRAINING state, causing them to be
    rescheduled elsewhere.

    :param drainable_hosts: Hosts that are in maintenance mode and ready to be drained
    :type drainable_hosts: gen.apache.aurora.ttypes.Hosts
    :param clock: time module for testing
    :type clock: time
    """
    check_and_log_response(self._client.drain_hosts(drainable_hosts))
    not_ready_hostnames = [hostname for hostname in drainable_hosts.hostNames]
    while not_ready_hostnames:
      log.info("Sleeping for %s." % self.START_MAINTENANCE_DELAY)
      clock.sleep(self.START_MAINTENANCE_DELAY.as_(Time.SECONDS))
      resp = self._client.maintenance_status(Hosts(set(not_ready_hostnames)))
      if not resp.result.maintenanceStatusResult.statuses:
        not_ready_hostnames = None
      for host_status in resp.result.maintenanceStatusResult.statuses:
        if host_status.mode != MaintenanceMode.DRAINED:
          log.warning('%s is currently in status %s' %
              (host_status.host, MaintenanceMode._VALUES_TO_NAMES[host_status.mode]))
        else:
          not_ready_hostnames.remove(host_status.host)
    def _construct_scheduler(self):
        """
      Populates:
        self._scheduler
        self._client
    """
        self._scheduler = SchedulerClient.get(self.cluster,
                                              verbose=self.verbose)
        assert self._scheduler, "Could not find scheduler (cluster = %s)" % self.cluster.name
        start = time.time()
        while (time.time() - start) < self.CONNECT_MAXIMUM_WAIT.as_(
                Time.SECONDS):
            try:
                self._client = self._scheduler.get_thrift_client()
                break
            except SchedulerClient.CouldNotConnect as e:
                log.warning('Could not connect to scheduler: %s' % e)
        if not self._client:
            raise self.TimeoutError(
                'Timed out trying to connect to scheduler at %s' %
                self.cluster.name)

        server_version = self._client.getVersion().result.getVersionResult
        if server_version != CURRENT_API_VERSION:
            raise self.APIVersionError(
                "Client Version: %s, Server Version: %s" %
                (CURRENT_API_VERSION, server_version))
  def _apply_states(self):
    """
      os.stat() the corresponding checkpoint stream of this task and determine if there are new ckpt
      records.  Attempt to read those records and update the high watermark for that stream.
      Returns True if new states were applied, False otherwise.
    """
    ckpt_offset = None
    try:
      ckpt_offset = os.stat(self._runner_ckpt).st_size

      updated = False
      if self._ckpt_head < ckpt_offset:
        with open(self._runner_ckpt, 'r') as fp:
          fp.seek(self._ckpt_head)
          rr = ThriftRecordReader(fp, RunnerCkpt)
          while True:
            runner_update = rr.try_read()
            if not runner_update:
              break
            try:
              self._dispatcher.dispatch(self._runnerstate, runner_update)
            except CheckpointDispatcher.InvalidSequenceNumber as e:
              log.error('Checkpoint stream is corrupt: %s' % e)
              break
          new_ckpt_head = fp.tell()
          updated = self._ckpt_head != new_ckpt_head
          self._ckpt_head = new_ckpt_head
      return updated
    except OSError as e:
      if e.errno == errno.ENOENT:
        # The log doesn't yet exist, will retry later.
        log.warning('Could not read from checkpoint %s' % self._runner_ckpt)
        return False
      else:
        raise
 def _complete_maintenance(self, drained_hosts):
   """End the maintenance status for a give set of hosts."""
   check_and_log_response(self._client.end_maintenance(drained_hosts))
   resp = self._client.maintenance_status(drained_hosts)
   for host_status in resp.result.maintenanceStatusResult.statuses:
     if host_status.mode != MaintenanceMode.NONE:
       log.warning('%s is DRAINING or in DRAINED' % host_status.host)
Exemple #14
0
    def get_completion(result):
      try:
        children = result.get()
      except self.DISCONNECT_EXCEPTIONS:
        self._once(KazooState.CONNECTED, do_monitor)
        return
      except ke.NoNodeError:
        wait_exists()
        return
      except ke.KazooException as e:
        log.warning('Unexpected get_completion result: (%s)%s' % (type(e), e))
        return

      children = [child for child in children if self.znode_owned(child)]
      _, new = self._update_children(children)
      for child in new:
        def devnull(*args, **kw): pass
        self.info(child, callback=devnull)

      monitor_queue = self._monitor_queue[:]
      self._monitor_queue = []
      members = set(Membership(self.znode_to_id(child)) for child in children)
      for membership, capture in monitor_queue:
        if set(membership) != members:
          capture.set(members)
        else:
          self._monitor_queue.append((membership, capture))
    def method_wrapper(*args):
      with self._lock:
        start = time.time()
        while not self._terminating.is_set() and (
            time.time() - start) < self.RPC_MAXIMUM_WAIT.as_(Time.SECONDS):

          auth_args = () if method_name in self.UNAUTHENTICATED_RPCS else (self.session_key(),)
          try:
            method = getattr(self.client(), method_name)
            if not callable(method):
              return method
            return method(*(args + auth_args))
          except (TTransport.TTransportException, self.TimeoutError) as e:
            if not self._terminating:
              log.warning('Connection error with scheduler: %s, reconnecting...' % e)
              self.invalidate()
              self._terminating.wait(self.RPC_RETRY_INTERVAL.as_(Time.SECONDS))
          except Exception as e:
            # Take any error that occurs during the RPC call, and transform it
            # into something clients can handle.
            if not self._terminating:
              raise self.ThriftInternalError("Error during thrift call %s to %s: %s" %
                                            (method_name, self.cluster.name, e))
        if not self._terminating:
          raise self.TimeoutError('Timed out attempting to issue %s to %s' % (
              method_name, self.cluster.name))
Exemple #16
0
    def method_wrapper(*args):
      with self._lock:
        start = time.time()
        while not self._terminating.is_set() and (
            time.time() - start) < self.RPC_MAXIMUM_WAIT.as_(Time.SECONDS):

          try:
            method = getattr(self.client(), method_name)
            if not callable(method):
              return method
            resp = method(*args)
            if resp is not None and resp.responseCode == ResponseCode.ERROR_TRANSIENT:
              raise self.TransientError(", ".join(
                  [m.message for m in resp.details] if resp.details else []))
            return resp
          except TRequestsTransport.AuthError as e:
            log.error(self.scheduler_client().get_failed_auth_message())
            raise self.AuthError(e)
          except (TTransport.TTransportException, self.TimeoutError, self.TransientError) as e:
            if not self._terminating.is_set():
              log.warning('Connection error with scheduler: %s, reconnecting...' % e)
              self.invalidate()
              self._terminating.wait(self.RPC_RETRY_INTERVAL.as_(Time.SECONDS))
          except Exception as e:
            # Take any error that occurs during the RPC call, and transform it
            # into something clients can handle.
            if not self._terminating.is_set():
              raise self.ThriftInternalError("Error during thrift call %s to %s: %s" %
                                            (method_name, self.cluster.name, e))
        if not self._terminating.is_set():
          raise self.TimeoutError('Timed out attempting to issue %s to %s' % (
              method_name, self.cluster.name))
Exemple #17
0
    def __call__(self, endpoint, use_post_method=False, expected_response=None, expected_response_code=None):
        """Returns a (boolean, string|None) tuple of (call success, failure reason)"""
        try:
            response, response_code = self.query(endpoint, "" if use_post_method else None)
            response = response.strip().lower()
            if expected_response and response != expected_response.lower():
                reason = 'Response differs from expected response (expected "%s", got "%s")'

                def shorten(string):
                    return (
                        string
                        if len(string) < self.FAILURE_REASON_LENGTH
                        else "%s..." % string[: self.FAILURE_REASON_LENGTH - 3]
                    )

                log.warning(reason % (expected_response, response))
                return (False, reason % (shorten(str(expected_response)), shorten(str(response))))
            elif expected_response_code and response_code != expected_response_code:
                reason = "Response code differs from expected response (expected %i, got %i)"
                log.warning(reason % (expected_response_code, response_code))
                return (False, reason % (expected_response_code, response_code))
            else:
                return (True, None)
        except self.QueryError as e:
            return (False, str(e))
    def method_wrapper(*args):
      with self._lock:
        start = time.time()
        # TODO(wfarner): The while loop causes failed unit tests to spin for the retry
        # period (currently 10 minutes).  Figure out a better approach.
        while not self._terminating.is_set() and (
            time.time() - start) < self.RPC_MAXIMUM_WAIT.as_(Time.SECONDS):

          # Only automatically append a SessionKey if this is not part of the read-only API.
          auth_args = () if hasattr(ReadOnlyScheduler.Iface, method_name) else (self.session_key(),)
          try:
            method = getattr(self.client(), method_name)
            if not callable(method):
              return method
            return method(*(args + auth_args))
          except (TTransport.TTransportException, self.TimeoutError) as e:
            if not self._terminating:
              log.warning('Connection error with scheduler: %s, reconnecting...' % e)
              self.invalidate()
              self._terminating.wait(self.RPC_RETRY_INTERVAL.as_(Time.SECONDS))
          except Exception as e:
            # Take any error that occurs during the RPC call, and transform it
            # into something clients can handle.
            if not self._terminating:
              raise self.ThriftInternalError("Error during thrift call %s to %s: %s" %
                                            (method_name, self.cluster.name, e))
        if not self._terminating:
          raise self.TimeoutError('Timed out attempting to issue %s to %s' % (
              method_name, self.cluster.name))
Exemple #19
0
  def _drain_hosts(self, drainable_hosts):
    """"Drains tasks from the specified hosts.

    This will move active tasks on these hosts to the DRAINING state, causing them to be
    rescheduled elsewhere.

    :param drainable_hosts: Hosts that are in maintenance mode and ready to be drained
    :type drainable_hosts: gen.apache.aurora.ttypes.Hosts
    :rtype: set of host names failed to drain
    """
    check_and_log_response(self._client.drain_hosts(drainable_hosts))
    drainable_hostnames = [hostname for hostname in drainable_hosts.hostNames]

    total_wait = self.STATUS_POLL_INTERVAL
    not_drained_hostnames = set(drainable_hostnames)
    while not self._wait_event.is_set() and not_drained_hostnames:
      log.info('Waiting for hosts to be in DRAINED: %s' % not_drained_hostnames)
      self._wait_event.wait(self.STATUS_POLL_INTERVAL.as_(Time.SECONDS))

      statuses = self.check_status(list(not_drained_hostnames))
      not_drained_hostnames = set(h[0] for h in statuses if h[1] != 'DRAINED')

      total_wait += self.STATUS_POLL_INTERVAL
      if not_drained_hostnames and total_wait > self.MAX_STATUS_WAIT:
        log.warning('Failed to move all hosts into DRAINED within %s:\n%s' %
            (self.MAX_STATUS_WAIT,
            '\n'.join("\tHost:%s\tStatus:%s" % h for h in sorted(statuses) if h[1] != 'DRAINED')))
        break

    return not_drained_hostnames
Exemple #20
0
    def acreate_completion(result):
      try:
        # TODO(wickman) Kazoo has a bug:
        #    https://github.com/python-zk/kazoo/issues/106
        #    https://github.com/python-zk/kazoo/pull/107
        # Remove this one 1.3 is cut.
        path = self._zk.unchroot(result.get())
      except self.DISCONNECT_EXCEPTIONS:
        self._once(KazooState.CONNECTED, do_join)
        return
      except ke.KazooException as e:
        log.warning('Unexpected Kazoo result in join: (%s)%s' % (type(e), e))
        membership = Membership.error()
      else:
        created_id = self.znode_to_id(path)
        membership = Membership(created_id)
        with self._member_lock:
          result_future = self._members.get(membership, Future())
          result_future.set_result(blob)
          self._members[membership] = result_future
        if expire_callback:
          self._once(KazooState.CONNECTED, expire_notifier)
          do_exists(path)

      membership_capture.set(membership)
Exemple #21
0
    def __call__(self,
                 endpoint,
                 use_post_method=False,
                 expected_response=None,
                 expected_response_code=None):
        """Returns a (boolean, string|None) tuple of (call success, failure reason)"""
        try:
            response, response_code = self.query(
                endpoint, '' if use_post_method else None)
            response = response.strip().lower()
            if expected_response and response != expected_response.lower():
                reason = 'Response differs from expected response (expected "%s", got "%s")'

                def shorten(string):
                    return (string if len(string) < self.FAILURE_REASON_LENGTH
                            else "%s..." %
                            string[:self.FAILURE_REASON_LENGTH - 3])

                log.warning(reason % (expected_response, response))
                return (
                    False, reason %
                    (shorten(str(expected_response)), shorten(str(response))))
            elif expected_response_code and response_code != expected_response_code:
                reason = 'Response code differs from expected response (expected %i, got %i)'
                log.warning(reason % (expected_response_code, response_code))
                return (False,
                        reason % (expected_response_code, response_code))
            else:
                return (True, None)
        except self.QueryError as e:
            return (False, str(e))
Exemple #22
0
  def _fast_forward_stream(self, process_name):
    log.debug('Fast forwarding %s stream to seq=%s' % (process_name,
      self._watermarks[process_name]))
    assert self._processes.get(process_name) is not None
    fp = self._processes[process_name]
    rr = ThriftRecordReader(fp, RunnerCkpt)
    current_watermark = -1
    records = 0
    while current_watermark < self._watermarks[process_name]:
      last_pos = fp.tell()
      record = rr.try_read()
      if record is None:
        break
      new_watermark = record.process_status.seq
      if new_watermark > self._watermarks[process_name]:
        log.debug('Over-seeked %s [watermark = %s, high watermark = %s], rewinding.' % (
          process_name, new_watermark, self._watermarks[process_name]))
        fp.seek(last_pos)
        break
      current_watermark = new_watermark
      records += 1

    if current_watermark < self._watermarks[process_name]:
      log.warning('Only able to fast forward to %s@sequence=%s, high watermark is %s' % (
         process_name, current_watermark, self._watermarks[process_name]))

    if records:
      log.debug('Fast forwarded %s %s record(s) to seq=%s.' % (process_name, records,
        current_watermark))
        def get_completion(result):
            try:
                children = result.get()
            except self.DISCONNECT_EXCEPTIONS:
                self._once(KazooState.CONNECTED, do_monitor)
                return
            except ke.NoNodeError:
                wait_exists()
                return
            except ke.KazooException as e:
                log.warning('Unexpected get_completion result: (%s)%s' %
                            (type(e), e))
                return

            children = [child for child in children if self.znode_owned(child)]
            _, new = self._update_children(children)
            for child in new:

                def devnull(*args, **kw):
                    pass

                self.info(child, callback=devnull)

            monitor_queue = self._monitor_queue[:]
            self._monitor_queue = []
            members = set(
                Membership(self.znode_to_id(child)) for child in children)
            for membership, capture in monitor_queue:
                if set(membership) != members:
                    capture.set(members)
                else:
                    self._monitor_queue.append((membership, capture))
    def sample(self):
        """ Collate and aggregate ProcessSamples for process and children
        Returns None: result is stored in self.value
    """
        try:
            last_sample, last_stamp = self._sample, self._stamp
            if self._process is None:
                self._process = Process(self._pid)
            parent = self._process
            parent_sample = process_to_sample(parent)
            new_samples = dict((proc.pid, process_to_sample(proc)) for proc in parent.get_children(recursive=True))
            new_samples[self._pid] = parent_sample

        except PsutilError as e:
            log.warning("Error during process sampling: %s" % e)
            self._sample = ProcessSample.empty()
            self._rate = 0.0

        else:
            last_stamp = self._stamp
            self._stamp = time()
            # for most stats, calculate simple sum to aggregate
            self._sample = sum(new_samples.values(), ProcessSample.empty())
            # cpu consumption is more complicated
            # We require at least 2 generations of a process before we can calculate rate, so for all
            # current processes that were not running in the previous sample, compare to an empty sample
            if self._sampled_tree and last_stamp:
                new = new_samples.values()
                old = [self._sampled_tree.get(pid, ProcessSample.empty()) for pid in new_samples.keys()]
                new_user_sys = sum(map(attrgetter("user"), new)) + sum(map(attrgetter("system"), new))
                old_user_sys = sum(map(attrgetter("user"), old)) + sum(map(attrgetter("system"), old))
                self._rate = (new_user_sys - old_user_sys) / (self._stamp - last_stamp)
                log.debug("Calculated rate for pid=%s and children: %s" % (self._process.pid, self._rate))
            self._sampled_tree = new_samples
Exemple #25
0
  def disambiguate_args_or_die(cls, args, options, client_factory=AuroraClientAPI):
    """
    Returns a (AuroraClientAPI, AuroraJobKey, AuroraConfigFile:str) tuple
    if one can be found given the args, potentially querying the scheduler with the returned client.
    Calls die() with an appropriate error message otherwise.

    Arguments:
      args: args from app command invocation.
      options: options from app command invocation. must have env and cluster attributes.
      client_factory: a callable (cluster) -> AuroraClientAPI.
    """
    if not len(args) > 0:
      die('job path is required')
    try:
      job_key = AuroraJobKey.from_path(args[0])
      client = client_factory(job_key.cluster)
      config_file = args[1] if len(args) > 1 else None  # the config for hooks
      return client, job_key, config_file
    except AuroraJobKey.Error:
      log.warning("Failed to parse job path, falling back to compatibility mode")
      role = args[0] if len(args) > 0 else None
      name = args[1] if len(args) > 1 else None
      env = None
      config_file = None  # deprecated form does not support hooks functionality
      cluster = options.cluster
      if not cluster:
        die('cluster is required')
      client = client_factory(cluster)
      return client, cls._disambiguate_or_die(client, role, env, name), config_file
  def _construct_scheduler(self):
    """
      Populates:
        self._scheduler_client
        self._client
    """
    self._scheduler_client = SchedulerClient.get(self.cluster, verbose=self.verbose)
    assert self._scheduler_client, "Could not find scheduler (cluster = %s)" % self.cluster.name
    start = time.time()
    while (time.time() - start) < self.CONNECT_MAXIMUM_WAIT.as_(Time.SECONDS):
      try:
        # this can wind up generating any kind of error, because it turns into
        # a call to a dynamically set authentication module.
        self._client = self._scheduler_client.get_thrift_client()
        break
      except SchedulerClient.CouldNotConnect as e:
        log.warning('Could not connect to scheduler: %s' % e)
      except Exception as e:
        # turn any auth module exception into an auth error.
        log.debug('Warning: got an unknown exception during authentication:')
        log.debug(traceback.format_exc())
        raise self.AuthenticationError('Error connecting to scheduler: %s' % e)
    if not self._client:
      raise self.TimeoutError('Timed out trying to connect to scheduler at %s' % self.cluster.name)

    server_version = self._client.getVersion().result.getVersionResult
    if server_version != CURRENT_API_VERSION:
      raise self.APIVersionError("Client Version: %s, Server Version: %s" %
                                 (CURRENT_API_VERSION, server_version))
Exemple #27
0
def setup_child_subreaping():
    """
  This uses the prctl(2) syscall to set the `PR_SET_CHILD_SUBREAPER` flag. This
  means if any children processes need to be reparented, they will be reparented
  to this process.

  More documentation here: http://man7.org/linux/man-pages/man2/prctl.2.html
  and here: https://lwn.net/Articles/474787/

  Callers should reap terminal children to prevent zombies.
  """
    log.debug("Calling prctl(2) with PR_SET_CHILD_SUBREAPER")
    # This constant is taken from prctl.h
    PR_SET_CHILD_SUBREAPER = 36
    try:
        library_name = ctypes.util.find_library('c')
        if library_name is None:
            log.warning("libc is not found. Unable to call prctl!")
            log.warning("Children subreaping is disabled!")
            return
        libc = ctypes.CDLL(library_name, use_errno=True)
        # If we are on a system where prctl doesn't exist, this will throw an
        # attribute error.
        ret = libc.prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0)
        if ret != 0:
            errno = ctypes.get_errno()
            raise OSError(errno, os.strerror(errno))
    except Exception as e:
        log.error("Unable to call prctl %s" % e)
        log.error("Children subreaping is disabled!")
Exemple #28
0
 def _construct_scheduler(self):
     """
   Populates:
     self._scheduler_client
     self._client
 """
     self._scheduler_client = SchedulerClient.get(self.cluster,
                                                  verbose=self.verbose,
                                                  **self._kwargs)
     assert self._scheduler_client, "Could not find scheduler (cluster = %s)" % self.cluster.name
     start = time.time()
     while (time.time() - start) < self.CONNECT_MAXIMUM_WAIT.as_(
             Time.SECONDS):
         try:
             # this can wind up generating any kind of error, because it turns into
             # a call to a dynamically set authentication module.
             self._client = self._scheduler_client.get_thrift_client()
             break
         except SchedulerClient.CouldNotConnect as e:
             log.warning('Could not connect to scheduler: %s' % e)
         except Exception as e:
             # turn any auth module exception into an auth error.
             log.debug(
                 'Warning: got an unknown exception during authentication:')
             log.debug(traceback.format_exc())
             raise self.AuthError('Error connecting to scheduler: %s' % e)
     if not self._client:
         raise self.TimeoutError(
             'Timed out trying to connect to scheduler at %s' %
             self.cluster.name)
  def _drain_hosts(self, drainable_hosts):
    """"Drains tasks from the specified hosts.

    This will move active tasks on these hosts to the DRAINING state, causing them to be
    rescheduled elsewhere.

    :param drainable_hosts: Hosts that are in maintenance mode and ready to be drained
    :type drainable_hosts: gen.apache.aurora.ttypes.Hosts
    :rtype: set of host names failed to drain
    """
    check_and_log_response(self._client.drain_hosts(drainable_hosts))
    drainable_hostnames = [hostname for hostname in drainable_hosts.hostNames]

    total_wait = self.STATUS_POLL_INTERVAL
    not_drained_hostnames = set(drainable_hostnames)
    while not self._wait_event.is_set() and not_drained_hostnames:
      self._wait_event.wait(self.STATUS_POLL_INTERVAL.as_(Time.SECONDS))

      not_drained_hostnames = self.check_if_drained(drainable_hostnames)

      total_wait += self.STATUS_POLL_INTERVAL
      if not_drained_hostnames and total_wait > self.MAX_STATUS_WAIT:
        log.warning('Failed to move all hosts into DRAINED within %s' % self.MAX_STATUS_WAIT)
        break

    return not_drained_hostnames
Exemple #30
0
    def method_wrapper(*args):
      with self._lock:
        start = time.time()
        while not self._terminating.is_set() and (
            time.time() - start) < self.RPC_MAXIMUM_WAIT.as_(Time.SECONDS):

          try:
            method = getattr(self.client(), method_name)
            if not callable(method):
              return method

            resp = method(*args)
            if resp is not None and resp.responseCode == ResponseCode.ERROR_TRANSIENT:
              raise self.TransientError(", ".join(
                  [m.message for m in resp.details] if resp.details else []))
            return resp
          except TRequestsTransport.AuthError as e:
            log.error(self.scheduler_client().get_failed_auth_message())
            raise self.AuthError(e)
          except (TTransport.TTransportException, self.TimeoutError, self.TransientError) as e:
            if not self._terminating.is_set():
              log.warning('Connection error with scheduler: %s, reconnecting...' % e)
              self.invalidate()
              self._terminating.wait(self.RPC_RETRY_INTERVAL.as_(Time.SECONDS))
          except Exception as e:
            # Take any error that occurs during the RPC call, and transform it
            # into something clients can handle.
            if not self._terminating.is_set():
              raise self.ThriftInternalError("Error during thrift call %s to %s: %s" %
                                            (method_name, self.cluster.name, e))
        if not self._terminating.is_set():
          raise self.TimeoutError('Timed out attempting to issue %s to %s' % (
              method_name, self.cluster.name))
Exemple #31
0
  def run(self):
    """Thread entrypoint. Loop indefinitely, polling collectors at self._collection_interval and
    collating samples."""

    log.debug('Commencing resource monitoring for task "%s"' % self._task_id)
    next_process_collection = 0
    next_disk_collection = 0

    while not self._kill_signal.is_set():

      now = time.time()

      if now > next_process_collection:
        next_process_collection = now + self._process_collection_interval
        actives = set(self._get_active_processes())
        current = set(self._process_collectors)
        for process in current - actives:
          self._process_collectors.pop(process)
        for process in actives - current:
          self._process_collectors[process] = self._process_collector_factory(process.pid)
        for process, collector in self._process_collectors.items():
          collector.sample()

      if now > next_disk_collection:
        next_disk_collection = now + self._disk_collection_interval
        if not self._disk_collector:
          sandbox = self._task_monitor.get_sandbox()
          if sandbox:
            self._disk_collector = self._disk_collector_class(sandbox)
        if self._disk_collector:
          self._disk_collector.sample()
        else:
          log.debug('No sandbox detected yet for %s' % self._task_id)

      try:
        aggregated_procs = sum(map(attrgetter('procs'), self._process_collectors.values()))
        aggregated_sample = sum(map(attrgetter('value'), self._process_collectors.values()),
                                ProcessSample.empty())
        disk_value = self._disk_collector.value if self._disk_collector else 0
        self._history.add(now, self.ResourceResult(aggregated_procs, aggregated_sample, disk_value))
      except ValueError as err:
        log.warning("Error recording resource sample: %s" % err)

      # Sleep until any of the following conditions are met:
      # - it's time for the next disk collection
      # - it's time for the next process collection
      # - the result from the last disk collection is available via the DiskCollector
      # - the TaskResourceMonitor has been killed via self._kill_signal
      now = time.time()
      next_collection = min(next_process_collection - now, next_disk_collection - now)

      if self._disk_collector:
        waiter = EventMuxer(self._kill_signal, self._disk_collector.completed_event)
      else:
        waiter = self._kill_signal

      waiter.wait(timeout=max(0, next_collection))

    log.debug('Stopping resource monitoring for task "%s"' % self._task_id)
Exemple #32
0
  def run(self):
    """Thread entrypoint. Loop indefinitely, polling collectors at self._collection_interval and
    collating samples."""

    log.debug('Commencing resource monitoring for task "%s"' % self._task_id)
    next_process_collection = 0
    next_disk_collection = 0

    while not self._kill_signal.is_set():

      now = time.time()

      if now > next_process_collection:
        next_process_collection = now + self._process_collection_interval
        actives = set(self._get_active_processes())
        current = set(self._process_collectors)
        for process in current - actives:
          self._process_collectors.pop(process)
        for process in actives - current:
          self._process_collectors[process] = self._process_collector_factory(process.pid)
        for process, collector in self._process_collectors.items():
          collector.sample()

      if now > next_disk_collection:
        next_disk_collection = now + self._disk_collection_interval
        if not self._disk_collector:
          sandbox = self._task_monitor.get_sandbox()
          if sandbox:
            self._disk_collector = self._disk_collector_class(sandbox)
        if self._disk_collector:
          self._disk_collector.sample()
        else:
          log.debug('No sandbox detected yet for %s' % self._task_id)

      try:
        aggregated_procs = sum(map(attrgetter('procs'), self._process_collectors.values()))
        aggregated_sample = sum(map(attrgetter('value'), self._process_collectors.values()),
                                ProcessSample.empty())
        disk_value = self._disk_collector.value if self._disk_collector else 0
        self._history.add(now, self.ResourceResult(aggregated_procs, aggregated_sample, disk_value))
      except ValueError as err:
        log.warning("Error recording resource sample: %s" % err)

      # Sleep until any of the following conditions are met:
      # - it's time for the next disk collection
      # - it's time for the next process collection
      # - the result from the last disk collection is available via the DiskCollector
      # - the TaskResourceMonitor has been killed via self._kill_signal
      now = time.time()
      next_collection = min(next_process_collection - now, next_disk_collection - now)

      if self._disk_collector:
        waiter = EventMuxer(self._kill_signal, self._disk_collector.completed_event)
      else:
        waiter = self._kill_signal

      waiter.wait(timeout=max(0, next_collection))

    log.debug('Stopping resource monitoring for task "%s"' % self._task_id)
Exemple #33
0
 def __iter__(self):
     """Iterate over the services (ServiceInstance objects) in this ServerSet."""
     for member in self._group.list():
         try:
             yield ServiceInstance.unpack(self._group.info(member))
         except Exception as e:
             log.warning('Failed to deserialize endpoint: %s' % e)
             continue
Exemple #34
0
 def run(self):
     self.runner._run_plan(self.runner._finalizing_plan)
     log.debug("TaskRunnerStage[FINALIZING]: Finalization remaining: %s" % self.runner._finalization_remaining())
     if self.runner.deadlocked(self.runner._finalizing_plan):
         log.warning("Finalizing plan deadlocked.")
         return None
     if self.runner._finalization_remaining() > 0 and not self.runner._finalizing_plan.is_complete():
         return min(self.runner._finalization_remaining(), self.MAX_ITERATION_WAIT.as_(Time.SECONDS))
Exemple #35
0
 def __iter__(self):
   """Iterate over the services (ServiceInstance objects) in this ServerSet."""
   for member in self._group.list():
     try:
       yield ServiceInstance.unpack(self._group.info(member))
     except Exception as e:
       log.warning('Failed to deserialize endpoint: %s' % e)
       continue
  def perform_maintenance(self, hostnames, grouping_function=DEFAULT_GROUPING,
                          percentage=None, duration=None, output_file=None):
    """Put hosts into maintenance mode and drain them.

    Walk through the process of putting hosts into maintenance and draining them of tasks. The hosts
    will remain in maintenance mode upon completion.


    :param hostnames: A list of hostnames to operate upon
    :type hostnames: list of strings
    :param grouping_function: How to split up the hostname into groups
    :type grouping_function: function
    :param percentage: SLA percentage to use
    :type percentage: float
    :param duration: SLA duration to use
    :type duration: twitter.common.quantity.Time
    :param output_file: file to write hosts that were not drained due to failed SLA check
    :type output_file: string
    :rtype: set of host names that were successfully drained
    """
    hostnames = self.start_maintenance(hostnames)
    not_drained_hostnames = set()

    for hosts in self.iter_batches(hostnames, grouping_function):
      log.info('Beginning SLA check for %s' % hosts.hostNames)
      unsafe_hostnames = self._check_sla(
          list(hosts.hostNames),
          grouping_function,
          percentage,
          duration)

      if unsafe_hostnames:
        log.warning('Some hosts did not pass SLA check and will not be drained! '
                    'Skipping hosts: %s' % unsafe_hostnames)
        not_drained_hostnames |= unsafe_hostnames
        drainable_hostnames = hosts.hostNames - unsafe_hostnames
        if not drainable_hostnames:
          continue
        hosts = Hosts(drainable_hostnames)
      else:
        log.info('All hosts passed SLA check.')

      self._drain_hosts(hosts)

    if not_drained_hostnames:
      output = '\n'.join(list(not_drained_hostnames))
      log.info('The following hosts did not pass SLA check and were not drained:')
      print(output)
      if output_file:
        try:
          with open(output_file, 'w') as fp:
            fp.write(output)
            fp.write('\n')
          log.info('Written unsafe host names into: %s' % output_file)
        except IOError as e:
          log.error('Failed to write into the output file: %s' % e)

    return set(hostnames) - not_drained_hostnames
Exemple #37
0
 def run(self):
   self.runner._run_plan(self.runner._finalizing_plan)
   log.debug('TaskRunnerStage[FINALIZING]: Finalization remaining: %s' %
       self.runner._finalization_remaining())
   if self.runner.deadlocked(self.runner._finalizing_plan):
     log.warning('Finalizing plan deadlocked.')
     return None
   if self.runner._finalization_remaining() > 0 and not self.runner._finalizing_plan.is_complete():
     return min(self.runner._finalization_remaining(), self.MAX_ITERATION_WAIT.as_(Time.SECONDS))
Exemple #38
0
 def get_completion(_, rc, children):
   if rc in self._zk.COMPLETION_RETRY:
     do_monitor()
     return
   if rc != zookeeper.OK:
     log.warning('Unexpected get_completion return code: %s' % ZooKeeper.ReturnCode(rc))
     promise.set(set([Membership.error()]))
     return
   self._update_children(children)
   set_different(promise, membership, self._members)
  def run(self):
    """Thread entrypoint. Loop indefinitely, polling collectors at self._collection_interval and
    collating samples."""

    log.debug('Commencing resource monitoring for task "%s"' % self._task_id)
    next_process_collection = 0
    next_disk_collection = 0

    while not self._kill_signal.is_set():

      now = time.time()

      if now > next_process_collection:
        next_process_collection = now + self._process_collection_interval
        actives = set(self._get_active_processes())
        current = set(self._process_collectors)
        for process in current - actives:
          log.debug('Process "%s" (pid %s) no longer active, removing from monitored processes' %
                   (process.process, process.pid))
          self._process_collectors.pop(process)
        for process in actives - current:
          log.debug('Adding process "%s" (pid %s) to resource monitoring' %
                   (process.process, process.pid))
          self._process_collectors[process] = self._process_collector_factory(process.pid)
        for process, collector in self._process_collectors.items():
          log.debug('Collecting sample for process "%s" (pid %s) and children' %
                   (process.process, process.pid))
          collector.sample()

      if now > next_disk_collection:
        next_disk_collection = now + self._disk_collection_interval
        log.debug('Collecting disk sample for %s' % self._sandbox)
        self._disk_collector.sample()

      try:
        aggregated_procs = sum(map(attrgetter('procs'), self._process_collectors.values()))
        aggregated_sample = sum(map(attrgetter('value'), self._process_collectors.values()),
                                ProcessSample.empty())
        self._history.add(now, self.ResourceResult(aggregated_procs, aggregated_sample,
                                                   self._disk_collector.value))
        log.debug("Recorded resource sample at %s" % now)
      except ValueError as err:
        log.warning("Error recording resource sample: %s" % err)

      # Sleep until any of the following conditions are met:
      # - it's time for the next disk collection
      # - it's time for the next process collection
      # - the result from the last disk collection is available via the DiskCollector
      # - the TaskResourceMonitor has been killed via self._kill_signal
      now = time.time()
      next_collection = min(next_process_collection - now, next_disk_collection - now)
      EventMuxer(self._kill_signal, self._disk_collector.completed_event
                ).wait(timeout=max(0, next_collection))

    log.debug('Stopping resource monitoring for task "%s"' % self._task_id)
Exemple #40
0
 def _reconnect(self):
   """Reconnect to ZK and update endpoints once complete."""
   for _ in range(self._retries):
     try:
       self._zk.restart()
       self._start()
       break
     except ZooKeeper.ConnectionTimeout:
       log.warning('Connection establishment to %r timed out, retrying.' % self._zk)
   else:
     raise ServerSetClient.ReconnectFailed('Re-establishment of connection to ZK servers failed')
Exemple #41
0
 def _reconnect(self):
     """Reconnect to ZK and update endpoints once complete."""
     for _ in range(self._retries):
         try:
             self._zk.restart()
             self._start()
             break
         except ZooKeeper.ConnectionTimeout:
             log.warning("Connection establishment to %r timed out, retrying." % self._zk)
     else:
         raise ServerSetClient.ReconnectFailed("Re-establishment of connection to ZK servers failed")
Exemple #42
0
 def get_completion(_, rc, children):
     if rc in self._zk.COMPLETION_RETRY:
         do_monitor()
         return
     if rc != zookeeper.OK:
         log.warning('Unexpected get_completion return code: %s' %
                     ZooKeeper.ReturnCode(rc))
         promise.set(set([Membership.error()]))
         return
     self._update_children(children)
     set_different(promise, membership, self._members)
  def _complete_maintenance(self, drained_hosts):
    """End the maintenance status for a given set of hosts.

    :param drained_hosts: Hosts that are drained and finished being operated upon
    :type drained_hosts: gen.apache.aurora.ttypes.Hosts
    """
    check_and_log_response(self._client.end_maintenance(drained_hosts))
    resp = self._client.maintenance_status(drained_hosts)
    for host_status in resp.result.maintenanceStatusResult.statuses:
      if host_status.mode != MaintenanceMode.NONE:
        log.warning('%s is DRAINING or in DRAINED' % host_status.host)
    def _complete_maintenance(self, drained_hosts):
        """End the maintenance status for a given set of hosts.

    :param drained_hosts: Hosts that are drained and finished being operated upon
    :type drained_hosts: gen.apache.aurora.ttypes.Hosts
    """
        check_and_log_response(self._client.end_maintenance(drained_hosts))
        resp = self._client.maintenance_status(drained_hosts)
        for host_status in resp.result.maintenanceStatusResult.statuses:
            if host_status.mode != MaintenanceMode.NONE:
                log.warning('%s is DRAINING or in DRAINED' % host_status.host)
 def _maybe_update_failure_count(self, is_healthy, reason):
   if not is_healthy:
     log.warning('Health check failure: %s' % reason)
     self.current_consecutive_failures += 1
     if self.current_consecutive_failures > self.max_consecutive_failures:
       log.warning('Reached consecutive failure limit.')
       self.healthy = False
       self.reason = reason
   else:
     if self.current_consecutive_failures > 0:
       log.debug('Reset consecutive failures counter.')
     self.current_consecutive_failures = 0
Exemple #46
0
 def _maybe_update_failure_count(self, is_healthy, reason):
     if not is_healthy:
         log.warning('Health check failure: %s' % reason)
         self.current_consecutive_failures += 1
         if self.current_consecutive_failures > self.max_consecutive_failures:
             log.warning('Reached consecutive failure limit.')
             self.healthy = False
             self.reason = reason
     else:
         if self.current_consecutive_failures > 0:
             log.debug('Reset consecutive failures counter.')
         self.current_consecutive_failures = 0
Exemple #47
0
def resolve_ports(mesos_task, portmap):
  """Given a MesosTaskInstance and the portmap of resolved ports from the scheduler,
     create a fully resolved map of port name => port number for the thermos
     runner and discovery manager."""
  task_portmap = mesos_task.announce().portmap().get() if mesos_task.has_announce() else {}
  task_portmap.update(portmap)
  task_portmap = PortResolver.resolve(task_portmap)

  for name, port in task_portmap.items():
    if not isinstance(port, int):
      log.warning('Task has unmapped port: %s => %s' % (name, port))

  return dict((name, port) for (name, port) in task_portmap.items() if isinstance(port, int))
Exemple #48
0
 def get_completion(_, rc, children):
   if rc in self._zk.COMPLETION_RETRY:
     do_monitor()
     return
   if rc == zookeeper.NONODE:
     wait_exists()
     return
   if rc != zookeeper.OK:
     log.warning('Unexpected get_completion return code: %s' % ReturnCode(rc))
     capture.set(set([Membership.error()]))
     return
   self._update_children(children)
   set_different(capture, membership, self._members)
def resolve_ports(mesos_task, portmap):
  """Given a MesosTaskInstance and the portmap of resolved ports from the scheduler,
     create a fully resolved map of port name => port number for the thermos
     runner and discovery manager."""
  task_portmap = mesos_task.announce().portmap().get() if mesos_task.has_announce() else {}
  task_portmap.update(portmap)
  task_portmap = PortResolver.resolve(task_portmap)

  for name, port in task_portmap.items():
    if not isinstance(port, int):
      log.warning('Task has unmapped port: %s => %s' % (name, port))

  return dict((name, port) for (name, port) in task_portmap.items() if isinstance(port, int))
Exemple #50
0
 def get_completion(_, rc, children):
     if rc in self._zk.COMPLETION_RETRY:
         do_monitor()
         return
     if rc == zookeeper.NONODE:
         wait_exists()
         return
     if rc != zookeeper.OK:
         log.warning('Unexpected get_completion return code: %s' %
                     ReturnCode(rc))
         capture.set(set([Membership.error()]))
         return
     self._update_children(children)
     set_different(capture, membership, self._members)
Exemple #51
0
 def _kill(self):
     processes = TaskRunnerHelper.scan_tree(self._state)
     for process, pid_tuple in processes.items():
         current_run = self._current_process_run(process)
         coordinator_pid, pid, tree = pid_tuple
         if TaskRunnerHelper.is_process_terminal(current_run.state):
             if coordinator_pid or pid or tree:
                 log.warning(
                     'Terminal process (%s) still has running pids:' %
                     process)
                 log.warning('  coordinator_pid: %s' % coordinator_pid)
                 log.warning('              pid: %s' % pid)
                 log.warning('             tree: %s' % tree)
             TaskRunnerHelper.kill_process(self.state, process)
         else:
             if coordinator_pid or pid or tree:
                 log.info('Transitioning %s to KILLED' % process)
                 self._set_process_status(process,
                                          ProcessState.KILLED,
                                          stop_time=self._clock.time(),
                                          return_code=-1)
             else:
                 log.info('Transitioning %s to LOST' % process)
                 if current_run.state != ProcessState.WAITING:
                     self._set_process_status(process, ProcessState.LOST)
Exemple #52
0
    def _do_health_check(self):
        if self._should_enforce_deadline():
            # This is needed otherwise it is possible to flap between
            # successful health-checks and failed health-checks, never
            # really satisfying the criteria for either healthy or unhealthy.
            log.warning(
                'Exhausted attempts before satisfying liveness criteria.')
            self.healthy = False
            self.reason = 'Not enough successful health checks in time.'
            return self.healthy, self.reason

        is_healthy, reason = self._perform_check_if_not_disabled()
        if self.attempts <= self.max_attempts_to_running:
            self.attempts += 1
        self._maybe_update_health_check_count(is_healthy, reason)
        return is_healthy, reason
        def adelete_completion(result):
            try:
                success = result.get()
            except self.DISCONNECT_EXCEPTIONS:
                self._once(KazooState.CONNECTED, do_cancel)
                return
            except ke.NoNodeError:
                success = True
            except ke.KazooException as e:
                log.warning('Unexpected Kazoo result in cancel: (%s)%s' %
                            (type(e), e))
                success = False

            future = self._members.pop(member.id, Future())
            future.set_result(Membership.error())
            capture.set(success)
 def get_completion(result):
     try:
         children = result.get()
     except self.DISCONNECT_EXCEPTIONS:
         self._once(KazooState.CONNECTED, do_monitor)
         return
     except ke.NoNodeError:
         wait_exists()
         return
     except ke.KazooException as e:
         log.warning('Unexpected get_completion result: (%s)%s' %
                     (type(e), e))
         capture.set(set([Membership.error()]))
         return
     self._update_children(children)
     set_different(capture, membership, self._members)
        def exists_completion(result):
            try:
                stat = result.get()
            except self.DISCONNECT_EXCEPTIONS:
                self._once(KazooState.CONNECTED, wait_exists)
                return
            except ke.NoNodeError:
                wait_exists()
                return
            except ke.KazooException as e:
                log.warning('Unexpected exists_completion result: (%s)%s' %
                            (type(e), e))
                return

            if stat:
                do_monitor()
        def method_wrapper(*args):
            with self._lock:
                start = time.time()
                while not self._terminating.is_set() and (
                        time.time() - start) < self.RPC_MAXIMUM_WAIT.as_(
                            Time.SECONDS):

                    # Only automatically append a SessionKey if this is not part of the read-only API.
                    auth_args = () if hasattr(
                        ReadOnlyScheduler.Iface,
                        method_name) else (self.session_key(), )
                    try:
                        method = getattr(self.client(), method_name)
                        if not callable(method):
                            return method

                        resp = method(*(args + auth_args))
                        if resp is not None and resp.responseCode == ResponseCode.ERROR_TRANSIENT:
                            raise self.TransientError(
                                ", ".join([m.message for m in resp.
                                           details] if resp.details else []))
                        if resp.serverInfo.thriftAPIVersion != THRIFT_API_VERSION:
                            raise self.APIVersionError(
                                "Client Version: %s, Server Version: %s" %
                                (THRIFT_API_VERSION,
                                 resp.serverInfo.thriftAPIVersion))
                        return resp
                    except (TTransport.TTransportException, self.TimeoutError,
                            self.TransientError) as e:
                        if not self._terminating.is_set():
                            log.warning(
                                'Connection error with scheduler: %s, reconnecting...'
                                % e)
                            self.invalidate()
                            self._terminating.wait(
                                self.RPC_RETRY_INTERVAL.as_(Time.SECONDS))
                    except Exception as e:
                        # Take any error that occurs during the RPC call, and transform it
                        # into something clients can handle.
                        if not self._terminating.is_set():
                            raise self.ThriftInternalError(
                                "Error during thrift call %s to %s: %s" %
                                (method_name, self.cluster.name, e))
                if not self._terminating.is_set():
                    raise self.TimeoutError(
                        'Timed out attempting to issue %s to %s' %
                        (method_name, self.cluster.name))
def process_to_sample(process):
  """ Given a psutil.Process, return a current ProcessSample """
  try:
    # the nonblocking get_cpu_percent call is stateful on a particular Process object, and hence
    # >2 consecutive calls are required before it will return a non-zero value
    rate = process.cpu_percent(0.0) / 100.0
    cpu_times = process.cpu_times()
    user, system = cpu_times.user, cpu_times.system
    memory_info = process.memory_info()
    rss, vms = memory_info.rss, memory_info.vms
    nice = process.nice()
    status = process.status()
    threads = process.num_threads()
    return ProcessSample(rate, user, system, rss, vms, nice, status, threads)
  except (AccessDenied, NoSuchProcess) as e:
    log.warning('Error during process sampling [pid=%s]: %s' % (process.pid, e))
    return ProcessSample.empty()
Exemple #58
0
  def reap_children(cls):
    pids = set()

    while True:
      try:
        pid, status, rusage = os.wait3(os.WNOHANG)
        if pid == 0:
          break
        pids.add(pid)
        log.debug('Detected terminated process: pid=%s, status=%s, rusage=%s' % (
          pid, status, rusage))
      except OSError as e:
        if e.errno != errno.ECHILD:
          log.warning('Unexpected error when calling waitpid: %s' % e)
        break

    return pids