コード例 #1
0
ファイル: recordio.py プロジェクト: znewman01/commons
    def __iter__(self):
      """
        May raise:
          RecordIO.PrematureEndOfStream
      """
      fd = os.dup(self._fp.fileno())
      try:
        cur_fp = os.fdopen(fd, self._fp.mode)
        cur_fp.seek(0)
      except OSError as e:
        log.error('Failed to duplicate fd on %s, error = %s' % (self._fp.name, e))
        try:
          os.close(fd)
        except OSError as e:
          if e.errno != errno.EBADF:
            log.error('Failed to close duped fd on %s, error = %s' % (self._fp.name, e))
        return

      try:
        while True:
          blob = RecordIO.Reader.do_read(cur_fp, self._codec)
          if blob:
            yield blob
          else:
            break
      finally:
        cur_fp.close()
コード例 #2
0
ファイル: mysos_test_client.py プロジェクト: dongzerun/mysos
  def delete(args, options):
    validate_common_options(options)

    with open(options.password_file, 'r') as f:
      password = f.read().strip()
      if not password:
        app.error("Empty password file")

    url = 'http://%s:%s/clusters/%s' % (options.api_host, options.api_port, options.cluster_name)
    values = dict(password=password)

    req = urllib2.Request(url, urllib.urlencode(values))
    req.get_method = lambda: 'DELETE'

    try:
      response = urllib2.urlopen(req).read()
    except urllib2.HTTPError as e:
      log.error("DELETE request failed: %s, %s, %s" % (
          e.code, BaseHTTPServer.BaseHTTPRequestHandler.responses[e.code], e.read()))
      app.quit(1)

    try:
      result = json.loads(response)
      if not isinstance(result, dict):
        raise ValueError()
    except ValueError:
      log.error("Invalid response: %s" % response)
      app.quit(1)

    log.info("Cluster deletion result: %s" % result)

    log.info("Waiting for the cluster to terminate...")
    wait_for_termination(result['cluster_url'])

    log.info("Cluster terminated/deleted")
コード例 #3
0
    def __on_active(self, root, task_id):
        log.debug('on_active(%r, %r)', root, task_id)
        if task_id in self.finished_tasks:
            log.error('Found an active task (%s) in finished tasks?', task_id)
            return
        task_monitor = TaskMonitor(root, task_id)

        if self._disable_task_resource_collection:
            resource_monitor = NullTaskResourceMonitor()

        else:
            disk_collector_provider = DiskCollectorProvider(
                self._enable_mesos_disk_collector,
                self._disk_collector_settings)

            resource_monitor = TaskResourceMonitor(
                task_id,
                task_monitor,
                disk_collector_provider=disk_collector_provider,
                process_collection_interval=self.
                _task_process_collection_interval,
                disk_collection_interval=self._disk_collector_settings.
                disk_collection_interval)

        resource_monitor.start()
        self._active_tasks[task_id] = ActiveObservedTask(
            root, task_id, task_monitor, resource_monitor)
コード例 #4
0
  def launchTask(self, driver, task):
    """
      Invoked when a task has been launched on this executor (initiated via Scheduler::launchTasks).
      Note that this task can be realized with a thread, a process, or some simple computation,
      however, no other callbacks will be invoked on this executor until this callback has returned.
    """
    self.launched.set()
    self.log('launchTask got task: %s:%s' % (task.name, task.task_id.value))

    # TODO(wickman)  Update the tests to call registered(), then remove this line and issue
    # an assert if self._driver is not populated.
    self._driver = driver

    if self._runner:
      log.error('Already running a task! %s' % self._task_id)
      self.send_update(driver, task.task_id.value, mesos_pb.TASK_LOST,
          "Task already running on this executor: %s" % self._task_id)
      return

    self._slave_id = task.slave_id.value
    self._task_id = task.task_id.value

    try:
      assigned_task = assigned_task_from_mesos_task(task)
      mesos_task = mesos_task_instance_from_assigned_task(assigned_task)
    except Exception as e:
      log.fatal('Could not deserialize AssignedTask')
      log.fatal(traceback.format_exc())
      self.send_update(
          driver, self._task_id, mesos_pb.TASK_FAILED, "Could not deserialize task: %s" % e)
      defer(driver.stop, delay=self.STOP_WAIT)
      return

    defer(lambda: self._run(driver, assigned_task, mesos_task))
コード例 #5
0
  def _shutdown(self, status_result):
    runner_status = self._runner.status

    try:
      deadline(self._runner.stop, timeout=self.STOP_TIMEOUT)
    except Timeout:
      log.error('Failed to stop runner within deadline.')

    try:
      deadline(self._chained_checker.stop, timeout=self.STOP_TIMEOUT)
    except Timeout:
      log.error('Failed to stop all checkers within deadline.')

    # If the runner was alive when _shutdown was called, defer to the status_result,
    # otherwise the runner's terminal state is the preferred state.
    exit_status = runner_status or status_result

    self.send_update(
        self._driver,
        self._task_id,
        self.translate_exit_state_to_mesos(exit_status.status),
        status_result.reason)

    self.terminated.set()
    defer(self._driver.stop, delay=self.PERSISTENCE_WAIT)
コード例 #6
0
ファイル: pingpong.py プロジェクト: ycaihua/twitter-commons
 def send_request(self, endpoint, message, ttl):
     url_base = 'http://%s:%d' % self._target
     try:
         urllib2.urlopen('%s/%s/%s/%d' %
                         (url_base, endpoint, message, ttl)).read()
     except Exception as e:
         log.error('Failed to query %s: %s' % (url_base, e))
コード例 #7
0
 def compute_status(self):
   if self.is_alive:
     return None
   exit_state = self.EXIT_STATE_MAP.get(self.task_state())
   if exit_state is None:
     log.error('Received unexpected exit state from TaskMonitor.')
   return exit_state
コード例 #8
0
ファイル: muxer.py プロジェクト: radhikari54/Mastering-Mesos
    def select(self):
        """
      Read and multiplex checkpoint records from all the forked off process coordinators.

      Checkpoint records can come from one of two places:
        in-process: checkpoint records synthesized for FORKED and LOST events
        out-of-process: checkpoint records from from file descriptors of forked coordinators

      Returns a list of RunnerCkpt objects that were successfully read, or an empty
      list if none were read.
    """
        self._bind_processes()
        updates = []
        for handle in filter(None, self._processes.values()):
            try:
                fstat = os.fstat(handle.fileno())
            except OSError:
                log.error('Unable to fstat %s!' % handle.name)
                continue
            if handle.tell() > fstat.st_size:
                log.error('Truncated checkpoint record detected on %s!' %
                          handle.name)
            elif handle.tell() < fstat.st_size:
                rr = ThriftRecordReader(handle, RunnerCkpt)
                while True:
                    process_update = rr.try_read()
                    if process_update:
                        updates.append(process_update)
                    else:
                        break
        if len(updates) > 0:
            log.debug('select() returning %s updates:' % len(updates))
            for update in updates:
                log.debug('  = %s' % update)
        return updates
コード例 #9
0
ファイル: http_observer.py プロジェクト: AltanAlpay/aurora
  def handle_process(self, task_id, process_id):
    all_processes = {}
    current_run = self._observer.process(task_id, process_id)
    if not current_run:
      HttpServer.abort(404, 'Invalid task/process combination: %s/%s' % (task_id, process_id))
    process = self._observer.process_from_name(task_id, process_id)
    if process is None:
      msg = 'Could not recover process: %s/%s' % (task_id, process_id)
      log.error(msg)
      HttpServer.abort(404, msg)

    current_run_number = current_run['process_run']
    all_processes[current_run_number] = current_run
    for run in range(current_run_number):
      all_processes[run] = self._observer.process(task_id, process_id, run)

    template = {
      'task_id': task_id,
      'process': {
         'name': process_id,
         'status': all_processes[current_run_number]["state"],
         'cmdline': process.cmdline().get()
      },
    }
    template['process'].update(**all_processes[current_run_number].get('used', {}))
    template['runs'] = all_processes
    log.debug('Rendering template is: %s' % template)
    return template
コード例 #10
0
ファイル: process_util.py プロジェクト: bmhatfield/aurora
def setup_child_subreaping():
  """
  This uses the prctl(2) syscall to set the `PR_SET_CHILD_SUBREAPER` flag. This
  means if any children processes need to be reparented, they will be reparented
  to this process.

  More documentation here: http://man7.org/linux/man-pages/man2/prctl.2.html
  and here: https://lwn.net/Articles/474787/

  Callers should reap terminal children to prevent zombies.
  """
  log.debug("Calling prctl(2) with PR_SET_CHILD_SUBREAPER")
  # This constant is taken from prctl.h
  PR_SET_CHILD_SUBREAPER = 36
  try:
    library_name = ctypes.util.find_library('c')
    if library_name is None:
      log.warning("libc is not found. Unable to call prctl!")
      log.warning("Children subreaping is disabled!")
      return
    libc = ctypes.CDLL(library_name, use_errno=True)
    # If we are on a system where prctl doesn't exist, this will throw an
    # attribute error.
    ret = libc.prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0)
    if ret != 0:
      errno = ctypes.get_errno()
      raise OSError(errno, os.strerror(errno))
  except Exception as e:
    log.error("Unable to call prctl %s" % e)
    log.error("Children subreaping is disabled!")
コード例 #11
0
ファイル: mysos_test_client.py プロジェクト: JeeLiu/mysos
  def create(args, options):
    validate_common_options(options)

    if not options.num_nodes:
      app.error("--num_nodes is required")

    if not options.cluster_user:
      app.error("--cluster_user is required")

    url = 'http://%s:%s/clusters/%s' % (options.api_host, options.api_port, options.cluster_name)
    values = dict(
        num_nodes=int(options.num_nodes),
        cluster_user=options.cluster_user,
        size=options.size if options.size else '',
        backup_id=options.backup_id if options.backup_id else '')

    req = urllib2.Request(url, urllib.urlencode(values))
    try:
      response = urllib2.urlopen(req).read()
    except urllib2.HTTPError as e:
      log.error("POST request failed: %s, %s, %s" % (
          e.code, BaseHTTPServer.BaseHTTPRequestHandler.responses[e.code], e.read()))
      app.quit(1)

    try:
      result = json.loads(response)
      if not isinstance(result, dict):
        raise ValueError()
    except ValueError:
      log.error("Invalid response: %s" % response)
      app.quit(1)

    log.info("Cluster created. Cluster info: %s" % str(result))
    with open(options.password_file, 'w') as f:
      f.write(result["cluster_password"])

    log.info("Waiting for the master for this cluster to be elected...")
    master_endpoint = wait_for_master(result['cluster_url']).service_endpoint

    connection_str = "mysql://%s:%s@%s:%d/" % (
        options.cluster_user,
        result["cluster_password"],
        master_endpoint.host,
        master_endpoint.port)
    log.info("Connecting to the MySQL cluster master: %s" % connection_str)
    engine = create_engine(connection_str)

    for i in range(5):  # Loop for 5 times/seconds to wait for the master to be promoted.
      try:
        # TODO(jyx): Test writing to the master and reading from the slave.
        result = engine.execute("SELECT 1;").scalar()
        assert 1 == int(result), "Expecting result to be 1 but got %s" % result
        break
      except OperationalError:
        if i == 4:
          raise
        log.debug("MySQL master not ready yet. Sleep for 1 second...")
        time.sleep(1)

    log.info("Cluster successfully started")
コード例 #12
0
ファイル: sink.py プロジェクト: apsaltis/anna-molly
 def connect(self):
     try:
         redis_conn = redis.StrictRedis(host=self.host, port=self.port, db=self.db)
         self.redis_pipeline = redis_conn.pipeline()
         return redis_conn
     except Exception as _e:
         log.error("RedisSink: ConnectionError\n %s %s" % (self.config, str(_e)))
コード例 #13
0
ファイル: sink.py プロジェクト: apsaltis/anna-molly
 def connect(self):
     try:
         sock = socket.socket()
         sock.connect((self.host, self.port))
         return sock
     except Exception as _e:
         log.error("Cannot connect to Graphite Sink with config:%s\n%s" % (self.config, str(_e)))
コード例 #14
0
    def _run_task(self, task):
        assert self._runner, "_runner should be created before this method is called"

        try:
            self._runner.start()
            log.info("Task runner for task %s started" % task.task_id)

            self._send_update(task.task_id.value, mesos_pb2.TASK_RUNNING)
        except TaskError as e:
            log.error("Task runner for task %s failed to start: %s" %
                      (task.task_id, str(e)))
            # Send TASK_FAILED if the task failed to start.
            self._send_update(task.task_id.value, mesos_pb2.TASK_FAILED)
        except Exception as e:
            log.error("Error occurred while executing the task: %s" % e)
            log.error(traceback.format_exc())
            # Send TASK_LOST for unknown errors.
            self._send_update(task.task_id.value, mesos_pb2.TASK_LOST)

        # Wait for the task's return code (when it terminates).
        try:
            returncode = self._runner.join()
            # Regardless of the return code, if '_runner' terminates, it failed!
            log.error("Task process terminated with return code %s" %
                      returncode)
        except TaskError as e:
            log.error("Task terminated: %s" % e)

        if self._killed:
            self._send_update(task.task_id.value, mesos_pb2.TASK_KILLED)
        else:
            self._send_update(task.task_id.value, mesos_pb2.TASK_FAILED)

        self._kill()
コード例 #15
0
ファイル: scheduler_client.py プロジェクト: bmhatfield/aurora
    def method_wrapper(*args):
      with self._lock:
        start = time.time()
        while not self._terminating.is_set() and (
            time.time() - start) < self.RPC_MAXIMUM_WAIT.as_(Time.SECONDS):

          try:
            method = getattr(self.client(), method_name)
            if not callable(method):
              return method

            resp = method(*args)
            if resp is not None and resp.responseCode == ResponseCode.ERROR_TRANSIENT:
              raise self.TransientError(", ".join(
                  [m.message for m in resp.details] if resp.details else []))
            return resp
          except TRequestsTransport.AuthError as e:
            log.error(self.scheduler_client().get_failed_auth_message())
            raise self.AuthError(e)
          except (TTransport.TTransportException, self.TimeoutError, self.TransientError) as e:
            if not self._terminating.is_set():
              log.warning('Connection error with scheduler: %s, reconnecting...' % e)
              self.invalidate()
              self._terminating.wait(self.RPC_RETRY_INTERVAL.as_(Time.SECONDS))
          except Exception as e:
            # Take any error that occurs during the RPC call, and transform it
            # into something clients can handle.
            if not self._terminating.is_set():
              raise self.ThriftInternalError("Error during thrift call %s to %s: %s" %
                                            (method_name, self.cluster.name, e))
        if not self._terminating.is_set():
          raise self.TimeoutError('Timed out attempting to issue %s to %s' % (
              method_name, self.cluster.name))
コード例 #16
0
ファイル: client.py プロジェクト: nsanch/commons
    def _update_endpoints(self, _1, event, state, _2):
        """Update endpoints from ZK.

    This function will block until the ZK servers respond or retry limit is hit.

    :raises ReconnectFailed: If reconnection fails.
    """
        if not (state == zookeeper.CONNECTED_STATE and event == zookeeper.CHILD_EVENT) and not (
            state == zookeeper.EXPIRED_SESSION_STATE
        ):
            return

        try:
            endpoints = []
            endpoint_names = self._zk.get_children(self._endpoint, self._update_endpoints)
            endpoint_names.sort()
            for endpoint in endpoint_names:
                data = self._zk.get(posixpath.join(self._endpoint, endpoint))
                service_endpoint = serverset_types.ServiceInstance()
                endpoints.append(codec.deserialize(service_endpoint, data[0]))

            old = set(map(_format_endpoint, self._endpoints))
            new = set(map(_format_endpoint, endpoints))
            log.debug("ServerSet endpoints at %r changed to: %s" % (self._endpoint, ", ".join(new)))
            log.debug("  Added: %s" % ", ".join(new - old))
            log.debug("  Removed: %s" % ", ".join(old - new))

            with self._lock:
                if self._watcher:
                    self._watcher(self._endpoint, self._endpoints, endpoints)
                self._endpoints = endpoints
        except ZooKeeper.Error as e:
            log.error("Lost connection to ZooKeeper: %s, reestablishing." % e)
            self._reconnect()
コード例 #17
0
 def run(self):
   try:
     log.info("Setting filter: %s", self.config.filter)
     if self.config.iface == "any":  # pragma: no cover
       sniff(
         filter=self.config.filter,
         store=0,
         prn=self.handle_packet,
         stop_filter=self.wants_stop
       )
     else:
       sniff(
         filter=self.config.filter,
         store=0,
         prn=self.handle_packet,
         iface=self.config.iface,
         stop_filter=self.wants_stop
       )
   except socket.error as ex:
     if self._error_to_stderr:
       sys.stderr.write("Error: %s, device: %s\n" % (ex, self.config.iface))
     else:
       log.error("Error: %s, device: %s", ex, self.config.iface)
   finally:
     log.info("The sniff loop exited")
     os.kill(os.getpid(), signal.SIGINT)
コード例 #18
0
 def compute_status(self):
     if self.is_alive:
         return None
     if self._popen_signal != 0:
         return StatusResult(
             'Task killed by signal %s.' % self._popen_signal,
             mesos_pb2.TASK_KILLED)
     if self._popen_rc == 0 or self._popen_rc == TERMINAL_TASK:
         exit_state = self.EXIT_STATE_MAP.get(self.task_state())
         if exit_state is None:
             log.error('Received unexpected exit state from TaskMonitor.')
             return StatusResult('Task checkpoint could not be read.',
                                 mesos_pb2.TASK_LOST)
         else:
             return exit_state
     elif self._popen_rc == UNKNOWN_USER:
         return StatusResult('Task started with unknown user.',
                             mesos_pb2.TASK_FAILED)
     elif self._popen_rc == INTERNAL_ERROR:
         return StatusResult('Thermos failed with internal error.',
                             mesos_pb2.TASK_LOST)
     elif self._popen_rc == INVALID_TASK:
         return StatusResult('Thermos received an invalid task.',
                             mesos_pb2.TASK_FAILED)
     elif self._popen_rc == UNKNOWN_ERROR:
         return StatusResult('Thermos failed with an unknown error.',
                             mesos_pb2.TASK_LOST)
     else:
         return StatusResult(
             'Thermos exited for unknown reason (exit status: %s)' %
             self._popen_rc, mesos_pb2.TASK_LOST)
コード例 #19
0
    def handle_process(self, task_id, process_id):
        all_processes = {}
        current_run = self._observer.process(task_id, process_id)
        if not current_run:
            HttpServer.abort(
                404, 'Invalid task/process combination: %s/%s' %
                (task_id, process_id))
        process = self._observer.process_from_name(task_id, process_id)
        if process is None:
            msg = 'Could not recover process: %s/%s' % (task_id, process_id)
            log.error(msg)
            HttpServer.abort(404, msg)

        current_run_number = current_run['process_run']
        all_processes[current_run_number] = current_run
        for run in range(current_run_number):
            all_processes[run] = self._observer.process(
                task_id, process_id, run)

        template = {
            'task_id': task_id,
            'process': {
                'name': process_id,
                'status': all_processes[current_run_number]["state"],
                'cmdline': process.cmdline().get()
            },
        }
        template['process'].update(
            **all_processes[current_run_number].get('used', {}))
        template['runs'] = all_processes
        log.debug('Rendering template is: %s', template)
        return template
コード例 #20
0
 def open_checkpoint(cls, filename, force=False, state=None):
     """
   Acquire a locked checkpoint stream.
 """
     safe_mkdir(os.path.dirname(filename))
     fp = lock_file(filename, "a+")
     if fp in (None, False):
         if force:
             log.info('Found existing runner, forcing leadership forfeit.')
             state = state or CheckpointDispatcher.from_file(filename)
             if cls.kill_runner(state):
                 log.info('Successfully killed leader.')
                 # TODO(wickman)  Blocking may not be the best idea here.  Perhaps block up to
                 # a maximum timeout.  But blocking is necessary because os.kill does not immediately
                 # release the lock if we're in force mode.
                 fp = lock_file(filename, "a+", blocking=True)
         else:
             log.error('Found existing runner, cannot take control.')
     if fp in (None, False):
         raise cls.PermissionError(
             'Could not open locked checkpoint: %s, lock_file = %s' %
             (filename, fp))
     ckpt = ThriftRecordWriter(fp)
     ckpt.set_sync(True)
     return ckpt
コード例 #21
0
ファイル: process_util.py プロジェクト: isabella232/client-3
def setup_child_subreaping():
    """
  This uses the prctl(2) syscall to set the `PR_SET_CHILD_SUBREAPER` flag. This
  means if any children processes need to be reparented, they will be reparented
  to this process.

  More documentation here: http://man7.org/linux/man-pages/man2/prctl.2.html
  and here: https://lwn.net/Articles/474787/

  Callers should reap terminal children to prevent zombies.
  """
    log.debug("Calling prctl(2) with PR_SET_CHILD_SUBREAPER")
    # This constant is taken from prctl.h
    PR_SET_CHILD_SUBREAPER = 36
    try:
        library_name = ctypes.util.find_library('c')
        if library_name is None:
            log.warning("libc is not found. Unable to call prctl!")
            log.warning("Children subreaping is disabled!")
            return
        libc = ctypes.CDLL(library_name, use_errno=True)
        # If we are on a system where prctl doesn't exist, this will throw an
        # attribute error.
        ret = libc.prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0)
        if ret != 0:
            errno = ctypes.get_errno()
            raise OSError(errno, os.strerror(errno))
    except Exception as e:
        log.error("Unable to call prctl %s" % e)
        log.error("Children subreaping is disabled!")
コード例 #22
0
ファイル: monitor.py プロジェクト: rowoot/aurora
    def _apply_states(self):
        """
      os.stat() the corresponding checkpoint stream of this task and determine if there are new ckpt
      records.  Attempt to read those records and update the high watermark for that stream.
      Returns True if new states were applied, False otherwise.
    """
        ckpt_offset = None
        try:
            ckpt_offset = os.stat(self._runner_ckpt).st_size

            updated = False
            if self._ckpt_head < ckpt_offset:
                with open(self._runner_ckpt, "r") as fp:
                    fp.seek(self._ckpt_head)
                    rr = ThriftRecordReader(fp, RunnerCkpt)
                    while True:
                        runner_update = rr.try_read()
                        if not runner_update:
                            break
                        try:
                            self._dispatcher.dispatch(self._runnerstate, runner_update)
                        except CheckpointDispatcher.InvalidSequenceNumber as e:
                            log.error("Checkpoint stream is corrupt: %s" % e)
                            break
                    new_ckpt_head = fp.tell()
                    updated = self._ckpt_head != new_ckpt_head
                    self._ckpt_head = new_ckpt_head
            return updated
        except OSError as e:
            if e.errno == errno.ENOENT:
                # The log doesn't yet exist, will retry later.
                log.warning("Could not read from checkpoint %s" % self._runner_ckpt)
                return False
            else:
                raise
コード例 #23
0
ファイル: aurora_executor.py プロジェクト: bmhatfield/aurora
  def launchTask(self, driver, task):
    """
      Invoked when a task has been launched on this executor (initiated via Scheduler::launchTasks).
      Note that this task can be realized with a thread, a process, or some simple computation,
      however, no other callbacks will be invoked on this executor until this callback has returned.
    """
    self.launched.set()
    self.log('TaskInfo: %s' % task)
    self.log('launchTask got task: %s:%s' % (task.name, task.task_id.value))

    # TODO(wickman)  Update the tests to call registered(), then remove this line and issue
    # an assert if self._driver is not populated.
    self._driver = driver

    if self._runner:
      log.error('Already running a task! %s' % self._task_id)
      self.send_update(driver, task.task_id.value, mesos_pb2.TASK_LOST,
          "Task already running on this executor: %s" % self._task_id)
      return

    self._slave_id = task.slave_id.value
    self._task_id = task.task_id.value

    assigned_task = self.validate_task(task)
    self.log("Assigned task: %s" % assigned_task)
    if not assigned_task:
      self.send_update(driver, self._task_id, mesos_pb2.TASK_FAILED,
          'Could not deserialize task.')
      defer(driver.stop, delay=self.STOP_WAIT)
      return

    defer(lambda: self._run(driver, assigned_task, self.extract_mount_paths_from_task(task)))
コード例 #24
0
    def wait_start(self, timeout=MAX_WAIT):
        log.debug('Waiting for task to start.')

        def is_started():
            return self._monitor and (self._monitor.active
                                      or self._monitor.finished)

        waited = Amount(0, Time.SECONDS)

        while waited < timeout:
            if not is_started():
                log.debug('  - sleeping...')
                self._clock.sleep(self.POLL_INTERVAL.as_(Time.SECONDS))
                waited += self.POLL_INTERVAL
            else:
                break

            if not self.is_alive:
                if self._popen_rc != 0:
                    raise TaskError('Task failed: %s' %
                                    self.compute_status().reason)
                else:
                    # We can end up here if the process exited between the call to Popen and
                    # waitpid (in is_alive), which is fine.
                    log.info('Task runner exited: %s' %
                             self.compute_status().reason)
                    break

        if not is_started():
            log.error('Task did not start with in deadline, forcing loss.')
            self.lose()
            raise TaskError('Task did not start within deadline.')
コード例 #25
0
ファイル: scheduler.py プロジェクト: a-nldisr/mysos
  def statusUpdate(self, driver, status):
    with self._lock:
      # Forward the status update to the corresponding launcher.
      task_id = status.task_id.value
      launcher = self._get_launcher_by_task_id(task_id)
      if not launcher:
        log.info("Cluster for task %s doesn't exist. It could have been removed" % task_id)
        return

      try:
        launcher.status_update(status)
      except MySQLClusterLauncher.Error as e:
        log.error("Status update failed due to launcher error: %s" % e.message)
        self._stop()

      # Update metrics.
      # TODO(xujyan): This doesn't rule out duplicates, etc. We can consider updating these metrics
      # in the launcher.
      if status.state == mesos_pb2.TASK_FINISHED:
        self._metrics.tasks_finished.increment()
      elif status.state == mesos_pb2.TASK_FAILED:
        self._metrics.tasks_failed.increment()
      elif status.state == mesos_pb2.TASK_KILLED:
        self._metrics.tasks_killed.increment()
      elif status.state == mesos_pb2.TASK_LOST:
        self._metrics.tasks_lost.increment()

      if launcher.terminated:
        log.info("Deleting the launcher for cluster %s because the cluster has terminated" %
                 launcher.cluster_name)
        self._delete_launcher(launcher)
コード例 #26
0
    def wait_start(self, timeout=MAX_WAIT):
        log.debug("Waiting for task to start.")

        def is_started():
            return self._monitor and (self._monitor.active or self._monitor.finished)

        waited = Amount(0, Time.SECONDS)

        while waited < timeout:
            if not is_started():
                log.debug("  - sleeping...")
                self._clock.sleep(self.POLL_INTERVAL.as_(Time.SECONDS))
                waited += self.POLL_INTERVAL
            else:
                break

            if not self.is_alive:
                if self._popen_rc != 0:
                    raise TaskError("Task failed: %s" % self.compute_status().reason)
                else:
                    # We can end up here if the process exited between the call to Popen and
                    # waitpid (in is_alive), which is fine.
                    log.info("Task runner exited: %s" % self.compute_status().reason)
                    break

        if not is_started():
            log.error("Task did not start with in deadline, forcing loss.")
            self.lose()
            raise TaskError("Task did not start within deadline.")
コード例 #27
0
    def statusUpdate(self, driver, status):
        with self._lock:
            # Forward the status update to the corresponding launcher.
            task_id = status.task_id.value
            launcher = self._get_launcher_by_task_id(task_id)
            if not launcher:
                log.info(
                    "Cluster for task %s doesn't exist. It could have been removed"
                    % task_id)
                return

            try:
                launcher.status_update(status)
            except MySQLClusterLauncher.Error as e:
                log.error("Status update failed due to launcher error: %s" %
                          e.message)
                self._stop()

            # Update metrics.
            # TODO(xujyan): This doesn't rule out duplicates, etc. We can consider updating these metrics
            # in the launcher.
            if status.state == mesos_pb2.TASK_FINISHED:
                self._metrics.tasks_finished.increment()
            elif status.state == mesos_pb2.TASK_FAILED:
                self._metrics.tasks_failed.increment()
            elif status.state == mesos_pb2.TASK_KILLED:
                self._metrics.tasks_killed.increment()
            elif status.state == mesos_pb2.TASK_LOST:
                self._metrics.tasks_lost.increment()

            if launcher.terminated:
                log.info(
                    "Deleting the launcher for cluster %s because the cluster has terminated"
                    % launcher.cluster_name)
                self._delete_launcher(launcher)
コード例 #28
0
    def is_alive(self):
        """
      Is the process underlying the Thermos task runner alive?
    """
        if not self._popen:
            return False

        if self._dead.is_set():
            return False

        # N.B. You cannot mix this code and any code that relies upon os.wait
        # mechanisms with blanket child process collection.  One example is the
        # Thermos task runner which calls os.wait4 -- without refactoring, you
        # should not mix a Thermos task runner in the same process as this
        # thread.
        try:
            pid, status = os.waitpid(self._popen.pid, os.WNOHANG)
            if pid == 0:
                return True
            else:
                self._popen_signal, self._popen_rc = self._decode_status(
                    status)
                log.info(
                    'Detected runner termination: pid=%s, signal=%s, rc=%s' %
                    (pid, self._popen_signal, self._popen_rc))
        except OSError as e:
            log.error('is_alive got OSError: %s' % e)
            if e.errno != errno.ECHILD:
                raise

        self._dead.set()
        return False
コード例 #29
0
  def _shutdown(self, status_result):
    runner_status = self._runner.status

    try:
      deadline(self._runner.stop, timeout=self.STOP_TIMEOUT)
    except Timeout:
      log.error('Failed to stop runner within deadline.')

    try:
      deadline(self._chained_checker.stop, timeout=self.STOP_TIMEOUT)
    except Timeout:
      log.error('Failed to stop all checkers within deadline.')

    # If the runner was alive when _shutdown was called, defer to the status_result,
    # otherwise the runner's terminal state is the preferred state.
    exit_status = runner_status or status_result

    self.send_update(
        self._driver,
        self._task_id,
        self.translate_exit_state_to_mesos(exit_status.status),
        status_result.reason)

    self.terminated.set()
    defer(self._driver.stop, delay=self.PERSISTENCE_WAIT)
コード例 #30
0
ファイル: runner.py プロジェクト: StephanErb/aurora
 def get(cls, task_id, checkpoint_root):
     """
   Get a TaskRunner bound to the task_id in checkpoint_root.
 """
     path = TaskPath(root=checkpoint_root, task_id=task_id, state="active")
     task_json = path.getpath("task_path")
     task_checkpoint = path.getpath("runner_checkpoint")
     if not os.path.exists(task_json):
         return None
     task = ThermosConfigLoader.load_json(task_json)
     if task is None:
         return None
     if len(task.tasks()) == 0:
         return None
     try:
         checkpoint = CheckpointDispatcher.from_file(task_checkpoint)
         if checkpoint is None or checkpoint.header is None:
             return None
         return cls(
             task.tasks()[0].task(),
             checkpoint_root,
             checkpoint.header.sandbox,
             log_dir=checkpoint.header.log_dir,
             task_id=task_id,
             portmap=checkpoint.header.ports,
             hostname=checkpoint.header.hostname,
         )
     except Exception as e:
         log.error("Failed to reconstitute checkpoint in TaskRunner.get: %s" % e, exc_info=True)
         return None
コード例 #31
0
ファイル: muxer.py プロジェクト: sumanau7/incubator-aurora
  def select(self):
    """
      Read and multiplex checkpoint records from all the forked off process coordinators.

      Checkpoint records can come from one of two places:
        in-process: checkpoint records synthesized for FORKED and LOST events
        out-of-process: checkpoint records from from file descriptors of forked coordinators

      Returns a list of RunnerCkpt objects that were successfully read, or an empty
      list if none were read.
    """
    self._bind_processes()
    updates = []
    for handle in filter(None, self._processes.values()):
      try:
        fstat = os.fstat(handle.fileno())
      except OSError as e:
        log.error('Unable to fstat %s!' % handle.name)
        continue
      if handle.tell() > fstat.st_size:
        log.error('Truncated checkpoint record detected on %s!' % handle.name)
      elif handle.tell() < fstat.st_size:
        rr = ThriftRecordReader(handle, RunnerCkpt)
        while True:
          process_update = rr.try_read()
          if process_update:
            updates.append(process_update)
          else:
            break
    if len(updates) > 0:
      log.debug('select() returning %s updates:' % len(updates))
      for update in updates:
        log.debug('  = %s' % update)
    return updates
コード例 #32
0
  def _update_endpoints(self, _1, event, state, _2):
    """Update endpoints from ZK.

    This function will block until the ZK servers respond or retry limit is hit.

    :raises ReconnectFailed: If reconnection fails.
    """
    if not (state == zookeeper.CONNECTED_STATE and event == zookeeper.CHILD_EVENT) and not (
        state == zookeeper.EXPIRED_SESSION_STATE):
      return

    try:
      endpoints = []
      endpoint_names = self._zk.get_children(self._endpoint, self._update_endpoints)
      endpoint_names.sort()
      for endpoint in endpoint_names:
        data = self._zk.get(posixpath.join(self._endpoint, endpoint))
        service_endpoint = serverset_types.ServiceInstance()
        endpoints.append(codec.deserialize(service_endpoint, data[0]))

      old = set(map(_format_endpoint, self._endpoints))
      new = set(map(_format_endpoint, endpoints))
      log.debug('ServerSet endpoints at %r changed to: %s' % (self._endpoint, ', '.join(new)))
      log.debug('  Added: %s' % ', '.join(new - old))
      log.debug('  Removed: %s' % ', '.join(old - new))

      with self._lock:
        if self._watcher:
          self._watcher(self._endpoint, self._endpoints, endpoints)
        self._endpoints = endpoints
    except ZooKeeper.Error as e:
      log.error('Lost connection to ZooKeeper: %s, reestablishing.' % e)
      self._reconnect()
コード例 #33
0
  def _run(self, driver, assigned_task, mounted_volume_paths):
    """
      Commence running a Task.
        - Initialize the sandbox
        - Start the ThermosTaskRunner (fork the Thermos TaskRunner)
        - Set up necessary HealthCheckers
        - Set up StatusManager, and attach HealthCheckers
    """
    self.send_update(driver, self._task_id, mesos_pb2.TASK_STARTING)

    if not self._initialize_sandbox(driver, assigned_task, mounted_volume_paths):
      return

    # start the process on a separate thread and give the message processing thread back
    # to the driver
    try:
      self._runner = self._runner_provider.from_assigned_task(assigned_task, self._sandbox)
    except TaskError as e:
      self.runner_aborted.set()
      self._die(driver, mesos_pb2.TASK_FAILED, str(e))
      return

    if not isinstance(self._runner, TaskRunner):
      self._die(driver, mesos_pb2.TASK_FAILED, 'Unrecognized task!')
      return

    if not self._start_runner(driver, assigned_task):
      return

    try:
      self._start_status_manager(driver, assigned_task)
    except Exception:
      log.error(traceback.format_exc())
      self._die(driver, mesos_pb2.TASK_FAILED, "Internal error")
コード例 #34
0
  def _rollback(self, instances_to_rollback, instance_configs):
    """Performs a rollback operation for the failed instances.

    Arguments:
    instances_to_rollback -- instance ids to rollback.
    instance_configs -- instance configuration to use for rollback.
    """
    if not self._update_config.rollback_on_failure:
      log.info('Rollback on failure is disabled in config. Aborting rollback')
      return

    log.info('Reverting update for %s' % instances_to_rollback)
    instance_operation = self.OperationConfigs(
        from_config=instance_configs.local_config_map,
        to_config=instance_configs.remote_config_map
    )
    instances_to_rollback.sort(reverse=True)
    failed_instances = []
    while instances_to_rollback:
      batch_instances = instances_to_rollback[0 : self._update_config.batch_size]
      instances_to_rollback = list(set(instances_to_rollback) - set(batch_instances))
      instances_to_rollback.sort(reverse=True)
      instances_to_watch = self._update_instances(batch_instances, instance_operation)
      failed_instances += self._watcher.watch(instances_to_watch)

    if failed_instances:
      log.error('Rollback failed for instances: %s' % failed_instances)
コード例 #35
0
ファイル: runner.py プロジェクト: theevocater/aurora
 def control(self, force=False):
     """
   Bind to the checkpoint associated with this task, position to the end of the log if
   it exists, or create it if it doesn't.  Fails if we cannot get "leadership" i.e. a
   file lock on the checkpoint stream.
 """
     if self.is_terminal():
         raise self.StateError(
             'Cannot take control of a task in terminal state.')
     if self._sandbox:
         safe_mkdir(self._sandbox)
     ckpt_file = self._pathspec.getpath('runner_checkpoint')
     try:
         self._ckpt = TaskRunnerHelper.open_checkpoint(ckpt_file,
                                                       force=force,
                                                       state=self._state)
     except TaskRunnerHelper.PermissionError:
         raise self.PermissionError('Unable to open checkpoint %s' %
                                    ckpt_file)
     log.debug('Flipping recovery mode off.')
     self._recovery = False
     self._set_task_status(self.task_state())
     self._resume_task()
     try:
         yield
     except Exception as e:
         log.error('Caught exception in self.control(): %s' % e)
         log.error('  %s' % traceback.format_exc())
     self._ckpt.close()
コード例 #36
0
ファイル: runner.py プロジェクト: theevocater/aurora
 def get(cls, task_id, checkpoint_root):
     """
   Get a TaskRunner bound to the task_id in checkpoint_root.
 """
     path = TaskPath(root=checkpoint_root, task_id=task_id, state='active')
     task_json = path.getpath('task_path')
     task_checkpoint = path.getpath('runner_checkpoint')
     if not os.path.exists(task_json):
         return None
     task = ThermosConfigLoader.load_json(task_json)
     if task is None:
         return None
     if len(task.tasks()) == 0:
         return None
     try:
         checkpoint = CheckpointDispatcher.from_file(task_checkpoint)
         if checkpoint is None or checkpoint.header is None:
             return None
         return cls(task.tasks()[0].task(),
                    checkpoint_root,
                    checkpoint.header.sandbox,
                    log_dir=checkpoint.header.log_dir,
                    task_id=task_id,
                    portmap=checkpoint.header.ports,
                    hostname=checkpoint.header.hostname)
     except Exception as e:
         log.error(
             'Failed to reconstitute checkpoint in TaskRunner.get: %s' % e,
             exc_info=True)
         return None
コード例 #37
0
ファイル: runner.py プロジェクト: shirchen/aurora
  def _initialize_ckpt_header(self):
    """
      Initializes the RunnerHeader for this checkpoint stream if it has not already
      been constructed.
    """
    if self._state.header is None:
      try:
        uid = pwd.getpwnam(self._user).pw_uid
      except KeyError:
        # This will cause failures downstream, but they will at least be correctly
        # reflected in the process state.
        log.error('Unknown user %s.' % self._user)
        uid = None

      header = RunnerHeader(
          task_id=self._task_id,
          launch_time_ms=int(self._launch_time * 1000),
          sandbox=self._sandbox,
          log_dir=self._log_dir,
          hostname=self._hostname,
          user=self._user,
          uid=uid,
          ports=self._portmap)
      runner_ckpt = RunnerCkpt(runner_header=header)
      self._dispatcher.dispatch(self._state, runner_ckpt)
コード例 #38
0
 def compute_status(self):
   if self.is_alive:
     return None
   exit_state = self.EXIT_STATE_MAP.get(self.task_state())
   if exit_state is None:
     log.error('Received unexpected exit state from TaskMonitor.')
   return exit_state
コード例 #39
0
  def validate_quota_from_requested(self, job_key, production, released, acquired):
    """Validates requested change will not exceed the available quota.

    Arguments:
    job_key -- job key.
    production -- production flag.
    released -- production CapacityRequest to be released (in case of job update).
    acquired -- production CapacityRequest to be acquired.

    Returns: ResponseCode.OK if check is successful.
    """
    resp_ok = Response(responseCode=ResponseCode.OK, messageDEPRECATED='Quota check successful.')
    if not production:
      return resp_ok

    resp = self._scheduler.getQuota(job_key.role)
    if resp.responseCode != ResponseCode.OK:
      log.error('Failed to get quota from scheduler: %s' % resp.messageDEPRECATED)
      return resp

    allocated = CapacityRequest(resp.result.getQuotaResult.quota)
    consumed = CapacityRequest(resp.result.getQuotaResult.prodConsumption)
    requested = acquired - released
    effective = allocated - consumed - requested

    if not effective.valid():
      log.info('Not enough quota to create/update job.')
      print_quota(allocated.quota(), 'Total allocated quota', job_key.role)
      print_quota(consumed.quota(), 'Consumed quota', job_key.role)
      print_quota(requested.quota(), 'Requested', job_key.name)
      return Response(
          responseCode=ResponseCode.INVALID_REQUEST,
          messageDEPRECATED='Failed quota check.')

    return resp_ok
コード例 #40
0
ファイル: executor.py プロジェクト: repls/mysos
  def _run_task(self, task):
    assert self._runner, "_runner should be created before this method is called"

    try:
      self._runner.start()
      log.info("Task runner for task %s started" % task.task_id)

      self._send_update(task.task_id.value, mesos_pb2.TASK_RUNNING)
    except TaskError as e:
      log.error("Task runner for task %s failed to start: %s" % (task.task_id, str(e)))
      # Send TASK_FAILED if the task failed to start.
      self._send_update(task.task_id.value, mesos_pb2.TASK_FAILED)
    except Exception as e:
      log.error("Error occurred while executing the task: %s" % e)
      log.error(traceback.format_exc())
      # Send TASK_LOST for unknown errors.
      self._send_update(task.task_id.value, mesos_pb2.TASK_LOST)

    # Wait for the task's return code (when it terminates).
    try:
      returncode = self._runner.join()
      # Regardless of the return code, if '_runner' terminates, it failed!
      log.error("Task process terminated with return code %s" % returncode)
    except TaskError as e:
      log.error("Task terminated: %s" % e)

    if self._killed:
      self._send_update(task.task_id.value, mesos_pb2.TASK_KILLED)
    else:
      self._send_update(task.task_id.value, mesos_pb2.TASK_FAILED)

    self._kill()
コード例 #41
0
  def is_alive(self):
    """
      Is the process underlying the Thermos task runner alive?
    """
    if not self._popen:
      return False
    if self._dead.is_set():
      return False

    # N.B. You cannot mix this code and any code that relies upon os.wait
    # mechanisms with blanket child process collection.  One example is the
    # Thermos task runner which calls os.wait4 -- without refactoring, you
    # should not mix a Thermos task runner in the same process as this
    # thread.
    try:
      pid, _ = os.waitpid(self._popen.pid, os.WNOHANG)
      if pid == 0:
        return True
      else:
        log.info('Detected runner termination: pid=%s' % pid)
    except OSError as e:
      log.error('is_alive got OSError: %s' % e)
      if e.errno != errno.ECHILD:
        raise

    self._dead.set()
    return False
コード例 #42
0
ファイル: runner.py プロジェクト: apache/aurora
  def _initialize_ckpt_header(self):
    """
      Initializes the RunnerHeader for this checkpoint stream if it has not already
      been constructed.
    """
    if self._state.header is None:
      try:
        uid = pwd.getpwnam(self._user).pw_uid
      except KeyError:
        # This will cause failures downstream, but they will at least be correctly
        # reflected in the process state.
        log.error('Unknown user %s.', self._user)
        uid = None

      header = RunnerHeader(
          task_id=self._task_id,
          launch_time_ms=int(self._launch_time * 1000),
          sandbox=self._sandbox,
          log_dir=self._log_dir,
          hostname=self._hostname,
          user=self._user,
          uid=uid,
          ports=self._portmap)
      runner_ckpt = RunnerCkpt(runner_header=header)
      self._dispatcher.dispatch(self._state, runner_ckpt)
コード例 #43
0
    def launchTask(self, driver, task):
        """
      Invoked when a task has been launched on this executor (initiated via Scheduler::launchTasks).
      Note that this task can be realized with a thread, a process, or some simple computation,
      however, no other callbacks will be invoked on this executor until this callback has returned.
    """
        self.launched.set()
        self.log('launchTask got task: %s:%s' %
                 (task.name, task.task_id.value))

        # TODO(wickman)  Update the tests to call registered(), then remove this line and issue
        # an assert if self._driver is not populated.
        self._driver = driver

        if self._runner:
            log.error('Already running a task! %s' % self._task_id)
            self.send_update(
                driver, task.task_id.value, mesos_pb2.TASK_LOST,
                "Task already running on this executor: %s" % self._task_id)
            return

        self._slave_id = task.slave_id.value
        self._task_id = task.task_id.value

        assigned_task = self.validate_task(task)
        if not assigned_task:
            self.send_update(driver, self._task_id, mesos_pb2.TASK_FAILED,
                             'Could not deserialize task.')
            defer(driver.stop, delay=self.STOP_WAIT)
            return

        defer(lambda: self._run(driver, assigned_task))
コード例 #44
0
ファイル: updater.py プロジェクト: isomer/incubator-aurora
    def _rollback(self, instances_to_rollback, instance_configs):
        """Performs a rollback operation for the failed instances.

    Arguments:
    instances_to_rollback -- instance ids to rollback.
    instance_configs -- instance configuration to use for rollback.
    """
        log.info('Reverting update for %s' % instances_to_rollback)
        instance_operation = self.OperationConfigs(
            from_config=instance_configs.local_config_map,
            to_config=instance_configs.remote_config_map)
        instances_to_rollback.sort(reverse=True)
        failed_instances = []
        while instances_to_rollback:
            batch_instances = instances_to_rollback[0:self._update_config.
                                                    batch_size]
            instances_to_rollback = list(
                set(instances_to_rollback) - set(batch_instances))
            instances_to_rollback.sort(reverse=True)
            instances_to_watch = self._update_instances(
                batch_instances, instance_operation)
            failed_instances += self._watcher.watch(instances_to_watch)

        if failed_instances:
            log.error('Rollback failed for instances: %s' % failed_instances)
コード例 #45
0
ファイル: runner.py プロジェクト: apache/aurora
 def control(self, force=False):
   """
     Bind to the checkpoint associated with this task, position to the end of the log if
     it exists, or create it if it doesn't.  Fails if we cannot get "leadership" i.e. a
     file lock on the checkpoint stream.
   """
   if self.is_terminal():
     raise self.StateError('Cannot take control of a task in terminal state.')
   if self._sandbox:
     safe_mkdir(self._sandbox)
   ckpt_file = self._pathspec.getpath('runner_checkpoint')
   try:
     self._ckpt = TaskRunnerHelper.open_checkpoint(ckpt_file, force=force, state=self._state)
   except TaskRunnerHelper.PermissionError:
     raise self.PermissionError('Unable to open checkpoint %s' % ckpt_file)
   log.debug('Flipping recovery mode off.')
   self._recovery = False
   self._set_task_status(self.task_state())
   self._resume_task()
   try:
     yield
   except Exception as e:
     log.error('Caught exception in self.control(): %s', e)
     log.error('  %s', traceback.format_exc())
   self._ckpt.close()
コード例 #46
0
    def addattachment(self, page, filename):
        """Add an attachment to an existing page.
    Note: this will first read the entire file into memory"""
        mime_type = mimetypes.guess_type(filename, strict=False)[0]
        if not mime_type:
            raise ConfluenceError('Failed to detect MIME type of %s' %
                                  filename)

        try:
            with open(filename, 'rb') as f:
                file_data = f.read()

            attachment = dict(fileName=basename(filename),
                              contentType=mime_type)
            return self._api_entrypoint.addAttachment(self._session_token,
                                                      page['id'], attachment,
                                                      Binary(file_data))
        except (IOError, OSError) as e:
            log.error('Failed to read data from file %s: %s' %
                      (filename, str(e)))
            return None
        except XMLRPCError as e:
            log.error('Failed to add file attachment %s to page: %s' %
                      (filename, page.get('title', '[unknown title]')))
            return None
コード例 #47
0
  def _apply_states(self):
    """
      os.stat() the corresponding checkpoint stream of this task and determine if there are new ckpt
      records.  Attempt to read those records and update the high watermark for that stream.
      Returns True if new states were applied, False otherwise.
    """
    ckpt_offset = None
    try:
      ckpt_offset = os.stat(self._runner_ckpt).st_size

      updated = False
      if self._ckpt_head < ckpt_offset:
        with open(self._runner_ckpt, 'r') as fp:
          fp.seek(self._ckpt_head)
          rr = ThriftRecordReader(fp, RunnerCkpt)
          while True:
            runner_update = rr.try_read()
            if not runner_update:
              break
            try:
              self._dispatcher.dispatch(self._runnerstate, runner_update)
            except CheckpointDispatcher.InvalidSequenceNumber as e:
              log.error('Checkpoint stream is corrupt: %s' % e)
              break
          new_ckpt_head = fp.tell()
          updated = self._ckpt_head != new_ckpt_head
          self._ckpt_head = new_ckpt_head
      return updated
    except OSError as e:
      if e.errno == errno.ENOENT:
        # The log doesn't yet exist, will retry later.
        log.warning('Could not read from checkpoint %s' % self._runner_ckpt)
        return False
      else:
        raise
コード例 #48
0
ファイル: host_maintenance.py プロジェクト: rosmo/aurora
    def _check_sla(self, hostnames, grouping_function, percentage, duration):
        """Check if the provided list of hosts passes the job uptime SLA check.

    This is an all-or-nothing check, meaning that all provided hosts must pass their job
    SLA check for the maintenance to proceed.

    :param hostnames: list of host names to check SLA for
    :type hostnames: list of strings
    :param grouping_function: grouping function to apply to the given hosts
    :type grouping_function: function
    :param percentage: SLA uptime percentage override
    :type percentage: float
    :param duration: SLA uptime duration override
    :type duration: twitter.common.quantity.Amount
    :rtype: set of unsafe hosts
    """
        vector = self._client.sla_get_safe_domain_vector(self.SLA_MIN_JOB_INSTANCE_COUNT, hostnames)
        host_groups = vector.probe_hosts(percentage, duration.as_(Time.SECONDS), grouping_function)

        unsafe_hostnames = set()
        # Given that maintenance is performed 1 group at a time, any result longer than 1 group
        # should be considered a batch failure.
        if host_groups:
            if len(host_groups) > 1:
                log.error("Illegal multiple groups detected in SLA results. Skipping hosts: %s" % hostnames)
                return set(hostnames)

            results, unsafe_hostnames = format_sla_results(host_groups, unsafe_only=True)
            if results:
                print_results(results)
                return unsafe_hostnames

        return unsafe_hostnames
コード例 #49
0
    def method_wrapper(*args):
      with self._lock:
        start = time.time()
        while not self._terminating.is_set() and (
            time.time() - start) < self.RPC_MAXIMUM_WAIT.as_(Time.SECONDS):

          try:
            method = getattr(self.client(), method_name)
            if not callable(method):
              return method
            resp = method(*args)
            if resp is not None and resp.responseCode == ResponseCode.ERROR_TRANSIENT:
              raise self.TransientError(", ".join(
                  [m.message for m in resp.details] if resp.details else []))
            return resp
          except TRequestsTransport.AuthError as e:
            log.error(self.scheduler_client().get_failed_auth_message())
            raise self.AuthError(e)
          except (TTransport.TTransportException, self.TimeoutError, self.TransientError) as e:
            if not self._terminating.is_set():
              log.warning('Connection error with scheduler: %s, reconnecting...' % e)
              self.invalidate()
              self._terminating.wait(self.RPC_RETRY_INTERVAL.as_(Time.SECONDS))
          except Exception as e:
            # Take any error that occurs during the RPC call, and transform it
            # into something clients can handle.
            if not self._terminating.is_set():
              raise self.ThriftInternalError("Error during thrift call %s to %s: %s" %
                                            (method_name, self.cluster.name, e))
        if not self._terminating.is_set():
          raise self.TimeoutError('Timed out attempting to issue %s to %s' % (
              method_name, self.cluster.name))
コード例 #50
0
 def safe_signal(cls, pid, sig=signal.SIGTERM):
   try:
     os.kill(pid, sig)
   except OSError as e:
     if e.errno not in (errno.ESRCH, errno.EPERM):
       log.error('Unexpected error in os.kill: %s' % e)
   except Exception as e:
     log.error('Unexpected error in os.kill: %s' % e)
コード例 #51
0
ファイル: helper.py プロジェクト: betepahos/incubator-aurora
 def safe_signal(cls, pid, sig=signal.SIGTERM):
   try:
     os.kill(pid, sig)
   except OSError as e:
     if e.errno not in (errno.ESRCH, errno.EPERM):
       log.error('Unexpected error in os.kill: %s' % e)
   except Exception as e:
     log.error('Unexpected error in os.kill: %s' % e)
コード例 #52
0
 def resolve(self):
     for job in self._jobs:
         resp = self._api.query(self.query_from(self._role, self._env, job))
         if resp.responseCode != ResponseCode.OK:
             log.error("Failed to query job: %s" % job)
             continue
         for task in resp.result.scheduleStatusResult.tasks:
             yield task
コード例 #53
0
def delete(j, name):
    """
  delete job
  """
    try:
        j.delete_job(name)
    except JenkinsAPIException as e:
        log.error("error deleting job: %s" % e)
コード例 #54
0
ファイル: jenkins.py プロジェクト: makewhatis/commons
def delete(j, name):
  """
  delete job
  """
  try:
    j.delete_job(name)
  except JenkinsAPIException as e:
    log.error("error deleting job: %s" % e)
コード例 #55
0
 def connect(self):
     try:
         sock = socket.socket()
         sock.connect((self.host, self.port))
         return sock
     except Exception as _e:
         log.error("Cannot connect to Graphite Sink with config:%s\n%s" %
                   (self.config, str(_e)))
コード例 #56
0
 def resolve(self):
     for job in self._jobs:
         resp = self._api.query(self.query_from(self._role, self._env, job))
         if resp.responseCode != ResponseCode.OK:
             log.error('Failed to query job: %s' % job)
             continue
         for task in resp.result.scheduleStatusResult.tasks:
             yield task