Example #1
0
    def __iter__(self):
      """
        May raise:
          RecordIO.PrematureEndOfStream
      """
      fd = os.dup(self._fp.fileno())
      try:
        cur_fp = os.fdopen(fd, self._fp.mode)
        cur_fp.seek(0)
      except OSError as e:
        log.error('Failed to duplicate fd on %s, error = %s' % (self._fp.name, e))
        try:
          os.close(fd)
        except OSError as e:
          if e.errno != errno.EBADF:
            log.error('Failed to close duped fd on %s, error = %s' % (self._fp.name, e))
        return

      try:
        while True:
          blob = RecordIO.Reader.do_read(cur_fp, self._codec)
          if blob:
            yield blob
          else:
            break
      finally:
        cur_fp.close()
Example #2
0
  def delete(args, options):
    validate_common_options(options)

    with open(options.password_file, 'r') as f:
      password = f.read().strip()
      if not password:
        app.error("Empty password file")

    url = 'http://%s:%s/clusters/%s' % (options.api_host, options.api_port, options.cluster_name)
    values = dict(password=password)

    req = urllib2.Request(url, urllib.urlencode(values))
    req.get_method = lambda: 'DELETE'

    try:
      response = urllib2.urlopen(req).read()
    except urllib2.HTTPError as e:
      log.error("DELETE request failed: %s, %s, %s" % (
          e.code, BaseHTTPServer.BaseHTTPRequestHandler.responses[e.code], e.read()))
      app.quit(1)

    try:
      result = json.loads(response)
      if not isinstance(result, dict):
        raise ValueError()
    except ValueError:
      log.error("Invalid response: %s" % response)
      app.quit(1)

    log.info("Cluster deletion result: %s" % result)

    log.info("Waiting for the cluster to terminate...")
    wait_for_termination(result['cluster_url'])

    log.info("Cluster terminated/deleted")
Example #3
0
    def __on_active(self, root, task_id):
        log.debug('on_active(%r, %r)', root, task_id)
        if task_id in self.finished_tasks:
            log.error('Found an active task (%s) in finished tasks?', task_id)
            return
        task_monitor = TaskMonitor(root, task_id)

        if self._disable_task_resource_collection:
            resource_monitor = NullTaskResourceMonitor()

        else:
            disk_collector_provider = DiskCollectorProvider(
                self._enable_mesos_disk_collector,
                self._disk_collector_settings)

            resource_monitor = TaskResourceMonitor(
                task_id,
                task_monitor,
                disk_collector_provider=disk_collector_provider,
                process_collection_interval=self.
                _task_process_collection_interval,
                disk_collection_interval=self._disk_collector_settings.
                disk_collection_interval)

        resource_monitor.start()
        self._active_tasks[task_id] = ActiveObservedTask(
            root, task_id, task_monitor, resource_monitor)
  def launchTask(self, driver, task):
    """
      Invoked when a task has been launched on this executor (initiated via Scheduler::launchTasks).
      Note that this task can be realized with a thread, a process, or some simple computation,
      however, no other callbacks will be invoked on this executor until this callback has returned.
    """
    self.launched.set()
    self.log('launchTask got task: %s:%s' % (task.name, task.task_id.value))

    # TODO(wickman)  Update the tests to call registered(), then remove this line and issue
    # an assert if self._driver is not populated.
    self._driver = driver

    if self._runner:
      log.error('Already running a task! %s' % self._task_id)
      self.send_update(driver, task.task_id.value, mesos_pb.TASK_LOST,
          "Task already running on this executor: %s" % self._task_id)
      return

    self._slave_id = task.slave_id.value
    self._task_id = task.task_id.value

    try:
      assigned_task = assigned_task_from_mesos_task(task)
      mesos_task = mesos_task_instance_from_assigned_task(assigned_task)
    except Exception as e:
      log.fatal('Could not deserialize AssignedTask')
      log.fatal(traceback.format_exc())
      self.send_update(
          driver, self._task_id, mesos_pb.TASK_FAILED, "Could not deserialize task: %s" % e)
      defer(driver.stop, delay=self.STOP_WAIT)
      return

    defer(lambda: self._run(driver, assigned_task, mesos_task))
  def _shutdown(self, status_result):
    runner_status = self._runner.status

    try:
      deadline(self._runner.stop, timeout=self.STOP_TIMEOUT)
    except Timeout:
      log.error('Failed to stop runner within deadline.')

    try:
      deadline(self._chained_checker.stop, timeout=self.STOP_TIMEOUT)
    except Timeout:
      log.error('Failed to stop all checkers within deadline.')

    # If the runner was alive when _shutdown was called, defer to the status_result,
    # otherwise the runner's terminal state is the preferred state.
    exit_status = runner_status or status_result

    self.send_update(
        self._driver,
        self._task_id,
        self.translate_exit_state_to_mesos(exit_status.status),
        status_result.reason)

    self.terminated.set()
    defer(self._driver.stop, delay=self.PERSISTENCE_WAIT)
Example #6
0
 def send_request(self, endpoint, message, ttl):
     url_base = 'http://%s:%d' % self._target
     try:
         urllib2.urlopen('%s/%s/%s/%d' %
                         (url_base, endpoint, message, ttl)).read()
     except Exception as e:
         log.error('Failed to query %s: %s' % (url_base, e))
 def compute_status(self):
   if self.is_alive:
     return None
   exit_state = self.EXIT_STATE_MAP.get(self.task_state())
   if exit_state is None:
     log.error('Received unexpected exit state from TaskMonitor.')
   return exit_state
Example #8
0
    def select(self):
        """
      Read and multiplex checkpoint records from all the forked off process coordinators.

      Checkpoint records can come from one of two places:
        in-process: checkpoint records synthesized for FORKED and LOST events
        out-of-process: checkpoint records from from file descriptors of forked coordinators

      Returns a list of RunnerCkpt objects that were successfully read, or an empty
      list if none were read.
    """
        self._bind_processes()
        updates = []
        for handle in filter(None, self._processes.values()):
            try:
                fstat = os.fstat(handle.fileno())
            except OSError:
                log.error('Unable to fstat %s!' % handle.name)
                continue
            if handle.tell() > fstat.st_size:
                log.error('Truncated checkpoint record detected on %s!' %
                          handle.name)
            elif handle.tell() < fstat.st_size:
                rr = ThriftRecordReader(handle, RunnerCkpt)
                while True:
                    process_update = rr.try_read()
                    if process_update:
                        updates.append(process_update)
                    else:
                        break
        if len(updates) > 0:
            log.debug('select() returning %s updates:' % len(updates))
            for update in updates:
                log.debug('  = %s' % update)
        return updates
Example #9
0
  def handle_process(self, task_id, process_id):
    all_processes = {}
    current_run = self._observer.process(task_id, process_id)
    if not current_run:
      HttpServer.abort(404, 'Invalid task/process combination: %s/%s' % (task_id, process_id))
    process = self._observer.process_from_name(task_id, process_id)
    if process is None:
      msg = 'Could not recover process: %s/%s' % (task_id, process_id)
      log.error(msg)
      HttpServer.abort(404, msg)

    current_run_number = current_run['process_run']
    all_processes[current_run_number] = current_run
    for run in range(current_run_number):
      all_processes[run] = self._observer.process(task_id, process_id, run)

    template = {
      'task_id': task_id,
      'process': {
         'name': process_id,
         'status': all_processes[current_run_number]["state"],
         'cmdline': process.cmdline().get()
      },
    }
    template['process'].update(**all_processes[current_run_number].get('used', {}))
    template['runs'] = all_processes
    log.debug('Rendering template is: %s' % template)
    return template
Example #10
0
def setup_child_subreaping():
  """
  This uses the prctl(2) syscall to set the `PR_SET_CHILD_SUBREAPER` flag. This
  means if any children processes need to be reparented, they will be reparented
  to this process.

  More documentation here: http://man7.org/linux/man-pages/man2/prctl.2.html
  and here: https://lwn.net/Articles/474787/

  Callers should reap terminal children to prevent zombies.
  """
  log.debug("Calling prctl(2) with PR_SET_CHILD_SUBREAPER")
  # This constant is taken from prctl.h
  PR_SET_CHILD_SUBREAPER = 36
  try:
    library_name = ctypes.util.find_library('c')
    if library_name is None:
      log.warning("libc is not found. Unable to call prctl!")
      log.warning("Children subreaping is disabled!")
      return
    libc = ctypes.CDLL(library_name, use_errno=True)
    # If we are on a system where prctl doesn't exist, this will throw an
    # attribute error.
    ret = libc.prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0)
    if ret != 0:
      errno = ctypes.get_errno()
      raise OSError(errno, os.strerror(errno))
  except Exception as e:
    log.error("Unable to call prctl %s" % e)
    log.error("Children subreaping is disabled!")
Example #11
0
  def create(args, options):
    validate_common_options(options)

    if not options.num_nodes:
      app.error("--num_nodes is required")

    if not options.cluster_user:
      app.error("--cluster_user is required")

    url = 'http://%s:%s/clusters/%s' % (options.api_host, options.api_port, options.cluster_name)
    values = dict(
        num_nodes=int(options.num_nodes),
        cluster_user=options.cluster_user,
        size=options.size if options.size else '',
        backup_id=options.backup_id if options.backup_id else '')

    req = urllib2.Request(url, urllib.urlencode(values))
    try:
      response = urllib2.urlopen(req).read()
    except urllib2.HTTPError as e:
      log.error("POST request failed: %s, %s, %s" % (
          e.code, BaseHTTPServer.BaseHTTPRequestHandler.responses[e.code], e.read()))
      app.quit(1)

    try:
      result = json.loads(response)
      if not isinstance(result, dict):
        raise ValueError()
    except ValueError:
      log.error("Invalid response: %s" % response)
      app.quit(1)

    log.info("Cluster created. Cluster info: %s" % str(result))
    with open(options.password_file, 'w') as f:
      f.write(result["cluster_password"])

    log.info("Waiting for the master for this cluster to be elected...")
    master_endpoint = wait_for_master(result['cluster_url']).service_endpoint

    connection_str = "mysql://%s:%s@%s:%d/" % (
        options.cluster_user,
        result["cluster_password"],
        master_endpoint.host,
        master_endpoint.port)
    log.info("Connecting to the MySQL cluster master: %s" % connection_str)
    engine = create_engine(connection_str)

    for i in range(5):  # Loop for 5 times/seconds to wait for the master to be promoted.
      try:
        # TODO(jyx): Test writing to the master and reading from the slave.
        result = engine.execute("SELECT 1;").scalar()
        assert 1 == int(result), "Expecting result to be 1 but got %s" % result
        break
      except OperationalError:
        if i == 4:
          raise
        log.debug("MySQL master not ready yet. Sleep for 1 second...")
        time.sleep(1)

    log.info("Cluster successfully started")
Example #12
0
 def connect(self):
     try:
         redis_conn = redis.StrictRedis(host=self.host, port=self.port, db=self.db)
         self.redis_pipeline = redis_conn.pipeline()
         return redis_conn
     except Exception as _e:
         log.error("RedisSink: ConnectionError\n %s %s" % (self.config, str(_e)))
Example #13
0
 def connect(self):
     try:
         sock = socket.socket()
         sock.connect((self.host, self.port))
         return sock
     except Exception as _e:
         log.error("Cannot connect to Graphite Sink with config:%s\n%s" % (self.config, str(_e)))
Example #14
0
    def _run_task(self, task):
        assert self._runner, "_runner should be created before this method is called"

        try:
            self._runner.start()
            log.info("Task runner for task %s started" % task.task_id)

            self._send_update(task.task_id.value, mesos_pb2.TASK_RUNNING)
        except TaskError as e:
            log.error("Task runner for task %s failed to start: %s" %
                      (task.task_id, str(e)))
            # Send TASK_FAILED if the task failed to start.
            self._send_update(task.task_id.value, mesos_pb2.TASK_FAILED)
        except Exception as e:
            log.error("Error occurred while executing the task: %s" % e)
            log.error(traceback.format_exc())
            # Send TASK_LOST for unknown errors.
            self._send_update(task.task_id.value, mesos_pb2.TASK_LOST)

        # Wait for the task's return code (when it terminates).
        try:
            returncode = self._runner.join()
            # Regardless of the return code, if '_runner' terminates, it failed!
            log.error("Task process terminated with return code %s" %
                      returncode)
        except TaskError as e:
            log.error("Task terminated: %s" % e)

        if self._killed:
            self._send_update(task.task_id.value, mesos_pb2.TASK_KILLED)
        else:
            self._send_update(task.task_id.value, mesos_pb2.TASK_FAILED)

        self._kill()
Example #15
0
    def method_wrapper(*args):
      with self._lock:
        start = time.time()
        while not self._terminating.is_set() and (
            time.time() - start) < self.RPC_MAXIMUM_WAIT.as_(Time.SECONDS):

          try:
            method = getattr(self.client(), method_name)
            if not callable(method):
              return method

            resp = method(*args)
            if resp is not None and resp.responseCode == ResponseCode.ERROR_TRANSIENT:
              raise self.TransientError(", ".join(
                  [m.message for m in resp.details] if resp.details else []))
            return resp
          except TRequestsTransport.AuthError as e:
            log.error(self.scheduler_client().get_failed_auth_message())
            raise self.AuthError(e)
          except (TTransport.TTransportException, self.TimeoutError, self.TransientError) as e:
            if not self._terminating.is_set():
              log.warning('Connection error with scheduler: %s, reconnecting...' % e)
              self.invalidate()
              self._terminating.wait(self.RPC_RETRY_INTERVAL.as_(Time.SECONDS))
          except Exception as e:
            # Take any error that occurs during the RPC call, and transform it
            # into something clients can handle.
            if not self._terminating.is_set():
              raise self.ThriftInternalError("Error during thrift call %s to %s: %s" %
                                            (method_name, self.cluster.name, e))
        if not self._terminating.is_set():
          raise self.TimeoutError('Timed out attempting to issue %s to %s' % (
              method_name, self.cluster.name))
Example #16
0
    def _update_endpoints(self, _1, event, state, _2):
        """Update endpoints from ZK.

    This function will block until the ZK servers respond or retry limit is hit.

    :raises ReconnectFailed: If reconnection fails.
    """
        if not (state == zookeeper.CONNECTED_STATE and event == zookeeper.CHILD_EVENT) and not (
            state == zookeeper.EXPIRED_SESSION_STATE
        ):
            return

        try:
            endpoints = []
            endpoint_names = self._zk.get_children(self._endpoint, self._update_endpoints)
            endpoint_names.sort()
            for endpoint in endpoint_names:
                data = self._zk.get(posixpath.join(self._endpoint, endpoint))
                service_endpoint = serverset_types.ServiceInstance()
                endpoints.append(codec.deserialize(service_endpoint, data[0]))

            old = set(map(_format_endpoint, self._endpoints))
            new = set(map(_format_endpoint, endpoints))
            log.debug("ServerSet endpoints at %r changed to: %s" % (self._endpoint, ", ".join(new)))
            log.debug("  Added: %s" % ", ".join(new - old))
            log.debug("  Removed: %s" % ", ".join(old - new))

            with self._lock:
                if self._watcher:
                    self._watcher(self._endpoint, self._endpoints, endpoints)
                self._endpoints = endpoints
        except ZooKeeper.Error as e:
            log.error("Lost connection to ZooKeeper: %s, reestablishing." % e)
            self._reconnect()
Example #17
0
 def run(self):
   try:
     log.info("Setting filter: %s", self.config.filter)
     if self.config.iface == "any":  # pragma: no cover
       sniff(
         filter=self.config.filter,
         store=0,
         prn=self.handle_packet,
         stop_filter=self.wants_stop
       )
     else:
       sniff(
         filter=self.config.filter,
         store=0,
         prn=self.handle_packet,
         iface=self.config.iface,
         stop_filter=self.wants_stop
       )
   except socket.error as ex:
     if self._error_to_stderr:
       sys.stderr.write("Error: %s, device: %s\n" % (ex, self.config.iface))
     else:
       log.error("Error: %s, device: %s", ex, self.config.iface)
   finally:
     log.info("The sniff loop exited")
     os.kill(os.getpid(), signal.SIGINT)
Example #18
0
 def compute_status(self):
     if self.is_alive:
         return None
     if self._popen_signal != 0:
         return StatusResult(
             'Task killed by signal %s.' % self._popen_signal,
             mesos_pb2.TASK_KILLED)
     if self._popen_rc == 0 or self._popen_rc == TERMINAL_TASK:
         exit_state = self.EXIT_STATE_MAP.get(self.task_state())
         if exit_state is None:
             log.error('Received unexpected exit state from TaskMonitor.')
             return StatusResult('Task checkpoint could not be read.',
                                 mesos_pb2.TASK_LOST)
         else:
             return exit_state
     elif self._popen_rc == UNKNOWN_USER:
         return StatusResult('Task started with unknown user.',
                             mesos_pb2.TASK_FAILED)
     elif self._popen_rc == INTERNAL_ERROR:
         return StatusResult('Thermos failed with internal error.',
                             mesos_pb2.TASK_LOST)
     elif self._popen_rc == INVALID_TASK:
         return StatusResult('Thermos received an invalid task.',
                             mesos_pb2.TASK_FAILED)
     elif self._popen_rc == UNKNOWN_ERROR:
         return StatusResult('Thermos failed with an unknown error.',
                             mesos_pb2.TASK_LOST)
     else:
         return StatusResult(
             'Thermos exited for unknown reason (exit status: %s)' %
             self._popen_rc, mesos_pb2.TASK_LOST)
Example #19
0
    def handle_process(self, task_id, process_id):
        all_processes = {}
        current_run = self._observer.process(task_id, process_id)
        if not current_run:
            HttpServer.abort(
                404, 'Invalid task/process combination: %s/%s' %
                (task_id, process_id))
        process = self._observer.process_from_name(task_id, process_id)
        if process is None:
            msg = 'Could not recover process: %s/%s' % (task_id, process_id)
            log.error(msg)
            HttpServer.abort(404, msg)

        current_run_number = current_run['process_run']
        all_processes[current_run_number] = current_run
        for run in range(current_run_number):
            all_processes[run] = self._observer.process(
                task_id, process_id, run)

        template = {
            'task_id': task_id,
            'process': {
                'name': process_id,
                'status': all_processes[current_run_number]["state"],
                'cmdline': process.cmdline().get()
            },
        }
        template['process'].update(
            **all_processes[current_run_number].get('used', {}))
        template['runs'] = all_processes
        log.debug('Rendering template is: %s', template)
        return template
Example #20
0
 def open_checkpoint(cls, filename, force=False, state=None):
     """
   Acquire a locked checkpoint stream.
 """
     safe_mkdir(os.path.dirname(filename))
     fp = lock_file(filename, "a+")
     if fp in (None, False):
         if force:
             log.info('Found existing runner, forcing leadership forfeit.')
             state = state or CheckpointDispatcher.from_file(filename)
             if cls.kill_runner(state):
                 log.info('Successfully killed leader.')
                 # TODO(wickman)  Blocking may not be the best idea here.  Perhaps block up to
                 # a maximum timeout.  But blocking is necessary because os.kill does not immediately
                 # release the lock if we're in force mode.
                 fp = lock_file(filename, "a+", blocking=True)
         else:
             log.error('Found existing runner, cannot take control.')
     if fp in (None, False):
         raise cls.PermissionError(
             'Could not open locked checkpoint: %s, lock_file = %s' %
             (filename, fp))
     ckpt = ThriftRecordWriter(fp)
     ckpt.set_sync(True)
     return ckpt
Example #21
0
def setup_child_subreaping():
    """
  This uses the prctl(2) syscall to set the `PR_SET_CHILD_SUBREAPER` flag. This
  means if any children processes need to be reparented, they will be reparented
  to this process.

  More documentation here: http://man7.org/linux/man-pages/man2/prctl.2.html
  and here: https://lwn.net/Articles/474787/

  Callers should reap terminal children to prevent zombies.
  """
    log.debug("Calling prctl(2) with PR_SET_CHILD_SUBREAPER")
    # This constant is taken from prctl.h
    PR_SET_CHILD_SUBREAPER = 36
    try:
        library_name = ctypes.util.find_library('c')
        if library_name is None:
            log.warning("libc is not found. Unable to call prctl!")
            log.warning("Children subreaping is disabled!")
            return
        libc = ctypes.CDLL(library_name, use_errno=True)
        # If we are on a system where prctl doesn't exist, this will throw an
        # attribute error.
        ret = libc.prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0)
        if ret != 0:
            errno = ctypes.get_errno()
            raise OSError(errno, os.strerror(errno))
    except Exception as e:
        log.error("Unable to call prctl %s" % e)
        log.error("Children subreaping is disabled!")
Example #22
0
    def _apply_states(self):
        """
      os.stat() the corresponding checkpoint stream of this task and determine if there are new ckpt
      records.  Attempt to read those records and update the high watermark for that stream.
      Returns True if new states were applied, False otherwise.
    """
        ckpt_offset = None
        try:
            ckpt_offset = os.stat(self._runner_ckpt).st_size

            updated = False
            if self._ckpt_head < ckpt_offset:
                with open(self._runner_ckpt, "r") as fp:
                    fp.seek(self._ckpt_head)
                    rr = ThriftRecordReader(fp, RunnerCkpt)
                    while True:
                        runner_update = rr.try_read()
                        if not runner_update:
                            break
                        try:
                            self._dispatcher.dispatch(self._runnerstate, runner_update)
                        except CheckpointDispatcher.InvalidSequenceNumber as e:
                            log.error("Checkpoint stream is corrupt: %s" % e)
                            break
                    new_ckpt_head = fp.tell()
                    updated = self._ckpt_head != new_ckpt_head
                    self._ckpt_head = new_ckpt_head
            return updated
        except OSError as e:
            if e.errno == errno.ENOENT:
                # The log doesn't yet exist, will retry later.
                log.warning("Could not read from checkpoint %s" % self._runner_ckpt)
                return False
            else:
                raise
Example #23
0
  def launchTask(self, driver, task):
    """
      Invoked when a task has been launched on this executor (initiated via Scheduler::launchTasks).
      Note that this task can be realized with a thread, a process, or some simple computation,
      however, no other callbacks will be invoked on this executor until this callback has returned.
    """
    self.launched.set()
    self.log('TaskInfo: %s' % task)
    self.log('launchTask got task: %s:%s' % (task.name, task.task_id.value))

    # TODO(wickman)  Update the tests to call registered(), then remove this line and issue
    # an assert if self._driver is not populated.
    self._driver = driver

    if self._runner:
      log.error('Already running a task! %s' % self._task_id)
      self.send_update(driver, task.task_id.value, mesos_pb2.TASK_LOST,
          "Task already running on this executor: %s" % self._task_id)
      return

    self._slave_id = task.slave_id.value
    self._task_id = task.task_id.value

    assigned_task = self.validate_task(task)
    self.log("Assigned task: %s" % assigned_task)
    if not assigned_task:
      self.send_update(driver, self._task_id, mesos_pb2.TASK_FAILED,
          'Could not deserialize task.')
      defer(driver.stop, delay=self.STOP_WAIT)
      return

    defer(lambda: self._run(driver, assigned_task, self.extract_mount_paths_from_task(task)))
Example #24
0
    def wait_start(self, timeout=MAX_WAIT):
        log.debug('Waiting for task to start.')

        def is_started():
            return self._monitor and (self._monitor.active
                                      or self._monitor.finished)

        waited = Amount(0, Time.SECONDS)

        while waited < timeout:
            if not is_started():
                log.debug('  - sleeping...')
                self._clock.sleep(self.POLL_INTERVAL.as_(Time.SECONDS))
                waited += self.POLL_INTERVAL
            else:
                break

            if not self.is_alive:
                if self._popen_rc != 0:
                    raise TaskError('Task failed: %s' %
                                    self.compute_status().reason)
                else:
                    # We can end up here if the process exited between the call to Popen and
                    # waitpid (in is_alive), which is fine.
                    log.info('Task runner exited: %s' %
                             self.compute_status().reason)
                    break

        if not is_started():
            log.error('Task did not start with in deadline, forcing loss.')
            self.lose()
            raise TaskError('Task did not start within deadline.')
Example #25
0
  def statusUpdate(self, driver, status):
    with self._lock:
      # Forward the status update to the corresponding launcher.
      task_id = status.task_id.value
      launcher = self._get_launcher_by_task_id(task_id)
      if not launcher:
        log.info("Cluster for task %s doesn't exist. It could have been removed" % task_id)
        return

      try:
        launcher.status_update(status)
      except MySQLClusterLauncher.Error as e:
        log.error("Status update failed due to launcher error: %s" % e.message)
        self._stop()

      # Update metrics.
      # TODO(xujyan): This doesn't rule out duplicates, etc. We can consider updating these metrics
      # in the launcher.
      if status.state == mesos_pb2.TASK_FINISHED:
        self._metrics.tasks_finished.increment()
      elif status.state == mesos_pb2.TASK_FAILED:
        self._metrics.tasks_failed.increment()
      elif status.state == mesos_pb2.TASK_KILLED:
        self._metrics.tasks_killed.increment()
      elif status.state == mesos_pb2.TASK_LOST:
        self._metrics.tasks_lost.increment()

      if launcher.terminated:
        log.info("Deleting the launcher for cluster %s because the cluster has terminated" %
                 launcher.cluster_name)
        self._delete_launcher(launcher)
    def wait_start(self, timeout=MAX_WAIT):
        log.debug("Waiting for task to start.")

        def is_started():
            return self._monitor and (self._monitor.active or self._monitor.finished)

        waited = Amount(0, Time.SECONDS)

        while waited < timeout:
            if not is_started():
                log.debug("  - sleeping...")
                self._clock.sleep(self.POLL_INTERVAL.as_(Time.SECONDS))
                waited += self.POLL_INTERVAL
            else:
                break

            if not self.is_alive:
                if self._popen_rc != 0:
                    raise TaskError("Task failed: %s" % self.compute_status().reason)
                else:
                    # We can end up here if the process exited between the call to Popen and
                    # waitpid (in is_alive), which is fine.
                    log.info("Task runner exited: %s" % self.compute_status().reason)
                    break

        if not is_started():
            log.error("Task did not start with in deadline, forcing loss.")
            self.lose()
            raise TaskError("Task did not start within deadline.")
Example #27
0
    def statusUpdate(self, driver, status):
        with self._lock:
            # Forward the status update to the corresponding launcher.
            task_id = status.task_id.value
            launcher = self._get_launcher_by_task_id(task_id)
            if not launcher:
                log.info(
                    "Cluster for task %s doesn't exist. It could have been removed"
                    % task_id)
                return

            try:
                launcher.status_update(status)
            except MySQLClusterLauncher.Error as e:
                log.error("Status update failed due to launcher error: %s" %
                          e.message)
                self._stop()

            # Update metrics.
            # TODO(xujyan): This doesn't rule out duplicates, etc. We can consider updating these metrics
            # in the launcher.
            if status.state == mesos_pb2.TASK_FINISHED:
                self._metrics.tasks_finished.increment()
            elif status.state == mesos_pb2.TASK_FAILED:
                self._metrics.tasks_failed.increment()
            elif status.state == mesos_pb2.TASK_KILLED:
                self._metrics.tasks_killed.increment()
            elif status.state == mesos_pb2.TASK_LOST:
                self._metrics.tasks_lost.increment()

            if launcher.terminated:
                log.info(
                    "Deleting the launcher for cluster %s because the cluster has terminated"
                    % launcher.cluster_name)
                self._delete_launcher(launcher)
Example #28
0
    def is_alive(self):
        """
      Is the process underlying the Thermos task runner alive?
    """
        if not self._popen:
            return False

        if self._dead.is_set():
            return False

        # N.B. You cannot mix this code and any code that relies upon os.wait
        # mechanisms with blanket child process collection.  One example is the
        # Thermos task runner which calls os.wait4 -- without refactoring, you
        # should not mix a Thermos task runner in the same process as this
        # thread.
        try:
            pid, status = os.waitpid(self._popen.pid, os.WNOHANG)
            if pid == 0:
                return True
            else:
                self._popen_signal, self._popen_rc = self._decode_status(
                    status)
                log.info(
                    'Detected runner termination: pid=%s, signal=%s, rc=%s' %
                    (pid, self._popen_signal, self._popen_rc))
        except OSError as e:
            log.error('is_alive got OSError: %s' % e)
            if e.errno != errno.ECHILD:
                raise

        self._dead.set()
        return False
  def _shutdown(self, status_result):
    runner_status = self._runner.status

    try:
      deadline(self._runner.stop, timeout=self.STOP_TIMEOUT)
    except Timeout:
      log.error('Failed to stop runner within deadline.')

    try:
      deadline(self._chained_checker.stop, timeout=self.STOP_TIMEOUT)
    except Timeout:
      log.error('Failed to stop all checkers within deadline.')

    # If the runner was alive when _shutdown was called, defer to the status_result,
    # otherwise the runner's terminal state is the preferred state.
    exit_status = runner_status or status_result

    self.send_update(
        self._driver,
        self._task_id,
        self.translate_exit_state_to_mesos(exit_status.status),
        status_result.reason)

    self.terminated.set()
    defer(self._driver.stop, delay=self.PERSISTENCE_WAIT)
Example #30
0
 def get(cls, task_id, checkpoint_root):
     """
   Get a TaskRunner bound to the task_id in checkpoint_root.
 """
     path = TaskPath(root=checkpoint_root, task_id=task_id, state="active")
     task_json = path.getpath("task_path")
     task_checkpoint = path.getpath("runner_checkpoint")
     if not os.path.exists(task_json):
         return None
     task = ThermosConfigLoader.load_json(task_json)
     if task is None:
         return None
     if len(task.tasks()) == 0:
         return None
     try:
         checkpoint = CheckpointDispatcher.from_file(task_checkpoint)
         if checkpoint is None or checkpoint.header is None:
             return None
         return cls(
             task.tasks()[0].task(),
             checkpoint_root,
             checkpoint.header.sandbox,
             log_dir=checkpoint.header.log_dir,
             task_id=task_id,
             portmap=checkpoint.header.ports,
             hostname=checkpoint.header.hostname,
         )
     except Exception as e:
         log.error("Failed to reconstitute checkpoint in TaskRunner.get: %s" % e, exc_info=True)
         return None
Example #31
0
  def select(self):
    """
      Read and multiplex checkpoint records from all the forked off process coordinators.

      Checkpoint records can come from one of two places:
        in-process: checkpoint records synthesized for FORKED and LOST events
        out-of-process: checkpoint records from from file descriptors of forked coordinators

      Returns a list of RunnerCkpt objects that were successfully read, or an empty
      list if none were read.
    """
    self._bind_processes()
    updates = []
    for handle in filter(None, self._processes.values()):
      try:
        fstat = os.fstat(handle.fileno())
      except OSError as e:
        log.error('Unable to fstat %s!' % handle.name)
        continue
      if handle.tell() > fstat.st_size:
        log.error('Truncated checkpoint record detected on %s!' % handle.name)
      elif handle.tell() < fstat.st_size:
        rr = ThriftRecordReader(handle, RunnerCkpt)
        while True:
          process_update = rr.try_read()
          if process_update:
            updates.append(process_update)
          else:
            break
    if len(updates) > 0:
      log.debug('select() returning %s updates:' % len(updates))
      for update in updates:
        log.debug('  = %s' % update)
    return updates
Example #32
0
  def _update_endpoints(self, _1, event, state, _2):
    """Update endpoints from ZK.

    This function will block until the ZK servers respond or retry limit is hit.

    :raises ReconnectFailed: If reconnection fails.
    """
    if not (state == zookeeper.CONNECTED_STATE and event == zookeeper.CHILD_EVENT) and not (
        state == zookeeper.EXPIRED_SESSION_STATE):
      return

    try:
      endpoints = []
      endpoint_names = self._zk.get_children(self._endpoint, self._update_endpoints)
      endpoint_names.sort()
      for endpoint in endpoint_names:
        data = self._zk.get(posixpath.join(self._endpoint, endpoint))
        service_endpoint = serverset_types.ServiceInstance()
        endpoints.append(codec.deserialize(service_endpoint, data[0]))

      old = set(map(_format_endpoint, self._endpoints))
      new = set(map(_format_endpoint, endpoints))
      log.debug('ServerSet endpoints at %r changed to: %s' % (self._endpoint, ', '.join(new)))
      log.debug('  Added: %s' % ', '.join(new - old))
      log.debug('  Removed: %s' % ', '.join(old - new))

      with self._lock:
        if self._watcher:
          self._watcher(self._endpoint, self._endpoints, endpoints)
        self._endpoints = endpoints
    except ZooKeeper.Error as e:
      log.error('Lost connection to ZooKeeper: %s, reestablishing.' % e)
      self._reconnect()
Example #33
0
  def _run(self, driver, assigned_task, mounted_volume_paths):
    """
      Commence running a Task.
        - Initialize the sandbox
        - Start the ThermosTaskRunner (fork the Thermos TaskRunner)
        - Set up necessary HealthCheckers
        - Set up StatusManager, and attach HealthCheckers
    """
    self.send_update(driver, self._task_id, mesos_pb2.TASK_STARTING)

    if not self._initialize_sandbox(driver, assigned_task, mounted_volume_paths):
      return

    # start the process on a separate thread and give the message processing thread back
    # to the driver
    try:
      self._runner = self._runner_provider.from_assigned_task(assigned_task, self._sandbox)
    except TaskError as e:
      self.runner_aborted.set()
      self._die(driver, mesos_pb2.TASK_FAILED, str(e))
      return

    if not isinstance(self._runner, TaskRunner):
      self._die(driver, mesos_pb2.TASK_FAILED, 'Unrecognized task!')
      return

    if not self._start_runner(driver, assigned_task):
      return

    try:
      self._start_status_manager(driver, assigned_task)
    except Exception:
      log.error(traceback.format_exc())
      self._die(driver, mesos_pb2.TASK_FAILED, "Internal error")
  def _rollback(self, instances_to_rollback, instance_configs):
    """Performs a rollback operation for the failed instances.

    Arguments:
    instances_to_rollback -- instance ids to rollback.
    instance_configs -- instance configuration to use for rollback.
    """
    if not self._update_config.rollback_on_failure:
      log.info('Rollback on failure is disabled in config. Aborting rollback')
      return

    log.info('Reverting update for %s' % instances_to_rollback)
    instance_operation = self.OperationConfigs(
        from_config=instance_configs.local_config_map,
        to_config=instance_configs.remote_config_map
    )
    instances_to_rollback.sort(reverse=True)
    failed_instances = []
    while instances_to_rollback:
      batch_instances = instances_to_rollback[0 : self._update_config.batch_size]
      instances_to_rollback = list(set(instances_to_rollback) - set(batch_instances))
      instances_to_rollback.sort(reverse=True)
      instances_to_watch = self._update_instances(batch_instances, instance_operation)
      failed_instances += self._watcher.watch(instances_to_watch)

    if failed_instances:
      log.error('Rollback failed for instances: %s' % failed_instances)
Example #35
0
 def control(self, force=False):
     """
   Bind to the checkpoint associated with this task, position to the end of the log if
   it exists, or create it if it doesn't.  Fails if we cannot get "leadership" i.e. a
   file lock on the checkpoint stream.
 """
     if self.is_terminal():
         raise self.StateError(
             'Cannot take control of a task in terminal state.')
     if self._sandbox:
         safe_mkdir(self._sandbox)
     ckpt_file = self._pathspec.getpath('runner_checkpoint')
     try:
         self._ckpt = TaskRunnerHelper.open_checkpoint(ckpt_file,
                                                       force=force,
                                                       state=self._state)
     except TaskRunnerHelper.PermissionError:
         raise self.PermissionError('Unable to open checkpoint %s' %
                                    ckpt_file)
     log.debug('Flipping recovery mode off.')
     self._recovery = False
     self._set_task_status(self.task_state())
     self._resume_task()
     try:
         yield
     except Exception as e:
         log.error('Caught exception in self.control(): %s' % e)
         log.error('  %s' % traceback.format_exc())
     self._ckpt.close()
Example #36
0
 def get(cls, task_id, checkpoint_root):
     """
   Get a TaskRunner bound to the task_id in checkpoint_root.
 """
     path = TaskPath(root=checkpoint_root, task_id=task_id, state='active')
     task_json = path.getpath('task_path')
     task_checkpoint = path.getpath('runner_checkpoint')
     if not os.path.exists(task_json):
         return None
     task = ThermosConfigLoader.load_json(task_json)
     if task is None:
         return None
     if len(task.tasks()) == 0:
         return None
     try:
         checkpoint = CheckpointDispatcher.from_file(task_checkpoint)
         if checkpoint is None or checkpoint.header is None:
             return None
         return cls(task.tasks()[0].task(),
                    checkpoint_root,
                    checkpoint.header.sandbox,
                    log_dir=checkpoint.header.log_dir,
                    task_id=task_id,
                    portmap=checkpoint.header.ports,
                    hostname=checkpoint.header.hostname)
     except Exception as e:
         log.error(
             'Failed to reconstitute checkpoint in TaskRunner.get: %s' % e,
             exc_info=True)
         return None
Example #37
0
  def _initialize_ckpt_header(self):
    """
      Initializes the RunnerHeader for this checkpoint stream if it has not already
      been constructed.
    """
    if self._state.header is None:
      try:
        uid = pwd.getpwnam(self._user).pw_uid
      except KeyError:
        # This will cause failures downstream, but they will at least be correctly
        # reflected in the process state.
        log.error('Unknown user %s.' % self._user)
        uid = None

      header = RunnerHeader(
          task_id=self._task_id,
          launch_time_ms=int(self._launch_time * 1000),
          sandbox=self._sandbox,
          log_dir=self._log_dir,
          hostname=self._hostname,
          user=self._user,
          uid=uid,
          ports=self._portmap)
      runner_ckpt = RunnerCkpt(runner_header=header)
      self._dispatcher.dispatch(self._state, runner_ckpt)
 def compute_status(self):
   if self.is_alive:
     return None
   exit_state = self.EXIT_STATE_MAP.get(self.task_state())
   if exit_state is None:
     log.error('Received unexpected exit state from TaskMonitor.')
   return exit_state
Example #39
0
  def validate_quota_from_requested(self, job_key, production, released, acquired):
    """Validates requested change will not exceed the available quota.

    Arguments:
    job_key -- job key.
    production -- production flag.
    released -- production CapacityRequest to be released (in case of job update).
    acquired -- production CapacityRequest to be acquired.

    Returns: ResponseCode.OK if check is successful.
    """
    resp_ok = Response(responseCode=ResponseCode.OK, messageDEPRECATED='Quota check successful.')
    if not production:
      return resp_ok

    resp = self._scheduler.getQuota(job_key.role)
    if resp.responseCode != ResponseCode.OK:
      log.error('Failed to get quota from scheduler: %s' % resp.messageDEPRECATED)
      return resp

    allocated = CapacityRequest(resp.result.getQuotaResult.quota)
    consumed = CapacityRequest(resp.result.getQuotaResult.prodConsumption)
    requested = acquired - released
    effective = allocated - consumed - requested

    if not effective.valid():
      log.info('Not enough quota to create/update job.')
      print_quota(allocated.quota(), 'Total allocated quota', job_key.role)
      print_quota(consumed.quota(), 'Consumed quota', job_key.role)
      print_quota(requested.quota(), 'Requested', job_key.name)
      return Response(
          responseCode=ResponseCode.INVALID_REQUEST,
          messageDEPRECATED='Failed quota check.')

    return resp_ok
Example #40
0
  def _run_task(self, task):
    assert self._runner, "_runner should be created before this method is called"

    try:
      self._runner.start()
      log.info("Task runner for task %s started" % task.task_id)

      self._send_update(task.task_id.value, mesos_pb2.TASK_RUNNING)
    except TaskError as e:
      log.error("Task runner for task %s failed to start: %s" % (task.task_id, str(e)))
      # Send TASK_FAILED if the task failed to start.
      self._send_update(task.task_id.value, mesos_pb2.TASK_FAILED)
    except Exception as e:
      log.error("Error occurred while executing the task: %s" % e)
      log.error(traceback.format_exc())
      # Send TASK_LOST for unknown errors.
      self._send_update(task.task_id.value, mesos_pb2.TASK_LOST)

    # Wait for the task's return code (when it terminates).
    try:
      returncode = self._runner.join()
      # Regardless of the return code, if '_runner' terminates, it failed!
      log.error("Task process terminated with return code %s" % returncode)
    except TaskError as e:
      log.error("Task terminated: %s" % e)

    if self._killed:
      self._send_update(task.task_id.value, mesos_pb2.TASK_KILLED)
    else:
      self._send_update(task.task_id.value, mesos_pb2.TASK_FAILED)

    self._kill()
  def is_alive(self):
    """
      Is the process underlying the Thermos task runner alive?
    """
    if not self._popen:
      return False
    if self._dead.is_set():
      return False

    # N.B. You cannot mix this code and any code that relies upon os.wait
    # mechanisms with blanket child process collection.  One example is the
    # Thermos task runner which calls os.wait4 -- without refactoring, you
    # should not mix a Thermos task runner in the same process as this
    # thread.
    try:
      pid, _ = os.waitpid(self._popen.pid, os.WNOHANG)
      if pid == 0:
        return True
      else:
        log.info('Detected runner termination: pid=%s' % pid)
    except OSError as e:
      log.error('is_alive got OSError: %s' % e)
      if e.errno != errno.ECHILD:
        raise

    self._dead.set()
    return False
Example #42
0
  def _initialize_ckpt_header(self):
    """
      Initializes the RunnerHeader for this checkpoint stream if it has not already
      been constructed.
    """
    if self._state.header is None:
      try:
        uid = pwd.getpwnam(self._user).pw_uid
      except KeyError:
        # This will cause failures downstream, but they will at least be correctly
        # reflected in the process state.
        log.error('Unknown user %s.', self._user)
        uid = None

      header = RunnerHeader(
          task_id=self._task_id,
          launch_time_ms=int(self._launch_time * 1000),
          sandbox=self._sandbox,
          log_dir=self._log_dir,
          hostname=self._hostname,
          user=self._user,
          uid=uid,
          ports=self._portmap)
      runner_ckpt = RunnerCkpt(runner_header=header)
      self._dispatcher.dispatch(self._state, runner_ckpt)
Example #43
0
    def launchTask(self, driver, task):
        """
      Invoked when a task has been launched on this executor (initiated via Scheduler::launchTasks).
      Note that this task can be realized with a thread, a process, or some simple computation,
      however, no other callbacks will be invoked on this executor until this callback has returned.
    """
        self.launched.set()
        self.log('launchTask got task: %s:%s' %
                 (task.name, task.task_id.value))

        # TODO(wickman)  Update the tests to call registered(), then remove this line and issue
        # an assert if self._driver is not populated.
        self._driver = driver

        if self._runner:
            log.error('Already running a task! %s' % self._task_id)
            self.send_update(
                driver, task.task_id.value, mesos_pb2.TASK_LOST,
                "Task already running on this executor: %s" % self._task_id)
            return

        self._slave_id = task.slave_id.value
        self._task_id = task.task_id.value

        assigned_task = self.validate_task(task)
        if not assigned_task:
            self.send_update(driver, self._task_id, mesos_pb2.TASK_FAILED,
                             'Could not deserialize task.')
            defer(driver.stop, delay=self.STOP_WAIT)
            return

        defer(lambda: self._run(driver, assigned_task))
Example #44
0
    def _rollback(self, instances_to_rollback, instance_configs):
        """Performs a rollback operation for the failed instances.

    Arguments:
    instances_to_rollback -- instance ids to rollback.
    instance_configs -- instance configuration to use for rollback.
    """
        log.info('Reverting update for %s' % instances_to_rollback)
        instance_operation = self.OperationConfigs(
            from_config=instance_configs.local_config_map,
            to_config=instance_configs.remote_config_map)
        instances_to_rollback.sort(reverse=True)
        failed_instances = []
        while instances_to_rollback:
            batch_instances = instances_to_rollback[0:self._update_config.
                                                    batch_size]
            instances_to_rollback = list(
                set(instances_to_rollback) - set(batch_instances))
            instances_to_rollback.sort(reverse=True)
            instances_to_watch = self._update_instances(
                batch_instances, instance_operation)
            failed_instances += self._watcher.watch(instances_to_watch)

        if failed_instances:
            log.error('Rollback failed for instances: %s' % failed_instances)
Example #45
0
 def control(self, force=False):
   """
     Bind to the checkpoint associated with this task, position to the end of the log if
     it exists, or create it if it doesn't.  Fails if we cannot get "leadership" i.e. a
     file lock on the checkpoint stream.
   """
   if self.is_terminal():
     raise self.StateError('Cannot take control of a task in terminal state.')
   if self._sandbox:
     safe_mkdir(self._sandbox)
   ckpt_file = self._pathspec.getpath('runner_checkpoint')
   try:
     self._ckpt = TaskRunnerHelper.open_checkpoint(ckpt_file, force=force, state=self._state)
   except TaskRunnerHelper.PermissionError:
     raise self.PermissionError('Unable to open checkpoint %s' % ckpt_file)
   log.debug('Flipping recovery mode off.')
   self._recovery = False
   self._set_task_status(self.task_state())
   self._resume_task()
   try:
     yield
   except Exception as e:
     log.error('Caught exception in self.control(): %s', e)
     log.error('  %s', traceback.format_exc())
   self._ckpt.close()
    def addattachment(self, page, filename):
        """Add an attachment to an existing page.
    Note: this will first read the entire file into memory"""
        mime_type = mimetypes.guess_type(filename, strict=False)[0]
        if not mime_type:
            raise ConfluenceError('Failed to detect MIME type of %s' %
                                  filename)

        try:
            with open(filename, 'rb') as f:
                file_data = f.read()

            attachment = dict(fileName=basename(filename),
                              contentType=mime_type)
            return self._api_entrypoint.addAttachment(self._session_token,
                                                      page['id'], attachment,
                                                      Binary(file_data))
        except (IOError, OSError) as e:
            log.error('Failed to read data from file %s: %s' %
                      (filename, str(e)))
            return None
        except XMLRPCError as e:
            log.error('Failed to add file attachment %s to page: %s' %
                      (filename, page.get('title', '[unknown title]')))
            return None
Example #47
0
  def _apply_states(self):
    """
      os.stat() the corresponding checkpoint stream of this task and determine if there are new ckpt
      records.  Attempt to read those records and update the high watermark for that stream.
      Returns True if new states were applied, False otherwise.
    """
    ckpt_offset = None
    try:
      ckpt_offset = os.stat(self._runner_ckpt).st_size

      updated = False
      if self._ckpt_head < ckpt_offset:
        with open(self._runner_ckpt, 'r') as fp:
          fp.seek(self._ckpt_head)
          rr = ThriftRecordReader(fp, RunnerCkpt)
          while True:
            runner_update = rr.try_read()
            if not runner_update:
              break
            try:
              self._dispatcher.dispatch(self._runnerstate, runner_update)
            except CheckpointDispatcher.InvalidSequenceNumber as e:
              log.error('Checkpoint stream is corrupt: %s' % e)
              break
          new_ckpt_head = fp.tell()
          updated = self._ckpt_head != new_ckpt_head
          self._ckpt_head = new_ckpt_head
      return updated
    except OSError as e:
      if e.errno == errno.ENOENT:
        # The log doesn't yet exist, will retry later.
        log.warning('Could not read from checkpoint %s' % self._runner_ckpt)
        return False
      else:
        raise
Example #48
0
    def _check_sla(self, hostnames, grouping_function, percentage, duration):
        """Check if the provided list of hosts passes the job uptime SLA check.

    This is an all-or-nothing check, meaning that all provided hosts must pass their job
    SLA check for the maintenance to proceed.

    :param hostnames: list of host names to check SLA for
    :type hostnames: list of strings
    :param grouping_function: grouping function to apply to the given hosts
    :type grouping_function: function
    :param percentage: SLA uptime percentage override
    :type percentage: float
    :param duration: SLA uptime duration override
    :type duration: twitter.common.quantity.Amount
    :rtype: set of unsafe hosts
    """
        vector = self._client.sla_get_safe_domain_vector(self.SLA_MIN_JOB_INSTANCE_COUNT, hostnames)
        host_groups = vector.probe_hosts(percentage, duration.as_(Time.SECONDS), grouping_function)

        unsafe_hostnames = set()
        # Given that maintenance is performed 1 group at a time, any result longer than 1 group
        # should be considered a batch failure.
        if host_groups:
            if len(host_groups) > 1:
                log.error("Illegal multiple groups detected in SLA results. Skipping hosts: %s" % hostnames)
                return set(hostnames)

            results, unsafe_hostnames = format_sla_results(host_groups, unsafe_only=True)
            if results:
                print_results(results)
                return unsafe_hostnames

        return unsafe_hostnames
Example #49
0
    def method_wrapper(*args):
      with self._lock:
        start = time.time()
        while not self._terminating.is_set() and (
            time.time() - start) < self.RPC_MAXIMUM_WAIT.as_(Time.SECONDS):

          try:
            method = getattr(self.client(), method_name)
            if not callable(method):
              return method
            resp = method(*args)
            if resp is not None and resp.responseCode == ResponseCode.ERROR_TRANSIENT:
              raise self.TransientError(", ".join(
                  [m.message for m in resp.details] if resp.details else []))
            return resp
          except TRequestsTransport.AuthError as e:
            log.error(self.scheduler_client().get_failed_auth_message())
            raise self.AuthError(e)
          except (TTransport.TTransportException, self.TimeoutError, self.TransientError) as e:
            if not self._terminating.is_set():
              log.warning('Connection error with scheduler: %s, reconnecting...' % e)
              self.invalidate()
              self._terminating.wait(self.RPC_RETRY_INTERVAL.as_(Time.SECONDS))
          except Exception as e:
            # Take any error that occurs during the RPC call, and transform it
            # into something clients can handle.
            if not self._terminating.is_set():
              raise self.ThriftInternalError("Error during thrift call %s to %s: %s" %
                                            (method_name, self.cluster.name, e))
        if not self._terminating.is_set():
          raise self.TimeoutError('Timed out attempting to issue %s to %s' % (
              method_name, self.cluster.name))
Example #50
0
 def safe_signal(cls, pid, sig=signal.SIGTERM):
   try:
     os.kill(pid, sig)
   except OSError as e:
     if e.errno not in (errno.ESRCH, errno.EPERM):
       log.error('Unexpected error in os.kill: %s' % e)
   except Exception as e:
     log.error('Unexpected error in os.kill: %s' % e)
Example #51
0
 def safe_signal(cls, pid, sig=signal.SIGTERM):
   try:
     os.kill(pid, sig)
   except OSError as e:
     if e.errno not in (errno.ESRCH, errno.EPERM):
       log.error('Unexpected error in os.kill: %s' % e)
   except Exception as e:
     log.error('Unexpected error in os.kill: %s' % e)
 def resolve(self):
     for job in self._jobs:
         resp = self._api.query(self.query_from(self._role, self._env, job))
         if resp.responseCode != ResponseCode.OK:
             log.error("Failed to query job: %s" % job)
             continue
         for task in resp.result.scheduleStatusResult.tasks:
             yield task
Example #53
0
def delete(j, name):
    """
  delete job
  """
    try:
        j.delete_job(name)
    except JenkinsAPIException as e:
        log.error("error deleting job: %s" % e)
Example #54
0
def delete(j, name):
  """
  delete job
  """
  try:
    j.delete_job(name)
  except JenkinsAPIException as e:
    log.error("error deleting job: %s" % e)
Example #55
0
 def connect(self):
     try:
         sock = socket.socket()
         sock.connect((self.host, self.port))
         return sock
     except Exception as _e:
         log.error("Cannot connect to Graphite Sink with config:%s\n%s" %
                   (self.config, str(_e)))
Example #56
0
 def resolve(self):
     for job in self._jobs:
         resp = self._api.query(self.query_from(self._role, self._env, job))
         if resp.responseCode != ResponseCode.OK:
             log.error('Failed to query job: %s' % job)
             continue
         for task in resp.result.scheduleStatusResult.tasks:
             yield task