Ejemplo n.º 1
0
 def on_finalizing(self, task_update):
   log.debug('Task on_finalizing(%s)', task_update)
   if not self._runner._recovery:
     self._runner._kill()
   self._runner._plan = self._runner._finalizing_plan
   if self._runner._finalization_start is None:
     self._runner._finalization_start = task_update.timestamp_ms / 1000.0
Ejemplo n.º 2
0
 def _spawn(self, cmd, **subprocess_args):
   with self._maybe_scrubbed_env():
     log.debug('Executing: %s' % ' '.join(cmd))
     try:
       return subprocess.Popen(cmd, cwd=self._buildroot, **subprocess_args)
     except OSError as e:
       raise self.Error('Problem executing %s: %s' % (self._distribution.java, e))
Ejemplo n.º 3
0
 def on_active(self, task_update):
   log.debug('Task on_active(%s)', task_update)
   self._runner._plan = self._runner._regular_plan
   if self._runner._recovery:
     return
   TaskRunnerHelper.initialize_task(self._pathspec,
       ThermosTaskWrapper(self._runner._task).to_json())
Ejemplo n.º 4
0
  def is_process_lost(self, process_name):
    """Determine whether or not we should mark a task as LOST and do so if necessary."""
    current_run = self._current_process_run(process_name)
    if not current_run:
      raise self.InternalError('No current_run for process %s!' % process_name)

    def forked_but_never_came_up():
      return current_run.state == ProcessState.FORKED and (
        self._clock.time() - current_run.fork_time > self.LOST_TIMEOUT.as_(Time.SECONDS))

    def running_but_coordinator_died():
      if current_run.state != ProcessState.RUNNING:
        return False
      coordinator_pid, _, _ = TaskRunnerHelper.scan_process(self.state, process_name)
      if coordinator_pid is not None:
        return False
      elif self._watcher.has_data(process_name):
        return False
      return True

    if forked_but_never_came_up() or running_but_coordinator_died():
      log.info('Detected a LOST task: %s', current_run)
      log.debug('  forked_but_never_came_up: %s', forked_but_never_came_up())
      log.debug('  running_but_coordinator_died: %s', running_but_coordinator_died())
      return True

    return False
Ejemplo n.º 5
0
  def __init__(
      self,
      task_id,
      task_monitor,
      disk_collector_provider=DiskCollectorProvider(),
      process_collection_interval=PROCESS_COLLECTION_INTERVAL,
      disk_collection_interval=DiskCollectorSettings.DISK_COLLECTION_INTERVAL,
      history_time=HISTORY_TIME,
      history_provider=HistoryProvider()):

    """
      task_monitor: TaskMonitor object specifying the task whose resources should be monitored
      sandbox: Directory for which to monitor disk utilisation
    """
    self._task_monitor = task_monitor  # exposes PIDs, sandbox
    self._task_id = task_id
    log.debug('Initialising resource collection for task %s', self._task_id)
    self._process_collectors = dict()  # ProcessStatus => ProcessTreeCollector

    self._disk_collector_provider = disk_collector_provider
    self._disk_collector = None
    self._process_collection_interval = process_collection_interval.as_(Time.SECONDS)
    self._disk_collection_interval = disk_collection_interval.as_(Time.SECONDS)
    min_collection_interval = min(self._process_collection_interval, self._disk_collection_interval)
    self._history = history_provider.provides(history_time, min_collection_interval)
    self._kill_signal = threading.Event()
    ExceptionalThread.__init__(self, name='%s[%s]' % (self.__class__.__name__, task_id))
    self.daemon = True
Ejemplo n.º 6
0
    def wait_start(self, timeout=MAX_WAIT):
        log.debug("Waiting for task to start.")

        def is_started():
            return self._monitor and (self._monitor.active or self._monitor.finished)

        waited = Amount(0, Time.SECONDS)

        while waited < timeout:
            if not is_started():
                log.debug("  - sleeping...")
                self._clock.sleep(self.POLL_INTERVAL.as_(Time.SECONDS))
                waited += self.POLL_INTERVAL
            else:
                break

            if not self.is_alive:
                if self._popen_rc != 0:
                    raise TaskError("Task failed: %s" % self.compute_status().reason)
                else:
                    # We can end up here if the process exited between the call to Popen and
                    # waitpid (in is_alive), which is fine.
                    log.info("Task runner exited: %s" % self.compute_status().reason)
                    break

        if not is_started():
            log.error("Task did not start with in deadline, forcing loss.")
            self.lose()
            raise TaskError("Task did not start within deadline.")
Ejemplo n.º 7
0
  def _await_nailgun_server(self, workunit):
    nailgun_timeout_seconds = 5
    max_socket_connect_attempts = 10
    nailgun = None
    port_parse_start = time.time()
    with _safe_open(self._ng_out, 'r') as ng_out:
      while not nailgun:
        started = ng_out.readline()
        if started:
          port = self._parse_nailgun_port(started)
          with open(self._pidfile, 'a') as pidfile:
            pidfile.write(':%d\n' % port)
          nailgun = self._create_ngclient(port, workunit)
          log.debug('Detected ng server up on port %d' % port)
        elif time.time() - port_parse_start > nailgun_timeout_seconds:
          raise NailgunError('Failed to read ng output after %s seconds' % nailgun_timeout_seconds)

    attempt = 0
    while nailgun:
      sock = nailgun.try_connect()
      if sock:
        sock.close()
        log.info('Connected to ng server pid: %d @ port: %d' % self._get_nailgun_endpoint())
        return nailgun
      elif attempt > max_socket_connect_attempts:
        raise NailgunError('Failed to connect to ng output after %d connect attempts'
                            % max_socket_connect_attempts)
      attempt += 1
      log.debug('Failed to connect on attempt %d' % attempt)
      time.sleep(0.1)
Ejemplo n.º 8
0
  def flush(self):
    if self.isOpen():
      self.close()

    self.open()

    data = self.__wbuf.getvalue()
    self.__wbuf = BytesIO()

    self._session.headers['Content-Type'] = 'application/x-thrift'
    self._session.headers['Content-Length'] = str(len(data))
    self._session.headers['Host'] = self.__urlparse.hostname

    response = None
    try:
      response = self._session.post(
          self.__uri,
          data=data,
          timeout=self.__timeout,
          auth=self.__auth)
      response.raise_for_status()
    except request_exceptions.Timeout:
      raise TTransportException(
          type=TTransportException.TIMED_OUT,
          message='Timed out talking to %s' % self.__uri)
    except request_exceptions.RequestException as e:
      if response:
        log.debug('Error connecting, logging response headers:.')
        for field_name, field_value in response.headers.items():
          log.debug('  %s: %s' % (field_name, field_value))
      raise TTransportException(
          type=TTransportException.UNKNOWN,
          message='Unknown error talking to %s: %s' % (self.__uri, e))

    self.__rbuf = BytesIO(response.content)
Ejemplo n.º 9
0
 def _get_process_resource_consumption(self, task_id, process_name):
   if task_id not in self.active_tasks:
     log.debug("Task %s not found in active tasks" % task_id)
     return ProcessSample.empty().to_dict()
   sample = self.active_tasks[task_id].resource_monitor.sample_by_process(process_name).to_dict()
   log.debug('Resource consumption (%s, %s) => %s' % (task_id, process_name, sample))
   return sample
Ejemplo n.º 10
0
def select_binary(base_path, version, name, config=None):
  """Selects a binary matching the current os and architecture.

  :raises: :class:`pants.binary_util.BinaryUtil.BinaryNotFound` if no binary of the given version
    and name could be found.
  """
  # TODO(John Sirois): finish doc of the path structure expexcted under base_path
  config = config or Config.load()
  bootstrap_dir = config.getdefault('pants_bootstrapdir')

  binary_path = select_binary_base_path(base_path, version, name)
  bootstrapped_binary_path = os.path.join(bootstrap_dir, binary_path)
  if not os.path.exists(bootstrapped_binary_path):
    downloadpath = bootstrapped_binary_path + '~'
    try:
      with select_binary_stream(base_path, version, name, config) as stream:
        with safe_open(downloadpath, 'wb') as bootstrapped_binary:
          bootstrapped_binary.write(stream())
        os.rename(downloadpath, bootstrapped_binary_path)
        chmod_plus_x(bootstrapped_binary_path)
    finally:
      safe_delete(downloadpath)

  log.debug('Selected {binary} binary bootstrapped to: {path}'
            .format(binary=name, path=bootstrapped_binary_path))
  return bootstrapped_binary_path
Ejemplo n.º 11
0
  def genlang(self, lang, targets):
    bases, sources = self._calculate_sources(targets)

    if lang == 'java':
      safe_mkdir(self.java_out)
      gen = '--java_out=%s' % self.java_out
    elif lang == 'python':
      safe_mkdir(self.py_out)
      gen = '--python_out=%s' % self.py_out
    else:
      raise TaskError('Unrecognized protobuf gen lang: %s' % lang)

    args = [
      self.protobuf_binary,
      gen
    ]

    for base in bases:
      args.append('--proto_path=%s' % base)

    args.extend(sources)
    log.debug('Executing: %s' % ' '.join(args))
    process = subprocess.Popen(args)
    result = process.wait()
    if result != 0:
      raise TaskError
 def run(self):
   log.debug('Health checker thread started.')
   self._clock.sleep(self._initial_interval)
   log.debug('Initial interval expired.')
   while not self._dead.is_set():
     self._maybe_update_failure_count(*self._checker())
     self._clock.sleep(self._interval)
Ejemplo n.º 13
0
  def _update_instances_in_parallel(self, target, instances_to_update):
    """Processes instance updates in parallel and waits for completion.

    Arguments:
    target -- target method to handle instance update.
    instances_to_update -- list of InstanceData with update details.

    Returns Queue with non-updated instance data.
    """
    log.info('Processing in parallel with %s worker thread(s)' % self._update_config.batch_size)
    instance_queue = Queue()
    for instance_to_update in instances_to_update:
      instance_queue.put(instance_to_update)

    try:
      threads = []
      for _ in range(self._update_config.batch_size):
        threads.append(spawn_worker(target, kwargs={'instance_queue': instance_queue}))

      for thread in threads:
        thread.join_and_raise()
    except Exception as e:
      log.debug('Caught unhandled exception: %s' % e)
      self._terminate()
      raise

    return instance_queue
Ejemplo n.º 14
0
  def _construct_scheduler(self):
    """
      Populates:
        self._scheduler_client
        self._client
    """
    self._scheduler_client = SchedulerClient.get(self.cluster, verbose=self.verbose)
    assert self._scheduler_client, "Could not find scheduler (cluster = %s)" % self.cluster.name
    start = time.time()
    while (time.time() - start) < self.CONNECT_MAXIMUM_WAIT.as_(Time.SECONDS):
      try:
        # this can wind up generating any kind of error, because it turns into
        # a call to a dynamically set authentication module.
        self._client = self._scheduler_client.get_thrift_client()
        break
      except SchedulerClient.CouldNotConnect as e:
        log.warning('Could not connect to scheduler: %s' % e)
      except Exception as e:
        # turn any auth module exception into an auth error.
        log.debug('Warning: got an unknown exception during authentication:')
        log.debug(traceback.format_exc())
        raise self.AuthenticationError('Error connecting to scheduler: %s' % e)
    if not self._client:
      raise self.TimeoutError('Timed out trying to connect to scheduler at %s' % self.cluster.name)

    server_version = self._client.getVersion().result.getVersionResult
    if server_version != CURRENT_API_VERSION:
      raise self.APIVersionError("Client Version: %s, Server Version: %s" %
                                 (CURRENT_API_VERSION, server_version))
    def sample(self):
        """ Collate and aggregate ProcessSamples for process and children
        Returns None: result is stored in self.value
    """
        try:
            last_sample, last_stamp = self._sample, self._stamp
            if self._process is None:
                self._process = Process(self._pid)
            parent = self._process
            parent_sample = process_to_sample(parent)
            new_samples = dict((proc.pid, process_to_sample(proc)) for proc in parent.get_children(recursive=True))
            new_samples[self._pid] = parent_sample

        except PsutilError as e:
            log.warning("Error during process sampling: %s" % e)
            self._sample = ProcessSample.empty()
            self._rate = 0.0

        else:
            last_stamp = self._stamp
            self._stamp = time()
            # for most stats, calculate simple sum to aggregate
            self._sample = sum(new_samples.values(), ProcessSample.empty())
            # cpu consumption is more complicated
            # We require at least 2 generations of a process before we can calculate rate, so for all
            # current processes that were not running in the previous sample, compare to an empty sample
            if self._sampled_tree and last_stamp:
                new = new_samples.values()
                old = [self._sampled_tree.get(pid, ProcessSample.empty()) for pid in new_samples.keys()]
                new_user_sys = sum(map(attrgetter("user"), new)) + sum(map(attrgetter("system"), new))
                old_user_sys = sum(map(attrgetter("user"), old)) + sum(map(attrgetter("system"), old))
                self._rate = (new_user_sys - old_user_sys) / (self._stamp - last_stamp)
                log.debug("Calculated rate for pid=%s and children: %s" % (self._process.pid, self._rate))
            self._sampled_tree = new_samples
Ejemplo n.º 16
0
def setup_child_subreaping():
  """
  This uses the prctl(2) syscall to set the `PR_SET_CHILD_SUBREAPER` flag. This
  means if any children processes need to be reparented, they will be reparented
  to this process.

  More documentation here: http://man7.org/linux/man-pages/man2/prctl.2.html
  and here: https://lwn.net/Articles/474787/

  Callers should reap terminal children to prevent zombies.
  """
  log.debug("Calling prctl(2) with PR_SET_CHILD_SUBREAPER")
  # This constant is taken from prctl.h
  PR_SET_CHILD_SUBREAPER = 36
  try:
    library_name = ctypes.util.find_library('c')
    if library_name is None:
      log.warning("libc is not found. Unable to call prctl!")
      log.warning("Children subreaping is disabled!")
      return
    libc = ctypes.CDLL(library_name, use_errno=True)
    # If we are on a system where prctl doesn't exist, this will throw an
    # attribute error.
    ret = libc.prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0)
    if ret != 0:
      errno = ctypes.get_errno()
      raise OSError(errno, os.strerror(errno))
  except Exception as e:
    log.error("Unable to call prctl %s" % e)
    log.error("Children subreaping is disabled!")
Ejemplo n.º 17
0
  def run(self):
    """
      The internal thread for the observer.  This periodically polls the
      checkpoint root for new tasks, or transitions of tasks from active to
      finished state.
    """
    while not self._stop_event.is_set():
      time.sleep(self.POLLING_INTERVAL.as_(Time.SECONDS))

      active_tasks = [task_id for _, task_id in self._detector.get_task_ids(state='active')]
      finished_tasks = [task_id for _, task_id in self._detector.get_task_ids(state='finished')]

      with self.lock:

        # Ensure all tasks currently detected on the system are observed appropriately
        for active in active_tasks:
          if active not in self.active_tasks:
            log.debug('task_id %s (unknown) -> active' % active)
            self.add_active_task(active)
        for finished in finished_tasks:
          if finished in self.active_tasks:
            log.debug('task_id %s active -> finished' % finished)
            self.active_to_finished(finished)
          elif finished not in self.finished_tasks:
            log.debug('task_id %s (unknown) -> finished' % finished)
            self.add_finished_task(finished)

        # Remove ObservedTasks for tasks no longer detected on the system
        for unknown in set(self.active_tasks) - set(active_tasks + finished_tasks):
          log.debug('task_id %s active -> (unknown)' % unknown)
          self.remove_active_task(unknown)
        for unknown in set(self.finished_tasks) - set(active_tasks + finished_tasks):
          log.debug('task_id %s finished -> (unknown)' % unknown)
          self.remove_finished_task(unknown)
Ejemplo n.º 18
0
  def _maybe_update_health_check_count(self, is_healthy, reason):
    if not is_healthy:
      log.warning('Health check failure: %s' % reason)

      if self.current_consecutive_successes > 0:
        log.debug('Reset consecutive successes counter.')
        self.current_consecutive_successes = 0

      if self._should_ignore_failure():
        return

      if self._should_fail_fast():
        log.warning('Not enough attempts left prove health, failing fast.')
        self.healthy = False
        self.reason = reason

      self.current_consecutive_failures += 1
      if self.current_consecutive_failures > self.max_consecutive_failures:
        log.warning('Reached consecutive failure limit.')
        self.healthy = False
        self.reason = reason
    else:
      self.current_consecutive_successes += 1

      if not self.running:
        if self.current_consecutive_successes >= self.min_consecutive_successes:
          log.info('Reached consecutive success limit.')
          self.running = True

      if self.current_consecutive_failures > 0:
        log.debug('Reset consecutive failures counter.')
        self.current_consecutive_failures = 0
Ejemplo n.º 19
0
  def _create_kill_add_lists(self, instance_ids, operation_configs):
    """Determines a particular action (kill or add) to use for every instance in instance_ids.

    Arguments:
    instance_ids -- current batch of IDs to process.
    operation_configs -- OperationConfigs with update details.

    Returns lists of instances to kill and to add.
    """
    to_kill = []
    to_add = []
    for instance_id in instance_ids:
      from_config = operation_configs.from_config.get(instance_id)
      to_config = operation_configs.to_config.get(instance_id)

      if from_config and to_config:
        diff_output = self._diff_configs(from_config, to_config)
        if diff_output:
          log.debug('Task configuration changed for instance [%s]:\n%s' % (instance_id, diff_output))
          to_kill.append(instance_id)
          to_add.append(instance_id)
      elif from_config and not to_config:
        to_kill.append(instance_id)
      elif not from_config and to_config:
        to_add.append(instance_id)
      else:
        raise self.Error('Instance %s is outside of supported range' % instance_id)

    return to_kill, to_add
Ejemplo n.º 20
0
  def flush(self):
    if not self.isOpen():
      self.open()

    data = self.__wbuf.getvalue()
    self.__wbuf = BytesIO()

    self._session.headers['Accept'] = 'application/vnd.apache.thrift.binary'
    self._session.headers['Content-Type'] = 'application/vnd.apache.thrift.binary'
    self._session.headers['Content-Length'] = str(len(data))
    self._session.headers['Host'] = self.__urlparse.hostname

    try:
      response = self._session.post(
          self.__uri,
          data=data,
          timeout=self.__timeout,
          auth=self.__auth)
      response.raise_for_status()
      self.__rbuf = BytesIO(response.content)
    except request_exceptions.Timeout:
      raise TTransportException(
          type=TTransportException.TIMED_OUT,
          message='Timed out talking to %s' % self.__uri)
    except request_exceptions.RequestException as e:
      if e.response is not None:
        log.debug('Request failed, response headers:')
        for field_name, field_value in e.response.headers.items():
          log.debug('  %s: %s' % (field_name, field_value))
        if e.response.status_code in (401, 403):
          raise self.AuthError(e)
      raise TTransportException(
          type=TTransportException.UNKNOWN,
          message='Unknown error talking to %s: %s' % (self.__uri, e))
Ejemplo n.º 21
0
 def terminate_process(cls, state, process_name):
   log.debug('TaskRunnerHelper.terminate_process(%s)' % process_name)
   _, pid, _ = cls._get_process_tuple(state, process_name)
   if pid:
     log.debug('   => SIGTERM pid %s' % pid)
     cls.terminate_pid(pid)
   return bool(pid)
Ejemplo n.º 22
0
  def genlang(self, lang, targets):
    protobuf_binary = select_binary(
      self.protoc_supportdir,
      self.protoc_version,
      'protoc',
      self.context.config
    )

    bases, sources = self._calculate_sources(targets)

    if lang == 'java':
      safe_mkdir(self.java_out)
      gen = '--java_out=%s' % self.java_out
    elif lang == 'python':
      safe_mkdir(self.py_out)
      gen = '--python_out=%s' % self.py_out
    else:
      raise TaskError('Unrecognized protobuf gen lang: %s' % lang)

    args = [self.protobuf_binary, gen]

    for base in bases:
      args.append('--proto_path=%s' % base)

    args.extend(sources)
    log.debug('Executing: %s' % ' '.join(args))
    process = subprocess.Popen(args)
    result = process.wait()
    if result != 0:
      raise TaskError('%s ... exited non-zero (%i)' % (self.protobuf_binary, result))
Ejemplo n.º 23
0
 def write(self, slice_, data):
   log.debug('Disk writing %s' % slice_)
   if len(data) != slice_.length:
     raise self.WriteError('Block must be of appropriate size!')
   with open(slice_._filename, 'r+b') as fp:
     fp.seek(slice_.start)
     fp.write(data)
Ejemplo n.º 24
0
  def genlang(self, lang, targets):
    bases, sources = self._calculate_sources(targets)
    bases = bases.union(self._proto_path_imports(targets))

    if lang == 'java':
      output_dir = self.java_out
      gen_flag = '--java_out'
    elif lang == 'python':
      output_dir = self.py_out
      gen_flag = '--python_out'
    else:
      raise TaskError('Unrecognized protobuf gen lang: %s' % lang)

    safe_mkdir(output_dir)
    gen = '%s=%s' % (gen_flag, output_dir)

    args = [self.protobuf_binary, gen]

    if self.plugins:
      for plugin in self.plugins:
        # TODO(Eric Ayers) Is it a good assumption that the generated source output dir is
        # acceptable for all plugins?
        args.append("--%s_protobuf_out=%s" % (plugin, output_dir))

    for base in bases:
      args.append('--proto_path=%s' % base)

    args.extend(sources)
    log.debug('Executing: %s' % ' '.join(args))
    process = subprocess.Popen(args)
    result = process.wait()
    if result != 0:
      raise TaskError('%s ... exited non-zero (%i)' % (self.protobuf_binary, result))
Ejemplo n.º 25
0
 def __init__(self,
              task_monitor,
              sandbox,
              process_collector=ProcessTreeCollector,
              disk_collector=DiskCollector,
              process_collection_interval=Amount(20, Time.SECONDS),
              disk_collection_interval=Amount(1, Time.MINUTES),
              history_time=Amount(1, Time.HOURS)):
   """
     task_monitor: TaskMonitor object specifying the task whose resources should be monitored
     sandbox: Directory for which to monitor disk utilisation
   """
   self._task_monitor = task_monitor  # exposes PIDs, sandbox
   self._task_id = task_monitor._task_id
   log.debug('Initialising resource collection for task %s' % self._task_id)
   self._process_collectors = dict()  # ProcessStatus => ProcessTreeCollector
   # TODO(jon): sandbox is also available through task_monitor, but typically the first checkpoint
   # isn't written (and hence the header is not available) by the time we initialise here
   self._sandbox = sandbox
   self._process_collector_factory = process_collector
   self._disk_collector = disk_collector(self._sandbox)
   self._process_collection_interval = process_collection_interval.as_(Time.SECONDS)
   self._disk_collection_interval = disk_collection_interval.as_(Time.SECONDS)
   min_collection_interval = min(self._process_collection_interval, self._disk_collection_interval)
   history_length = int(history_time.as_(Time.SECONDS) / min_collection_interval)
   if history_length > self.MAX_HISTORY:
     raise ValueError("Requested history length too large")
   log.debug("Initialising ResourceHistory of length %s" % history_length)
   self._history = ResourceHistory(history_length)
   self._kill_signal = threading.Event()
   threading.Thread.__init__(self)
   self.daemon = True
Ejemplo n.º 26
0
 def terminal_state(self):
   if self._terminal_state:
     log.debug('Forced terminal state: %s' %
         TaskState._VALUES_TO_NAMES.get(self._terminal_state, 'UNKNOWN'))
     return self._terminal_state
   else:
     return TaskState.SUCCESS if self.is_healthy() else TaskState.FAILED
Ejemplo n.º 27
0
 def _get_user_topics(self):
   with CheatSheetsCache(self.ttl) as cs:
     if self.force or not cs.has_valid_user_cheat_sheets(self.user_name):
       log.debug('Fetching user cheats from server')
       user_sheet = self.api.sheets(self.user_name)
       cs.add_or_update_user_sheet(self.user_name, user_sheet)
     return cs.get_user_topics(self.user_name)
Ejemplo n.º 28
0
 def on_killed(self, process_update):
   log.debug('Process on_killed %s', process_update)
   self._cleanup(process_update)
   self._runner._task_processes.pop(process_update.process)
   self._runner._watcher.unregister(process_update.process)
   log.debug('Process killed, marking it as a loss.')
   self._runner._plan.lost(process_update.process)
Ejemplo n.º 29
0
  def select(self):
    """
      Read and multiplex checkpoint records from all the forked off process coordinators.

      Checkpoint records can come from one of two places:
        in-process: checkpoint records synthesized for FORKED and LOST events
        out-of-process: checkpoint records from from file descriptors of forked coordinators

      Returns a list of RunnerCkpt objects that were successfully read, or an empty
      list if none were read.
    """
    self._bind_processes()
    updates = []
    for handle in filter(None, self._processes.values()):
      try:
        fstat = os.fstat(handle.fileno())
      except OSError as e:
        log.error('Unable to fstat %s!' % handle.name)
        continue
      if handle.tell() > fstat.st_size:
        log.error('Truncated checkpoint record detected on %s!' % handle.name)
      elif handle.tell() < fstat.st_size:
        rr = ThriftRecordReader(handle, RunnerCkpt)
        while True:
          process_update = rr.try_read()
          if process_update:
            updates.append(process_update)
          else:
            break
    if len(updates) > 0:
      log.debug('select() returning %s updates:' % len(updates))
      for update in updates:
        log.debug('  = %s' % update)
    return updates
Ejemplo n.º 30
0
 def control(self, force=False):
   """
     Bind to the checkpoint associated with this task, position to the end of the log if
     it exists, or create it if it doesn't.  Fails if we cannot get "leadership" i.e. a
     file lock on the checkpoint stream.
   """
   if self.is_terminal():
     raise self.StateError('Cannot take control of a task in terminal state.')
   if self._sandbox:
     safe_mkdir(self._sandbox)
   ckpt_file = self._pathspec.getpath('runner_checkpoint')
   try:
     self._ckpt = TaskRunnerHelper.open_checkpoint(ckpt_file, force=force, state=self._state)
   except TaskRunnerHelper.PermissionError:
     raise self.PermissionError('Unable to open checkpoint %s' % ckpt_file)
   log.debug('Flipping recovery mode off.')
   self._recovery = False
   self._set_task_status(self.task_state())
   self._resume_task()
   try:
     yield
   except Exception as e:
     log.error('Caught exception in self.control(): %s', e)
     log.error('  %s', traceback.format_exc())
   self._ckpt.close()
Ejemplo n.º 31
0
    def create(self):
        log.debug('DirectorySandbox: mkdir %s' % self.root)

        try:
            safe_mkdir(self.root)
        except (IOError, OSError) as e:
            raise self.CreationError('Failed to create the sandbox: %s' % e)

        if self._user:
            pwent, grent = self.get_user_and_group()

            try:
                log.debug('DirectorySandbox: chown %s:%s %s' %
                          (self._user, grent.gr_name, self.root))
                os.chown(self.root, pwent.pw_uid, pwent.pw_gid)
                log.debug('DirectorySandbox: chmod 700 %s' % self.root)
                os.chmod(self.root, 0700)
            except (IOError, OSError) as e:
                raise self.CreationError(
                    'Failed to chown/chmod the sandbox: %s' % e)
Ejemplo n.º 32
0
 def stats_uploader_daemon(self, stats):
     """
 Starts the StatsUploader as a daemon process if it is already not running
 """
     log.debug("Checking if the statsUploaderDaemon is already running")
     stats_pid = os.path.join("/tmp", self._user, ".pid_stats")
     stats_uploader_dir = os.path.join("/tmp", self._user)
     dirutil.safe_mkdir(stats_uploader_dir)
     if not os.path.exists(stats_pid):
         log.debug("Starting the daemon")
         stats_log_file = os.path.join("/tmp", self._user,
                                       "buildtime_uploader")
         log.debug("The logs are writen to %s" % stats_log_file)
         if spawn_daemon(pidfile=stats_pid, quiet=True):
             force_stats_upload = False
             if "--force_stats_upload" in sys.argv:
                 force_stats_upload = True
             su = StatsUploader(STATS_COLLECTION_URL, STATS_COLLECTION_PORT,
                                STATS_COLLECTION_ENDPOINT, self._max_delay,
                                self._get_default_stats_file(), self._user,
                                force_stats_upload)
             su.upload_sync(stats)
Ejemplo n.º 33
0
    def _await_nailgun_server(self, stdout, stderr):
        nailgun_timeout_seconds = 5
        max_socket_connect_attempts = 10
        nailgun = None
        port_parse_start = time.time()
        with safe_open(self._ng_out, 'r') as ng_out:
            while not nailgun:
                started = ng_out.readline()
                if started:
                    port = self._parse_nailgun_port(started)
                    nailgun = self._create_ngclient(port, stdout, stderr)
                    log.debug('Detected ng server up on port %d' % port)
                elif time.time() - port_parse_start > nailgun_timeout_seconds:
                    raise NailgunClient.NailgunError(
                        'Failed to read ng output after'
                        ' %s seconds' % nailgun_timeout_seconds)

        attempt = 0
        while nailgun:
            sock = nailgun.try_connect()
            if sock:
                sock.close()
                endpoint = self._get_nailgun_endpoint()
                if endpoint:
                    log.debug(
                        'Connected to ng server with fingerprint %s pid: %d @ port: %d'
                        % endpoint)
                else:
                    raise NailgunClient.NailgunError(
                        'Failed to connect to ng server.')
                return nailgun
            elif attempt > max_socket_connect_attempts:
                raise nailgun.NailgunError(
                    'Failed to connect to ng output after %d connect attempts'
                    % max_socket_connect_attempts)
            attempt += 1
            log.debug('Failed to connect on attempt %d' % attempt)
            time.sleep(0.1)
Ejemplo n.º 34
0
  def create(self):
    log.debug('DirectorySandbox: mkdir %s' % self.root)

    try:
      safe_mkdir(self.root)
    except (IOError, OSError) as e:
      raise self.CreationError('Failed to create the sandbox: %s' % e)

    if self._user:
      try:
        pwent = pwd.getpwnam(self._user)
        grent = grp.getgrgid(pwent.pw_gid)
      except KeyError:
        raise self.CreationError(
            'Could not create sandbox because user does not exist: %s' % self._user)

      try:
        log.debug('DirectorySandbox: chown %s:%s %s' % (self._user, grent.gr_name, self.root))
        os.chown(self.root, pwent.pw_uid, pwent.pw_gid)
        log.debug('DirectorySandbox: chmod 700 %s' % self.root)
        os.chmod(self.root, 0700)
      except (IOError, OSError) as e:
        raise self.CreationError('Failed to chown/chmod the sandbox: %s' % e)
Ejemplo n.º 35
0
 def on_lost(self, task_update):
     log.debug('Task on_lost(%s)' % task_update)
     self._cleanup()
Ejemplo n.º 36
0
 def shutdown(self):
     if log: log.debug('Shutting down metric sampler.')
     self._shutdown = True
Ejemplo n.º 37
0
 def on_failed(self, task_update):
     log.debug('Task on_failed(%s)' % task_update)
     self._cleanup()
Ejemplo n.º 38
0
 def schedule_cron(self, config, lock=None):
   log.info("Registering job %s with cron" % config.name())
   log.debug('Full configuration: %s' % config.job())
   log.debug('Lock %s' % lock)
   return self._scheduler_proxy.scheduleCronJob(config.job(), lock)
Ejemplo n.º 39
0
 def run(self):
     while True:
         self._event.wait()
         log.debug('Join event triggered, joining serverset.')
         self._event.clear()
         self._joiner()
Ejemplo n.º 40
0
 def on_initialization(self, header):
     log.debug('_on_initialization: %s' % header)
     ThermosTaskValidator.assert_valid_task(self._runner.task)
     ThermosTaskValidator.assert_valid_ports(self._runner.task,
                                             header.ports)
     self._checkpoint(RunnerCkpt(runner_header=header))
Ejemplo n.º 41
0
 def on_forked(self, process_update):
     log.debug('Process on_forked %s' % process_update)
     task_process = self._runner._task_processes[process_update.process]
     task_process.rebind(process_update.coordinator_pid,
                         process_update.fork_time)
     self._runner._plan.set_running(process_update.process)
Ejemplo n.º 42
0
    def execute(self):
        """Perform final initialization and launch target process commandline in a subprocess."""

        user, _ = self._getpwuid()
        username, homedir = user.pw_name, user.pw_dir

        # TODO(wickman) reconsider setsid now that we're invoking in a subshell
        os.setsid()
        if self._use_chroot:
            self._chroot()

        # If the mesos containerizer path is set, then this process will be launched from within an
        # isolated filesystem image by the mesos-containerizer executable. This executable needs to be
        # run as root so that it can properly set up the filesystem as such we'll skip calling setuid at
        # this point. We'll instead setuid after the process has been forked (mesos-containerizer itself
        # ensures the forked process is run as the correct user).
        taskfs_isolated = self._mesos_containerizer_path is not None
        if not taskfs_isolated:
            self._setuid()

        # start process
        start_time = self._platform.clock().time()

        if not self._sandbox:
            cwd = subprocess_cwd = sandbox = os.getcwd()
        else:
            if self._use_chroot:
                cwd = subprocess_cwd = sandbox = '/'
            elif taskfs_isolated:
                cwd = homedir = sandbox = self._container_sandbox
                subprocess_cwd = self._sandbox
            else:
                cwd = subprocess_cwd = homedir = sandbox = self._sandbox

        thermos_profile = os.path.join(sandbox, self.RCFILE)

        if self._preserve_env:
            env = deepcopy(os.environ)
        else:
            env = {}

        env.update({
            'HOME': homedir,
            'LOGNAME': username,
            'USER': username,
            'PATH': os.environ['PATH']
        })

        wrapped_cmdline = self.wrapped_cmdline(cwd)
        log.debug('Wrapped cmdline: %s' % wrapped_cmdline)

        real_thermos_profile_path = os.path.join(
            os.environ['MESOS_DIRECTORY'], TASK_FILESYSTEM_MOUNT_POINT,
            thermos_profile.lstrip(
                '/')) if taskfs_isolated else thermos_profile

        if os.path.exists(real_thermos_profile_path):
            env.update(BASH_ENV=thermos_profile)

        log.debug('ENV is: %s' % env)
        subprocess_args = {
            'args': wrapped_cmdline,
            'close_fds': self.FD_CLOEXEC,
            'cwd': subprocess_cwd,
            'env': env,
            'pathspec': self._pathspec
        }

        log_destination_resolver = LogDestinationResolver(
            self._pathspec,
            destination=self._logger_destination,
            mode=self._logger_mode,
            rotate_log_size=self._rotate_log_size,
            rotate_log_backups=self._rotate_log_backups)
        stdout, stderr, handlers_are_files = log_destination_resolver.get_handlers(
        )
        if handlers_are_files:
            executor = SubprocessExecutor(stdout=stdout,
                                          stderr=stderr,
                                          **subprocess_args)
        else:
            executor = PipedSubprocessExecutor(stdout=stdout,
                                               stderr=stderr,
                                               **subprocess_args)

        pid = executor.start()

        # Now that we've forked the process, if the task's filesystem is isolated it's now safe to
        # setuid.
        if taskfs_isolated:
            self._setuid()

        self._write_process_update(state=ProcessState.RUNNING,
                                   pid=pid,
                                   start_time=start_time)

        rc = executor.wait()

        # indicate that we have finished/failed
        if rc < 0:
            state = ProcessState.KILLED
        elif rc == 0:
            state = ProcessState.SUCCESS
        else:
            state = ProcessState.FAILED

        self._write_process_update(state=state,
                                   return_code=rc,
                                   stop_time=self._platform.clock().time())
        self._rc = rc
Ejemplo n.º 43
0
    def watch(self, instance_ids, health_check=None):
        """Watches a set of instances and detects failures based on a delegated health check.

    Arguments:
    instance_ids -- set of instances to watch.

    Returns a set of instances that are considered failed.
    """
        log.info('Watching instances: %s' % instance_ids)
        instance_ids = set(instance_ids)
        health_check = health_check or StatusHealthCheck()

        instance_states = {}

        def finished_instances():
            return dict(
                (s_id, s) for s_id, s in instance_states.items() if s.finished)

        def set_instance_healthy(instance_id, now):
            if instance_id not in instance_states:
                instance_states[instance_id] = Instance(now)
            instance = instance_states.get(instance_id)
            if now > (instance.birthday + self._watch_secs):
                log.info(
                    'Instance %s has been up and healthy for at least %d seconds'
                    % (instance_id, self._watch_secs))
                instance.set_healthy(True)

        def set_instance_unhealthy(instance_id):
            log.info('Instance %s is unhealthy' % instance_id)
            if instance_id in instance_states:
                # An instance that was previously healthy and currently unhealthy has failed.
                instance_states[instance_id].set_healthy(False)
            else:
                # An instance never passed a health check (e.g.: failed before the first health check).
                instance_states[instance_id] = Instance(finished=True)

        while not self._terminating.is_set():
            running_tasks = self._status_helper.get_tasks(instance_ids)
            now = self._clock.time()
            tasks_by_instance = dict(
                (task.assignedTask.instanceId, task) for task in running_tasks)
            for instance_id in instance_ids:
                if instance_id not in finished_instances():
                    running_task = tasks_by_instance.get(instance_id)
                    if running_task is not None:
                        task_healthy = health_check.health(running_task)
                        if task_healthy:
                            set_instance_healthy(instance_id, now)
                        else:
                            set_instance_unhealthy(instance_id)

            log.debug('Instances health: %s' %
                      ['%s: %s' % val for val in instance_states.items()])

            # Return if all tasks are finished.
            if set(finished_instances().keys()) == instance_ids:
                return set([
                    s_id for s_id, s in instance_states.items()
                    if not s.healthy
                ])

            self._terminating.wait(self._health_check_interval_seconds)
Ejemplo n.º 44
0
 def __on_removed(self, root, task_id):
     log.debug('on_removed(%r, %r)', root, task_id)
     active_task = self._active_tasks.pop(task_id, None)
     if active_task:
         active_task.resource_monitor.kill()
     self._finished_tasks.pop(task_id, None)
Ejemplo n.º 45
0
 def _log(self, msg, exc_info=None):
     log.debug('[process:%5s=%s]: %s' % (self._pid, self.name(), msg),
               exc_info=exc_info)
Ejemplo n.º 46
0
    def create(args, options):
        validate_common_options(options)

        if not options.num_nodes:
            app.error("--num_nodes is required")

        if not options.cluster_user:
            app.error("--cluster_user is required")

        url = 'http://%s:%s/clusters/%s' % (options.api_host, options.api_port,
                                            options.cluster_name)
        values = dict(
            num_nodes=int(options.num_nodes),
            cluster_user=options.cluster_user,
            size=options.size
            if options.size else '',  # 'urlencode()' doesn't accept None.
            backup_id=options.backup_id if options.backup_id else '',
            cluster_password=options.cluster_password
            if options.cluster_password else '')

        req = urllib2.Request(url, urllib.urlencode(values))
        try:
            response = urllib2.urlopen(req).read()
        except urllib2.HTTPError as e:
            log.error("POST request failed: %s, %s, %s" %
                      (e.code,
                       BaseHTTPServer.BaseHTTPRequestHandler.responses[e.code],
                       e.read()))
            app.quit(1)

        try:
            result = json.loads(response)
            if not isinstance(result, dict):
                raise ValueError()
        except ValueError:
            log.error("Invalid response: %s" % response)
            app.quit(1)

        log.info("Cluster created. Cluster info: %s" % str(result))
        with open(options.password_file, 'w') as f:
            f.write(result["cluster_password"])

        log.info("Waiting for the master for this cluster to be elected...")
        master_endpoint = wait_for_master(
            result['cluster_url']).service_endpoint

        connection_str = "mysql://%s:%s@%s:%d/" % (
            options.cluster_user, result["cluster_password"],
            master_endpoint.host, master_endpoint.port)
        log.info("Connecting to the MySQL cluster master: %s" % connection_str)
        engine = create_engine(connection_str)

        for i in range(
                5
        ):  # Loop for 5 times/seconds to wait for the master to be promoted.
            try:
                # TODO(jyx): Test writing to the master and reading from the slave.
                result = engine.execute("SELECT 1;").scalar()
                assert 1 == int(
                    result), "Expecting result to be 1 but got %s" % result
                break
            except OperationalError:
                if i == 4:
                    raise
                log.debug("MySQL master not ready yet. Sleep for 1 second...")
                time.sleep(1)

        log.info("Cluster successfully started")
Ejemplo n.º 47
0
    def run(self):
        """Thread entrypoint. Loop indefinitely, polling collectors at self._collection_interval and
    collating samples."""

        log.debug('Commencing resource monitoring for task "%s"' %
                  self._task_id)
        next_process_collection = 0
        next_disk_collection = 0

        while not self._kill_signal.is_set():
            now = time.time()

            if now > next_process_collection:
                next_process_collection = now + self._process_collection_interval
                actives = set(self._get_active_processes())
                current = set(self._process_collectors)
                for process in current - actives:
                    self._process_collectors.pop(process)
                for process in actives - current:
                    self._process_collectors[process] = ProcessTreeCollector(
                        process.pid)
                for process, collector in self._process_collectors.items():
                    collector.sample()

            if now > next_disk_collection:
                next_disk_collection = now + self._disk_collection_interval
                if not self._disk_collector:
                    sandbox = self._task_monitor.get_sandbox()
                    if sandbox:
                        self._disk_collector = self._disk_collector_class(
                            sandbox)
                if self._disk_collector:
                    self._disk_collector.sample()
                else:
                    log.debug('No sandbox detected yet for %s' % self._task_id)

            try:
                disk_usage = self._disk_collector.value if self._disk_collector else 0

                proc_usage_dict = dict()
                for process, collector in self._process_collectors.items():
                    proc_usage_dict.update({
                        process:
                        self.ProcResourceResult(collector.value,
                                                collector.procs)
                    })

                self._history.add(
                    now, self.FullResourceResult(proc_usage_dict, disk_usage))
            except ValueError as err:
                log.warning("Error recording resource sample: %s" % err)

            log.debug(
                "TaskResourceMonitor: finished collection of %s in %.2fs" %
                (self._task_id, (time.time() - now)))

            # Sleep until any of the following conditions are met:
            # - it's time for the next disk collection
            # - it's time for the next process collection
            # - the result from the last disk collection is available via the DiskCollector
            # - the TaskResourceMonitor has been killed via self._kill_signal
            now = time.time()
            next_collection = min(next_process_collection - now,
                                  next_disk_collection - now)

            if self._disk_collector:
                waiter = EventMuxer(self._kill_signal,
                                    self._disk_collector.completed_event)
            else:
                waiter = self._kill_signal

            if next_collection > 0:
                waiter.wait(timeout=next_collection)
            else:
                log.warning(
                    'Task resource collection is backlogged. Consider increasing '
                    'process_collection_interval and disk_collection_interval.'
                )

        log.debug('Stopping resource monitoring for task "%s"' % self._task_id)
Ejemplo n.º 48
0
 def stop(self):
     log.debug('Health checker thread stopped.')
     self.dead.set()
Ejemplo n.º 49
0
 def on_process_transition(self, state, process_update):
     log.debug('_on_process_transition: %s' % process_update)
     self._checkpoint(RunnerCkpt(process_status=process_update))
Ejemplo n.º 50
0
def read_opcode(data, offset):
    opcode, offset = read_number(data, offset)
    if opcode not in ZK_REQUEST_TYPES:
        log.debug("Bad request type: %s", opcode)
        raise DeserializationError("Invalid request type: %d" % (opcode))
    return (opcode, offset)
Ejemplo n.º 51
0
 def on_task_transition(self, state, task_update):
     log.debug('_on_task_transition: %s' % task_update)
     self._checkpoint(RunnerCkpt(task_status=task_update))
Ejemplo n.º 52
0
 def on_running(self, process_update):
     log.debug('Process on_running %s' % process_update)
     self._runner._plan.set_running(process_update.process)
Ejemplo n.º 53
0
 def close_ckpt(self):
     """Force close the checkpoint stream.  This is necessary for runners terminated through
    exception propagation."""
     log.debug('Closing the checkpoint stream.')
     self._ckpt.close()
Ejemplo n.º 54
0
 def on_lost(self, process_update):
     log.debug('Process on_lost %s' % process_update)
     self._cleanup(process_update)
     self._on_abnormal(process_update)
     self._runner._plan.lost(process_update.process)
Ejemplo n.º 55
0
    def genlang(self, lang, targets):
        bases, sources = calculate_compile_roots(targets, self.is_gentarget)

        if lang == 'java':
            gen = self.gen_java.gen
        elif lang == 'python':
            gen = self.gen_python.gen
        else:
            raise TaskError('Unrecognized thrift gen lang: %s' % lang)

        args = [
            self.thrift_binary,
            '--gen',
            gen,
            '-recurse',
        ]

        if self.strict:
            args.append('-strict')
        if self.verbose:
            args.append('-verbose')
        for base in bases:
            args.extend(('-I', base))

        sessions = []
        for source in sources:
            self.context.log.info('Generating thrift for %s\n' % source)
            # Create a unique session dir for this thrift root.  Sources may be full paths but we only
            # need the path relative to the build root to ensure uniqueness.
            # TODO(John Sirois): file paths should be normalized early on and uniformly, fix the need to
            # relpath here at all.
            relsource = os.path.relpath(source, get_buildroot())

            if lang == "python":
                copied_source = os.path.join(self._workdir, relsource)
                safe_mkdir(os.path.dirname(copied_source))
                shutil.copyfile(source, copied_source)
                replace_python_keywords_in_file(copied_source)
                source = relsource = copied_source

            outdir = os.path.join(self.session_dir,
                                  '.'.join(relsource.split(os.path.sep)))
            safe_mkdir(outdir)

            cmd = args[:]
            cmd.extend(('-o', outdir))
            cmd.append(source)
            log.debug('Executing: %s' % ' '.join(cmd))
            sessions.append(
                self.ThriftSession(outdir, cmd, subprocess.Popen(cmd)))

        result = 0
        for session in sessions:
            if result != 0:
                session.process.kill()
            else:
                result = session.process.wait()
                if result != 0:
                    self.context.log.error('Failed: %s' %
                                           ' '.join(session.cmd))
                else:
                    _copytree(session.outdir, self.combined_dir)
        if result != 0:
            raise TaskError('%s ... exited non-zero (%i)' %
                            (self.thrift_binary, result))
Ejemplo n.º 56
0
 def wrapped_func(self, *args):
     log.debug('%s(%s)' % (func.__name__, ', '.join(
         '%s=%s' % (name, arg) for (name, arg) in zip(arg_names, args))))
     return func(self, *args)
Ejemplo n.º 57
0
 def create_job(self, config, lock=None):
   log.info('Creating job %s' % config.name())
   log.debug('Full configuration: %s' % config.job())
   log.debug('Lock %s' % lock)
   return self._scheduler_proxy.createJob(config.job(), lock)
Ejemplo n.º 58
0
 def on_success(self, task_update):
     log.debug('Task on_success(%s)' % task_update)
     self._cleanup()
     log.info('Task succeeded.')
Ejemplo n.º 59
0
 def on_cleaning(self, task_update):
     log.debug('Task on_cleaning(%s)' % task_update)
     self._runner._finalization_start = task_update.timestamp_ms / 1000.0
     self._runner._terminate_plan(self._runner._regular_plan)
Ejemplo n.º 60
0
 def _log(self, msg):
     log.debug('[process:%5s=%s]: %s' % (self._pid, self.name(), msg))