Esempio n. 1
0
def exec_python(args):
    """Executes a python process, replacing the current process if possible.

  On Windows, it returns the child process code. The caller must exit at the
  earliest opportunity.
  """
    cmd = [sys.executable] + args
    if sys.platform not in ('cygwin', 'win32'):
        os.execv(cmd[0], cmd)
        return 1

    try:
        # On Windows, we cannot sanely exec() so shell out the child process
        # instead. But we need to forward any signal received that the bot may care
        # about. This means processes accumulate, sadly.
        # TODO(maruel): If stdin closes, it tells the child process that the parent
        # process died.
        proc = subprocess42.Popen(cmd, detached=True, stdin=subprocess42.PIPE)

        def handler(sig, _):
            logging.info('Got signal %s', sig)
            # Always send SIGTERM, which is properly translated.
            proc.send_signal(signal.SIGTERM)

        sig = signal.SIGBREAK if sys.platform == 'win32' else signal.SIGTERM
        with subprocess42.set_signal_handler([sig], handler):
            proc.wait()
            return proc.returncode
    except Exception as e:
        logging.exception('failed to start: %s', e)
        # Swallow the exception.
        return 1
Esempio n. 2
0
def exec_python(args):
  """Executes a python process, replacing the current process if possible.

  On Windows, it returns the child process code. The caller must exit at the
  earliest opportunity.
  """
  cmd = [sys.executable] + args
  if sys.platform not in ('cygwin', 'win32'):
    os.execv(cmd[0], cmd)
    return 1

  try:
    # On Windows, we cannot sanely exec() so shell out the child process
    # instead. But we need to forward any signal received that the bot may care
    # about. This means processes accumulate, sadly.
    # TODO(maruel): If stdin closes, it tells the child process that the parent
    # process died.
    proc = subprocess42.Popen(cmd, detached=True, stdin=subprocess42.PIPE)
    def handler(sig, _):
      logging.info('Got signal %s', sig)
      # Always send SIGTERM, which is properly translated.
      proc.send_signal(signal.SIGTERM)

    sig = signal.SIGBREAK if sys.platform == 'win32' else signal.SIGTERM
    with subprocess42.set_signal_handler([sig], handler):
      proc.wait()
      return proc.returncode
  except Exception as e:
    logging.exception('failed to start: %s', e)
    # Swallow the exception.
    return 1
Esempio n. 3
0
def load_and_run(in_file, swarming_server, cost_usd_hour, start, out_file):
    """Loads the task's metadata and execute it.

  This may throw all sorts of exceptions in case of failure. It's up to the
  caller to trap them. These shall be considered 'internal_failure' instead of
  'failure' from a TaskRunResult standpoint.
  """
    # The work directory is guaranteed to exist since it was created by
    # bot_main.py and contains the manifest. Temporary files will be downloaded
    # there. It's bot_main.py that will delete the directory afterward. Tests are
    # not run from there.
    task_result = None

    def handler(sig, _):
        logging.info('Got signal %s', sig)
        raise MustExit(sig)

    try:
        with subprocess42.set_signal_handler([SIG_BREAK_OR_TERM], handler):
            work_dir = os.path.abspath('work')
            if not os.path.isdir(work_dir):
                raise ValueError('%s expected to exist' % work_dir)

            with open(in_file, 'rb') as f:
                task_details = TaskDetails(json.load(f))

            # Download the script to run in the temporary directory.
            # TODO(maruel): Remove.
            download_data(work_dir, task_details.data)

            task_result = run_command(swarming_server, task_details, work_dir,
                                      cost_usd_hour, start)
    except MustExit as e:
        # This normally means run_command() didn't get the chance to run, as it
        # itself trap MustExit and will report accordingly. In this case, we want
        # the parent process to send the message instead.
        if not task_result:
            task_result = {
                u'exit_code':
                None,
                u'hard_timeout':
                False,
                u'io_timeout':
                False,
                u'must_signal_internal_failure':
                u'task_runner received signal %s' % e.signal,
                u'version':
                OUT_VERSION,
            }
    finally:
        # We've found tests to delete 'work' when quitting, causing an exception
        # here. Try to recreate the directory if necessary.
        work_dir = os.path.dirname(out_file)
        if not os.path.isdir(work_dir):
            os.mkdir(work_dir)
        with open(out_file, 'wb') as f:
            json.dump(task_result, f)
Esempio n. 4
0
def load_and_run(in_file, swarming_server, cost_usd_hour, start, out_file):
  """Loads the task's metadata and execute it.

  This may throw all sorts of exceptions in case of failure. It's up to the
  caller to trap them. These shall be considered 'internal_failure' instead of
  'failure' from a TaskRunResult standpoint.
  """
  # The work directory is guaranteed to exist since it was created by
  # bot_main.py and contains the manifest. Temporary files will be downloaded
  # there. It's bot_main.py that will delete the directory afterward. Tests are
  # not run from there.
  task_result = None
  def handler(sig, _):
    logging.info('Got signal %s', sig)
    raise MustExit(sig)
  try:
    with subprocess42.set_signal_handler([SIG_BREAK_OR_TERM], handler):
      work_dir = os.path.abspath('work')
      if not os.path.isdir(work_dir):
        raise ValueError('%s expected to exist' % work_dir)

      with open(in_file, 'rb') as f:
        task_details = TaskDetails(json.load(f))

      # Download the script to run in the temporary directory.
      # TODO(maruel): Remove.
      download_data(work_dir, task_details.data)

      task_result = run_command(
          swarming_server, task_details, work_dir, cost_usd_hour, start)
  except MustExit as e:
    # This normally means run_command() didn't get the chance to run, as it
    # itself trap MustExit and will report accordingly. In this case, we want
    # the parent process to send the message instead.
    if not task_result:
      task_result = {
        u'exit_code': None,
        u'hard_timeout': False,
        u'io_timeout': False,
        u'must_signal_internal_failure':
            u'task_runner received signal %s' % e.signal,
        u'version': OUT_VERSION,
      }
  finally:
    # We've found tests to delete 'work' when quitting, causing an exception
    # here. Try to recreate the directory if necessary.
    work_dir = os.path.dirname(out_file)
    if not os.path.isdir(work_dir):
      os.mkdir(work_dir)
    with open(out_file, 'wb') as f:
      json.dump(task_result, f)
Esempio n. 5
0
def load_and_run(
    in_file, swarming_server, is_grpc, cost_usd_hour, start, out_file,
    run_isolated_flags, bot_file, auth_params_file):
  """Loads the task's metadata, prepares auth environment and executes the task.

  This may throw all sorts of exceptions in case of failure. It's up to the
  caller to trap them. These shall be considered 'internal_failure' instead of
  'failure' from a TaskRunResult standpoint.
  """
  auth_system = None
  local_auth_context = None
  task_result = None
  work_dir = os.path.dirname(out_file)

  def handler(sig, _):
    logging.info('Got signal %s', sig)
    raise ExitSignal(sig)

  try:
    with subprocess42.set_signal_handler([SIG_BREAK_OR_TERM], handler):
      # The work directory is guaranteed to exist since it was created by
      # bot_main.py and contains the manifest. Temporary files will be
      # downloaded there. It's bot_main.py that will delete the directory
      # afterward. Tests are not run from there.
      if not os.path.isdir(work_dir):
        raise InternalError('%s expected to exist' % work_dir)

      # Raises InternalError on errors.
      task_details = TaskDetails.load(in_file)

      # This will start a thread that occasionally reads bot authentication
      # headers from 'auth_params_file'. It will also optionally launch local
      # HTTP server that serves OAuth tokens to the task processes. We put
      # location of this service into a file referenced by LUCI_CONTEXT env var
      # below.
      if auth_params_file:
        try:
          auth_system = bot_auth.AuthSystem(auth_params_file)
          local_auth_context = auth_system.start()
        except bot_auth.AuthSystemError as e:
          raise InternalError('Failed to init auth: %s' % e)

      # Override LUCI_CONTEXT['local_auth']. If the task is not using auth,
      # do NOT inherit existing local_auth (if its there). Kick it out by
      # passing None.
      context_edits = {
        'local_auth': local_auth_context
      }

      # Extend existing LUCI_CONTEXT['swarming'], if any.
      if task_details.secret_bytes is not None:
        swarming = luci_context.read('swarming') or {}
        swarming['secret_bytes'] = task_details.secret_bytes
        context_edits['swarming'] = swarming

      # Returns bot authentication headers dict or raises InternalError.
      def headers_cb():
        try:
          if auth_system:
            return auth_system.get_bot_headers()
          return (None, None) # A timeout of "None" means "don't use auth"
        except bot_auth.AuthSystemError as e:
          raise InternalError('Failed to grab bot auth headers: %s' % e)

      # Make a client that can send request to Swarming using bot auth headers.
      grpc_proxy = ''
      if is_grpc:
        grpc_proxy = swarming_server
        swarming_server = ''
      # The hostname and work dir provided here don't really matter, since the
      # task runner is always called with a specific versioned URL.
      remote = remote_client.createRemoteClient(
          swarming_server, headers_cb, os_utilities.get_hostname_short(),
          work_dir, grpc_proxy)
      remote.initialize()

      # Let AuthSystem know it can now send RPCs to Swarming (to grab OAuth
      # tokens). There's a circular dependency here! AuthSystem will be
      # indirectly relying on its own 'get_bot_headers' method to authenticate
      # RPCs it sends through the provided client.
      if auth_system:
        auth_system.set_remote_client(remote)

      # Auth environment is up, start the command. task_result is dumped to
      # disk in 'finally' block.
      with luci_context.stage(_tmpdir=work_dir, **context_edits) as ctx_file:
        task_result = run_command(
            remote, task_details, work_dir, cost_usd_hour,
            start, run_isolated_flags, bot_file, ctx_file)
  except (ExitSignal, InternalError, remote_client.InternalError) as e:
    # This normally means run_command() didn't get the chance to run, as it
    # itself traps exceptions and will report accordingly. In this case, we want
    # the parent process to send the message instead.
    if not task_result:
      task_result = {
        u'exit_code': -1,
        u'hard_timeout': False,
        u'io_timeout': False,
        u'must_signal_internal_failure': str(e.message or 'unknown error'),
        u'version': OUT_VERSION,
      }

  finally:
    # We've found tests to delete the working directory work_dir when quitting,
    # causing an exception here. Try to recreate the directory if necessary.
    if not os.path.isdir(work_dir):
      os.mkdir(work_dir)
    if auth_system:
      auth_system.stop()
    with open(out_file, 'wb') as f:
      json.dump(task_result, f)
Esempio n. 6
0
def run_command(command, cwd, tmp_dir, hard_timeout, grace_period):
  """Runs the command.

  Returns:
    tuple(process exit code, bool if had a hard timeout)
  """
  logging.info('run_command(%s, %s)' % (command, cwd))
  sys.stdout.flush()

  env = os.environ.copy()
  if sys.platform == 'darwin':
    env['TMPDIR'] = tmp_dir.encode('ascii')
  elif sys.platform == 'win32':
    # Temporarily disable this behavior on Windows while investigating
    # https://crbug.com/533552.
    # env['TEMP'] = tmp_dir.encode('ascii')
    pass
  else:
    env['TMP'] = tmp_dir.encode('ascii')
  exit_code = None
  had_hard_timeout = False
  with tools.Profiler('RunTest'):
    proc = None
    had_signal = []
    try:
      # TODO(maruel): This code is imperfect. It doesn't handle well signals
      # during the download phase and there's short windows were things can go
      # wrong.
      def handler(signum, _frame):
        if proc and not had_signal:
          logging.info('Received signal %d', signum)
          had_signal.append(True)
          raise subprocess42.TimeoutExpired(command, None)

      proc = subprocess42.Popen(command, cwd=cwd, env=env, detached=True)
      with subprocess42.set_signal_handler(subprocess42.STOP_SIGNALS, handler):
        try:
          exit_code = proc.wait(hard_timeout or None)
        except subprocess42.TimeoutExpired:
          if not had_signal:
            logging.warning('Hard timeout')
            had_hard_timeout = True
          logging.warning('Sending SIGTERM')
          proc.terminate()

      # Ignore signals in grace period. Forcibly give the grace period to the
      # child process.
      if exit_code is None:
        ignore = lambda *_: None
        with subprocess42.set_signal_handler(subprocess42.STOP_SIGNALS, ignore):
          try:
            exit_code = proc.wait(grace_period or None)
          except subprocess42.TimeoutExpired:
            # Now kill for real. The user can distinguish between the
            # following states:
            # - signal but process exited within grace period,
            #   hard_timed_out will be set but the process exit code will be
            #   script provided.
            # - processed exited late, exit code will be -9 on posix.
            logging.warning('Grace exhausted; sending SIGKILL')
            proc.kill()
      logging.info('Waiting for proces exit')
      exit_code = proc.wait()
    except OSError:
      # This is not considered to be an internal error. The executable simply
      # does not exit.
      exit_code = 1
  logging.info(
      'Command finished with exit code %d (%s)',
      exit_code, hex(0xffffffff & exit_code))
  return exit_code, had_hard_timeout
Esempio n. 7
0
def run_command(command, cwd, env, hard_timeout, grace_period):
    """Runs the command.

  Returns:
    tuple(process exit code, bool if had a hard timeout)
  """
    logging.info('run_command(%s, %s)' % (command, cwd))

    exit_code = None
    had_hard_timeout = False
    with tools.Profiler('RunTest'):
        proc = None
        had_signal = []
        try:
            # TODO(maruel): This code is imperfect. It doesn't handle well signals
            # during the download phase and there's short windows were things can go
            # wrong.
            def handler(signum, _frame):
                if proc and not had_signal:
                    logging.info('Received signal %d', signum)
                    had_signal.append(True)
                    raise subprocess42.TimeoutExpired(command, None)

            proc = subprocess42.Popen(command, cwd=cwd, env=env, detached=True)
            with subprocess42.set_signal_handler(subprocess42.STOP_SIGNALS,
                                                 handler):
                try:
                    exit_code = proc.wait(hard_timeout or None)
                except subprocess42.TimeoutExpired:
                    if not had_signal:
                        logging.warning('Hard timeout')
                        had_hard_timeout = True
                    logging.warning('Sending SIGTERM')
                    proc.terminate()

            # Ignore signals in grace period. Forcibly give the grace period to the
            # child process.
            if exit_code is None:
                ignore = lambda *_: None
                with subprocess42.set_signal_handler(subprocess42.STOP_SIGNALS,
                                                     ignore):
                    try:
                        exit_code = proc.wait(grace_period or None)
                    except subprocess42.TimeoutExpired:
                        # Now kill for real. The user can distinguish between the
                        # following states:
                        # - signal but process exited within grace period,
                        #   hard_timed_out will be set but the process exit code will be
                        #   script provided.
                        # - processed exited late, exit code will be -9 on posix.
                        logging.warning('Grace exhausted; sending SIGKILL')
                        proc.kill()
            logging.info('Waiting for proces exit')
            exit_code = proc.wait()
        except OSError:
            # This is not considered to be an internal error. The executable simply
            # does not exit.
            sys.stderr.write(
                '<The executable does not exist or a dependent library is missing>\n'
                '<Check for missing .so/.dll in the .isolate or GN file>\n'
                '<Command: %s>\n' % command)
            if os.environ.get('SWARMING_TASK_ID'):
                # Give an additional hint when running as a swarming task.
                sys.stderr.write(
                    '<See the task\'s page for commands to help diagnose this issue '
                    'by reproducing the task locally>\n')
            exit_code = 1
    logging.info('Command finished with exit code %d (%s)', exit_code,
                 hex(0xffffffff & exit_code))
    return exit_code, had_hard_timeout
Esempio n. 8
0
def run_command(command, cwd, tmp_dir, hard_timeout, grace_period):
  """Runs the command.

  Returns:
    tuple(process exit code, bool if had a hard timeout)
  """
  logging.info('run_command(%s, %s)' % (command, cwd))
  sys.stdout.flush()

  env = os.environ.copy()
  if sys.platform == 'darwin':
    env['TMPDIR'] = tmp_dir.encode('ascii')
  elif sys.platform == 'win32':
    # Temporarily disable this behavior on Windows while investigating
    # https://crbug.com/533552.
    # env['TEMP'] = tmp_dir.encode('ascii')
    pass
  else:
    env['TMP'] = tmp_dir.encode('ascii')
  exit_code = None
  had_hard_timeout = False
  with tools.Profiler('RunTest'):
    proc = None
    had_signal = []
    try:
      # TODO(maruel): This code is imperfect. It doesn't handle well signals
      # during the download phase and there's short windows were things can go
      # wrong.
      def handler(signum, _frame):
        if proc and not had_signal:
          logging.info('Received signal %d', signum)
          had_signal.append(True)
          raise subprocess42.TimeoutExpired(command, None)

      proc = subprocess42.Popen(command, cwd=cwd, env=env, detached=True)
      with subprocess42.set_signal_handler(subprocess42.STOP_SIGNALS, handler):
        try:
          exit_code = proc.wait(hard_timeout or None)
        except subprocess42.TimeoutExpired:
          if not had_signal:
            logging.warning('Hard timeout')
            had_hard_timeout = True
          logging.warning('Sending SIGTERM')
          proc.terminate()

      # Ignore signals in grace period. Forcibly give the grace period to the
      # child process.
      if exit_code is None:
        ignore = lambda *_: None
        with subprocess42.set_signal_handler(subprocess42.STOP_SIGNALS, ignore):
          try:
            exit_code = proc.wait(grace_period or None)
          except subprocess42.TimeoutExpired:
            # Now kill for real. The user can distinguish between the
            # following states:
            # - signal but process exited within grace period,
            #   hard_timed_out will be set but the process exit code will be
            #   script provided.
            # - processed exited late, exit code will be -9 on posix.
            logging.warning('Grace exhausted; sending SIGKILL')
            proc.kill()
      logging.info('Waiting for proces exit')
      exit_code = proc.wait()
    except OSError:
      # This is not considered to be an internal error. The executable simply
      # does not exit.
      exit_code = 1
  logging.info(
      'Command finished with exit code %d (%s)',
      exit_code, hex(0xffffffff & exit_code))
  return exit_code, had_hard_timeout
Esempio n. 9
0
def run_bot(arg_error):
    """Runs the bot until it reboots or self-update or a signal is received.

  When a signal is received, simply exit.
  """
    quit_bit = threading.Event()

    def handler(sig, _):
        logging.info('Got signal %s', sig)
        quit_bit.set()

    # TODO(maruel): Set quit_bit when stdin is closed on Windows.

    with subprocess42.set_signal_handler(subprocess42.STOP_SIGNALS, handler):
        config = get_config()
        try:
            # First thing is to get an arbitrary url. This also ensures the network is
            # up and running, which is necessary before trying to get the FQDN below.
            resp = net.url_read(config['server'] +
                                '/swarming/api/v1/bot/server_ping')
            if resp is None:
                logging.error('No response from server_ping')
        except Exception as e:
            # url_read() already traps pretty much every exceptions. This except
            # clause is kept there "just in case".
            logging.exception('server_ping threw')

        if quit_bit.is_set():
            logging.info('Early quit 1')
            return 0

        # If this fails, there's hardly anything that can be done, the bot can't
        # even get to the point to be able to self-update.
        botobj = get_bot()
        resp = net.url_read_json(botobj.server +
                                 '/swarming/api/v1/bot/handshake',
                                 data=botobj._attributes)
        if not resp:
            logging.error('Failed to contact for handshake')
        else:
            logging.info('Connected to %s', resp.get('server_version'))
            if resp.get('bot_version') != botobj._attributes['version']:
                logging.warning(
                    'Found out we\'ll need to update: server said %s; we\'re %s',
                    resp.get('bot_version'), botobj._attributes['version'])

        if arg_error:
            botobj.post_error('Bootstrapping error: %s' % arg_error)

        if quit_bit.is_set():
            logging.info('Early quit 2')
            return 0

        clean_isolated_cache(botobj)

        call_hook(botobj, 'on_bot_startup')

        if quit_bit.is_set():
            logging.info('Early quit 3')
            return 0

        # This environment variable is accessible to the tasks executed by this bot.
        os.environ['SWARMING_BOT_ID'] = botobj.id.encode('utf-8')

        # Remove the 'work' directory if present, as not removing it may cause the
        # bot to stay quarantined and not be able to get out of this state.
        work_dir = os.path.join(botobj.base_dir, 'work')
        try:
            if os.path.isdir(work_dir):
                file_path.rmtree(work_dir)
        except Exception as e:
            botobj.post_error('Failed to remove work: %s' % e)

        consecutive_sleeps = 0
        while not quit_bit.is_set():
            try:
                botobj.update_dimensions(get_dimensions(botobj))
                botobj.update_state(get_state(botobj, consecutive_sleeps))
                did_something = poll_server(botobj, quit_bit)
                if did_something:
                    consecutive_sleeps = 0
                else:
                    consecutive_sleeps += 1
            except Exception as e:
                logging.exception('poll_server failed')
                msg = '%s\n%s' % (e, traceback.format_exc()[-2048:])
                botobj.post_error(msg)
                consecutive_sleeps = 0
        logging.info('Quitting')

    # Tell the server we are going away.
    botobj.post_event('bot_shutdown', 'Signal was received')
    botobj.cancel_all_timers()
    return 0
Esempio n. 10
0
def run_bot(arg_error):
  """Runs the bot until it reboots or self-update or a signal is received.

  When a signal is received, simply exit.
  """
  quit_bit = threading.Event()
  def handler(sig, _):
    logging.info('Got signal %s', sig)
    quit_bit.set()

  # TODO(maruel): Set quit_bit when stdin is closed on Windows.

  with subprocess42.set_signal_handler(subprocess42.STOP_SIGNALS, handler):
    try:
      # First thing is to get an arbitrary url. This also ensures the network is
      # up and running, which is necessary before trying to get the FQDN below.
      resp = get_remote().url_read('/swarming/api/v1/bot/server_ping')
      if resp is None:
        logging.error('No response from server_ping')
    except Exception as e:
      # url_read() already traps pretty much every exceptions. This except
      # clause is kept there "just in case".
      logging.exception('server_ping threw')

    if quit_bit.is_set():
      return 0

    # If this fails, there's hardly anything that can be done, the bot can't
    # even get to the point to be able to self-update.
    botobj = get_bot()
    if arg_error:
      botobj.post_error('Bootstrapping error: %s' % arg_error)

    if quit_bit.is_set():
      return 0

    call_hook(botobj, 'on_bot_startup')

    if quit_bit.is_set():
      return 0

    # This environment variable is accessible to the tasks executed by this bot.
    os.environ['SWARMING_BOT_ID'] = botobj.id.encode('utf-8')

    # Remove the 'work' directory if present, as not removing it may cause the
    # bot to stay quarantined and not be able to get out of this state.
    work_dir = os.path.join(botobj.base_dir, 'work')
    try:
      if os.path.isdir(work_dir):
        file_path.rmtree(work_dir)
    except Exception as e:
      botobj.post_error('Failed to remove work: %s' % e)

    # TODO(maruel): Run 'health check' on startup.
    # https://code.google.com/p/swarming/issues/detail?id=112
    consecutive_sleeps = 0
    while not quit_bit.is_set():
      try:
        botobj.update_dimensions(get_dimensions(botobj))
        botobj.update_state(get_state(botobj, consecutive_sleeps))
        did_something = poll_server(botobj, quit_bit)
        if did_something:
          consecutive_sleeps = 0
        else:
          consecutive_sleeps += 1
      except Exception as e:
        logging.exception('poll_server failed')
        msg = '%s\n%s' % (e, traceback.format_exc()[-2048:])
        botobj.post_error(msg)
        consecutive_sleeps = 0

  # Tell the server we are going away.
  botobj.post_event('bot_shutdown', 'Signal was received')
  botobj.cancel_all_timers()
  return 0
Esempio n. 11
0
def load_and_run(in_file, swarming_server, cost_usd_hour, start, out_file,
                 min_free_space, bot_file, auth_params_file):
    """Loads the task's metadata, prepares auth environment and executes the task.

  This may throw all sorts of exceptions in case of failure. It's up to the
  caller to trap them. These shall be considered 'internal_failure' instead of
  'failure' from a TaskRunResult standpoint.
  """
    auth_system = None
    task_result = None
    work_dir = os.path.dirname(out_file)

    def handler(sig, _):
        logging.info('Got signal %s', sig)
        raise ExitSignal(sig)

    try:
        with subprocess42.set_signal_handler([SIG_BREAK_OR_TERM], handler):
            # The work directory is guaranteed to exist since it was created by
            # bot_main.py and contains the manifest. Temporary files will be
            # downloaded there. It's bot_main.py that will delete the directory
            # afterward. Tests are not run from there.
            if not os.path.isdir(work_dir):
                raise InternalError('%s expected to exist' % work_dir)

            # Raises InternalError on errors.
            task_details = TaskDetails.load(in_file)

            # This will start a thread that occasionally reads bot authentication
            # headers from 'auth_params_file'. It will also optionally launch local
            # HTTP server that serves OAuth tokens to the task processes. We put
            # location of this service into a file referenced by LUCI_CONTEXT env var
            # below.
            if auth_params_file:
                try:
                    auth_system = bot_auth.AuthSystem(auth_params_file)
                    auth_system.start()
                except bot_auth.AuthSystemError as e:
                    raise InternalError('Failed to init auth: %s' % e)

            context_edits = {}

            # If the task is using service accounts, add local_auth details to
            # LUCI_CONTEXT.
            if auth_system and auth_system.local_auth_context:
                context_edits['local_auth'] = auth_system.local_auth_context

            # Returns bot authentication headers dict or raises InternalError.
            def headers_cb():
                try:
                    if auth_system:
                        # The second parameter is the time until which the remote client
                        # should cache the headers. Since auth_system is doing the
                        # caching, we're just sending "0", which is to say the Epoch
                        # (Jan 1 1970), which effectively means "never cache."
                        return (auth_system.bot_headers, 0)
                    return (None, None
                            )  # A timeout of "None" means "don't use auth"
                except bot_auth.AuthSystemError as e:
                    raise InternalError('Failed to grab bot auth headers: %s' %
                                        e)

            # Auth environment is up, start the command. task_result is dumped to
            # disk in 'finally' block.
            remote = remote_client.createRemoteClient(swarming_server,
                                                      headers_cb)
            with luci_context.write(_tmpdir=work_dir, **context_edits):
                task_result = run_command(remote, task_details, work_dir,
                                          cost_usd_hour, start, min_free_space,
                                          bot_file)

    except (ExitSignal, InternalError) as e:
        # This normally means run_command() didn't get the chance to run, as it
        # itself traps exceptions and will report accordingly. In this case, we want
        # the parent process to send the message instead.
        if not task_result:
            task_result = {
                u'exit_code': -1,
                u'hard_timeout': False,
                u'io_timeout': False,
                u'must_signal_internal_failure': str(e.message
                                                     or 'unknown error'),
                u'version': OUT_VERSION,
            }

    finally:
        # We've found tests to delete the working directory work_dir when quitting,
        # causing an exception here. Try to recreate the directory if necessary.
        if not os.path.isdir(work_dir):
            os.mkdir(work_dir)
        if auth_system:
            auth_system.stop()
        with open(out_file, 'wb') as f:
            json.dump(task_result, f)
Esempio n. 12
0
def run_bot(arg_error):
  """Runs the bot until it reboots or self-update or a signal is received.

  When a signal is received, simply exit.
  """
  quit_bit = threading.Event()
  def handler(sig, _):
    logging.info('Got signal %s', sig)
    quit_bit.set()

  # TODO(maruel): Set quit_bit when stdin is closed on Windows.

  with subprocess42.set_signal_handler(subprocess42.STOP_SIGNALS, handler):
    config = get_config()
    try:
      # First thing is to get an arbitrary url. This also ensures the network is
      # up and running, which is necessary before trying to get the FQDN below.
      resp = net.url_read(config['server'] + '/swarming/api/v1/bot/server_ping')
      if resp is None:
        logging.error('No response from server_ping')
    except Exception as e:
      # url_read() already traps pretty much every exceptions. This except
      # clause is kept there "just in case".
      logging.exception('server_ping threw')

    if quit_bit.is_set():
      logging.info('Early quit 1')
      return 0

    # If this fails, there's hardly anything that can be done, the bot can't
    # even get to the point to be able to self-update.
    botobj = get_bot()
    resp = net.url_read_json(
        botobj.server + '/swarming/api/v1/bot/handshake',
        data=botobj._attributes)
    if not resp:
      logging.error('Failed to contact for handshake')
    else:
      logging.info('Connected to %s', resp.get('server_version'))
      if resp.get('bot_version') != botobj._attributes['version']:
        logging.warning(
            'Found out we\'ll need to update: server said %s; we\'re %s',
            resp.get('bot_version'), botobj._attributes['version'])

    if arg_error:
      botobj.post_error('Bootstrapping error: %s' % arg_error)

    if quit_bit.is_set():
      logging.info('Early quit 2')
      return 0

    clean_isolated_cache(botobj)

    call_hook(botobj, 'on_bot_startup')

    if quit_bit.is_set():
      logging.info('Early quit 3')
      return 0

    # This environment variable is accessible to the tasks executed by this bot.
    os.environ['SWARMING_BOT_ID'] = botobj.id.encode('utf-8')

    # Remove the 'work' directory if present, as not removing it may cause the
    # bot to stay quarantined and not be able to get out of this state.
    work_dir = os.path.join(botobj.base_dir, 'work')
    try:
      if os.path.isdir(work_dir):
        file_path.rmtree(work_dir)
    except Exception as e:
      botobj.post_error('Failed to remove work: %s' % e)

    consecutive_sleeps = 0
    while not quit_bit.is_set():
      try:
        botobj.update_dimensions(get_dimensions(botobj))
        botobj.update_state(get_state(botobj, consecutive_sleeps))
        did_something = poll_server(botobj, quit_bit)
        if did_something:
          consecutive_sleeps = 0
        else:
          consecutive_sleeps += 1
      except Exception as e:
        logging.exception('poll_server failed')
        msg = '%s\n%s' % (e, traceback.format_exc()[-2048:])
        botobj.post_error(msg)
        consecutive_sleeps = 0
    logging.info('Quitting')

  # Tell the server we are going away.
  botobj.post_event('bot_shutdown', 'Signal was received')
  botobj.cancel_all_timers()
  return 0
Esempio n. 13
0
def run_command(command, cwd, tmp_dir, hard_timeout, grace_period):
  """Runs the command.

  Returns:
    tuple(process exit code, bool if had a hard timeout)
  """
  logging.info('run_command(%s, %s)' % (command, cwd))
  sys.stdout.flush()

  env = os.environ.copy()
  if sys.platform == 'darwin':
    env['TMPDIR'] = tmp_dir.encode('ascii')
  elif sys.platform == 'win32':
    env['TEMP'] = tmp_dir.encode('ascii')
  else:
    env['TMP'] = tmp_dir.encode('ascii')
  exit_code = None
  had_hard_timeout = False
  with tools.Profiler('RunTest'):
    proc = None
    had_signal = []
    try:
      # TODO(maruel): This code is imperfect. It doesn't handle well signals
      # during the download phase and there's short windows were things can go
      # wrong.
      def handler(signum, _frame):
        if proc and not had_signal:
          logging.info('Received signal %d', signum)
          had_signal.append(True)
          raise subprocess42.TimeoutExpired(command, None)

      proc = subprocess42.Popen(command, cwd=cwd, env=env, detached=True)
      with subprocess42.set_signal_handler(subprocess42.STOP_SIGNALS, handler):
        try:
          exit_code = proc.wait(hard_timeout or None)
        except subprocess42.TimeoutExpired:
          if not had_signal:
            logging.warning('Hard timeout')
            had_hard_timeout = True
          logging.warning('Sending SIGTERM')
          proc.terminate()

      # Ignore signals in grace period. Forcibly give the grace period to the
      # child process.
      if exit_code is None:
        ignore = lambda *_: None
        with subprocess42.set_signal_handler(subprocess42.STOP_SIGNALS, ignore):
          try:
            exit_code = proc.wait(grace_period or None)
          except subprocess42.TimeoutExpired:
            # Now kill for real. The user can distinguish between the
            # following states:
            # - signal but process exited within grace period,
            #   hard_timed_out will be set but the process exit code will be
            #   script provided.
            # - processed exited late, exit code will be -9 on posix.
            logging.warning('Grace exhausted; sending SIGKILL')
            proc.kill()
      logging.info('Waiting for proces exit')
      exit_code = proc.wait()
    except OSError:
      # This is not considered to be an internal error. The executable simply
      # does not exit.
      sys.stderr.write(
          '<The executable does not exist or a dependent library is missing>\n'
          '<Check for missing .so/.dll in the .isolate or GN file>\n'
          '<Command: %s>\n' % command)
      if os.environ.get('SWARMING_TASK_ID'):
        # Give an additional hint when running as a swarming task.
        sys.stderr.write(
            '<See the task\'s page for commands to help diagnose this issue '
            'by reproducing the task locally>\n')
      exit_code = 1
  logging.info(
      'Command finished with exit code %d (%s)',
      exit_code, hex(0xffffffff & exit_code))
  return exit_code, had_hard_timeout
Esempio n. 14
0
def run_bot(arg_error):
    """Runs the bot until it reboots or self-update or a signal is received.

  When a signal is received, simply exit.
  """
    quit_bit = threading.Event()

    def handler(sig, _):
        logging.info('Got signal %s', sig)
        quit_bit.set()

    # TODO(maruel): Set quit_bit when stdin is closed on Windows.

    with subprocess42.set_signal_handler(subprocess42.STOP_SIGNALS, handler):
        config = get_config()
        try:
            # First thing is to get an arbitrary url. This also ensures the network is
            # up and running, which is necessary before trying to get the FQDN below.
            # There's no need to do error handling here - the "ping" is just to "wake
            # up" the network; if there's something seriously wrong, the handshake
            # will fail and we'll handle it there.
            remote = remote_client.createRemoteClient(config['server'], None)
            remote.ping()
        except Exception as e:
            # url_read() already traps pretty much every exceptions. This except
            # clause is kept there "just in case".
            logging.exception('server_ping threw')

        # If we are on GCE, we want to make sure GCE metadata server responds, since
        # we use the metadata to derive bot ID, dimensions and state.
        if platforms.is_gce():
            logging.info('Running on GCE, waiting for the metadata server')
            platforms.gce.wait_for_metadata(quit_bit)
            if quit_bit.is_set():
                logging.info('Early quit 1')
                return 0

        # Next we make sure the bot can make authenticated calls by grabbing
        # the auth headers, retrying on errors a bunch of times. We don't give up
        # if it fails though (maybe the bot will "fix itself" later).
        botobj = get_bot()
        try:
            botobj.remote.initialize(quit_bit)
        except remote_client.InitializationError as exc:
            botobj.post_error('failed to grab auth headers: %s' %
                              exc.last_error)
            logging.error('Can\'t grab auth headers, continuing anyway...')

        if arg_error:
            botobj.post_error('Bootstrapping error: %s' % arg_error)

        if quit_bit.is_set():
            logging.info('Early quit 2')
            return 0

        call_hook(botobj, 'on_bot_startup')

        # Initial attributes passed to bot.Bot in get_bot above were constructed for
        # 'fake' bot ID ('none'). Refresh them to match the real bot ID, now that we
        # have fully initialize bot.Bot object. Note that 'get_dimensions' and
        # 'get_state' may depend on actions done by 'on_bot_startup' hook, that's
        # why we do it here and not in 'get_bot'.
        botobj._update_dimensions(get_dimensions(botobj))
        botobj._update_state(get_state(botobj, 0))

        if quit_bit.is_set():
            logging.info('Early quit 3')
            return 0

        # This is the first authenticated request to the server. If the bot is
        # misconfigured, the request may fail with HTTP 401 or HTTP 403. Instead of
        # dying right away, spin in a loop, hoping the bot will "fix itself"
        # eventually. Authentication errors in /handshake are logged on the server
        # and generate error reports, so bots stuck in this state are discoverable.
        sleep_time = 5
        while not quit_bit.is_set():
            resp = botobj.remote.do_handshake(botobj._attributes)
            if resp:
                logging.info('Connected to %s', resp.get('server_version'))
                if resp.get('bot_version') != botobj._attributes['version']:
                    logging.warning(
                        'Found out we\'ll need to update: server said %s; we\'re %s',
                        resp.get('bot_version'), botobj._attributes['version'])
                # Remember the server-provided per-bot configuration. '/handshake' is
                # the only place where the server returns it. The bot will be sending
                # the 'bot_group_cfg_version' back in each /poll (as part of 'state'),
                # so that the server can instruct the bot to restart itself when
                # config changes.
                cfg_version = resp.get('bot_group_cfg_version')
                if cfg_version:
                    botobj._update_bot_group_cfg(cfg_version,
                                                 resp.get('bot_group_cfg'))
                break
            logging.error(
                'Failed to contact for handshake, retrying in %d sec...',
                sleep_time)
            quit_bit.wait(sleep_time)
            sleep_time = min(300, sleep_time * 2)

        if quit_bit.is_set():
            logging.info('Early quit 4')
            return 0

        # Let the bot to finish the initialization, now that it knows its server
        # defined dimensions.
        call_hook(botobj, 'on_handshake')

        cleanup_bot_directory(botobj)
        clean_cache(botobj)

        if quit_bit.is_set():
            logging.info('Early quit 5')
            return 0

        # This environment variable is accessible to the tasks executed by this bot.
        os.environ['SWARMING_BOT_ID'] = botobj.id.encode('utf-8')

        consecutive_sleeps = 0
        last_action = time.time()
        while not quit_bit.is_set():
            try:
                botobj._update_dimensions(get_dimensions(botobj))
                botobj._update_state(get_state(botobj, consecutive_sleeps))
                did_something = poll_server(botobj, quit_bit, last_action)
                if did_something:
                    last_action = time.time()
                    consecutive_sleeps = 0
                else:
                    consecutive_sleeps += 1
            except Exception as e:
                logging.exception('poll_server failed')
                msg = '%s\n%s' % (e, traceback.format_exc()[-2048:])
                botobj.post_error(msg)
                consecutive_sleeps = 0
        logging.info('Quitting')

    # Tell the server we are going away.
    botobj.post_event('bot_shutdown', 'Signal was received')
    return 0