Example #1
0
 def test_should_restart_bot_not_set(self):
   state = {
     'running_time': 0,
     'started_ts': 1410989556.174,
   }
   self.assertEqual(
       (False, ''), bot_management.should_restart_bot('id', state))
 def test_should_restart_bot_not_set(self):
   state = {
     'running_time': 0,
     'started_ts': 1410989556.174,
   }
   self.assertEqual(
       (False, ''), bot_management.should_restart_bot('id', state))
Example #3
0
 def test_should_restart_bot_bad_type(self):
   state = {
     'periodic_reboot_secs': '100',
     'running_time': 105,
     'started_ts': 1410989556.174,
   }
   self.assertEqual(
       (False, ''), bot_management.should_restart_bot('id', state))
 def test_should_restart_bot_bad_type(self):
   state = {
     'periodic_reboot_secs': '100',
     'running_time': 105,
     'started_ts': 1410989556.174,
   }
   self.assertEqual(
       (False, ''), bot_management.should_restart_bot('id', state))
Example #5
0
 def test_should_restart_bot(self):
   state = {
     'periodic_reboot_secs': 100,
     'running_time': 105,
     'started_ts': 1410989556.174,
   }
   needs_reboot, message = bot_management.should_restart_bot('id', state)
   self.assertTrue(needs_reboot)
   self.assertTrue(message)
 def test_should_restart_bot(self):
   state = {
     'periodic_reboot_secs': 100,
     'running_time': 105,
     'started_ts': 1410989556.174,
   }
   needs_reboot, message = bot_management.should_restart_bot('id', state)
   self.assertTrue(needs_reboot)
   self.assertTrue(message)
Example #7
0
    def post(self):
        """Handles a polling request.

    Be very permissive on missing values. This can happen because of errors
    on the bot, *we don't want to deny them the capacity to update*, so that the
    bot code is eventually fixed and the bot self-update to this working code.

    It makes recovery of the fleet in case of catastrophic failure much easier.
    """
        (_request, bot_id, version, state, dimensions,
         quarantined_msg) = self._process()
        sleep_streak = state.get('sleep_streak', 0)
        quarantined = bool(quarantined_msg)

        # Note bot existence at two places, one for stats at 1 minute resolution,
        # the other for the list of known bots.
        action = 'bot_inactive' if quarantined else 'bot_active'
        stats.add_entry(action=action, bot_id=bot_id, dimensions=dimensions)

        def bot_event(event_type, task_id=None, task_name=None):
            bot_management.bot_event(event_type=event_type,
                                     bot_id=bot_id,
                                     external_ip=self.request.remote_addr,
                                     dimensions=dimensions,
                                     state=state,
                                     version=version,
                                     quarantined=quarantined,
                                     task_id=task_id,
                                     task_name=task_name,
                                     message=quarantined_msg)

        # Bot version is host-specific because the host URL is embedded in
        # swarming_bot.zip
        expected_version = bot_code.get_bot_version(self.request.host_url)
        if version != expected_version:
            bot_event('request_update')
            self._cmd_update(expected_version)
            return
        if quarantined:
            bot_event('request_sleep')
            self._cmd_sleep(sleep_streak, quarantined)
            return

        #
        # At that point, the bot should be in relatively good shape since it's
        # running the right version. It is still possible that invalid code was
        # pushed to the server, so be diligent about it.
        #

        # Bot may need a reboot if it is running for too long. We do not reboot
        # quarantined bots.
        needs_restart, restart_message = bot_management.should_restart_bot(
            bot_id, state)
        if needs_restart:
            bot_event('request_restart')
            self._cmd_restart(restart_message)
            return

        # The bot is in good shape. Try to grab a task.
        try:
            # This is a fairly complex function call, exceptions are expected.
            request, run_result = task_scheduler.bot_reap_task(
                dimensions, bot_id, version)
            if not request:
                # No task found, tell it to sleep a bit.
                bot_event('request_sleep')
                self._cmd_sleep(sleep_streak, quarantined)
                return

            try:
                # This part is tricky since it intentionally runs a transaction after
                # another one.
                if request.properties.is_terminate:
                    bot_event('bot_terminate', task_id=run_result.task_id)
                    self._cmd_terminate(run_result.task_id)
                else:
                    bot_event('request_task',
                              task_id=run_result.task_id,
                              task_name=request.name)
                    self._cmd_run(request, run_result.key, bot_id)
            except:
                logging.exception('Dang, exception after reaping')
                raise
        except runtime.DeadlineExceededError:
            # If the timeout happened before a task was assigned there is no problems.
            # If the timeout occurred after a task was assigned, that task will
            # timeout (BOT_DIED) since the bot didn't get the details required to
            # run it) and it will automatically get retried (TODO) when the task times
            # out.
            # TODO(maruel): Note the task if possible and hand it out on next poll.
            # https://code.google.com/p/swarming/issues/detail?id=130
            self.abort(500, 'Deadline')
Example #8
0
    def post(self):
        """Handles a polling request.

    Be very permissive on missing values. This can happen because of errors
    on the bot, *we don't want to deny them the capacity to update*, so that the
    bot code is eventually fixed and the bot self-update to this working code.

    It makes recovery of the fleet in case of catastrophic failure much easier.
    """
        (_request, bot_id, version, state, dimensions, quarantined_msg) = self._process()
        sleep_streak = state.get("sleep_streak", 0)
        quarantined = bool(quarantined_msg)

        # Note bot existence at two places, one for stats at 1 minute resolution,
        # the other for the list of known bots.
        action = "bot_inactive" if quarantined else "bot_active"
        stats.add_entry(action=action, bot_id=bot_id, dimensions=dimensions)

        def bot_event(event_type, task_id=None, task_name=None):
            bot_management.bot_event(
                event_type=event_type,
                bot_id=bot_id,
                external_ip=self.request.remote_addr,
                dimensions=dimensions,
                state=state,
                version=version,
                quarantined=quarantined,
                task_id=task_id,
                task_name=task_name,
                message=quarantined_msg,
            )

        # Bot version is host-specific because the host URL is embedded in
        # swarming_bot.zip
        expected_version = bot_code.get_bot_version(self.request.host_url)
        if version != expected_version:
            bot_event("request_update")
            self._cmd_update(expected_version)
            return
        if quarantined:
            bot_event("request_sleep")
            self._cmd_sleep(sleep_streak, quarantined)
            return

        #
        # At that point, the bot should be in relatively good shape since it's
        # running the right version. It is still possible that invalid code was
        # pushed to the server, so be diligent about it.
        #

        # Bot may need a reboot if it is running for too long. We do not reboot
        # quarantined bots.
        needs_restart, restart_message = bot_management.should_restart_bot(bot_id, state)
        if needs_restart:
            bot_event("request_restart")
            self._cmd_restart(restart_message)
            return

        # The bot is in good shape. Try to grab a task.
        try:
            # This is a fairly complex function call, exceptions are expected.
            request, run_result = task_scheduler.bot_reap_task(dimensions, bot_id, version)
            if not request:
                # No task found, tell it to sleep a bit.
                bot_event("request_sleep")
                self._cmd_sleep(sleep_streak, quarantined)
                return

            try:
                # This part is tricky since it intentionally runs a transaction after
                # another one.
                if request.properties.is_terminate:
                    bot_event("bot_terminate", task_id=run_result.task_id)
                    self._cmd_terminate(run_result.task_id)
                else:
                    bot_event("request_task", task_id=run_result.task_id, task_name=request.name)
                    self._cmd_run(request, run_result.key, bot_id)
            except:
                logging.exception("Dang, exception after reaping")
                raise
        except runtime.DeadlineExceededError:
            # If the timeout happened before a task was assigned there is no problems.
            # If the timeout occurred after a task was assigned, that task will
            # timeout (BOT_DIED) since the bot didn't get the details required to
            # run it) and it will automatically get retried (TODO) when the task times
            # out.
            # TODO(maruel): Note the task if possible and hand it out on next poll.
            # https://code.google.com/p/swarming/issues/detail?id=130
            self.abort(500, "Deadline")
Example #9
0
    def post(self):
        """Handles a polling request.

    Be very permissive on missing values. This can happen because of errors
    on the bot, *we don't want to deny them the capacity to update*, so that the
    bot code is eventually fixed and the bot self-update to this working code.

    It makes recovery of the fleet in case of catastrophic failure much easier.
    """
        if config.settings().force_bots_to_sleep_and_not_run_task:
            # Ignore everything, just sleep. Tell the bot it is quarantined to inform
            # it that it won't be running anything anyway. Use a large streak so it
            # will sleep for 60s.
            self._cmd_sleep(1000, True)
            return

        res = self._process()
        sleep_streak = res.state.get('sleep_streak', 0)
        quarantined = bool(res.quarantined_msg)

        # Note bot existence at two places, one for stats at 1 minute resolution,
        # the other for the list of known bots.
        action = 'bot_inactive' if quarantined else 'bot_active'
        stats.add_entry(action=action,
                        bot_id=res.bot_id,
                        dimensions=res.dimensions)

        def bot_event(event_type, task_id=None, task_name=None):
            bot_management.bot_event(
                event_type=event_type,
                bot_id=res.bot_id,
                external_ip=self.request.remote_addr,
                authenticated_as=auth.get_peer_identity().to_bytes(),
                dimensions=res.dimensions,
                state=res.state,
                version=res.version,
                quarantined=quarantined,
                task_id=task_id,
                task_name=task_name,
                message=res.quarantined_msg)

        # Bot version is host-specific because the host URL is embedded in
        # swarming_bot.zip
        expected_version = bot_code.get_bot_version(self.request.host_url)
        if res.version != expected_version:
            bot_event('request_update')
            self._cmd_update(expected_version)
            return
        if quarantined:
            bot_event('request_sleep')
            self._cmd_sleep(sleep_streak, quarantined)
            return

        # If the server-side per-bot config for the bot has changed, we need
        # to restart this particular bot, so it picks up new config in /handshake.
        # Do this check only for bots that know about server-side per-bot configs
        # already (such bots send 'bot_group_cfg_version' state attribute).
        cur_bot_cfg_ver = res.state.get('bot_group_cfg_version')
        if cur_bot_cfg_ver and cur_bot_cfg_ver != res.bot_group_cfg.version:
            bot_event('request_restart')
            self._cmd_restart('Restarting to pick up new bots.cfg config')
            return

        #
        # At that point, the bot should be in relatively good shape since it's
        # running the right version. It is still possible that invalid code was
        # pushed to the server, so be diligent about it.
        #

        # Bot may need a reboot if it is running for too long. We do not reboot
        # quarantined bots.
        needs_restart, restart_message = bot_management.should_restart_bot(
            res.bot_id, res.state)
        if needs_restart:
            bot_event('request_restart')
            self._cmd_restart(restart_message)
            return

        # The bot is in good shape. Try to grab a task.
        try:
            # This is a fairly complex function call, exceptions are expected.
            request, run_result = task_scheduler.bot_reap_task(
                res.dimensions, res.bot_id, res.version,
                res.state.get('lease_expiration_ts'))
            if not request:
                # No task found, tell it to sleep a bit.
                bot_event('request_sleep')
                self._cmd_sleep(sleep_streak, quarantined)
                return

            try:
                # This part is tricky since it intentionally runs a transaction after
                # another one.
                if request.properties.is_terminate:
                    bot_event('bot_terminate', task_id=run_result.task_id)
                    self._cmd_terminate(run_result.task_id)
                else:
                    bot_event('request_task',
                              task_id=run_result.task_id,
                              task_name=request.name)
                    self._cmd_run(request, run_result.key, res.bot_id)
            except:
                logging.exception('Dang, exception after reaping')
                raise
        except runtime.DeadlineExceededError:
            # If the timeout happened before a task was assigned there is no problems.
            # If the timeout occurred after a task was assigned, that task will
            # timeout (BOT_DIED) since the bot didn't get the details required to
            # run it) and it will automatically get retried (TODO) when the task times
            # out.
            # TODO(maruel): Note the task if possible and hand it out on next poll.
            # https://code.google.com/p/swarming/issues/detail?id=130
            self.abort(500, 'Deadline')