Example #1
0
    def terminate(self, request):
        """Asks a bot to terminate itself gracefully.

    The bot will stay in the DB, use 'delete' to remove it from the DB
    afterward. This request returns a pseudo-taskid that can be waited for to
    wait for the bot to turn down.

    This command is particularly useful when a privileged user needs to safely
    debug a machine specific issue. The user can trigger a terminate for one of
    the bot exhibiting the issue, wait for the pseudo-task to run then access
    the machine with the guarantee that the bot is not running anymore.
    """
        # TODO(maruel): Disallow a terminate task when there's one currently
        # pending or if the bot is considered 'dead', e.g. no contact since 10
        # minutes.
        logging.debug('%s', request)
        bot_id = unicode(request.bot_id)
        bot_key = bot_management.get_info_key(bot_id)
        get_or_raise(bot_key)  # raises 404 if there is no such bot
        try:
            # Craft a special priority 0 task to tell the bot to shutdown.
            request = task_request.create_termination_task(
                bot_id, wait_for_capacity=True)
        except (datastore_errors.BadValueError, TypeError, ValueError) as e:
            raise endpoints.BadRequestException(e.message)

        result_summary = task_scheduler.schedule_request(request,
                                                         secret_bytes=None)
        return swarming_rpcs.TerminateResponse(
            task_id=task_pack.pack_result_summary_key(result_summary.key))
Example #2
0
def handle_early_release(machine_lease):
    """Handles the early release of a leased machine.

  Args:
    machine_lease: MachineLease instance.
  """
    if machine_lease.lease_expiration_ts <= utils.utcnow(
    ) + datetime.timedelta(seconds=machine_lease.early_release_secs):
        logging.info('MachineLease ready to be released: %s',
                     machine_lease.key)
        task_result_summary = task_scheduler.schedule_request(
            task_request.create_termination_task(machine_lease.hostname, True),
            check_acls=False,
        )
        associate_termination_task(machine_lease.key, machine_lease.hostname,
                                   task_result_summary.task_id)
Example #3
0
def handle_early_release(machine_lease):
    """Handles the early release of a leased machine.

  Args:
    machine_lease: MachineLease instance.
  """
    assert not machine_lease.termination_task, machine_lease.termination_task

    early_expiration_ts = machine_lease.lease_expiration_ts - datetime.timedelta(
        seconds=machine_lease.early_release_secs)

    if machine_lease.drained or early_expiration_ts <= utils.utcnow():
        logging.info(
            'MachineLease ready to be released:\nKey: %s\nHostname: %s',
            machine_lease.key,
            machine_lease.hostname,
        )
        task_result_summary = task_scheduler.schedule_request(
            task_request.create_termination_task(machine_lease.hostname),
            None,
        )
        associate_termination_task(machine_lease.key, machine_lease.hostname,
                                   task_result_summary.task_id)
Example #4
0
 def test_create_termination_task(self):
   request = task_request.create_termination_task(u'some-bot',
       wait_for_capacity=True)
   self.assertTrue(request.task_slice(0).properties.is_terminate)
Example #5
0
def check_for_connection(machine_lease):
    """Checks for a bot_connected event.

  Args:
    machine_lease: MachineLease instance.
  """
    assert machine_lease.instruction_ts

    # Technically this query is wrong because it looks at events in reverse
    # chronological order. The connection time we find here is actually the
    # most recent connection when we want the earliest. However, this function
    # is only called for new bots and stops being called once the connection
    # time is recorded, so the connection time we record should end up being the
    # first connection anyways. Iterating in the correct order would require
    # building a new, large index.
    for event in bot_management.get_events_query(machine_lease.bot_id, True):
        # We don't want to find a bot_connected event from before we sent the
        # connection instruction (e.g. in the event of hostname reuse), so do not
        # look at events from before the connection instruction was sent.
        if event.ts < machine_lease.instruction_ts:
            break
        if event.event_type == 'bot_connected':
            logging.info(
                'Bot connected:\nKey: %s\nHostname: %s\nTime: %s',
                machine_lease.key,
                machine_lease.hostname,
                event.ts,
            )
            associate_connection_ts(machine_lease.key, event.ts)
            ts_mon_metrics.on_machine_connected_time(
                (event.ts - machine_lease.instruction_ts).total_seconds(),
                fields={
                    'machine_type': machine_lease.machine_type.id(),
                },
            )
            return

    # The bot hasn't connected yet. If it's dead or missing, release the lease.
    # At this point we have sent the connection instruction so the bot could still
    # connect after we release the lease but before Machine Provider actually
    # deletes the bot. Therefore we also schedule a termination task if releasing
    # the bot. That way, if the bot connects, it will just shut itself down.
    bot_info = bot_management.get_info_key(machine_lease.hostname).get()
    if not bot_info:
        logging.error(
            'BotInfo missing:\nKey: %s\nHostname: %s',
            machine_lease.key,
            machine_lease.hostname,
        )
        task_scheduler.schedule_request(
            task_request.create_termination_task(machine_lease.hostname),
            None,
        )
        if release(machine_lease):
            clear_lease_request(machine_lease.key,
                                machine_lease.client_request_id)
        return
    if bot_info.is_dead:
        logging.warning(
            'Bot failed to connect in time:\nKey: %s\nHostname: %s',
            machine_lease.key,
            machine_lease.hostname,
        )
        task_scheduler.schedule_request(
            task_request.create_termination_task(machine_lease.hostname),
            None,
        )
        if release(machine_lease):
            cleanup_bot(machine_lease)
Example #6
0
 def test_create_termination_task(self):
   request = task_request.create_termination_task(u'some-bot', True)
   self.assertTrue(request.properties.is_terminate)