def test_or_dimensions_new_tasks(self):
        # Bots are already registered, then new tasks show up
        self.mock_now(datetime.datetime(2020, 1, 2, 3, 4, 5))
        self.assertEqual(
            0,
            _assert_bot(bot_id=u'bot1',
                        dimensions={
                            u'os': [u'v1', u'v2'],
                            u'gpu': [u'nv'],
                        }))
        self.assertEqual(
            0,
            _assert_bot(bot_id=u'bot2',
                        dimensions={
                            u'os': [u'v2'],
                            u'gpu': [u'amd'],
                        }))

        payloads = self._mock_enqueue_task_async_for_rebuild_task_cache()

        request1 = _gen_request(properties=_gen_properties(
            dimensions={
                u'pool': [u'default'],
                u'os': [u'v1|v2'],
                u'gpu': [u'nv|amd'],
            }))
        task_queues.assert_task_async(request1).get_result()
        self.assertEqual(1, len(payloads))
        f = task_queues.rebuild_task_cache_async(payloads[-1])
        self.assertEqual(True, f.get_result())
        payloads.pop()

        # Both bots should be able to handle |request1|
        self.assert_count(2, task_queues.BotDimensions)
        self.assert_count(2, task_queues.BotTaskDimensions)
        self.assert_count(1, task_queues.TaskDimensions)
        self.assertEqual(4, len(task_queues.TaskDimensions.query().get().sets))
        bot1_root_key = bot_management.get_root_key(u'bot1')
        bot2_root_key = bot_management.get_root_key(u'bot2')
        self.assertEqual(1, len(task_queues.get_queues(bot1_root_key)))
        self.assertEqual(1, len(task_queues.get_queues(bot2_root_key)))

        request2 = _gen_request(properties=_gen_properties(
            dimensions={
                u'pool': [u'default'],
                u'os': [u'v1'],
                u'gpu': [u'nv|amd'],
            }))
        task_queues.assert_task_async(request2).get_result()
        self.assertEqual(1, len(payloads))
        f = task_queues.rebuild_task_cache_async(payloads[-1])
        self.assertEqual(True, f.get_result())
        payloads.pop()

        # Only bot1 can handle |request2|
        self.assert_count(3, task_queues.BotTaskDimensions)
        self.assert_count(2, task_queues.TaskDimensions)
        self.assertEqual(2, len(task_queues.get_queues(bot1_root_key)))
        self.assertEqual(1, len(task_queues.get_queues(bot2_root_key)))
    def test_assert_bot_dimensions_changed(self):
        # Ensure that stale BotTaskDimensions are deleted when the bot dimensions
        # changes.
        now = datetime.datetime(2010, 1, 2, 3, 4, 5)
        self.mock_now(now)
        request = self._assert_task()
        exp = (request.expiration_ts - request.created_ts +
               task_queues._EXTEND_VALIDITY)
        self.assertEqual(1, _assert_bot())
        # One hour later, the bot changes dimensions.
        self.mock_now(now, task_queues._EXTEND_VALIDITY.total_seconds())
        self.assertEqual(1, _assert_bot(dimensions={u'gpu': u'Matrox'}))
        self.assert_count(1, task_queues.BotTaskDimensions)
        self.assert_count(1, task_queues.TaskDimensions)
        bot_root_key = bot_management.get_root_key(u'bot1')
        self.assertEqual([2980491642], task_queues.get_queues(bot_root_key))

        # One second before expiration.
        self.mock_now(now, exp.total_seconds())
        self.assertEqual(None, _assert_bot(dimensions={u'gpu': u'Matrox'}))
        self.assert_count(1, task_queues.BotTaskDimensions)
        self.assert_count(1, task_queues.TaskDimensions)
        self.assertEqual([2980491642], task_queues.get_queues(bot_root_key))

        # TaskDimension expired. The fact that the bot changed dimensions after an
        # hour didn't impact BotTaskDimensions expiration.
        self.mock_now(now, exp.total_seconds() + 1)
        self.assertEqual(0, _assert_bot())
        self.assert_count(1, task_queues.BotTaskDimensions)
        self.assert_count(1, task_queues.TaskDimensions)
        self.assertEqual([], task_queues.get_queues(bot_root_key))
Esempio n. 3
0
def cleanup_bot(machine_lease):
    """Cleans up entities after a bot is removed."""
    bot_root_key = bot_management.get_root_key(machine_lease.hostname)
    # The bot is being removed, remove it from the task queues.
    task_queues.cleanup_after_bot(bot_root_key)
    bot_management.get_info_key(machine_lease.hostname).delete()
    clear_lease_request(machine_lease.key, machine_lease.client_request_id)
Esempio n. 4
0
  def test_cron_tidy_stale(self):
    now = datetime.datetime(2010, 1, 2, 3, 4, 5)
    self.mock_now(now)
    self.assertEqual(0, _assert_bot())
    request = self._assert_task()
    exp = (request.expiration_ts-request.created_ts) + task_queues._ADVANCE
    self.assert_count(1, task_queues.BotTaskDimensions)
    self.assert_count(1, task_queues.TaskDimensions)
    bot_root_key = bot_management.get_root_key(u'bot1')
    self.assertEqual([2980491642], task_queues.get_queues(bot_root_key))

    # No-op.
    task_queues.cron_tidy_stale()
    self.assert_count(1, task_queues.BotTaskDimensions)
    self.assert_count(1, task_queues.TaskDimensions)
    self.assertEqual([2980491642], task_queues.get_queues(bot_root_key))

    # One second before expiration.
    self.mock_now(now, exp.total_seconds())
    task_queues.cron_tidy_stale()
    self.assert_count(1, task_queues.BotTaskDimensions)
    self.assert_count(1, task_queues.TaskDimensions)
    self.assertEqual([2980491642], task_queues.get_queues(bot_root_key))

    # TaskDimension expired.
    self.mock_now(now, exp.total_seconds() + 1)
    task_queues.cron_tidy_stale()
    self.assert_count(0, task_queues.BotTaskDimensions)
    self.assert_count(0, task_queues.TaskDimensions)
    self.assertEqual([], task_queues.get_queues(bot_root_key))
    def test_assert_task_async_call_rebuld_task_cache_async(self):
        self.assertEqual(0, _assert_bot())
        dimensions = {
            u'id': [u'bot1'],
        }
        self.mock_now(datetime.datetime(2020, 1, 2, 3, 4, 5))
        request1 = _gen_request(properties=_gen_properties(
            dimensions=dimensions))
        task_queues.assert_task_async(request1).get_result()
        self.assert_count(1, task_queues.BotDimensions)
        self.assert_count(1, task_queues.BotTaskDimensions)
        self.assert_count(1, task_queues.TaskDimensions)
        bot_root_key = bot_management.get_root_key('bot1')
        self.assertEqual(1, len(task_queues.get_queues(bot_root_key)))

        # expire BotTaskDimensions by changing time.
        memcache.flush_all()
        bot_task_dimensions = task_queues.BotTaskDimensions.query(
            ancestor=bot_root_key).fetch()[0]
        self.mock_now(bot_task_dimensions.valid_until_ts +
                      datetime.timedelta(seconds=1))
        self.assertEqual(0, len(task_queues.get_queues(bot_root_key)))

        # request a task with the same dimensions.
        memcache.flush_all()
        request2 = _gen_request(properties=_gen_properties(
            dimensions=dimensions))
        task_queues.assert_task_async(request2).get_result()
        self.assert_count(1, task_queues.BotDimensions)
        self.assert_count(1, task_queues.BotTaskDimensions)
        self.assert_count(1, task_queues.TaskDimensions)
        self.assertEqual(1, len(task_queues.get_queues(bot_root_key)))
 def test_assert_task_async_then_bot(self):
     self._assert_task()
     self.assertEqual(1, _assert_bot())
     self.assert_count(1, task_queues.BotDimensions)
     self.assert_count(1, task_queues.BotTaskDimensions)
     self.assert_count(1, task_queues.TaskDimensions)
     bot_root_key = bot_management.get_root_key(u'bot1')
     self.assertEqual([2980491642], task_queues.get_queues(bot_root_key))
 def test_cleanup_after_bot(self):
     self.assertEqual(0, _assert_bot())
     self._assert_task()
     task_queues.cleanup_after_bot(bot_management.get_root_key('bot1'))
     # BotInfo is deleted separately.
     self.assert_count(1, bot_management.BotInfo)
     self.assert_count(0, task_queues.BotDimensions)
     self.assert_count(0, task_queues.BotTaskDimensions)
     self.assert_count(1, task_queues.TaskDimensions)
Esempio n. 8
0
    def Events(self, request, context):
        logging.debug('%s', request)
        try:
            if not request.bot_id:
                # TODO(maruel): Allows not specifying one. Or specifying a pool.
                raise ValueError('specify bot_id')

            # Transparently limit to 1000, default to 200.
            page_size = request.page_size or 200
            if page_size > 1000:
                page_size = 1000
            if page_size < 0:
                raise ValueError('page_size must be positive')

            start = None
            end = None
            if request.HasField('start_time'):
                start = request.start_time.ToDatetime()
            if request.HasField('end_time'):
                end = request.end_time.ToDatetime()
            if (start and end) and start >= end:
                raise ValueError('start_time must be before end_time')

            # The BotEvent key is already in the right chronological order, but
            # querying per BotEvent.ts *requires* ordering per BotEvent.ts.
            order = not (start or end)
            q = bot_management.get_events_query(request.bot_id, order)
            if not order:
                q = q.order(-bot_management.BotEvent.ts,
                            bot_management.BotEvent.key)
            if start:
                q = q.filter(bot_management.BotEvent.ts >= start)
            if end:
                q = q.filter(bot_management.BotEvent.ts < end)

            items, cursor = datastore_utils.fetch_page(q, page_size,
                                                       request.page_token)
            if not items:
                # Check if the bot exists, if not, return a 404. We check BotRoot, not
                # BotInfo, so that even deleted bots can be queried. See bot_management
                # for more information.
                if not bot_management.get_root_key(request.bot_id).get():
                    context.set_code(StatusCode.NOT_FOUND)
                    context.set_details('Bot does not exist')
                    return None
        except ValueError as e:
            context.set_code(StatusCode.INVALID_ARGUMENT)
            context.set_details(str(e))
            return None
        logging.info('Returning %d events', len(items))
        out = swarming_pb2.BotEventsResponse(next_page_token=cursor)
        for r in items:
            i = out.events.add()
            r.to_proto(i)
        return out
Esempio n. 9
0
def _yield_next_available_task_to_dispatch(bot_dimensions):
  bot_id = bot_dimensions[u'id'][0]
  bot_management.bot_event(
      'bot_connected', bot_id, '1.2.3.4', 'joe@localhost',
      bot_dimensions, {'state': 'real'}, '1234', False, None, None, None)
  bot_root_key = bot_management.get_root_key(bot_id)
  task_queues.assert_bot_async(bot_root_key, bot_dimensions).get_result()
  return [
    to_run.to_dict()
    for _request, to_run in
    task_to_run.yield_next_available_task_to_dispatch(bot_dimensions)
  ]
Esempio n. 10
0
def _assert_bot(bot_id=u'bot1', dimensions=None):
  bot_dimensions = {
    u'cpu': [u'x86-64', u'x64'],
    u'id': [bot_id],
    u'os': [u'Ubuntu-16.04', u'Ubuntu'],
    u'pool': [u'default'],
  }
  bot_dimensions.update(dimensions or {})
  bot_management.bot_event(
      'bot_connected', bot_id, '1.2.3.4', bot_id, bot_dimensions, {},
      '1234', False, None, None, None)
  bot_root_key = bot_management.get_root_key(bot_id)
  return task_queues.assert_bot_async(bot_root_key, bot_dimensions).get_result()
    def test_or_dimensions_same_hash(self):
        self.mock_now(datetime.datetime(2020, 1, 2, 3, 4, 5))
        self.assertEqual(
            0, _assert_bot(bot_id=u'bot1', dimensions={u'os': [u'v1']}))
        self.assertEqual(
            0, _assert_bot(bot_id=u'bot2', dimensions={u'os': [u'v2']}))
        self.assertEqual(
            0, _assert_bot(bot_id=u'bot3', dimensions={u'os': [u'v3']}))

        payloads = self._mock_enqueue_task_async_for_rebuild_task_cache()
        # Both requests should have the same dimension_hash
        request1 = _gen_request(properties=_gen_properties(dimensions={
            u'pool': [u'default'],
            u'os': [u'v1|v2|v3'],
        }))
        request2 = _gen_request(properties=_gen_properties(dimensions={
            u'pool': [u'default'],
            u'os': [u'v3|v2|v1'],
        }))
        task_queues.assert_task_async(request1).get_result()
        task_queues.assert_task_async(request2).get_result()
        self.assertEqual(2, len(payloads))
        while payloads:
            f = task_queues.rebuild_task_cache_async(payloads[-1])
            self.assertEqual(True, f.get_result())
            payloads.pop()

        self.assert_count(3, task_queues.BotDimensions)
        self.assert_count(3, task_queues.BotTaskDimensions)
        self.assert_count(1, task_queues.TaskDimensions)
        self.assertEqual(3, len(task_queues.TaskDimensions.query().get().sets))
        bot1_root_key = bot_management.get_root_key(u'bot1')
        bot2_root_key = bot_management.get_root_key(u'bot2')
        bot3_root_key = bot_management.get_root_key(u'bot3')
        self.assertEqual(1, len(task_queues.get_queues(bot1_root_key)))
        self.assertEqual(1, len(task_queues.get_queues(bot2_root_key)))
        self.assertEqual(1, len(task_queues.get_queues(bot3_root_key)))
    def test_probably_has_capacity_get_queues(self):
        d = {u'pool': [u'default'], u'os': [u'Ubuntu-16.04']}
        # Capacity registers there only once there's a request enqueued and
        # get_queues() is called.
        _assert_bot()
        request = _gen_request(properties=_gen_properties(dimensions=d))
        task_queues.assert_task_async(request).get_result()
        self.assertEqual(1, self.execute_tasks())
        self.assertEqual(None, task_queues.probably_has_capacity(d))

        # It get sets only once get_queues() is called.
        bot_root_key = bot_management.get_root_key(u'bot1')
        task_queues.get_queues(bot_root_key)
        self.assertEqual(True, task_queues.probably_has_capacity(d))
        self.assertEqual([1843498234],
                         memcache.get('bot1', namespace='task_queues'))
Esempio n. 13
0
def cleanup_after_bot(bot_id):
    """Removes all BotDimensions and BotTaskDimensions for this bot.

  Do not clean up TaskDimensions. There could be pending tasks and there's a
  possibility that a bot with the same ID could come up afterward (low chance in
  practice but it's a possibility). In this case, if TaskDimensions is deleted,
  the pending task would not be correctly run even when a bot comes back online
  as assert_bot_async() would fail to create the corresponding
  BotTaskDimensions.
  """
    bot_root_key = bot_management.get_root_key(bot_id)

    q = BotTaskDimensions.query(ancestor=bot_root_key).iter(keys_only=True)
    futures = ndb.delete_multi_async(q)
    futures.append(
        ndb.Key(BotDimensions, 1, parent=bot_root_key).delete_async())

    for f in futures:
        f.check_success()
Esempio n. 14
0
def assert_bot_async(bot_dimensions):
    """Prepares BotTaskDimensions entities as needed.

  Coupled with assert_task(), enables get_queues() to work by by knowing which
  TaskDimensions applies to this bot.

  Returns:
    Number of matches or None if hit the cache, thus nothing was updated.
  """
    assert len(bot_dimensions[u'id']) == 1, bot_dimensions
    # Check if the bot dimensions changed since last _rebuild_bot_cache_async()
    # call.
    bot_root_key = bot_management.get_root_key(bot_dimensions[u'id'][0])
    obj = yield ndb.Key(BotDimensions, 1, parent=bot_root_key).get_async()
    if obj and obj.dimensions_flat == _flatten_bot_dimensions(bot_dimensions):
        # Cache hit, no need to look further.
        raise ndb.Return(None)

    matches = yield _rebuild_bot_cache_async(bot_dimensions, bot_root_key)
    raise ndb.Return(matches)
    def test_cron_tidy_stale(self):
        now = datetime.datetime(2010, 1, 2, 3, 4, 5)
        self.mock_now(now)
        self.assertEqual(0, _assert_bot())
        request = self._assert_task()
        exp = (request.expiration_ts - request.created_ts +
               task_queues._EXTEND_VALIDITY)
        self.assert_count(1, task_queues.BotTaskDimensions)
        self.assert_count(1, task_queues.TaskDimensions)
        bot_root_key = bot_management.get_root_key(u'bot1')
        self.assertEqual([2980491642], task_queues.get_queues(bot_root_key))

        # No-op.
        task_queues.cron_tidy_stale()
        self.assert_count(1, task_queues.BotTaskDimensions)
        self.assert_count(1, task_queues.TaskDimensions)
        self.assertEqual([2980491642], task_queues.get_queues(bot_root_key))

        # TaskDimension expired but is still kept; get_queues() doesn't return it
        # anymore even if still in the DB. BotTaskDimensions was evicted.
        self.mock_now(now, exp.total_seconds() + 1)
        task_queues.cron_tidy_stale()
        self.assert_count(0, task_queues.BotTaskDimensions)
        self.assert_count(1, task_queues.TaskDimensions)
        self.assertEqual([], task_queues.get_queues(bot_root_key))

        # Just before _KEEP_DEAD.
        self.mock_now(now, (exp + task_queues._KEEP_DEAD).total_seconds())
        task_queues.cron_tidy_stale()
        self.assert_count(0, task_queues.BotTaskDimensions)
        self.assert_count(1, task_queues.TaskDimensions)
        self.assertEqual([], task_queues.get_queues(bot_root_key))

        # TaskDimension expired, after KEEP_DEAD.
        self.mock_now(now, (exp + task_queues._KEEP_DEAD).total_seconds() + 1)
        task_queues.cron_tidy_stale()
        self.assert_count(0, task_queues.BotTaskDimensions)
        self.assert_count(0, task_queues.TaskDimensions)
        self.assertEqual([], task_queues.get_queues(bot_root_key))
Esempio n. 16
0
def get_queues(bot_id):
    """Queries all the known task queues in parallel and yields the task in order
  of priority.
  """
    assert isinstance(bot_id, unicode), repr(bot_id)
    data = memcache.get(bot_id, namespace='task_queues')
    if data is not None:
        logging.debug('get_queues(%s): can run from %d queues (memcache)\n%s',
                      bot_id, len(data), data)
        return data

    # Retrieve all the dimensions_hash that this bot could run that have
    # actually been triggered in the past. Since this is under a root entity, this
    # should be fast.
    now = utils.utcnow()
    bot_root_key = bot_management.get_root_key(bot_id)
    data = sorted(obj.key.integer_id()
                  for obj in BotTaskDimensions.query(ancestor=bot_root_key)
                  if obj.valid_until_ts >= now)
    memcache.set(bot_id, data, namespace='task_queues')
    logging.info('get_queues(%s): Query in %.3fs: can run from %d queues\n%s',
                 bot_id, (utils.utcnow() - now).total_seconds(), len(data),
                 data)
    return data
 def test_get_root_key(self):
   self.assertEqual(
       ndb.Key(bot_management.BotRoot, 'foo'),
       bot_management.get_root_key('foo'))
Esempio n. 18
0
    def _process(self):
        """Fetches bot info and settings, does authorization and quarantine checks.

    Returns:
      _ProcessResult instance, see its fields for more info.

    Raises:
      auth.AuthorizationError if bot's credentials are invalid.
    """
        request = self.parse_body()
        version = request.get('version', None)

        dimensions = request.get('dimensions') or {}
        state = request.get('state') or {}
        bot_id = None
        if dimensions.get('id'):
            dimension_id = dimensions['id']
            if (isinstance(dimension_id, list) and len(dimension_id) == 1
                    and isinstance(dimension_id[0], unicode)):
                bot_id = dimensions['id'][0]

        if bot_id:
            logging.debug('Fetching bot settings for bot id: %s', bot_id)
            bot_settings = bot_management.get_settings_key(bot_id).get()

        # Make sure bot self-reported ID matches the authentication token. Raises
        # auth.AuthorizationError if not.
        bot_group_cfg = bot_auth.validate_bot_id_and_fetch_config(bot_id)

        # The server side dimensions from bot_group_cfg override bot-provided ones.
        # If both server side config and bot report some dimension, server side
        # config wins. We still emit an warning if bot tries to supply the dimension
        # and it disagrees with the server defined one. Note that this may happen
        # on a first poll after server side config for a bot has changed. The bot
        # doesn't know about new server-assigned dimensions yet in this case. Also
        # don't report ['default'], bot sends it in the handshake before it knows
        # anything at all.
        for dim_key, from_cfg in bot_group_cfg.dimensions.iteritems():
            from_bot = sorted(dimensions.get(dim_key) or [])
            from_cfg = sorted(from_cfg)
            if from_bot and from_bot != ['default'] and from_bot != from_cfg:
                logging.warning(
                    'Dimensions in bots.cfg don\'t match ones provided by the bot\n'
                    'bot_id: "%s", key: "%s", from_bot: %s, from_cfg: %s',
                    bot_id, dim_key, from_bot, from_cfg)
            dimensions[dim_key] = from_cfg

        # Fill in all result fields except 'quarantined_msg'.
        result = _ProcessResult(request=request,
                                bot_id=bot_id,
                                version=version,
                                state=state,
                                dimensions=dimensions,
                                bot_group_cfg=bot_group_cfg,
                                maintenance_msg=state.get('maintenance'))

        # The bot may decide to "self-quarantine" itself. Accept both via
        # dimensions or via state. See bot_management._BotCommon.quarantined for
        # more details.
        if (bool(dimensions.get('quarantined'))
                or bool(state.get('quarantined'))):
            result.quarantined_msg = 'Bot self-quarantined'
            return result

        quarantined_msg = None
        # Use a dummy 'for' to be able to break early from the block.
        for _ in [0]:

            quarantined_msg = has_unexpected_keys(self.EXPECTED_KEYS, request,
                                                  'keys')
            if quarantined_msg:
                break

            quarantined_msg = has_missing_keys(self.REQUIRED_STATE_KEYS, state,
                                               'state')
            if quarantined_msg:
                break

            if not bot_id:
                quarantined_msg = 'Missing bot id'
                break
            if not dimensions.get('pool'):
                quarantined_msg = 'Missing \'pool\' dimension'
                break

            for key, values in sorted(dimensions.items()):
                if not config.validate_dimension_key(key):
                    quarantined_msg = "Invalid dimension key: %r" % key
                    break
                if not isinstance(values, list):
                    quarantined_msg = "Key %s has non-list value: %s" % (
                        key, values)
                    break
                if len(values) != len(set(values)):
                    quarantined_msg = "Key %s has duplicate values: %s" % (
                        key, values)
                    break
                for value in sorted(values):
                    if not config.validate_dimension_value(value):
                        quarantined_msg = "Key %s has invalid value: %r" % (
                            key, value)
                        break
                if quarantined_msg:
                    break

        if quarantined_msg:
            line = 'Quarantined Bot\nhttps://%s/restricted/bot/%s\n%s' % (
                app_identity.get_default_version_hostname(), bot_id,
                quarantined_msg)
            ereporter2.log_request(self.request, source='bot', message=line)
            result.quarantined_msg = quarantined_msg
            return result

        # Look for admin enforced quarantine.
        if bool(bot_settings and bot_settings.quarantined):
            result.quarantined_msg = 'Quarantined by admin'
            return result

        # TODO(maruel): Parallelise.
        bot_root_key = bot_management.get_root_key(bot_id)
        task_queues.assert_bot_async(bot_root_key, dimensions).get_result()
        return result
    def test_rebuild_task_cache_async(self):
        # Assert that expiration works.
        now = datetime.datetime(2010, 1, 2, 3, 4, 5)
        self.mock_now(now)

        # We want _yield_BotTaskDimensions_keys() to return multiple
        # BotTaskDimensions ndb.Key to confirm that the inner loops work. This
        # requires a few bots.
        _assert_bot(bot_id=u'bot1')
        _assert_bot(bot_id=u'bot2')
        _assert_bot(bot_id=u'bot3')
        bot_root_key = bot_management.get_root_key(u'bot1')
        self.assertEqual(0, task_queues.BotTaskDimensions.query().count())
        self.assertEqual(0, task_queues.TaskDimensions.query().count())

        # Intentionally force the code to throttle the number of concurrent RPCs,
        # otherwise the inner loops wouldn't be reached with less than 50 bots, and
        # testing with 50 bots, would make the unit test slow.
        self.mock(task_queues, '_CAP_FUTURES_LIMIT', 1)

        payloads = self._mock_enqueue_task_async_for_rebuild_task_cache()

        # The equivalent of self._assert_task(tasks=1) except that we snapshot the
        # payload.
        # Trigger multiple task queues to go deeper in the code.
        request_1 = _gen_request(properties=_gen_properties(dimensions={
            u'cpu': [u'x86-64'],
            u'pool': [u'default'],
        }))
        task_queues.assert_task_async(request_1).get_result()
        self.assertEqual(1, len(payloads))
        f = task_queues.rebuild_task_cache_async(payloads[-1])
        self.assertEqual(True, f.get_result())
        self.assertEqual(3, task_queues.BotTaskDimensions.query().count())
        self.assertEqual(1, task_queues.TaskDimensions.query().count())
        self.assertEqual(60, request_1.expiration_secs)
        expected = now + task_queues._EXTEND_VALIDITY + datetime.timedelta(
            seconds=request_1.expiration_secs)
        self.assertEqual(
            expected,
            task_queues.TaskDimensions.query().get().valid_until_ts)

        request_2 = _gen_request(properties=_gen_properties(
            dimensions={
                u'os': [u'Ubuntu-16.04'],
                u'pool': [u'default'],
            }))
        task_queues.assert_task_async(request_2).get_result()
        self.assertEqual(2, len(payloads))
        f = task_queues.rebuild_task_cache_async(payloads[-1])
        self.assertEqual(True, f.get_result())
        self.assertEqual(6, task_queues.BotTaskDimensions.query().count())
        self.assertEqual(2, task_queues.TaskDimensions.query().count())
        self.assertEqual([227177418, 1843498234],
                         task_queues.get_queues(bot_root_key))
        memcache.flush_all()
        self.assertEqual([227177418, 1843498234],
                         task_queues.get_queues(bot_root_key))

        # Now expire the two TaskDimensions, one at a time, and rebuild the task
        # queue.
        offset = (task_queues._EXTEND_VALIDITY + datetime.timedelta(
            seconds=request_1.expiration_secs)).total_seconds() + 1
        self.mock_now(now, offset)
        f = task_queues.rebuild_task_cache_async(payloads[0])
        self.assertEqual(True, f.get_result())
        self.assertEqual(6, task_queues.BotTaskDimensions.query().count())
        self.assertEqual(2, task_queues.TaskDimensions.query().count())
        self.assertEqual([227177418, 1843498234],
                         task_queues.get_queues(bot_root_key))
        # Observe the effect of memcache. See comment in get_queues().
        memcache.flush_all()
        self.assertEqual([], task_queues.get_queues(bot_root_key))

        # Re-running still do not delete TaskDimensions because they are kept until
        # _KEEP_DEAD.
        f = task_queues.rebuild_task_cache_async(payloads[1])
        self.assertEqual(True, f.get_result())
        self.assertEqual(6, task_queues.BotTaskDimensions.query().count())
        self.assertEqual(2, task_queues.TaskDimensions.query().count())
        self.assertEqual([], task_queues.get_queues(bot_root_key))

        # Get past _KEEP_DEAD.
        offset = (task_queues._EXTEND_VALIDITY + task_queues._KEEP_DEAD +
                  datetime.timedelta(seconds=request_1.expiration_secs)
                  ).total_seconds() + 1
        self.mock_now(now, offset)
        self.assertEqual([], task_queues.get_queues(bot_root_key))
        f = task_queues.rebuild_task_cache_async(payloads[0])
        self.assertEqual(True, f.get_result())
        self.assertEqual(6, task_queues.BotTaskDimensions.query().count())
        self.assertEqual(1, task_queues.TaskDimensions.query().count())
        self.assertEqual([], task_queues.get_queues(bot_root_key))
Esempio n. 20
0
    def post(self, task_id=None):
        # Unlike handshake and poll, we do not accept invalid keys here. This code
        # path is much more strict.
        request = self.parse_body()
        msg = log_unexpected_subset_keys(self.ACCEPTED_KEYS,
                                         self.REQUIRED_KEYS, request,
                                         self.request, 'bot', 'keys')
        if msg:
            self.abort_with_error(400, error=msg)

        bot_id = request['id']
        task_id = request['task_id']

        # Make sure bot self-reported ID matches the authentication token. Raises
        # auth.AuthorizationError if not.
        bot_auth.validate_bot_id_and_fetch_config(bot_id)

        bot_overhead = request.get('bot_overhead')
        cipd_pins = request.get('cipd_pins')
        cipd_stats = request.get('cipd_stats')
        cost_usd = request.get('cost_usd', 0)
        duration = request.get('duration')
        exit_code = request.get('exit_code')
        hard_timeout = request.get('hard_timeout')
        io_timeout = request.get('io_timeout')
        isolated_stats = request.get('isolated_stats')
        output = request.get('output')
        output_chunk_start = request.get('output_chunk_start')
        outputs_ref = request.get('outputs_ref')

        if (isolated_stats or cipd_stats) and bot_overhead is None:
            ereporter2.log_request(request=self.request,
                                   source='server',
                                   category='task_failure',
                                   message='Failed to update task: %s' %
                                   task_id)
            self.abort_with_error(
                400,
                error=
                'isolated_stats and cipd_stats require bot_overhead to be set'
                '\nbot_overhead: %s\nisolate_stats: %s' %
                (bot_overhead, isolated_stats))

        run_result_key = task_pack.unpack_run_result_key(task_id)
        performance_stats = None
        if bot_overhead is not None:
            performance_stats = task_result.PerformanceStats(
                bot_overhead=bot_overhead)
            if isolated_stats:
                download = isolated_stats.get('download') or {}
                upload = isolated_stats.get('upload') or {}

                def unpack_base64(d, k):
                    x = d.get(k)
                    if x:
                        return base64.b64decode(x)

                performance_stats.isolated_download = task_result.OperationStats(
                    duration=download.get('duration'),
                    initial_number_items=download.get('initial_number_items'),
                    initial_size=download.get('initial_size'),
                    items_cold=unpack_base64(download, 'items_cold'),
                    items_hot=unpack_base64(download, 'items_hot'))
                performance_stats.isolated_upload = task_result.OperationStats(
                    duration=upload.get('duration'),
                    items_cold=unpack_base64(upload, 'items_cold'),
                    items_hot=unpack_base64(upload, 'items_hot'))
            if cipd_stats:
                performance_stats.package_installation = task_result.OperationStats(
                    duration=cipd_stats.get('duration'))

        if output is not None:
            try:
                output = base64.b64decode(output)
            except UnicodeEncodeError as e:
                logging.error('Failed to decode output\n%s\n%r', e, output)
                output = output.encode('ascii', 'replace')
            except TypeError as e:
                # Save the output as-is instead. The error will be logged in ereporter2
                # and returning a HTTP 500 would only force the bot to stay in a retry
                # loop.
                logging.error('Failed to decode output\n%s\n%r', e, output)
        if outputs_ref:
            outputs_ref = task_request.FilesRef(**outputs_ref)

        if cipd_pins:
            cipd_pins = task_result.CipdPins(
                client_package=task_request.CipdPackage(
                    **cipd_pins['client_package']),
                packages=[
                    task_request.CipdPackage(**args)
                    for args in cipd_pins['packages']
                ])

        # Tell the task queues management engine that the bot is still alive, and
        # it shall refresh the task queues.
        bot_root_key = bot_management.get_root_key(bot_id)
        task_queues.get_queues(bot_root_key)

        try:
            state = task_scheduler.bot_update_task(
                run_result_key=run_result_key,
                bot_id=bot_id,
                output=output,
                output_chunk_start=output_chunk_start,
                exit_code=exit_code,
                duration=duration,
                hard_timeout=hard_timeout,
                io_timeout=io_timeout,
                cost_usd=cost_usd,
                outputs_ref=outputs_ref,
                cipd_pins=cipd_pins,
                performance_stats=performance_stats)
            if not state:
                logging.info('Failed to update, please retry')
                self.abort_with_error(500,
                                      error='Failed to update, please retry')

            if state in (task_result.State.COMPLETED,
                         task_result.State.TIMED_OUT):
                action = 'task_completed'
            elif state == task_result.State.KILLED:
                action = 'task_killed'
            else:
                assert state in (task_result.State.BOT_DIED,
                                 task_result.State.RUNNING), state
                action = 'task_update'
            bot_management.bot_event(
                event_type=action,
                bot_id=bot_id,
                external_ip=self.request.remote_addr,
                authenticated_as=auth.get_peer_identity().to_bytes(),
                dimensions=None,
                state=None,
                version=None,
                quarantined=None,
                maintenance_msg=None,
                task_id=task_id,
                task_name=None)
        except ValueError as e:
            ereporter2.log_request(request=self.request,
                                   source='server',
                                   category='task_failure',
                                   message='Failed to update task: %s' % e)
            self.abort_with_error(400, error=str(e))
        except webob.exc.HTTPException:
            raise
        except Exception as e:
            logging.exception('Internal error: %s', e)
            self.abort_with_error(500, error=str(e))
        # - BOT_DIED will occur when the following conditions are true:
        #   - The bot polled correctly, but then stopped updating for at least
        #     task_result.BOT_PING_TOLERANCE. (It can occur if the host went to
        #     sleep, or the OS was overwhelmed).
        #   - /internal/cron/abort_bot_died runs, detects the bot is MIA, kills the
        #     task.
        #   - Bot wakes up, starts sending updates again.
        # - KILLED is when the client uses the kill API to forcibly stop a running
        #   task.
        must_stop = state in (task_result.State.BOT_DIED,
                              task_result.State.KILLED)
        if must_stop:
            logging.info('asking bot to kill the task')
        self.send_response({'must_stop': must_stop, 'ok': True})
Esempio n. 21
0
 def test_get_root_key(self):
   self.assertEqual(
       ndb.Key(bot_management.BotRoot, 'foo'),
       bot_management.get_root_key('foo'))
Esempio n. 22
0
def _yield_potential_tasks(bot_id):
    """Queries all the known task queues in parallel and yields the task in order
  of priority.

  The ordering is opportunistic, not strict. There's a risk of not returning
  exactly in the priority order depending on index staleness and query execution
  latency. The number of queries is unbounded.

  Yields:
    TaskToRun entities, trying to yield the highest priority one first. To have
    finite execution time, starts yielding results once one of these conditions
    are met:
    - 1 second elapsed; in this case, continue iterating in the background
    - First page of every query returned
    - All queries exhausted
  """
    bot_root_key = bot_management.get_root_key(bot_id)
    potential_dimensions_hashes = task_queues.get_queues(bot_root_key)
    # Note that the default ndb.EVENTUAL_CONSISTENCY is used so stale items may be
    # returned. It's handled specifically by consumers of this function.
    start = time.time()
    queries = [_get_task_to_run_query(d) for d in potential_dimensions_hashes]
    yielders = [_yield_pages_async(q, 10) for q in queries]
    # We do care about the first page of each query so we cannot merge all the
    # results of every query insensibly.
    futures = []

    try:
        for y in yielders:
            futures.append(next(y, None))

        while (time.time() - start) < 1 and not all(f.done()
                                                    for f in futures if f):
            r = ndb.eventloop.run0()
            if r is None:
                break
            time.sleep(r)
        logging.debug(
            '_yield_potential_tasks(%s): waited %.3fs for %d items from %d Futures',
            bot_id,
            time.time() - start,
            sum(len(f.get_result()) for f in futures if f.done()),
            len(futures))
        # items is a list of TaskToRun. The entities are needed because property
        # queue_number is used to sort according to each task's priority.
        items = []
        for i, f in enumerate(futures):
            if f and f.done():
                # The ndb.Future returns a list of up to 10 TaskToRun entities.
                r = f.get_result()
                if r:
                    # The ndb.Query ask for a valid queue_number but under load, it
                    # happens the value is not valid anymore.
                    items.extend(i for i in r if i.queue_number)
                    # Prime the next page, in case.
                    futures[i] = next(yielders[i], None)

        # That's going to be our search space for now.
        items.sort(key=_queue_number_fifo_priority)

        # It is possible that there is no items yet, in case all futures are taking
        # more than 1 second.
        # It is possible that all futures are done if every queue has less than 10
        # task pending.
        while any(futures) or items:
            if items:
                yield items[0]
                items = items[1:]
            else:
                # Let activity happen.
                ndb.eventloop.run1()
            changed = False
            for i, f in enumerate(futures):
                if f and f.done():
                    # See loop above for explanation.
                    items.extend(i for i in f.get_result() if i.queue_number)
                    futures[i] = next(yielders[i], None)
                    changed = True
            if changed:
                items.sort(key=_queue_number_fifo_priority)
    except apiproxy_errors.DeadlineExceededError as e:
        # This is normally due to: "The API call datastore_v3.RunQuery() took too
        # long to respond and was cancelled."
        # At that point, the Cloud DB index is not able to sustain the load. So the
        # best thing to do is to back off a bit and not return any task to the bot
        # for this poll.
        logging.error(
            'Failed to yield a task due to an RPC timeout. Returning no '
            'task to the bot: %s', e)