def test_or_dimensions_new_tasks(self): # Bots are already registered, then new tasks show up self.mock_now(datetime.datetime(2020, 1, 2, 3, 4, 5)) self.assertEqual( 0, _assert_bot(bot_id=u'bot1', dimensions={ u'os': [u'v1', u'v2'], u'gpu': [u'nv'], })) self.assertEqual( 0, _assert_bot(bot_id=u'bot2', dimensions={ u'os': [u'v2'], u'gpu': [u'amd'], })) payloads = self._mock_enqueue_task_async_for_rebuild_task_cache() request1 = _gen_request(properties=_gen_properties( dimensions={ u'pool': [u'default'], u'os': [u'v1|v2'], u'gpu': [u'nv|amd'], })) task_queues.assert_task_async(request1).get_result() self.assertEqual(1, len(payloads)) f = task_queues.rebuild_task_cache_async(payloads[-1]) self.assertEqual(True, f.get_result()) payloads.pop() # Both bots should be able to handle |request1| self.assert_count(2, task_queues.BotDimensions) self.assert_count(2, task_queues.BotTaskDimensions) self.assert_count(1, task_queues.TaskDimensions) self.assertEqual(4, len(task_queues.TaskDimensions.query().get().sets)) bot1_root_key = bot_management.get_root_key(u'bot1') bot2_root_key = bot_management.get_root_key(u'bot2') self.assertEqual(1, len(task_queues.get_queues(bot1_root_key))) self.assertEqual(1, len(task_queues.get_queues(bot2_root_key))) request2 = _gen_request(properties=_gen_properties( dimensions={ u'pool': [u'default'], u'os': [u'v1'], u'gpu': [u'nv|amd'], })) task_queues.assert_task_async(request2).get_result() self.assertEqual(1, len(payloads)) f = task_queues.rebuild_task_cache_async(payloads[-1]) self.assertEqual(True, f.get_result()) payloads.pop() # Only bot1 can handle |request2| self.assert_count(3, task_queues.BotTaskDimensions) self.assert_count(2, task_queues.TaskDimensions) self.assertEqual(2, len(task_queues.get_queues(bot1_root_key))) self.assertEqual(1, len(task_queues.get_queues(bot2_root_key)))
def test_assert_bot_dimensions_changed(self): # Ensure that stale BotTaskDimensions are deleted when the bot dimensions # changes. now = datetime.datetime(2010, 1, 2, 3, 4, 5) self.mock_now(now) request = self._assert_task() exp = (request.expiration_ts - request.created_ts + task_queues._EXTEND_VALIDITY) self.assertEqual(1, _assert_bot()) # One hour later, the bot changes dimensions. self.mock_now(now, task_queues._EXTEND_VALIDITY.total_seconds()) self.assertEqual(1, _assert_bot(dimensions={u'gpu': u'Matrox'})) self.assert_count(1, task_queues.BotTaskDimensions) self.assert_count(1, task_queues.TaskDimensions) bot_root_key = bot_management.get_root_key(u'bot1') self.assertEqual([2980491642], task_queues.get_queues(bot_root_key)) # One second before expiration. self.mock_now(now, exp.total_seconds()) self.assertEqual(None, _assert_bot(dimensions={u'gpu': u'Matrox'})) self.assert_count(1, task_queues.BotTaskDimensions) self.assert_count(1, task_queues.TaskDimensions) self.assertEqual([2980491642], task_queues.get_queues(bot_root_key)) # TaskDimension expired. The fact that the bot changed dimensions after an # hour didn't impact BotTaskDimensions expiration. self.mock_now(now, exp.total_seconds() + 1) self.assertEqual(0, _assert_bot()) self.assert_count(1, task_queues.BotTaskDimensions) self.assert_count(1, task_queues.TaskDimensions) self.assertEqual([], task_queues.get_queues(bot_root_key))
def cleanup_bot(machine_lease): """Cleans up entities after a bot is removed.""" bot_root_key = bot_management.get_root_key(machine_lease.hostname) # The bot is being removed, remove it from the task queues. task_queues.cleanup_after_bot(bot_root_key) bot_management.get_info_key(machine_lease.hostname).delete() clear_lease_request(machine_lease.key, machine_lease.client_request_id)
def test_cron_tidy_stale(self): now = datetime.datetime(2010, 1, 2, 3, 4, 5) self.mock_now(now) self.assertEqual(0, _assert_bot()) request = self._assert_task() exp = (request.expiration_ts-request.created_ts) + task_queues._ADVANCE self.assert_count(1, task_queues.BotTaskDimensions) self.assert_count(1, task_queues.TaskDimensions) bot_root_key = bot_management.get_root_key(u'bot1') self.assertEqual([2980491642], task_queues.get_queues(bot_root_key)) # No-op. task_queues.cron_tidy_stale() self.assert_count(1, task_queues.BotTaskDimensions) self.assert_count(1, task_queues.TaskDimensions) self.assertEqual([2980491642], task_queues.get_queues(bot_root_key)) # One second before expiration. self.mock_now(now, exp.total_seconds()) task_queues.cron_tidy_stale() self.assert_count(1, task_queues.BotTaskDimensions) self.assert_count(1, task_queues.TaskDimensions) self.assertEqual([2980491642], task_queues.get_queues(bot_root_key)) # TaskDimension expired. self.mock_now(now, exp.total_seconds() + 1) task_queues.cron_tidy_stale() self.assert_count(0, task_queues.BotTaskDimensions) self.assert_count(0, task_queues.TaskDimensions) self.assertEqual([], task_queues.get_queues(bot_root_key))
def test_assert_task_async_call_rebuld_task_cache_async(self): self.assertEqual(0, _assert_bot()) dimensions = { u'id': [u'bot1'], } self.mock_now(datetime.datetime(2020, 1, 2, 3, 4, 5)) request1 = _gen_request(properties=_gen_properties( dimensions=dimensions)) task_queues.assert_task_async(request1).get_result() self.assert_count(1, task_queues.BotDimensions) self.assert_count(1, task_queues.BotTaskDimensions) self.assert_count(1, task_queues.TaskDimensions) bot_root_key = bot_management.get_root_key('bot1') self.assertEqual(1, len(task_queues.get_queues(bot_root_key))) # expire BotTaskDimensions by changing time. memcache.flush_all() bot_task_dimensions = task_queues.BotTaskDimensions.query( ancestor=bot_root_key).fetch()[0] self.mock_now(bot_task_dimensions.valid_until_ts + datetime.timedelta(seconds=1)) self.assertEqual(0, len(task_queues.get_queues(bot_root_key))) # request a task with the same dimensions. memcache.flush_all() request2 = _gen_request(properties=_gen_properties( dimensions=dimensions)) task_queues.assert_task_async(request2).get_result() self.assert_count(1, task_queues.BotDimensions) self.assert_count(1, task_queues.BotTaskDimensions) self.assert_count(1, task_queues.TaskDimensions) self.assertEqual(1, len(task_queues.get_queues(bot_root_key)))
def test_assert_task_async_then_bot(self): self._assert_task() self.assertEqual(1, _assert_bot()) self.assert_count(1, task_queues.BotDimensions) self.assert_count(1, task_queues.BotTaskDimensions) self.assert_count(1, task_queues.TaskDimensions) bot_root_key = bot_management.get_root_key(u'bot1') self.assertEqual([2980491642], task_queues.get_queues(bot_root_key))
def test_cleanup_after_bot(self): self.assertEqual(0, _assert_bot()) self._assert_task() task_queues.cleanup_after_bot(bot_management.get_root_key('bot1')) # BotInfo is deleted separately. self.assert_count(1, bot_management.BotInfo) self.assert_count(0, task_queues.BotDimensions) self.assert_count(0, task_queues.BotTaskDimensions) self.assert_count(1, task_queues.TaskDimensions)
def Events(self, request, context): logging.debug('%s', request) try: if not request.bot_id: # TODO(maruel): Allows not specifying one. Or specifying a pool. raise ValueError('specify bot_id') # Transparently limit to 1000, default to 200. page_size = request.page_size or 200 if page_size > 1000: page_size = 1000 if page_size < 0: raise ValueError('page_size must be positive') start = None end = None if request.HasField('start_time'): start = request.start_time.ToDatetime() if request.HasField('end_time'): end = request.end_time.ToDatetime() if (start and end) and start >= end: raise ValueError('start_time must be before end_time') # The BotEvent key is already in the right chronological order, but # querying per BotEvent.ts *requires* ordering per BotEvent.ts. order = not (start or end) q = bot_management.get_events_query(request.bot_id, order) if not order: q = q.order(-bot_management.BotEvent.ts, bot_management.BotEvent.key) if start: q = q.filter(bot_management.BotEvent.ts >= start) if end: q = q.filter(bot_management.BotEvent.ts < end) items, cursor = datastore_utils.fetch_page(q, page_size, request.page_token) if not items: # Check if the bot exists, if not, return a 404. We check BotRoot, not # BotInfo, so that even deleted bots can be queried. See bot_management # for more information. if not bot_management.get_root_key(request.bot_id).get(): context.set_code(StatusCode.NOT_FOUND) context.set_details('Bot does not exist') return None except ValueError as e: context.set_code(StatusCode.INVALID_ARGUMENT) context.set_details(str(e)) return None logging.info('Returning %d events', len(items)) out = swarming_pb2.BotEventsResponse(next_page_token=cursor) for r in items: i = out.events.add() r.to_proto(i) return out
def _yield_next_available_task_to_dispatch(bot_dimensions): bot_id = bot_dimensions[u'id'][0] bot_management.bot_event( 'bot_connected', bot_id, '1.2.3.4', 'joe@localhost', bot_dimensions, {'state': 'real'}, '1234', False, None, None, None) bot_root_key = bot_management.get_root_key(bot_id) task_queues.assert_bot_async(bot_root_key, bot_dimensions).get_result() return [ to_run.to_dict() for _request, to_run in task_to_run.yield_next_available_task_to_dispatch(bot_dimensions) ]
def _assert_bot(bot_id=u'bot1', dimensions=None): bot_dimensions = { u'cpu': [u'x86-64', u'x64'], u'id': [bot_id], u'os': [u'Ubuntu-16.04', u'Ubuntu'], u'pool': [u'default'], } bot_dimensions.update(dimensions or {}) bot_management.bot_event( 'bot_connected', bot_id, '1.2.3.4', bot_id, bot_dimensions, {}, '1234', False, None, None, None) bot_root_key = bot_management.get_root_key(bot_id) return task_queues.assert_bot_async(bot_root_key, bot_dimensions).get_result()
def test_or_dimensions_same_hash(self): self.mock_now(datetime.datetime(2020, 1, 2, 3, 4, 5)) self.assertEqual( 0, _assert_bot(bot_id=u'bot1', dimensions={u'os': [u'v1']})) self.assertEqual( 0, _assert_bot(bot_id=u'bot2', dimensions={u'os': [u'v2']})) self.assertEqual( 0, _assert_bot(bot_id=u'bot3', dimensions={u'os': [u'v3']})) payloads = self._mock_enqueue_task_async_for_rebuild_task_cache() # Both requests should have the same dimension_hash request1 = _gen_request(properties=_gen_properties(dimensions={ u'pool': [u'default'], u'os': [u'v1|v2|v3'], })) request2 = _gen_request(properties=_gen_properties(dimensions={ u'pool': [u'default'], u'os': [u'v3|v2|v1'], })) task_queues.assert_task_async(request1).get_result() task_queues.assert_task_async(request2).get_result() self.assertEqual(2, len(payloads)) while payloads: f = task_queues.rebuild_task_cache_async(payloads[-1]) self.assertEqual(True, f.get_result()) payloads.pop() self.assert_count(3, task_queues.BotDimensions) self.assert_count(3, task_queues.BotTaskDimensions) self.assert_count(1, task_queues.TaskDimensions) self.assertEqual(3, len(task_queues.TaskDimensions.query().get().sets)) bot1_root_key = bot_management.get_root_key(u'bot1') bot2_root_key = bot_management.get_root_key(u'bot2') bot3_root_key = bot_management.get_root_key(u'bot3') self.assertEqual(1, len(task_queues.get_queues(bot1_root_key))) self.assertEqual(1, len(task_queues.get_queues(bot2_root_key))) self.assertEqual(1, len(task_queues.get_queues(bot3_root_key)))
def test_probably_has_capacity_get_queues(self): d = {u'pool': [u'default'], u'os': [u'Ubuntu-16.04']} # Capacity registers there only once there's a request enqueued and # get_queues() is called. _assert_bot() request = _gen_request(properties=_gen_properties(dimensions=d)) task_queues.assert_task_async(request).get_result() self.assertEqual(1, self.execute_tasks()) self.assertEqual(None, task_queues.probably_has_capacity(d)) # It get sets only once get_queues() is called. bot_root_key = bot_management.get_root_key(u'bot1') task_queues.get_queues(bot_root_key) self.assertEqual(True, task_queues.probably_has_capacity(d)) self.assertEqual([1843498234], memcache.get('bot1', namespace='task_queues'))
def cleanup_after_bot(bot_id): """Removes all BotDimensions and BotTaskDimensions for this bot. Do not clean up TaskDimensions. There could be pending tasks and there's a possibility that a bot with the same ID could come up afterward (low chance in practice but it's a possibility). In this case, if TaskDimensions is deleted, the pending task would not be correctly run even when a bot comes back online as assert_bot_async() would fail to create the corresponding BotTaskDimensions. """ bot_root_key = bot_management.get_root_key(bot_id) q = BotTaskDimensions.query(ancestor=bot_root_key).iter(keys_only=True) futures = ndb.delete_multi_async(q) futures.append( ndb.Key(BotDimensions, 1, parent=bot_root_key).delete_async()) for f in futures: f.check_success()
def assert_bot_async(bot_dimensions): """Prepares BotTaskDimensions entities as needed. Coupled with assert_task(), enables get_queues() to work by by knowing which TaskDimensions applies to this bot. Returns: Number of matches or None if hit the cache, thus nothing was updated. """ assert len(bot_dimensions[u'id']) == 1, bot_dimensions # Check if the bot dimensions changed since last _rebuild_bot_cache_async() # call. bot_root_key = bot_management.get_root_key(bot_dimensions[u'id'][0]) obj = yield ndb.Key(BotDimensions, 1, parent=bot_root_key).get_async() if obj and obj.dimensions_flat == _flatten_bot_dimensions(bot_dimensions): # Cache hit, no need to look further. raise ndb.Return(None) matches = yield _rebuild_bot_cache_async(bot_dimensions, bot_root_key) raise ndb.Return(matches)
def test_cron_tidy_stale(self): now = datetime.datetime(2010, 1, 2, 3, 4, 5) self.mock_now(now) self.assertEqual(0, _assert_bot()) request = self._assert_task() exp = (request.expiration_ts - request.created_ts + task_queues._EXTEND_VALIDITY) self.assert_count(1, task_queues.BotTaskDimensions) self.assert_count(1, task_queues.TaskDimensions) bot_root_key = bot_management.get_root_key(u'bot1') self.assertEqual([2980491642], task_queues.get_queues(bot_root_key)) # No-op. task_queues.cron_tidy_stale() self.assert_count(1, task_queues.BotTaskDimensions) self.assert_count(1, task_queues.TaskDimensions) self.assertEqual([2980491642], task_queues.get_queues(bot_root_key)) # TaskDimension expired but is still kept; get_queues() doesn't return it # anymore even if still in the DB. BotTaskDimensions was evicted. self.mock_now(now, exp.total_seconds() + 1) task_queues.cron_tidy_stale() self.assert_count(0, task_queues.BotTaskDimensions) self.assert_count(1, task_queues.TaskDimensions) self.assertEqual([], task_queues.get_queues(bot_root_key)) # Just before _KEEP_DEAD. self.mock_now(now, (exp + task_queues._KEEP_DEAD).total_seconds()) task_queues.cron_tidy_stale() self.assert_count(0, task_queues.BotTaskDimensions) self.assert_count(1, task_queues.TaskDimensions) self.assertEqual([], task_queues.get_queues(bot_root_key)) # TaskDimension expired, after KEEP_DEAD. self.mock_now(now, (exp + task_queues._KEEP_DEAD).total_seconds() + 1) task_queues.cron_tidy_stale() self.assert_count(0, task_queues.BotTaskDimensions) self.assert_count(0, task_queues.TaskDimensions) self.assertEqual([], task_queues.get_queues(bot_root_key))
def get_queues(bot_id): """Queries all the known task queues in parallel and yields the task in order of priority. """ assert isinstance(bot_id, unicode), repr(bot_id) data = memcache.get(bot_id, namespace='task_queues') if data is not None: logging.debug('get_queues(%s): can run from %d queues (memcache)\n%s', bot_id, len(data), data) return data # Retrieve all the dimensions_hash that this bot could run that have # actually been triggered in the past. Since this is under a root entity, this # should be fast. now = utils.utcnow() bot_root_key = bot_management.get_root_key(bot_id) data = sorted(obj.key.integer_id() for obj in BotTaskDimensions.query(ancestor=bot_root_key) if obj.valid_until_ts >= now) memcache.set(bot_id, data, namespace='task_queues') logging.info('get_queues(%s): Query in %.3fs: can run from %d queues\n%s', bot_id, (utils.utcnow() - now).total_seconds(), len(data), data) return data
def test_get_root_key(self): self.assertEqual( ndb.Key(bot_management.BotRoot, 'foo'), bot_management.get_root_key('foo'))
def _process(self): """Fetches bot info and settings, does authorization and quarantine checks. Returns: _ProcessResult instance, see its fields for more info. Raises: auth.AuthorizationError if bot's credentials are invalid. """ request = self.parse_body() version = request.get('version', None) dimensions = request.get('dimensions') or {} state = request.get('state') or {} bot_id = None if dimensions.get('id'): dimension_id = dimensions['id'] if (isinstance(dimension_id, list) and len(dimension_id) == 1 and isinstance(dimension_id[0], unicode)): bot_id = dimensions['id'][0] if bot_id: logging.debug('Fetching bot settings for bot id: %s', bot_id) bot_settings = bot_management.get_settings_key(bot_id).get() # Make sure bot self-reported ID matches the authentication token. Raises # auth.AuthorizationError if not. bot_group_cfg = bot_auth.validate_bot_id_and_fetch_config(bot_id) # The server side dimensions from bot_group_cfg override bot-provided ones. # If both server side config and bot report some dimension, server side # config wins. We still emit an warning if bot tries to supply the dimension # and it disagrees with the server defined one. Note that this may happen # on a first poll after server side config for a bot has changed. The bot # doesn't know about new server-assigned dimensions yet in this case. Also # don't report ['default'], bot sends it in the handshake before it knows # anything at all. for dim_key, from_cfg in bot_group_cfg.dimensions.iteritems(): from_bot = sorted(dimensions.get(dim_key) or []) from_cfg = sorted(from_cfg) if from_bot and from_bot != ['default'] and from_bot != from_cfg: logging.warning( 'Dimensions in bots.cfg don\'t match ones provided by the bot\n' 'bot_id: "%s", key: "%s", from_bot: %s, from_cfg: %s', bot_id, dim_key, from_bot, from_cfg) dimensions[dim_key] = from_cfg # Fill in all result fields except 'quarantined_msg'. result = _ProcessResult(request=request, bot_id=bot_id, version=version, state=state, dimensions=dimensions, bot_group_cfg=bot_group_cfg, maintenance_msg=state.get('maintenance')) # The bot may decide to "self-quarantine" itself. Accept both via # dimensions or via state. See bot_management._BotCommon.quarantined for # more details. if (bool(dimensions.get('quarantined')) or bool(state.get('quarantined'))): result.quarantined_msg = 'Bot self-quarantined' return result quarantined_msg = None # Use a dummy 'for' to be able to break early from the block. for _ in [0]: quarantined_msg = has_unexpected_keys(self.EXPECTED_KEYS, request, 'keys') if quarantined_msg: break quarantined_msg = has_missing_keys(self.REQUIRED_STATE_KEYS, state, 'state') if quarantined_msg: break if not bot_id: quarantined_msg = 'Missing bot id' break if not dimensions.get('pool'): quarantined_msg = 'Missing \'pool\' dimension' break for key, values in sorted(dimensions.items()): if not config.validate_dimension_key(key): quarantined_msg = "Invalid dimension key: %r" % key break if not isinstance(values, list): quarantined_msg = "Key %s has non-list value: %s" % ( key, values) break if len(values) != len(set(values)): quarantined_msg = "Key %s has duplicate values: %s" % ( key, values) break for value in sorted(values): if not config.validate_dimension_value(value): quarantined_msg = "Key %s has invalid value: %r" % ( key, value) break if quarantined_msg: break if quarantined_msg: line = 'Quarantined Bot\nhttps://%s/restricted/bot/%s\n%s' % ( app_identity.get_default_version_hostname(), bot_id, quarantined_msg) ereporter2.log_request(self.request, source='bot', message=line) result.quarantined_msg = quarantined_msg return result # Look for admin enforced quarantine. if bool(bot_settings and bot_settings.quarantined): result.quarantined_msg = 'Quarantined by admin' return result # TODO(maruel): Parallelise. bot_root_key = bot_management.get_root_key(bot_id) task_queues.assert_bot_async(bot_root_key, dimensions).get_result() return result
def test_rebuild_task_cache_async(self): # Assert that expiration works. now = datetime.datetime(2010, 1, 2, 3, 4, 5) self.mock_now(now) # We want _yield_BotTaskDimensions_keys() to return multiple # BotTaskDimensions ndb.Key to confirm that the inner loops work. This # requires a few bots. _assert_bot(bot_id=u'bot1') _assert_bot(bot_id=u'bot2') _assert_bot(bot_id=u'bot3') bot_root_key = bot_management.get_root_key(u'bot1') self.assertEqual(0, task_queues.BotTaskDimensions.query().count()) self.assertEqual(0, task_queues.TaskDimensions.query().count()) # Intentionally force the code to throttle the number of concurrent RPCs, # otherwise the inner loops wouldn't be reached with less than 50 bots, and # testing with 50 bots, would make the unit test slow. self.mock(task_queues, '_CAP_FUTURES_LIMIT', 1) payloads = self._mock_enqueue_task_async_for_rebuild_task_cache() # The equivalent of self._assert_task(tasks=1) except that we snapshot the # payload. # Trigger multiple task queues to go deeper in the code. request_1 = _gen_request(properties=_gen_properties(dimensions={ u'cpu': [u'x86-64'], u'pool': [u'default'], })) task_queues.assert_task_async(request_1).get_result() self.assertEqual(1, len(payloads)) f = task_queues.rebuild_task_cache_async(payloads[-1]) self.assertEqual(True, f.get_result()) self.assertEqual(3, task_queues.BotTaskDimensions.query().count()) self.assertEqual(1, task_queues.TaskDimensions.query().count()) self.assertEqual(60, request_1.expiration_secs) expected = now + task_queues._EXTEND_VALIDITY + datetime.timedelta( seconds=request_1.expiration_secs) self.assertEqual( expected, task_queues.TaskDimensions.query().get().valid_until_ts) request_2 = _gen_request(properties=_gen_properties( dimensions={ u'os': [u'Ubuntu-16.04'], u'pool': [u'default'], })) task_queues.assert_task_async(request_2).get_result() self.assertEqual(2, len(payloads)) f = task_queues.rebuild_task_cache_async(payloads[-1]) self.assertEqual(True, f.get_result()) self.assertEqual(6, task_queues.BotTaskDimensions.query().count()) self.assertEqual(2, task_queues.TaskDimensions.query().count()) self.assertEqual([227177418, 1843498234], task_queues.get_queues(bot_root_key)) memcache.flush_all() self.assertEqual([227177418, 1843498234], task_queues.get_queues(bot_root_key)) # Now expire the two TaskDimensions, one at a time, and rebuild the task # queue. offset = (task_queues._EXTEND_VALIDITY + datetime.timedelta( seconds=request_1.expiration_secs)).total_seconds() + 1 self.mock_now(now, offset) f = task_queues.rebuild_task_cache_async(payloads[0]) self.assertEqual(True, f.get_result()) self.assertEqual(6, task_queues.BotTaskDimensions.query().count()) self.assertEqual(2, task_queues.TaskDimensions.query().count()) self.assertEqual([227177418, 1843498234], task_queues.get_queues(bot_root_key)) # Observe the effect of memcache. See comment in get_queues(). memcache.flush_all() self.assertEqual([], task_queues.get_queues(bot_root_key)) # Re-running still do not delete TaskDimensions because they are kept until # _KEEP_DEAD. f = task_queues.rebuild_task_cache_async(payloads[1]) self.assertEqual(True, f.get_result()) self.assertEqual(6, task_queues.BotTaskDimensions.query().count()) self.assertEqual(2, task_queues.TaskDimensions.query().count()) self.assertEqual([], task_queues.get_queues(bot_root_key)) # Get past _KEEP_DEAD. offset = (task_queues._EXTEND_VALIDITY + task_queues._KEEP_DEAD + datetime.timedelta(seconds=request_1.expiration_secs) ).total_seconds() + 1 self.mock_now(now, offset) self.assertEqual([], task_queues.get_queues(bot_root_key)) f = task_queues.rebuild_task_cache_async(payloads[0]) self.assertEqual(True, f.get_result()) self.assertEqual(6, task_queues.BotTaskDimensions.query().count()) self.assertEqual(1, task_queues.TaskDimensions.query().count()) self.assertEqual([], task_queues.get_queues(bot_root_key))
def post(self, task_id=None): # Unlike handshake and poll, we do not accept invalid keys here. This code # path is much more strict. request = self.parse_body() msg = log_unexpected_subset_keys(self.ACCEPTED_KEYS, self.REQUIRED_KEYS, request, self.request, 'bot', 'keys') if msg: self.abort_with_error(400, error=msg) bot_id = request['id'] task_id = request['task_id'] # Make sure bot self-reported ID matches the authentication token. Raises # auth.AuthorizationError if not. bot_auth.validate_bot_id_and_fetch_config(bot_id) bot_overhead = request.get('bot_overhead') cipd_pins = request.get('cipd_pins') cipd_stats = request.get('cipd_stats') cost_usd = request.get('cost_usd', 0) duration = request.get('duration') exit_code = request.get('exit_code') hard_timeout = request.get('hard_timeout') io_timeout = request.get('io_timeout') isolated_stats = request.get('isolated_stats') output = request.get('output') output_chunk_start = request.get('output_chunk_start') outputs_ref = request.get('outputs_ref') if (isolated_stats or cipd_stats) and bot_overhead is None: ereporter2.log_request(request=self.request, source='server', category='task_failure', message='Failed to update task: %s' % task_id) self.abort_with_error( 400, error= 'isolated_stats and cipd_stats require bot_overhead to be set' '\nbot_overhead: %s\nisolate_stats: %s' % (bot_overhead, isolated_stats)) run_result_key = task_pack.unpack_run_result_key(task_id) performance_stats = None if bot_overhead is not None: performance_stats = task_result.PerformanceStats( bot_overhead=bot_overhead) if isolated_stats: download = isolated_stats.get('download') or {} upload = isolated_stats.get('upload') or {} def unpack_base64(d, k): x = d.get(k) if x: return base64.b64decode(x) performance_stats.isolated_download = task_result.OperationStats( duration=download.get('duration'), initial_number_items=download.get('initial_number_items'), initial_size=download.get('initial_size'), items_cold=unpack_base64(download, 'items_cold'), items_hot=unpack_base64(download, 'items_hot')) performance_stats.isolated_upload = task_result.OperationStats( duration=upload.get('duration'), items_cold=unpack_base64(upload, 'items_cold'), items_hot=unpack_base64(upload, 'items_hot')) if cipd_stats: performance_stats.package_installation = task_result.OperationStats( duration=cipd_stats.get('duration')) if output is not None: try: output = base64.b64decode(output) except UnicodeEncodeError as e: logging.error('Failed to decode output\n%s\n%r', e, output) output = output.encode('ascii', 'replace') except TypeError as e: # Save the output as-is instead. The error will be logged in ereporter2 # and returning a HTTP 500 would only force the bot to stay in a retry # loop. logging.error('Failed to decode output\n%s\n%r', e, output) if outputs_ref: outputs_ref = task_request.FilesRef(**outputs_ref) if cipd_pins: cipd_pins = task_result.CipdPins( client_package=task_request.CipdPackage( **cipd_pins['client_package']), packages=[ task_request.CipdPackage(**args) for args in cipd_pins['packages'] ]) # Tell the task queues management engine that the bot is still alive, and # it shall refresh the task queues. bot_root_key = bot_management.get_root_key(bot_id) task_queues.get_queues(bot_root_key) try: state = task_scheduler.bot_update_task( run_result_key=run_result_key, bot_id=bot_id, output=output, output_chunk_start=output_chunk_start, exit_code=exit_code, duration=duration, hard_timeout=hard_timeout, io_timeout=io_timeout, cost_usd=cost_usd, outputs_ref=outputs_ref, cipd_pins=cipd_pins, performance_stats=performance_stats) if not state: logging.info('Failed to update, please retry') self.abort_with_error(500, error='Failed to update, please retry') if state in (task_result.State.COMPLETED, task_result.State.TIMED_OUT): action = 'task_completed' elif state == task_result.State.KILLED: action = 'task_killed' else: assert state in (task_result.State.BOT_DIED, task_result.State.RUNNING), state action = 'task_update' bot_management.bot_event( event_type=action, bot_id=bot_id, external_ip=self.request.remote_addr, authenticated_as=auth.get_peer_identity().to_bytes(), dimensions=None, state=None, version=None, quarantined=None, maintenance_msg=None, task_id=task_id, task_name=None) except ValueError as e: ereporter2.log_request(request=self.request, source='server', category='task_failure', message='Failed to update task: %s' % e) self.abort_with_error(400, error=str(e)) except webob.exc.HTTPException: raise except Exception as e: logging.exception('Internal error: %s', e) self.abort_with_error(500, error=str(e)) # - BOT_DIED will occur when the following conditions are true: # - The bot polled correctly, but then stopped updating for at least # task_result.BOT_PING_TOLERANCE. (It can occur if the host went to # sleep, or the OS was overwhelmed). # - /internal/cron/abort_bot_died runs, detects the bot is MIA, kills the # task. # - Bot wakes up, starts sending updates again. # - KILLED is when the client uses the kill API to forcibly stop a running # task. must_stop = state in (task_result.State.BOT_DIED, task_result.State.KILLED) if must_stop: logging.info('asking bot to kill the task') self.send_response({'must_stop': must_stop, 'ok': True})
def test_get_root_key(self): self.assertEqual( ndb.Key(bot_management.BotRoot, 'foo'), bot_management.get_root_key('foo'))
def _yield_potential_tasks(bot_id): """Queries all the known task queues in parallel and yields the task in order of priority. The ordering is opportunistic, not strict. There's a risk of not returning exactly in the priority order depending on index staleness and query execution latency. The number of queries is unbounded. Yields: TaskToRun entities, trying to yield the highest priority one first. To have finite execution time, starts yielding results once one of these conditions are met: - 1 second elapsed; in this case, continue iterating in the background - First page of every query returned - All queries exhausted """ bot_root_key = bot_management.get_root_key(bot_id) potential_dimensions_hashes = task_queues.get_queues(bot_root_key) # Note that the default ndb.EVENTUAL_CONSISTENCY is used so stale items may be # returned. It's handled specifically by consumers of this function. start = time.time() queries = [_get_task_to_run_query(d) for d in potential_dimensions_hashes] yielders = [_yield_pages_async(q, 10) for q in queries] # We do care about the first page of each query so we cannot merge all the # results of every query insensibly. futures = [] try: for y in yielders: futures.append(next(y, None)) while (time.time() - start) < 1 and not all(f.done() for f in futures if f): r = ndb.eventloop.run0() if r is None: break time.sleep(r) logging.debug( '_yield_potential_tasks(%s): waited %.3fs for %d items from %d Futures', bot_id, time.time() - start, sum(len(f.get_result()) for f in futures if f.done()), len(futures)) # items is a list of TaskToRun. The entities are needed because property # queue_number is used to sort according to each task's priority. items = [] for i, f in enumerate(futures): if f and f.done(): # The ndb.Future returns a list of up to 10 TaskToRun entities. r = f.get_result() if r: # The ndb.Query ask for a valid queue_number but under load, it # happens the value is not valid anymore. items.extend(i for i in r if i.queue_number) # Prime the next page, in case. futures[i] = next(yielders[i], None) # That's going to be our search space for now. items.sort(key=_queue_number_fifo_priority) # It is possible that there is no items yet, in case all futures are taking # more than 1 second. # It is possible that all futures are done if every queue has less than 10 # task pending. while any(futures) or items: if items: yield items[0] items = items[1:] else: # Let activity happen. ndb.eventloop.run1() changed = False for i, f in enumerate(futures): if f and f.done(): # See loop above for explanation. items.extend(i for i in f.get_result() if i.queue_number) futures[i] = next(yielders[i], None) changed = True if changed: items.sort(key=_queue_number_fifo_priority) except apiproxy_errors.DeadlineExceededError as e: # This is normally due to: "The API call datastore_v3.RunQuery() took too # long to respond and was cancelled." # At that point, the Cloud DB index is not able to sustain the load. So the # best thing to do is to back off a bit and not return any task to the bot # for this poll. logging.error( 'Failed to yield a task due to an RPC timeout. Returning no ' 'task to the bot: %s', e)