def config_for_task(request): """Retrieves the ExternalSchedulerConfig for this task request, if any. Arguments: request: a task_request.TaskRequest instance. Returns: pools_config.ExternalSchedulerConfig for external scheduler to use for this bot, if it exists, or None otherwise. """ s0 = request.task_slice(0) pool = s0.properties.pool if not pool: return None pool_cfg = pools_config.get_pool_config(pool) if not pool_cfg or not pool_cfg.external_schedulers: return None # Determine the dimension intersection across all task slices. common_dimensions = set( task_queues.bot_dimensions_to_flat(s0.properties.dimensions)) for i in range(1, request.num_task_slices): s = request.task_slice(i) common_dimensions.intersection_update( task_queues.bot_dimensions_to_flat(s.properties.dimensions)) return _config_for_dimensions(pool_cfg, common_dimensions)
def match_dimensions(request_dimensions, bot_dimensions): """Returns True if the bot dimensions satisfies the request dimensions.""" assert isinstance(request_dimensions, dict), request_dimensions assert isinstance(bot_dimensions, dict), bot_dimensions if not frozenset(request_dimensions).issubset(bot_dimensions): return False bot_flat = frozenset(task_queues.bot_dimensions_to_flat(bot_dimensions)) return any( frozenset(f).issubset(bot_flat) for f in task_queues.expand_dimensions_to_flats(request_dimensions))
def _gen_bot_info(key_id, last_seen_ts, **kwargs): args = { 'key': ndb.Key('BotRoot', key_id, 'BotInfo', 'info'), 'last_seen_ts': last_seen_ts, 'dimensions': { 'os': ['Linux', 'Ubuntu'], 'bot_id': [key_id], }, 'state': {}, } args.update(**kwargs) args['dimensions_flat'] = task_queues.bot_dimensions_to_flat( args.pop('dimensions')) return bot_management.BotInfo(**args)
def config_for_bot(bot_dimensions): """Retrieves the ExternalSchedulerConfig for this bot, if any. Arguments: - bot_dimensions: The dimensions of the bot as a dictionary in {string key: list of string values} format. Returns: pools_config.ExternalSchedulerConfig for external scheduler to use for this bot, if it exists, or None otherwise. """ pool_cfg = _bot_pool_cfg(bot_dimensions) bot_dimensions_flat = set( task_queues.bot_dimensions_to_flat(bot_dimensions)) return _config_for_dimensions(pool_cfg, bot_dimensions_flat)
def test_dimensions_to_flat_long_unicode(self): key = u'a' * 64 actual = task_queues.bot_dimensions_to_flat({ key: [ # Ok. u'⌛' * 256, # Too long. u'⛔' * 257, ], }) expected = [ key + u':' + u'⌛' * 256, key + u':' + u'⛔' * 256 + u'…', ] self.assertEqual(expected, actual)
def test_dimensions_to_flat_long_ascii(self): key = u'a' * 64 actual = task_queues.bot_dimensions_to_flat({ key: [ # Too long. u'b' * 257, # Ok. u'c' * 256, ], }) expected = [ key + u':' + u'b' * 256 + u'…', key + u':' + u'c' * 256, ] self.assertEqual(expected, actual)
def test_dimensions_to_flat_long_unicode_non_BMP(self): # For non-BMP characters, the length is effectively halved for now. key = u'a' * 64 # Python considers emoji in the supplemental plane to have length 2 on UCS2 # builds, and length 1 on UCS4 builds. l = 128 if sys.maxunicode < 65536 else 256 actual = task_queues.bot_dimensions_to_flat({ key: [ # Too long. u'💥' * (l + 1), # Ok. u'😬' * l, ], }) expected = [ key + u':' + u'💥' * l + u'…', key + u':' + u'😬' * l, ] self.assertEqual(expected, actual)
def assign_task(es_cfg, bot_dimensions): """Calls external scheduler for a single idle bot with given dimensions. Arguments: es_cfg: pools_config.ExternalSchedulerConfig instance. bot_dimensions: dimensions {string key: list of string values} Returns: (Task id string, slice number) tuple or (None, None) if no task to assign. """ bot_id = bot_dimensions[u'id'][0] logging.debug('Using external scheduler address: %s id: %s for bot %s', es_cfg.address, es_cfg.id, bot_id) req = plugin_pb2.AssignTasksRequest() idle_bot = req.idle_bots.add() idle_bot.bot_id = bot_id idle_bot.dimensions.extend( task_queues.bot_dimensions_to_flat(bot_dimensions)) req.scheduler_id = es_cfg.id req.time.GetCurrentTime() c = _get_client(es_cfg.address) # TODO(akeshet): Catch or handle errors appropriately. resp = c.AssignTasks(req, credentials=_creds()) if not resp or not resp.assignments: return None, None assert len(resp.assignments) == 1 assert resp.assignments[0].bot_id == bot_id return resp.assignments[0].task_id, resp.assignments[0].slice_number
def notify_requests(es_cfg, requests, use_tq, is_callback, batch_mode=False): """Calls external scheduler to notify it of a task state. Arguments: - es_cfg: pools_config.ExternalSchedulerConfig for external scheduler to notify. - requests: A list of (task_request.TaskRequest, task_result.TaskResultSummary or task_result.TaskRunResult) tuples. - use_tq: If true, make this call on a task queue (within the current datastore transaction). - is_callback: If true, indicates that this notification was in response to a external-scheduler-requested callback. This is for - batch_mode: If true, the notifications will be sent in a batched mode along with others, to reduce traffic to external scheduler. Only valid when use_tq and global config's enable_batch_es_notifications are true. Returns: Nothing. """ logging.debug( 'notify_requests(es_cfg=(%s,%s), requests=%s, use_tq=%s, ' 'is_callback=%s, batch_mode=%s)', es_cfg.address, es_cfg.id, [r.task_id for r, _ in requests], use_tq, is_callback, batch_mode) req = plugin_pb2.NotifyTasksRequest() req.is_callback = is_callback for request, result_summary in requests: item = req.notifications.add() # TODO(akeshet): This time should possibly come from the read time from # datastore, rather than the local server clock. item.time.FromDatetime(utils.utcnow()) item.task.id = request.task_id item.task.tags.extend(request.tags) item.task.enqueued_time.FromDatetime(request.created_ts) for i in range(request.num_task_slices): s = request.task_slice(i) flat_dimensions = task_queues.bot_dimensions_to_flat( s.properties.dimensions) s_pb = item.task.slices.add() s_pb.dimensions.extend(flat_dimensions) res = swarming_pb2.TaskResult() result_summary.to_proto(res) item.task.state = res.state if result_summary.bot_id: # TODO(akeshet): We should only actually set this is state is running. item.task.bot_id = result_summary.bot_id req.scheduler_id = es_cfg.id if not use_tq: # Ignore return value, the response proto is empty. notify_request_now(es_cfg.address, req) return request_json = json_format.MessageToJson(req) # If enable_batch_es_notifications is true, the notifications will be sent in # a batched mode along with others, to reduce traffic to external scheduler. if batch_mode and config.settings().enable_batch_es_notifications: payload = {'es_host': es_cfg.address, 'request_json': request_json} req = taskqueue.Task(payload=json.dumps(payload), method='PULL') if not req.add(queue_name='es-notify-tasks-batch', transactional=ndb.in_transaction()): raise datastore_utils.CommitError('Failed to enqueue task') stats = taskqueue.QueueStatistics.fetch('es-notify-kick') # Add a kicker task if there are fewer than 10 minutes worth. if stats.tasks < 600: job_enqueued = utils.enqueue_task( '/internal/taskqueue/important/external_scheduler/notify-kick', 'es-notify-kick', transactional=ndb.in_transaction()) if not job_enqueued: logging.info('Failed to add a notify-kick for request.') return enqueued = utils.enqueue_task( '/internal/taskqueue/important/external_scheduler/notify-tasks', 'es-notify-tasks', params={ 'es_host': es_cfg.address, 'request_json': request_json }, transactional=ndb.in_transaction()) if not enqueued: raise datastore_utils.CommitError('Failed to enqueue task')
def test_dimensions_to_flat_duplicate_value(self): actual = task_queues.bot_dimensions_to_flat({u'a': [u'c', u'c']}) self.assertEqual([u'a:c'], actual)
def test_bot_dimensions_to_flat(self): actual = task_queues.bot_dimensions_to_flat({ u'a': [u'c', u'bee'], u'cee': [u'zee'] }) self.assertEqual([u'a:bee', u'a:c', u'cee:zee'], actual)
def bot_event(event_type, bot_id, external_ip, authenticated_as, dimensions, state, version, quarantined, maintenance_msg, task_id, task_name, register_dimensions, **kwargs): """Records when a bot has queried for work. This event happening usually means the bot is alive (not dead), except for 'bot_missing' event which is created by server. It may be quarantined, and in this case, it will be evicted from the task queues. If it's declaring maintenance, it will not be evicted from the task queues, as maintenance is supposed to be temporary and expected to complete within a reasonable time frame. Arguments: - event_type: event type, one of BotEvent.ALLOWED_EVENTS. - bot_id: bot id. - external_ip: IP address as seen by the HTTP handler. - authenticated_as: bot identity as seen by the HTTP handler. - dimensions: Bot's dimensions as self-reported. If not provided, keep previous value. - state: ephemeral state of the bot. It is expected to change constantly. If not provided, keep previous value. - version: swarming_bot.zip version as self-reported. Used to spot if a bot failed to update promptly. If not provided, keep previous value. - quarantined: bool to determine if the bot was declared quarantined. - maintenance_msg: string describing why the bot is in maintenance. - task_id: packed task id if relevant. Set to '' to zap the stored value. - task_name: task name if relevant. Zapped when task_id is zapped. - register_dimensions: bool to specify whether to register dimensions to BotInfo. - kwargs: optional values to add to BotEvent relevant to event_type. Returns: ndb.Key to BotEvent entity if one was added. """ if not bot_id: return # Retrieve the previous BotInfo and update it. info_key = get_info_key(bot_id) bot_info = info_key.get() if not bot_info: bot_info = BotInfo(key=info_key) # Register only id and pool dimensions at the first handshake. dimensions_flat = task_queues.bot_dimensions_to_flat(dimensions) bot_info.dimensions_flat = [ d for d in dimensions_flat if d.startswith('id:') or d.startswith('pool:') ] now = utils.utcnow() # bot_missing event is created by a server, not a bot. # So it shouldn't update last_seen_ts, external_ip, authenticated_as, # maintenance_msg. # If the last_seen_ts gets updated, it would change the bot composite # to alive. And if it clears maintenance_msg, it would change the composite # to NOT_IN_MAINTENANCE and lose the message. if event_type != 'bot_missing': bot_info.last_seen_ts = now bot_info.external_ip = external_ip bot_info.authenticated_as = authenticated_as bot_info.maintenance_msg = maintenance_msg dimensions_updated = False dimensions_flat = [] if dimensions: dimensions_flat = task_queues.bot_dimensions_to_flat(dimensions) if register_dimensions and bot_info.dimensions_flat != dimensions_flat: logging.debug('bot_event: Updating dimensions. from: %s, to: %s', bot_info.dimensions_flat, dimensions_flat) bot_info.dimensions_flat = dimensions_flat dimensions_updated = True if state: bot_info.state = state if quarantined is not None: bot_info.quarantined = quarantined if task_id is not None: bot_info.task_id = task_id # Remove the task from the BotInfo summary in the following cases # 1) When the task finishes (event_type=task_XXX) # In these cases, the BotEvent shall have the task # since the event still refers to it # 2) When the bot is pooling (event_type=request_sleep) # The bot has already finished the previous task. # But it could have forgotten to remove the task from the BotInfo. # So ensure the task is removed. # 3) When the bot is missing # We assume it can't process assigned task anymore. if event_type in ('task_completed', 'task_error', 'task_killed', 'request_sleep', 'bot_missing'): bot_info.task_id = None bot_info.task_name = None if task_name: bot_info.task_name = task_name if version is not None: bot_info.version = version if quarantined: # Make sure it is not in the queue since it can't reap anything. task_queues.cleanup_after_bot(info_key.parent()) try: # Decide whether saving the event. # It's not much of an even worth saving a BotEvent for but it's worth # updating BotInfo. The only reason BotInfo is GET is to keep first_seen_ts. # It's not necessary to use a transaction here since no BotEvent is being # added, only last_seen_ts is really updated. # crbug.com/1015365: It's useful saving BotEvent when dimensions updates. # crbug.com/952984: It needs to save BotEvent when quarantined. skip_save_event = (not dimensions_updated and not quarantined and event_type in ('request_sleep', 'task_update')) if skip_save_event: bot_info.put() return # When it's a 'bot_*' or 'request_*' event, use the dimensions provided # by the bot. # When it's a 'task_*' event, use BotInfo.dimensios_flat since dimensions # aren't provided by the bot. event_dimensions_flat = dimensions_flat or bot_info.dimensions_flat event = BotEvent(parent=get_root_key(bot_id), event_type=event_type, external_ip=external_ip, authenticated_as=authenticated_as, dimensions_flat=event_dimensions_flat, quarantined=bot_info.quarantined, maintenance_msg=bot_info.maintenance_msg, state=bot_info.state, task_id=task_id or bot_info.task_id, version=bot_info.version, **kwargs) datastore_utils.store_new_version(event, BotRoot, [bot_info]) return event.key finally: # Store the event in memcache to accelerate monitoring. # key is at minute resolution, because that's the monitoring precision. key = '%s:%s' % (bot_id, now.strftime('%Y-%m-%dT%H:%M')) m = memcache.Client() while True: data = [event_type, now.second] if m.add(key, data, time=3600, namespace='BotEvents'): break prev_val = m.get(key, for_cas=True, namespace='BotEvents') if prev_val is None: continue data = prev_val + [event_type, now.second] # Keep the data for one hour. If the cron job cannot reap it within 1h, # it's probably broken. if m.cas(key, data, time=3600, namespace='BotEvents'): break