Example #1
0
def config_for_task(request):
    """Retrieves the ExternalSchedulerConfig for this task request, if any.

  Arguments:
    request: a task_request.TaskRequest instance.

  Returns:
    pools_config.ExternalSchedulerConfig for external scheduler to use for
    this bot, if it exists, or None otherwise.
  """
    s0 = request.task_slice(0)
    pool = s0.properties.pool
    if not pool:
        return None
    pool_cfg = pools_config.get_pool_config(pool)
    if not pool_cfg or not pool_cfg.external_schedulers:
        return None

    # Determine the dimension intersection across all task slices.
    common_dimensions = set(
        task_queues.bot_dimensions_to_flat(s0.properties.dimensions))
    for i in range(1, request.num_task_slices):
        s = request.task_slice(i)
        common_dimensions.intersection_update(
            task_queues.bot_dimensions_to_flat(s.properties.dimensions))

    return _config_for_dimensions(pool_cfg, common_dimensions)
Example #2
0
def match_dimensions(request_dimensions, bot_dimensions):
    """Returns True if the bot dimensions satisfies the request dimensions."""
    assert isinstance(request_dimensions, dict), request_dimensions
    assert isinstance(bot_dimensions, dict), bot_dimensions
    if not frozenset(request_dimensions).issubset(bot_dimensions):
        return False

    bot_flat = frozenset(task_queues.bot_dimensions_to_flat(bot_dimensions))
    return any(
        frozenset(f).issubset(bot_flat)
        for f in task_queues.expand_dimensions_to_flats(request_dimensions))
Example #3
0
def _gen_bot_info(key_id, last_seen_ts, **kwargs):
    args = {
        'key': ndb.Key('BotRoot', key_id, 'BotInfo', 'info'),
        'last_seen_ts': last_seen_ts,
        'dimensions': {
            'os': ['Linux', 'Ubuntu'],
            'bot_id': [key_id],
        },
        'state': {},
    }
    args.update(**kwargs)
    args['dimensions_flat'] = task_queues.bot_dimensions_to_flat(
        args.pop('dimensions'))
    return bot_management.BotInfo(**args)
Example #4
0
def config_for_bot(bot_dimensions):
    """Retrieves the ExternalSchedulerConfig for this bot, if any.

  Arguments:
  - bot_dimensions: The dimensions of the bot as a dictionary in
          {string key: list of string values} format.

  Returns:
    pools_config.ExternalSchedulerConfig for external scheduler to use for
    this bot, if it exists, or None otherwise.
  """
    pool_cfg = _bot_pool_cfg(bot_dimensions)
    bot_dimensions_flat = set(
        task_queues.bot_dimensions_to_flat(bot_dimensions))
    return _config_for_dimensions(pool_cfg, bot_dimensions_flat)
 def test_dimensions_to_flat_long_unicode(self):
     key = u'a' * 64
     actual = task_queues.bot_dimensions_to_flat({
         key: [
             # Ok.
             u'⌛' * 256,
             # Too long.
             u'⛔' * 257,
         ],
     })
     expected = [
         key + u':' + u'⌛' * 256,
         key + u':' + u'⛔' * 256 + u'…',
     ]
     self.assertEqual(expected, actual)
 def test_dimensions_to_flat_long_ascii(self):
     key = u'a' * 64
     actual = task_queues.bot_dimensions_to_flat({
         key: [
             # Too long.
             u'b' * 257,
             # Ok.
             u'c' * 256,
         ],
     })
     expected = [
         key + u':' + u'b' * 256 + u'…',
         key + u':' + u'c' * 256,
     ]
     self.assertEqual(expected, actual)
 def test_dimensions_to_flat_long_unicode_non_BMP(self):
     # For non-BMP characters, the length is effectively halved for now.
     key = u'a' * 64
     # Python considers emoji in the supplemental plane to have length 2 on UCS2
     # builds, and length 1 on UCS4 builds.
     l = 128 if sys.maxunicode < 65536 else 256
     actual = task_queues.bot_dimensions_to_flat({
         key: [
             # Too long.
             u'💥' * (l + 1),
             # Ok.
             u'😬' * l,
         ],
     })
     expected = [
         key + u':' + u'💥' * l + u'…',
         key + u':' + u'😬' * l,
     ]
     self.assertEqual(expected, actual)
Example #8
0
def assign_task(es_cfg, bot_dimensions):
    """Calls external scheduler for a single idle bot with given dimensions.

  Arguments:
    es_cfg: pools_config.ExternalSchedulerConfig instance.
    bot_dimensions: dimensions {string key: list of string values}

  Returns:
    (Task id string, slice number) tuple or (None, None) if no task
    to assign.
  """
    bot_id = bot_dimensions[u'id'][0]
    logging.debug('Using external scheduler address: %s id: %s for bot %s',
                  es_cfg.address, es_cfg.id, bot_id)

    req = plugin_pb2.AssignTasksRequest()

    idle_bot = req.idle_bots.add()
    idle_bot.bot_id = bot_id
    idle_bot.dimensions.extend(
        task_queues.bot_dimensions_to_flat(bot_dimensions))

    req.scheduler_id = es_cfg.id
    req.time.GetCurrentTime()

    c = _get_client(es_cfg.address)

    # TODO(akeshet): Catch or handle errors appropriately.
    resp = c.AssignTasks(req, credentials=_creds())

    if not resp or not resp.assignments:
        return None, None

    assert len(resp.assignments) == 1
    assert resp.assignments[0].bot_id == bot_id

    return resp.assignments[0].task_id, resp.assignments[0].slice_number
Example #9
0
def notify_requests(es_cfg, requests, use_tq, is_callback, batch_mode=False):
    """Calls external scheduler to notify it of a task state.

  Arguments:
    - es_cfg: pools_config.ExternalSchedulerConfig for external scheduler to
        notify.
    - requests:
      A list of (task_request.TaskRequest,
                 task_result.TaskResultSummary or task_result.TaskRunResult)
      tuples.
    - use_tq: If true, make this call on a task queue (within the current
              datastore transaction).
    - is_callback: If true, indicates that this notification was in response
                   to a external-scheduler-requested callback. This is for
    - batch_mode: If true, the notifications will be sent in a batched mode
                  along with others, to reduce traffic to external scheduler.
                  Only valid when use_tq and global config's
                  enable_batch_es_notifications are true.

  Returns: Nothing.
  """
    logging.debug(
        'notify_requests(es_cfg=(%s,%s), requests=%s, use_tq=%s, '
        'is_callback=%s, batch_mode=%s)', es_cfg.address, es_cfg.id,
        [r.task_id for r, _ in requests], use_tq, is_callback, batch_mode)

    req = plugin_pb2.NotifyTasksRequest()
    req.is_callback = is_callback

    for request, result_summary in requests:
        item = req.notifications.add()
        # TODO(akeshet): This time should possibly come from the read time from
        # datastore, rather than the local server clock.
        item.time.FromDatetime(utils.utcnow())
        item.task.id = request.task_id
        item.task.tags.extend(request.tags)
        item.task.enqueued_time.FromDatetime(request.created_ts)
        for i in range(request.num_task_slices):
            s = request.task_slice(i)
            flat_dimensions = task_queues.bot_dimensions_to_flat(
                s.properties.dimensions)
            s_pb = item.task.slices.add()
            s_pb.dimensions.extend(flat_dimensions)

        res = swarming_pb2.TaskResult()
        result_summary.to_proto(res)
        item.task.state = res.state
        if result_summary.bot_id:
            # TODO(akeshet): We should only actually set this is state is running.
            item.task.bot_id = result_summary.bot_id

    req.scheduler_id = es_cfg.id

    if not use_tq:
        # Ignore return value, the response proto is empty.
        notify_request_now(es_cfg.address, req)
        return

    request_json = json_format.MessageToJson(req)
    # If enable_batch_es_notifications is true, the notifications will be sent in
    # a batched mode along with others, to reduce traffic to external scheduler.
    if batch_mode and config.settings().enable_batch_es_notifications:
        payload = {'es_host': es_cfg.address, 'request_json': request_json}
        req = taskqueue.Task(payload=json.dumps(payload), method='PULL')
        if not req.add(queue_name='es-notify-tasks-batch',
                       transactional=ndb.in_transaction()):
            raise datastore_utils.CommitError('Failed to enqueue task')
        stats = taskqueue.QueueStatistics.fetch('es-notify-kick')
        # Add a kicker task if there are fewer than 10 minutes worth.
        if stats.tasks < 600:
            job_enqueued = utils.enqueue_task(
                '/internal/taskqueue/important/external_scheduler/notify-kick',
                'es-notify-kick',
                transactional=ndb.in_transaction())
            if not job_enqueued:
                logging.info('Failed to add a notify-kick for request.')
        return

    enqueued = utils.enqueue_task(
        '/internal/taskqueue/important/external_scheduler/notify-tasks',
        'es-notify-tasks',
        params={
            'es_host': es_cfg.address,
            'request_json': request_json
        },
        transactional=ndb.in_transaction())
    if not enqueued:
        raise datastore_utils.CommitError('Failed to enqueue task')
 def test_dimensions_to_flat_duplicate_value(self):
     actual = task_queues.bot_dimensions_to_flat({u'a': [u'c', u'c']})
     self.assertEqual([u'a:c'], actual)
 def test_bot_dimensions_to_flat(self):
     actual = task_queues.bot_dimensions_to_flat({
         u'a': [u'c', u'bee'],
         u'cee': [u'zee']
     })
     self.assertEqual([u'a:bee', u'a:c', u'cee:zee'], actual)
Example #12
0
def bot_event(event_type, bot_id, external_ip, authenticated_as, dimensions,
              state, version, quarantined, maintenance_msg, task_id, task_name,
              register_dimensions, **kwargs):
    """Records when a bot has queried for work.

  This event happening usually means the bot is alive (not dead), except for
  'bot_missing' event which is created by server. It may be quarantined, and
  in this case, it will be evicted from the task queues.

  If it's declaring maintenance, it will not be evicted from the task queues, as
  maintenance is supposed to be temporary and expected to complete within a
  reasonable time frame.

  Arguments:
  - event_type: event type, one of BotEvent.ALLOWED_EVENTS.
  - bot_id: bot id.
  - external_ip: IP address as seen by the HTTP handler.
  - authenticated_as: bot identity as seen by the HTTP handler.
  - dimensions: Bot's dimensions as self-reported. If not provided, keep
        previous value.
  - state: ephemeral state of the bot. It is expected to change constantly. If
        not provided, keep previous value.
  - version: swarming_bot.zip version as self-reported. Used to spot if a bot
        failed to update promptly. If not provided, keep previous value.
  - quarantined: bool to determine if the bot was declared quarantined.
  - maintenance_msg: string describing why the bot is in maintenance.
  - task_id: packed task id if relevant. Set to '' to zap the stored value.
  - task_name: task name if relevant. Zapped when task_id is zapped.
  - register_dimensions: bool to specify whether to register dimensions to
    BotInfo.
  - kwargs: optional values to add to BotEvent relevant to event_type.

  Returns:
    ndb.Key to BotEvent entity if one was added.
  """
    if not bot_id:
        return

    # Retrieve the previous BotInfo and update it.
    info_key = get_info_key(bot_id)
    bot_info = info_key.get()
    if not bot_info:
        bot_info = BotInfo(key=info_key)
        # Register only id and pool dimensions at the first handshake.
        dimensions_flat = task_queues.bot_dimensions_to_flat(dimensions)
        bot_info.dimensions_flat = [
            d for d in dimensions_flat
            if d.startswith('id:') or d.startswith('pool:')
        ]

    now = utils.utcnow()
    # bot_missing event is created by a server, not a bot.
    # So it shouldn't update last_seen_ts, external_ip, authenticated_as,
    # maintenance_msg.
    # If the last_seen_ts gets updated, it would change the bot composite
    # to alive. And if it clears maintenance_msg, it would change the composite
    # to NOT_IN_MAINTENANCE and lose the message.
    if event_type != 'bot_missing':
        bot_info.last_seen_ts = now
        bot_info.external_ip = external_ip
        bot_info.authenticated_as = authenticated_as
        bot_info.maintenance_msg = maintenance_msg
    dimensions_updated = False
    dimensions_flat = []
    if dimensions:
        dimensions_flat = task_queues.bot_dimensions_to_flat(dimensions)
        if register_dimensions and bot_info.dimensions_flat != dimensions_flat:
            logging.debug('bot_event: Updating dimensions. from: %s, to: %s',
                          bot_info.dimensions_flat, dimensions_flat)
            bot_info.dimensions_flat = dimensions_flat
            dimensions_updated = True
    if state:
        bot_info.state = state
    if quarantined is not None:
        bot_info.quarantined = quarantined
    if task_id is not None:
        bot_info.task_id = task_id
    # Remove the task from the BotInfo summary in the following cases
    # 1) When the task finishes (event_type=task_XXX)
    #    In these cases, the BotEvent shall have the task
    #    since the event still refers to it
    # 2) When the bot is pooling (event_type=request_sleep)
    #    The bot has already finished the previous task.
    #    But it could have forgotten to remove the task from the BotInfo.
    #    So ensure the task is removed.
    # 3) When the bot is missing
    #    We assume it can't process assigned task anymore.
    if event_type in ('task_completed', 'task_error', 'task_killed',
                      'request_sleep', 'bot_missing'):
        bot_info.task_id = None
        bot_info.task_name = None
    if task_name:
        bot_info.task_name = task_name
    if version is not None:
        bot_info.version = version

    if quarantined:
        # Make sure it is not in the queue since it can't reap anything.
        task_queues.cleanup_after_bot(info_key.parent())

    try:
        # Decide whether saving the event.
        # It's not much of an even worth saving a BotEvent for but it's worth
        # updating BotInfo. The only reason BotInfo is GET is to keep first_seen_ts.
        # It's not necessary to use a transaction here since no BotEvent is being
        # added, only last_seen_ts is really updated.
        # crbug.com/1015365: It's useful saving BotEvent when dimensions updates.
        # crbug.com/952984: It needs to save BotEvent when quarantined.
        skip_save_event = (not dimensions_updated and not quarantined
                           and event_type in ('request_sleep', 'task_update'))
        if skip_save_event:
            bot_info.put()
            return

        # When it's a 'bot_*' or 'request_*' event, use the dimensions provided
        # by the bot.
        # When it's a 'task_*' event, use BotInfo.dimensios_flat since dimensions
        # aren't provided by the bot.
        event_dimensions_flat = dimensions_flat or bot_info.dimensions_flat

        event = BotEvent(parent=get_root_key(bot_id),
                         event_type=event_type,
                         external_ip=external_ip,
                         authenticated_as=authenticated_as,
                         dimensions_flat=event_dimensions_flat,
                         quarantined=bot_info.quarantined,
                         maintenance_msg=bot_info.maintenance_msg,
                         state=bot_info.state,
                         task_id=task_id or bot_info.task_id,
                         version=bot_info.version,
                         **kwargs)

        datastore_utils.store_new_version(event, BotRoot, [bot_info])
        return event.key
    finally:
        # Store the event in memcache to accelerate monitoring.
        # key is at minute resolution, because that's the monitoring precision.
        key = '%s:%s' % (bot_id, now.strftime('%Y-%m-%dT%H:%M'))
        m = memcache.Client()
        while True:
            data = [event_type, now.second]
            if m.add(key, data, time=3600, namespace='BotEvents'):
                break
            prev_val = m.get(key, for_cas=True, namespace='BotEvents')
            if prev_val is None:
                continue
            data = prev_val + [event_type, now.second]
            # Keep the data for one hour. If the cron job cannot reap it within 1h,
            # it's probably broken.
            if m.cas(key, data, time=3600, namespace='BotEvents'):
                break