Ejemplo n.º 1
0
def _update_stats(run_result, bot_id, request, completed):
    """Updates stats after a bot task update notification."""
    if completed:
        runtime_ms = 0
        if run_result.duration_total:
            runtime_ms = _secs_to_ms(run_result.duration_total.total_seconds())
        pending_ms = 0
        if run_result.started_ts:
            pending_ms = _secs_to_ms((run_result.started_ts - request.created_ts).total_seconds())
        stats.add_run_entry(
            "run_completed",
            run_result.key,
            bot_id=bot_id,
            dimensions=request.properties.dimensions,
            runtime_ms=runtime_ms,
            user=request.user,
        )
        stats.add_task_entry(
            "task_completed",
            task_pack.request_key_to_result_summary_key(request.key),
            dimensions=request.properties.dimensions,
            pending_ms=pending_ms,
            user=request.user,
        )
    else:
        stats.add_run_entry("run_updated", run_result.key, bot_id=bot_id, dimensions=request.properties.dimensions)
Ejemplo n.º 2
0
def bot_reap_task(dimensions, bot_id, bot_version):
    """Reaps a TaskToRun if one is available.

  The process is to find a TaskToRun where its .queue_number is set, then
  create a TaskRunResult for it.

  Returns:
    tuple of (TaskRequest, TaskRunResult) for the task that was reaped.
    The TaskToRun involved is not returned.
  """
    assert bot_id
    q = task_to_run.yield_next_available_task_to_dispatch(dimensions)
    # When a large number of bots try to reap hundreds of tasks simultaneously,
    # they'll constantly fail to call reap_task_to_run() as they'll get preempted
    # by other bots. So randomly jump farther in the queue when the number of
    # failures is too large.
    failures = 0
    to_skip = 0
    total_skipped = 0
    for request, to_run in q:
        if to_skip:
            to_skip -= 1
            total_skipped += 1
            continue

        run_result = _reap_task(to_run.key, request, bot_id, bot_version, dimensions)
        if not run_result:
            failures += 1
            # Every 3 failures starting on the very first one, jump randomly ahead of
            # the pack. This reduces the contention where hundreds of bots fight for
            # exactly the same task while there's many ready to be run waiting in the
            # queue.
            if (failures % 3) == 1:
                # TODO(maruel): Choose curve that makes the most sense. The tricky part
                # is finding a good heuristic to guess the load without much information
                # available in this content. When 'failures' is high, this means a lot
                # of bots are reaping tasks like crazy, which means there is a good flow
                # of tasks going on. On the other hand, skipping too much is useless. So
                # it should have an initial bump but then slow down on skipping.
                to_skip = min(int(round(random.gammavariate(3, 1))), 30)
            continue

        # Try to optimize these values but do not add as formal stats (yet).
        logging.info("failed %d, skipped %d", failures, total_skipped)

        pending_time = run_result.started_ts - request.created_ts
        stats.add_run_entry(
            "run_started",
            run_result.key,
            bot_id=bot_id,
            dimensions=request.properties.dimensions,
            pending_ms=_secs_to_ms(pending_time.total_seconds()),
            user=request.user,
        )
        return request, run_result
    if failures:
        logging.info("Chose nothing (failed %d, skipped %d)", failures, total_skipped)
    return None, None
Ejemplo n.º 3
0
def bot_kill_task(run_result_key, bot_id):
    """Terminates a task that is currently running as an internal failure.

  Returns:
    str if an error message.
  """
    result_summary_key = task_pack.run_result_key_to_result_summary_key(
        run_result_key)
    request = task_pack.result_summary_key_to_request_key(
        result_summary_key).get()
    server_version = utils.get_app_version()
    now = utils.utcnow()
    packed = task_pack.pack_run_result_key(run_result_key)

    def run():
        run_result, result_summary = ndb.get_multi(
            (run_result_key, result_summary_key))
        if bot_id and run_result.bot_id != bot_id:
            return None, 'Bot %s sent task kill for task %s owned by bot %s' % (
                bot_id, packed, run_result.bot_id)

        if run_result.state == task_result.State.BOT_DIED:
            # Ignore this failure.
            return None, None

        run_result.signal_server_version(server_version)
        run_result.state = task_result.State.BOT_DIED
        run_result.internal_failure = True
        run_result.abandoned_ts = now
        run_result.modified_ts = now
        result_summary.set_from_run_result(run_result, None)

        futures = ndb.put_multi_async((run_result, result_summary))
        _maybe_pubsub_notify_via_tq(result_summary, request)
        for f in futures:
            f.check_success()

        return run_result, None

    try:
        run_result, msg = datastore_utils.transaction(run)
    except datastore_utils.CommitError as e:
        # At worst, the task will be tagged as BOT_DIED after BOT_PING_TOLERANCE
        # seconds passed on the next cron_handle_bot_died cron job.
        return 'Failed killing task %s: %s' % (packed, e)

    if run_result:
        stats.add_run_entry('run_bot_died',
                            run_result.key,
                            bot_id=run_result.bot_id,
                            dimensions=request.properties.dimensions,
                            user=request.user)
    return msg
Ejemplo n.º 4
0
def bot_kill_task(run_result_key, bot_id):
    """Terminates a task that is currently running as an internal failure.

  Returns:
    str if an error message.
  """
    result_summary_key = task_pack.run_result_key_to_result_summary_key(run_result_key)
    request = task_pack.result_summary_key_to_request_key(result_summary_key).get()
    server_version = utils.get_app_version()
    now = utils.utcnow()
    packed = task_pack.pack_run_result_key(run_result_key)

    def run():
        run_result, result_summary = ndb.get_multi((run_result_key, result_summary_key))
        if bot_id and run_result.bot_id != bot_id:
            return None, "Bot %s sent task kill for task %s owned by bot %s" % (bot_id, packed, run_result.bot_id)

        if run_result.state == task_result.State.BOT_DIED:
            # Ignore this failure.
            return None, None

        run_result.signal_server_version(server_version)
        run_result.state = task_result.State.BOT_DIED
        run_result.internal_failure = True
        run_result.abandoned_ts = now
        run_result.modified_ts = now
        result_summary.set_from_run_result(run_result, None)

        futures = ndb.put_multi_async((run_result, result_summary))
        _maybe_pubsub_notify_via_tq(result_summary, request)
        for f in futures:
            f.check_success()

        return run_result, None

    try:
        run_result, msg = datastore_utils.transaction(run)
    except datastore_utils.CommitError as e:
        # At worst, the task will be tagged as BOT_DIED after BOT_PING_TOLERANCE
        # seconds passed on the next cron_handle_bot_died cron job.
        return "Failed killing task %s: %s" % (packed, e)

    if run_result:
        stats.add_run_entry(
            "run_bot_died",
            run_result.key,
            bot_id=run_result.bot_id,
            dimensions=request.properties.dimensions,
            user=request.user,
        )
    return msg
Ejemplo n.º 5
0
def _update_stats(run_result, bot_id, request, completed):
  """Updates stats after a bot task update notification."""
  if completed:
    stats.add_run_entry(
        'run_completed', run_result.key,
        bot_id=bot_id,
        dimensions=request.properties.dimensions,
        runtime_ms=_secs_to_ms(run_result.duration.total_seconds()),
        user=request.user)
    stats.add_task_entry(
        'task_completed',
        task_pack.request_key_to_result_summary_key(request.key),
        dimensions=request.properties.dimensions,
        pending_ms=_secs_to_ms(
            (run_result.completed_ts - request.created_ts).total_seconds()),
        user=request.user)
  else:
    stats.add_run_entry(
        'run_updated', run_result.key, bot_id=bot_id,
        dimensions=request.properties.dimensions)
Ejemplo n.º 6
0
def _handle_dead_bot(run_result_key):
    """Handles TaskRunResult where its bot has stopped showing sign of life.

  Transactionally updates the entities depending on the state of this task. The
  task may be retried automatically, canceled or left alone.

  Returns:
    True if the task was retried, False if the task was killed, None if no
    action was done.
  """
    result_summary_key = task_pack.run_result_key_to_result_summary_key(run_result_key)
    request_key = task_pack.result_summary_key_to_request_key(result_summary_key)
    request_future = request_key.get_async()
    now = utils.utcnow()
    server_version = utils.get_app_version()
    packed = task_pack.pack_run_result_key(run_result_key)
    request = request_future.get_result()
    to_run_key = task_to_run.request_to_task_to_run_key(request)

    def run():
        """Returns tuple(task_is_retried or None, bot_id)."""
        # Do one GET, one PUT at the end.
        run_result, result_summary, to_run = ndb.get_multi((run_result_key, result_summary_key, to_run_key))
        if run_result.state != task_result.State.RUNNING:
            # It was updated already or not updating last. Likely DB index was stale.
            return None, run_result.bot_id

        run_result.signal_server_version(server_version)
        run_result.modified_ts = now

        notify = False
        if result_summary.try_number != run_result.try_number:
            # Not updating correct run_result, cancel it without touching
            # result_summary.
            to_put = (run_result,)
            run_result.state = task_result.State.BOT_DIED
            run_result.internal_failure = True
            run_result.abandoned_ts = now
            task_is_retried = None
        elif result_summary.try_number == 1 and now < request.expiration_ts:
            # Retry it.
            to_put = (run_result, result_summary, to_run)
            to_run.queue_number = task_to_run.gen_queue_number(request)
            run_result.state = task_result.State.BOT_DIED
            run_result.internal_failure = True
            run_result.abandoned_ts = now
            # Do not sync data from run_result to result_summary, since the task is
            # being retried.
            result_summary.reset_to_pending()
            result_summary.modified_ts = now
            task_is_retried = True
        else:
            # Cancel it, there was more than one try or the task expired in the
            # meantime.
            to_put = (run_result, result_summary)
            run_result.state = task_result.State.BOT_DIED
            run_result.internal_failure = True
            run_result.abandoned_ts = now
            result_summary.set_from_run_result(run_result, request)
            notify = True
            task_is_retried = False

        futures = ndb.put_multi_async(to_put)
        if notify:
            _maybe_pubsub_notify_via_tq(result_summary, request)
        for f in futures:
            f.check_success()

        return task_is_retried, run_result.bot_id

    try:
        task_is_retried, bot_id = datastore_utils.transaction(run)
    except datastore_utils.CommitError:
        task_is_retried, bot_id = None, None
    if task_is_retried is not None:
        task_to_run.set_lookup_cache(to_run_key, task_is_retried)
        if not task_is_retried:
            stats.add_run_entry(
                "run_bot_died",
                run_result_key,
                bot_id=bot_id[0],
                dimensions=request.properties.dimensions,
                user=request.user,
            )
        else:
            logging.info("Retried %s", packed)
    else:
        logging.info("Ignored %s", packed)
    return task_is_retried
Ejemplo n.º 7
0
def _handle_dead_bot(run_result_key):
    """Handles TaskRunResult where its bot has stopped showing sign of life.

  Transactionally updates the entities depending on the state of this task. The
  task may be retried automatically, canceled or left alone.

  Returns:
    True if the task was retried, False if the task was killed, None if no
    action was done.
  """
    result_summary_key = task_pack.run_result_key_to_result_summary_key(
        run_result_key)
    request_key = task_pack.result_summary_key_to_request_key(
        result_summary_key)
    request_future = request_key.get_async()
    now = utils.utcnow()
    server_version = utils.get_app_version()
    packed = task_pack.pack_run_result_key(run_result_key)
    request = request_future.get_result()
    to_run_key = task_to_run.request_to_task_to_run_key(request)

    def run():
        """Returns tuple(task_is_retried or None, bot_id)."""
        # Do one GET, one PUT at the end.
        run_result, result_summary, to_run = ndb.get_multi(
            (run_result_key, result_summary_key, to_run_key))
        if run_result.state != task_result.State.RUNNING:
            # It was updated already or not updating last. Likely DB index was stale.
            return None, run_result.bot_id
        if run_result.modified_ts > now - task_result.BOT_PING_TOLERANCE:
            # The query index IS stale.
            return None, run_result.bot_id

        run_result.signal_server_version(server_version)
        run_result.modified_ts = now

        notify = False
        if result_summary.try_number != run_result.try_number:
            # Not updating correct run_result, cancel it without touching
            # result_summary.
            to_put = (run_result, )
            run_result.state = task_result.State.BOT_DIED
            run_result.internal_failure = True
            run_result.abandoned_ts = now
            task_is_retried = None
        elif result_summary.try_number == 1 and now < request.expiration_ts:
            # Retry it.
            to_put = (run_result, result_summary, to_run)
            to_run.queue_number = task_to_run.gen_queue_number(request)
            run_result.state = task_result.State.BOT_DIED
            run_result.internal_failure = True
            run_result.abandoned_ts = now
            # Do not sync data from run_result to result_summary, since the task is
            # being retried.
            result_summary.reset_to_pending()
            result_summary.modified_ts = now
            task_is_retried = True
        else:
            # Cancel it, there was more than one try or the task expired in the
            # meantime.
            to_put = (run_result, result_summary)
            run_result.state = task_result.State.BOT_DIED
            run_result.internal_failure = True
            run_result.abandoned_ts = now
            result_summary.set_from_run_result(run_result, request)
            notify = True
            task_is_retried = False

        futures = ndb.put_multi_async(to_put)
        if notify:
            _maybe_pubsub_notify_via_tq(result_summary, request)
        for f in futures:
            f.check_success()

        return task_is_retried, run_result.bot_id

    try:
        task_is_retried, bot_id = datastore_utils.transaction(run)
    except datastore_utils.CommitError:
        task_is_retried, bot_id = None, None
    if task_is_retried is not None:
        task_to_run.set_lookup_cache(to_run_key, task_is_retried)
        if not task_is_retried:
            stats.add_run_entry('run_bot_died',
                                run_result_key,
                                bot_id=bot_id[0],
                                dimensions=request.properties.dimensions,
                                user=request.user)
        else:
            logging.info('Retried %s', packed)
    else:
        logging.info('Ignored %s', packed)
    return task_is_retried