def _update_stats(run_result, bot_id, request, completed): """Updates stats after a bot task update notification.""" if completed: runtime_ms = 0 if run_result.duration_total: runtime_ms = _secs_to_ms(run_result.duration_total.total_seconds()) pending_ms = 0 if run_result.started_ts: pending_ms = _secs_to_ms((run_result.started_ts - request.created_ts).total_seconds()) stats.add_run_entry( "run_completed", run_result.key, bot_id=bot_id, dimensions=request.properties.dimensions, runtime_ms=runtime_ms, user=request.user, ) stats.add_task_entry( "task_completed", task_pack.request_key_to_result_summary_key(request.key), dimensions=request.properties.dimensions, pending_ms=pending_ms, user=request.user, ) else: stats.add_run_entry("run_updated", run_result.key, bot_id=bot_id, dimensions=request.properties.dimensions)
def bot_reap_task(dimensions, bot_id, bot_version): """Reaps a TaskToRun if one is available. The process is to find a TaskToRun where its .queue_number is set, then create a TaskRunResult for it. Returns: tuple of (TaskRequest, TaskRunResult) for the task that was reaped. The TaskToRun involved is not returned. """ assert bot_id q = task_to_run.yield_next_available_task_to_dispatch(dimensions) # When a large number of bots try to reap hundreds of tasks simultaneously, # they'll constantly fail to call reap_task_to_run() as they'll get preempted # by other bots. So randomly jump farther in the queue when the number of # failures is too large. failures = 0 to_skip = 0 total_skipped = 0 for request, to_run in q: if to_skip: to_skip -= 1 total_skipped += 1 continue run_result = _reap_task(to_run.key, request, bot_id, bot_version, dimensions) if not run_result: failures += 1 # Every 3 failures starting on the very first one, jump randomly ahead of # the pack. This reduces the contention where hundreds of bots fight for # exactly the same task while there's many ready to be run waiting in the # queue. if (failures % 3) == 1: # TODO(maruel): Choose curve that makes the most sense. The tricky part # is finding a good heuristic to guess the load without much information # available in this content. When 'failures' is high, this means a lot # of bots are reaping tasks like crazy, which means there is a good flow # of tasks going on. On the other hand, skipping too much is useless. So # it should have an initial bump but then slow down on skipping. to_skip = min(int(round(random.gammavariate(3, 1))), 30) continue # Try to optimize these values but do not add as formal stats (yet). logging.info("failed %d, skipped %d", failures, total_skipped) pending_time = run_result.started_ts - request.created_ts stats.add_run_entry( "run_started", run_result.key, bot_id=bot_id, dimensions=request.properties.dimensions, pending_ms=_secs_to_ms(pending_time.total_seconds()), user=request.user, ) return request, run_result if failures: logging.info("Chose nothing (failed %d, skipped %d)", failures, total_skipped) return None, None
def bot_kill_task(run_result_key, bot_id): """Terminates a task that is currently running as an internal failure. Returns: str if an error message. """ result_summary_key = task_pack.run_result_key_to_result_summary_key( run_result_key) request = task_pack.result_summary_key_to_request_key( result_summary_key).get() server_version = utils.get_app_version() now = utils.utcnow() packed = task_pack.pack_run_result_key(run_result_key) def run(): run_result, result_summary = ndb.get_multi( (run_result_key, result_summary_key)) if bot_id and run_result.bot_id != bot_id: return None, 'Bot %s sent task kill for task %s owned by bot %s' % ( bot_id, packed, run_result.bot_id) if run_result.state == task_result.State.BOT_DIED: # Ignore this failure. return None, None run_result.signal_server_version(server_version) run_result.state = task_result.State.BOT_DIED run_result.internal_failure = True run_result.abandoned_ts = now run_result.modified_ts = now result_summary.set_from_run_result(run_result, None) futures = ndb.put_multi_async((run_result, result_summary)) _maybe_pubsub_notify_via_tq(result_summary, request) for f in futures: f.check_success() return run_result, None try: run_result, msg = datastore_utils.transaction(run) except datastore_utils.CommitError as e: # At worst, the task will be tagged as BOT_DIED after BOT_PING_TOLERANCE # seconds passed on the next cron_handle_bot_died cron job. return 'Failed killing task %s: %s' % (packed, e) if run_result: stats.add_run_entry('run_bot_died', run_result.key, bot_id=run_result.bot_id, dimensions=request.properties.dimensions, user=request.user) return msg
def bot_kill_task(run_result_key, bot_id): """Terminates a task that is currently running as an internal failure. Returns: str if an error message. """ result_summary_key = task_pack.run_result_key_to_result_summary_key(run_result_key) request = task_pack.result_summary_key_to_request_key(result_summary_key).get() server_version = utils.get_app_version() now = utils.utcnow() packed = task_pack.pack_run_result_key(run_result_key) def run(): run_result, result_summary = ndb.get_multi((run_result_key, result_summary_key)) if bot_id and run_result.bot_id != bot_id: return None, "Bot %s sent task kill for task %s owned by bot %s" % (bot_id, packed, run_result.bot_id) if run_result.state == task_result.State.BOT_DIED: # Ignore this failure. return None, None run_result.signal_server_version(server_version) run_result.state = task_result.State.BOT_DIED run_result.internal_failure = True run_result.abandoned_ts = now run_result.modified_ts = now result_summary.set_from_run_result(run_result, None) futures = ndb.put_multi_async((run_result, result_summary)) _maybe_pubsub_notify_via_tq(result_summary, request) for f in futures: f.check_success() return run_result, None try: run_result, msg = datastore_utils.transaction(run) except datastore_utils.CommitError as e: # At worst, the task will be tagged as BOT_DIED after BOT_PING_TOLERANCE # seconds passed on the next cron_handle_bot_died cron job. return "Failed killing task %s: %s" % (packed, e) if run_result: stats.add_run_entry( "run_bot_died", run_result.key, bot_id=run_result.bot_id, dimensions=request.properties.dimensions, user=request.user, ) return msg
def _update_stats(run_result, bot_id, request, completed): """Updates stats after a bot task update notification.""" if completed: stats.add_run_entry( 'run_completed', run_result.key, bot_id=bot_id, dimensions=request.properties.dimensions, runtime_ms=_secs_to_ms(run_result.duration.total_seconds()), user=request.user) stats.add_task_entry( 'task_completed', task_pack.request_key_to_result_summary_key(request.key), dimensions=request.properties.dimensions, pending_ms=_secs_to_ms( (run_result.completed_ts - request.created_ts).total_seconds()), user=request.user) else: stats.add_run_entry( 'run_updated', run_result.key, bot_id=bot_id, dimensions=request.properties.dimensions)
def _handle_dead_bot(run_result_key): """Handles TaskRunResult where its bot has stopped showing sign of life. Transactionally updates the entities depending on the state of this task. The task may be retried automatically, canceled or left alone. Returns: True if the task was retried, False if the task was killed, None if no action was done. """ result_summary_key = task_pack.run_result_key_to_result_summary_key(run_result_key) request_key = task_pack.result_summary_key_to_request_key(result_summary_key) request_future = request_key.get_async() now = utils.utcnow() server_version = utils.get_app_version() packed = task_pack.pack_run_result_key(run_result_key) request = request_future.get_result() to_run_key = task_to_run.request_to_task_to_run_key(request) def run(): """Returns tuple(task_is_retried or None, bot_id).""" # Do one GET, one PUT at the end. run_result, result_summary, to_run = ndb.get_multi((run_result_key, result_summary_key, to_run_key)) if run_result.state != task_result.State.RUNNING: # It was updated already or not updating last. Likely DB index was stale. return None, run_result.bot_id run_result.signal_server_version(server_version) run_result.modified_ts = now notify = False if result_summary.try_number != run_result.try_number: # Not updating correct run_result, cancel it without touching # result_summary. to_put = (run_result,) run_result.state = task_result.State.BOT_DIED run_result.internal_failure = True run_result.abandoned_ts = now task_is_retried = None elif result_summary.try_number == 1 and now < request.expiration_ts: # Retry it. to_put = (run_result, result_summary, to_run) to_run.queue_number = task_to_run.gen_queue_number(request) run_result.state = task_result.State.BOT_DIED run_result.internal_failure = True run_result.abandoned_ts = now # Do not sync data from run_result to result_summary, since the task is # being retried. result_summary.reset_to_pending() result_summary.modified_ts = now task_is_retried = True else: # Cancel it, there was more than one try or the task expired in the # meantime. to_put = (run_result, result_summary) run_result.state = task_result.State.BOT_DIED run_result.internal_failure = True run_result.abandoned_ts = now result_summary.set_from_run_result(run_result, request) notify = True task_is_retried = False futures = ndb.put_multi_async(to_put) if notify: _maybe_pubsub_notify_via_tq(result_summary, request) for f in futures: f.check_success() return task_is_retried, run_result.bot_id try: task_is_retried, bot_id = datastore_utils.transaction(run) except datastore_utils.CommitError: task_is_retried, bot_id = None, None if task_is_retried is not None: task_to_run.set_lookup_cache(to_run_key, task_is_retried) if not task_is_retried: stats.add_run_entry( "run_bot_died", run_result_key, bot_id=bot_id[0], dimensions=request.properties.dimensions, user=request.user, ) else: logging.info("Retried %s", packed) else: logging.info("Ignored %s", packed) return task_is_retried
def _handle_dead_bot(run_result_key): """Handles TaskRunResult where its bot has stopped showing sign of life. Transactionally updates the entities depending on the state of this task. The task may be retried automatically, canceled or left alone. Returns: True if the task was retried, False if the task was killed, None if no action was done. """ result_summary_key = task_pack.run_result_key_to_result_summary_key( run_result_key) request_key = task_pack.result_summary_key_to_request_key( result_summary_key) request_future = request_key.get_async() now = utils.utcnow() server_version = utils.get_app_version() packed = task_pack.pack_run_result_key(run_result_key) request = request_future.get_result() to_run_key = task_to_run.request_to_task_to_run_key(request) def run(): """Returns tuple(task_is_retried or None, bot_id).""" # Do one GET, one PUT at the end. run_result, result_summary, to_run = ndb.get_multi( (run_result_key, result_summary_key, to_run_key)) if run_result.state != task_result.State.RUNNING: # It was updated already or not updating last. Likely DB index was stale. return None, run_result.bot_id if run_result.modified_ts > now - task_result.BOT_PING_TOLERANCE: # The query index IS stale. return None, run_result.bot_id run_result.signal_server_version(server_version) run_result.modified_ts = now notify = False if result_summary.try_number != run_result.try_number: # Not updating correct run_result, cancel it without touching # result_summary. to_put = (run_result, ) run_result.state = task_result.State.BOT_DIED run_result.internal_failure = True run_result.abandoned_ts = now task_is_retried = None elif result_summary.try_number == 1 and now < request.expiration_ts: # Retry it. to_put = (run_result, result_summary, to_run) to_run.queue_number = task_to_run.gen_queue_number(request) run_result.state = task_result.State.BOT_DIED run_result.internal_failure = True run_result.abandoned_ts = now # Do not sync data from run_result to result_summary, since the task is # being retried. result_summary.reset_to_pending() result_summary.modified_ts = now task_is_retried = True else: # Cancel it, there was more than one try or the task expired in the # meantime. to_put = (run_result, result_summary) run_result.state = task_result.State.BOT_DIED run_result.internal_failure = True run_result.abandoned_ts = now result_summary.set_from_run_result(run_result, request) notify = True task_is_retried = False futures = ndb.put_multi_async(to_put) if notify: _maybe_pubsub_notify_via_tq(result_summary, request) for f in futures: f.check_success() return task_is_retried, run_result.bot_id try: task_is_retried, bot_id = datastore_utils.transaction(run) except datastore_utils.CommitError: task_is_retried, bot_id = None, None if task_is_retried is not None: task_to_run.set_lookup_cache(to_run_key, task_is_retried) if not task_is_retried: stats.add_run_entry('run_bot_died', run_result_key, bot_id=bot_id[0], dimensions=request.properties.dimensions, user=request.user) else: logging.info('Retried %s', packed) else: logging.info('Ignored %s', packed) return task_is_retried