def cancel_task(result_summary_key): """Cancels a task if possible.""" request = task_pack.result_summary_key_to_request_key(result_summary_key).get() to_run_key = task_to_run.request_to_task_to_run_key(request) now = utils.utcnow() def run(): to_run, result_summary = ndb.get_multi((to_run_key, result_summary_key)) was_running = result_summary.state == task_result.State.RUNNING if not result_summary.can_be_canceled: return False, was_running to_run.queue_number = None result_summary.state = task_result.State.CANCELED result_summary.abandoned_ts = now result_summary.modified_ts = now futures = ndb.put_multi_async((to_run, result_summary)) _maybe_pubsub_notify_via_tq(result_summary, request) for f in futures: f.check_success() return True, was_running try: ok, was_running = datastore_utils.transaction(run) except datastore_utils.CommitError as e: packed = task_pack.pack_result_summary_key(result_summary_key) return "Failed killing task %s: %s" % (packed, e) # Add it to the negative cache. task_to_run.set_lookup_cache(to_run_key, False) # TODO(maruel): Add stats. return ok, was_running
def cancel_task(result_summary_key): """Cancels a task if possible.""" request_key = task_pack.result_summary_key_to_request_key(result_summary_key) to_run_key = task_to_run.request_to_task_to_run_key(request_key.get()) now = utils.utcnow() def run(): to_run, result_summary = ndb.get_multi((to_run_key, result_summary_key)) was_running = result_summary.state == task_result.State.RUNNING if not result_summary.can_be_canceled: return False, was_running to_run.queue_number = None result_summary.state = task_result.State.CANCELED result_summary.abandoned_ts = now result_summary.modified_ts = now ndb.put_multi((to_run, result_summary)) return True, was_running try: ok, was_running = datastore_utils.transaction(run) except datastore_utils.CommitError as e: packed = task_pack.pack_result_summary_key(result_summary_key) return 'Failed killing task %s: %s' % (packed, e) # Add it to the negative cache. task_to_run.set_lookup_cache(to_run_key, False) # TODO(maruel): Add stats. return ok, was_running
def _expire_task(to_run_key, request): """Expires a TaskResultSummary and unschedules the TaskToRun. Returns: True on success. """ # Look if the TaskToRun is reapable once before doing the check inside the # transaction. This reduces the likelihood of failing this check inside the # transaction, which is an order of magnitude more costly. if not to_run_key.get().is_reapable: logging.info('Not reapable anymore') return None result_summary_key = task_pack.request_key_to_result_summary_key( request.key) now = utils.utcnow() def run(): # 2 concurrent GET, one PUT. Optionally with an additional serialized GET. to_run_future = to_run_key.get_async() result_summary_future = result_summary_key.get_async() to_run = to_run_future.get_result() if not to_run or not to_run.is_reapable: result_summary_future.wait() return False to_run.queue_number = None result_summary = result_summary_future.get_result() if result_summary.try_number: # It's a retry that is being expired. Keep the old state. That requires an # additional pipelined GET but that shouldn't be the common case. run_result = result_summary.run_result_key.get() result_summary.set_from_run_result(run_result, request) else: result_summary.state = task_result.State.EXPIRED result_summary.abandoned_ts = now result_summary.modified_ts = now futures = ndb.put_multi_async((to_run, result_summary)) _maybe_pubsub_notify_via_tq(result_summary, request) for f in futures: f.check_success() return True # It'll be caught by next cron job execution in case of failure. try: success = datastore_utils.transaction(run) except datastore_utils.CommitError: success = False if success: task_to_run.set_lookup_cache(to_run_key, False) logging.info('Expired %s', task_pack.pack_result_summary_key(result_summary_key)) return success
def bot_kill_task(run_result_key, bot_id): """Terminates a task that is currently running as an internal failure. Returns: str if an error message. """ result_summary_key = task_pack.run_result_key_to_result_summary_key( run_result_key) request = task_pack.result_summary_key_to_request_key( result_summary_key).get() server_version = utils.get_app_version() now = utils.utcnow() packed = task_pack.pack_run_result_key(run_result_key) def run(): run_result, result_summary = ndb.get_multi( (run_result_key, result_summary_key)) if bot_id and run_result.bot_id != bot_id: return None, 'Bot %s sent task kill for task %s owned by bot %s' % ( bot_id, packed, run_result.bot_id) if run_result.state == task_result.State.BOT_DIED: # Ignore this failure. return None, None run_result.signal_server_version(server_version) run_result.state = task_result.State.BOT_DIED run_result.internal_failure = True run_result.abandoned_ts = now run_result.modified_ts = now result_summary.set_from_run_result(run_result, None) futures = ndb.put_multi_async((run_result, result_summary)) _maybe_pubsub_notify_via_tq(result_summary, request) for f in futures: f.check_success() return run_result, None try: run_result, msg = datastore_utils.transaction(run) except datastore_utils.CommitError as e: # At worst, the task will be tagged as BOT_DIED after BOT_PING_TOLERANCE # seconds passed on the next cron_handle_bot_died cron job. return 'Failed killing task %s: %s' % (packed, e) if run_result: stats.add_run_entry('run_bot_died', run_result.key, bot_id=run_result.bot_id, dimensions=request.properties.dimensions, user=request.user) return msg
def _expire_task(to_run_key, request): """Expires a TaskResultSummary and unschedules the TaskToRun. Returns: True on success. """ # Look if the TaskToRun is reapable once before doing the check inside the # transaction. This reduces the likelihood of failing this check inside the # transaction, which is an order of magnitude more costly. if not to_run_key.get().is_reapable: logging.info("Not reapable anymore") return None result_summary_key = task_pack.request_key_to_result_summary_key(request.key) now = utils.utcnow() def run(): # 2 concurrent GET, one PUT. Optionally with an additional serialized GET. to_run_future = to_run_key.get_async() result_summary_future = result_summary_key.get_async() to_run = to_run_future.get_result() if not to_run or not to_run.is_reapable: result_summary_future.wait() return False to_run.queue_number = None result_summary = result_summary_future.get_result() if result_summary.try_number: # It's a retry that is being expired. Keep the old state. That requires an # additional pipelined GET but that shouldn't be the common case. run_result = result_summary.run_result_key.get() result_summary.set_from_run_result(run_result, request) else: result_summary.state = task_result.State.EXPIRED result_summary.abandoned_ts = now result_summary.modified_ts = now futures = ndb.put_multi_async((to_run, result_summary)) _maybe_pubsub_notify_via_tq(result_summary, request) for f in futures: f.check_success() return True # It'll be caught by next cron job execution in case of failure. try: success = datastore_utils.transaction(run) except datastore_utils.CommitError: success = False if success: task_to_run.set_lookup_cache(to_run_key, False) logging.info("Expired %s", task_pack.pack_result_summary_key(result_summary_key)) return success
def bot_kill_task(run_result_key, bot_id): """Terminates a task that is currently running as an internal failure. Returns: str if an error message. """ result_summary_key = task_pack.run_result_key_to_result_summary_key(run_result_key) request = task_pack.result_summary_key_to_request_key(result_summary_key).get() server_version = utils.get_app_version() now = utils.utcnow() packed = task_pack.pack_run_result_key(run_result_key) def run(): run_result, result_summary = ndb.get_multi((run_result_key, result_summary_key)) if bot_id and run_result.bot_id != bot_id: return None, "Bot %s sent task kill for task %s owned by bot %s" % (bot_id, packed, run_result.bot_id) if run_result.state == task_result.State.BOT_DIED: # Ignore this failure. return None, None run_result.signal_server_version(server_version) run_result.state = task_result.State.BOT_DIED run_result.internal_failure = True run_result.abandoned_ts = now run_result.modified_ts = now result_summary.set_from_run_result(run_result, None) futures = ndb.put_multi_async((run_result, result_summary)) _maybe_pubsub_notify_via_tq(result_summary, request) for f in futures: f.check_success() return run_result, None try: run_result, msg = datastore_utils.transaction(run) except datastore_utils.CommitError as e: # At worst, the task will be tagged as BOT_DIED after BOT_PING_TOLERANCE # seconds passed on the next cron_handle_bot_died cron job. return "Failed killing task %s: %s" % (packed, e) if run_result: stats.add_run_entry( "run_bot_died", run_result.key, bot_id=run_result.bot_id, dimensions=request.properties.dimensions, user=request.user, ) return msg
def _reap_task(to_run_key, request, bot_id, bot_version, bot_dimensions): """Reaps a task and insert the results entity. Returns: TaskRunResult if successful, None otherwise. """ assert bot_id, bot_id assert request.key == task_to_run.task_to_run_key_to_request_key( to_run_key) result_summary_key = task_pack.request_key_to_result_summary_key( request.key) now = utils.utcnow() def run(): # 2 GET, 1 PUT at the end. to_run_future = to_run_key.get_async() result_summary_future = result_summary_key.get_async() to_run = to_run_future.get_result() result_summary = result_summary_future.get_result() if not to_run: logging.error('Missing TaskToRun?\n%s', result_summary.task_id) return None if not to_run.is_reapable: logging.info('%s is not reapable', result_summary.task_id) return None if result_summary.bot_id == bot_id: # This means two things, first it's a retry, second it's that the first # try failed and the retry is being reaped by the same bot. Deny that, as # the bot may be deeply broken and could be in a killing spree. logging.warning('%s can\'t retry its own internal failure task', result_summary.task_id) return None to_run.queue_number = None run_result = task_result.new_run_result( request, (result_summary.try_number or 0) + 1, bot_id, bot_version, bot_dimensions) run_result.modified_ts = now result_summary.set_from_run_result(run_result, request) ndb.put_multi([to_run, run_result, result_summary]) return run_result # The bot will reap the next available task in case of failure, no big deal. try: run_result = datastore_utils.transaction(run, retries=0) except datastore_utils.CommitError: run_result = None if run_result: task_to_run.set_lookup_cache(to_run_key, False) return run_result
def _reap_task(to_run_key, request, bot_id, bot_version, bot_dimensions): """Reaps a task and insert the results entity. Returns: TaskRunResult if successful, None otherwise. """ assert bot_id, bot_id assert request.key == task_to_run.task_to_run_key_to_request_key(to_run_key) result_summary_key = task_pack.request_key_to_result_summary_key(request.key) now = utils.utcnow() def run(): # 2 GET, 1 PUT at the end. to_run_future = to_run_key.get_async() result_summary_future = result_summary_key.get_async() to_run = to_run_future.get_result() if not to_run or not to_run.is_reapable: result_summary_future.wait() return None result_summary = result_summary_future.get_result() if result_summary.bot_id == bot_id: # This means two things, first it's a retry, second it's that the first # try failed and the retry is being reaped by the same bot. Deny that, as # the bot may be deeply broken and could be in a killing spree. return None to_run.queue_number = None run_result = task_result.new_run_result( request, (result_summary.try_number or 0) + 1, bot_id, bot_version, bot_dimensions ) run_result.modified_ts = now result_summary.set_from_run_result(run_result, request) ndb.put_multi([to_run, run_result, result_summary]) return run_result # The bot will reap the next available task in case of failure, no big deal. try: run_result = datastore_utils.transaction(run, retries=0) except datastore_utils.CommitError: run_result = None if run_result: task_to_run.set_lookup_cache(to_run_key, False) return run_result
def cancel_task(request, result_key): """Cancels a task if possible. Ensures that the associated TaskToRun is canceled and updates the TaskResultSummary/TaskRunResult accordingly. Warning: ACL check must have been done before. """ to_run_key = task_to_run.request_to_task_to_run_key(request) if result_key.kind() == 'TaskRunResult': result_key = task_pack.run_result_key_to_result_summary_key(result_key) now = utils.utcnow() def run(): to_run, result_summary = ndb.get_multi((to_run_key, result_key)) was_running = result_summary.state == task_result.State.RUNNING if not result_summary.can_be_canceled: return False, was_running to_run.queue_number = None result_summary.state = task_result.State.CANCELED result_summary.abandoned_ts = now result_summary.modified_ts = now futures = ndb.put_multi_async((to_run, result_summary)) _maybe_pubsub_notify_via_tq(result_summary, request) for f in futures: f.check_success() return True, was_running try: ok, was_running = datastore_utils.transaction(run) except datastore_utils.CommitError as e: packed = task_pack.pack_result_summary_key(result_key) return 'Failed killing task %s: %s' % (packed, e) # Add it to the negative cache. task_to_run.set_lookup_cache(to_run_key, False) # TODO(maruel): Add stats. return ok, was_running
def cancel_task(request, result_key, kill_running): """Cancels a task if possible, setting it to either CANCELED or KILLED. Ensures that the associated TaskToRun is canceled (when pending) and updates the TaskResultSummary/TaskRunResult accordingly. The TaskRunResult.state is immediately set to KILLED for running tasks. Warning: ACL check must have been done before. Returns: tuple(bool, bool) - True if the cancelation succeeded. Either the task atomically changed from PENDING to CANCELED or it was RUNNING and killing bit has been set. - True if the task was running while it was canceled. """ if result_key.kind() == 'TaskRunResult': # Ignore the try number. A user may ask to cancel run result 1, but if it # BOT_DIED, it is accepted to cancel try number #2 since the task is still # "pending". result_key = task_pack.run_result_key_to_result_summary_key(result_key) now = utils.utcnow() def run(): """1 DB GET, 1 memcache write, 2x DB PUTs, 1x task queue.""" # Need to get the current try number to know which TaskToRun to fetch. result_summary = result_key.get() was_running = result_summary.state == task_result.State.RUNNING if not result_summary.can_be_canceled: return False, was_running entities = [result_summary] if not was_running: # PENDING. result_summary.state = task_result.State.CANCELED to_run_key = task_to_run.request_to_task_to_run_key( request, result_summary.try_number or 1, result_summary.current_task_slice or 0) to_run_future = to_run_key.get_async() # Add it to the negative cache. task_to_run.set_lookup_cache(to_run_key, False) to_run = to_run_future.get_result() entities.append(to_run) to_run.queue_number = None else: if not kill_running: # Deny canceling a task that started. return False, was_running # RUNNING. run_result = result_summary.run_result_key.get() entities.append(run_result) # Do not change state to KILLED yet. Instead, use a 2 phase commit: # - set killing to True # - on next bot report, tell it to kill the task # - once the bot reports the task as terminated, set state to KILLED run_result.killing = True run_result.abandoned_ts = now run_result.modified_ts = now entities.append(run_result) result_summary.abandoned_ts = now result_summary.modified_ts = now futures = ndb.put_multi_async(entities) _maybe_pubsub_notify_via_tq(result_summary, request) for f in futures: f.check_success() return True, was_running try: ok, was_running = datastore_utils.transaction(run) except datastore_utils.CommitError as e: packed = task_pack.pack_result_summary_key(result_key) return 'Failed killing task %s: %s' % (packed, e) return ok, was_running
def _handle_dead_bot(run_result_key): """Handles TaskRunResult where its bot has stopped showing sign of life. Transactionally updates the entities depending on the state of this task. The task may be retried automatically, canceled or left alone. Returns: True if the task was retried, False if the task was killed, None if no action was done. """ result_summary_key = task_pack.run_result_key_to_result_summary_key( run_result_key) request_key = task_pack.result_summary_key_to_request_key(result_summary_key) request_future = request_key.get_async() now = utils.utcnow() server_version = utils.get_app_version() packed = task_pack.pack_run_result_key(run_result_key) request = request_future.get_result() def run(): """Returns tuple(task_is_retried or None, bot_id). 1x GET, 1x GETs 2~3x PUT. """ run_result = run_result_key.get() if run_result.state != task_result.State.RUNNING: # It was updated already or not updating last. Likely DB index was stale. return None, run_result.bot_id if run_result.modified_ts > now - task_result.BOT_PING_TOLERANCE: # The query index IS stale. return None, run_result.bot_id current_task_slice = run_result.current_task_slice run_result.signal_server_version(server_version) old_modified = run_result.modified_ts run_result.modified_ts = now result_summary = result_summary_key.get() orig_summary_state = result_summary.state if result_summary.try_number != run_result.try_number: # Not updating correct run_result, cancel it without touching # result_summary. to_put = (run_result,) run_result.state = task_result.State.BOT_DIED run_result.internal_failure = True run_result.abandoned_ts = now task_is_retried = None elif (result_summary.try_number == 1 and now < request.expiration_ts and (request.task_slice(current_task_slice).properties.idempotent or run_result.started_ts == old_modified)): # Retry it. It fits: # - first try # - not yet expired # - One of: # - idempotent # - task hadn't got any ping at all from task_runner.run_command() # TODO(maruel): Allow increasing the current_task_slice value. # Create a second TaskToRun with the same TaskSlice. to_run = task_to_run.new_task_to_run(request, 2, current_task_slice) to_put = (run_result, result_summary, to_run) run_result.state = task_result.State.BOT_DIED run_result.internal_failure = True run_result.abandoned_ts = now # Do not sync data from run_result to result_summary, since the task is # being retried. result_summary.reset_to_pending() result_summary.modified_ts = now task_is_retried = True else: # Kill it as BOT_DIED, there was more than one try, the task expired in # the meantime or it wasn't idempotent. to_put = (run_result, result_summary) run_result.state = task_result.State.BOT_DIED run_result.internal_failure = True run_result.abandoned_ts = now result_summary.set_from_run_result(run_result, request) task_is_retried = False futures = ndb.put_multi_async(to_put) # if result_summary.state != orig_summary_state: if orig_summary_state != result_summary.state: _maybe_pubsub_notify_via_tq(result_summary, request) for f in futures: f.check_success() return task_is_retried try: task_is_retried = datastore_utils.transaction(run) except datastore_utils.CommitError: task_is_retried = None if task_is_retried: logging.info('Retried %s', packed) elif task_is_retried == False: logging.debug('Ignored %s', packed) return task_is_retried
def _expire_task(to_run_key, request): """Expires a TaskResultSummary and unschedules the TaskToRun. This function is only meant to process PENDING tasks. If a follow up TaskSlice is available, reenqueue a new TaskToRun instead of expiring the TaskResultSummary. Returns: TaskResultSummary on success, bool if reenqueued (due to following TaskSlice). """ # Look if the TaskToRun is reapable once before doing the check inside the # transaction. This reduces the likelihood of failing this check inside the # transaction, which is an order of magnitude more costly. if not to_run_key.get().is_reapable: logging.info('Not reapable anymore') return None, None result_summary_key = task_pack.request_key_to_result_summary_key(request.key) now = utils.utcnow() def run(): # 2 concurrent GET, one PUT. Optionally with an additional serialized GET. to_run_future = to_run_key.get_async() result_summary_future = result_summary_key.get_async() to_run = to_run_future.get_result() if not to_run or not to_run.is_reapable: result_summary_future.get_result() return None, None # In any case, dequeue the TaskToRun. to_run.queue_number = None result_summary = result_summary_future.get_result() to_put = [to_run, result_summary] # Check if there's a TaskSlice fallback that could be reenqueued. new_to_run = None index = result_summary.current_task_slice+1 while index < request.num_task_slices: dimensions = request.task_slice(index).properties.dimensions if _has_capacity(dimensions): # Enqueue a new TasktoRun for this next TaskSlice, it has capacity! new_to_run = task_to_run.new_task_to_run(request, 1, index) result_summary.current_task_slice = index to_put.append(new_to_run) break index += 1 if not new_to_run: # There's no fallback, giving up. if result_summary.try_number: # It's a retry that is being expired, i.e. the first try had BOT_DIED. # Keep the old state. That requires an additional pipelined GET but that # shouldn't be the common case. run_result = result_summary.run_result_key.get() result_summary.set_from_run_result(run_result, request) else: result_summary.state = task_result.State.EXPIRED result_summary.abandoned_ts = now result_summary.modified_ts = now futures = ndb.put_multi_async(to_put) _maybe_pubsub_notify_via_tq(result_summary, request) for f in futures: f.check_success() return result_summary, new_to_run # Add it to the negative cache *before* running the transaction. Either way # the task was already reaped or the task is correctly expired and not # reapable. task_to_run.set_lookup_cache(to_run_key, False) # It'll be caught by next cron job execution in case of failure. try: res, r = datastore_utils.transaction(run) except datastore_utils.CommitError: res = None r = None if res: logging.info( 'Expired %s', task_pack.pack_result_summary_key(result_summary_key)) ts_mon_metrics.on_task_completed(res) return res, r
def bot_update_task( run_result_key, bot_id, output, output_chunk_start, exit_code, duration, hard_timeout, io_timeout, cost_usd, outputs_ref, ): """Updates a TaskRunResult and TaskResultSummary, along TaskOutput. Arguments: - run_result_key: ndb.Key to TaskRunResult. - bot_id: Self advertised bot id to ensure it's the one expected. - output: Data to append to this command output. - output_chunk_start: Index of output in the stdout stream. - exit_code: Mark that this command is terminated. - duration: Time spent in seconds for this command. - hard_timeout: Bool set if an hard timeout occured. - io_timeout: Bool set if an I/O timeout occured. - cost_usd: Cost in $USD of this task up to now. - outputs_ref: Serialized FilesRef instance or None. Invalid states, these are flat out refused: - A command is updated after it had an exit code assigned to. Returns: tuple(bool, bool); first is if the update succeeded, second is if the task completed. """ assert output_chunk_start is None or isinstance(output_chunk_start, int) assert output is None or isinstance(output, str) if cost_usd is not None and cost_usd < 0.0: raise ValueError("cost_usd must be None or greater or equal than 0") packed = task_pack.pack_run_result_key(run_result_key) logging.debug( "bot_update_task(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)", packed, bot_id, len(output) if output else output, output_chunk_start, exit_code, duration, hard_timeout, io_timeout, cost_usd, outputs_ref, ) result_summary_key = task_pack.run_result_key_to_result_summary_key(run_result_key) request_key = task_pack.result_summary_key_to_request_key(result_summary_key) request_future = request_key.get_async() server_version = utils.get_app_version() request = request_future.get_result() now = utils.utcnow() def run(): # 2 consecutive GETs, one PUT. run_result_future = run_result_key.get_async() result_summary_future = result_summary_key.get_async() run_result = run_result_future.get_result() if not run_result: result_summary_future.wait() return None, None, False, "is missing" if run_result.bot_id != bot_id: result_summary_future.wait() return None, None, False, ("expected bot (%s) but had update from bot %s" % (run_result.bot_id, bot_id)) if not run_result.started_ts: return None, None, False, "TaskRunResult is broken; %s" % (run_result.to_dict()) # This happens as an HTTP request is retried when the DB write succeeded but # it still returned HTTP 500. if run_result.exit_code is not None and exit_code is not None: if run_result.exit_code != exit_code: result_summary_future.wait() return None, None, False, "got 2 different exit_code; %s then %s" % (run_result.exit_code, exit_code) if run_result.durations and duration is not None: if run_result.durations[0] != duration: result_summary_future.wait() return None, None, False, "got 2 different durations; %s then %s" % (run_result.durations[0], duration) if (duration is None) != (exit_code is None): result_summary_future.wait() return ( None, None, False, ( "had unexpected duration; expected iff a command completes\n" "duration: %s vs %s; exit: %s vs %s" % (run_result.durations, duration, run_result.exit_code, exit_code) ), ) # If the command completed. Check if the value wasn't set already. if duration is not None and not run_result.durations: run_result.durations.append(duration) if exit_code is not None and run_result.exit_code is None: run_result.exit_codes.append(exit_code) if outputs_ref: run_result.outputs_ref = task_request.FilesRef(**outputs_ref) task_completed = run_result.exit_code is not None if run_result.state in task_result.State.STATES_RUNNING: if hard_timeout or io_timeout: run_result.state = task_result.State.TIMED_OUT run_result.completed_ts = now elif task_completed: run_result.state = task_result.State.COMPLETED run_result.completed_ts = now run_result.signal_server_version(server_version) to_put = [run_result] if output: # This does 1 multi GETs. This also modifies run_result in place. to_put.extend(run_result.append_output(0, output, output_chunk_start or 0)) run_result.cost_usd = max(cost_usd, run_result.cost_usd or 0.0) run_result.modified_ts = now result_summary = result_summary_future.get_result() if result_summary.try_number and result_summary.try_number > run_result.try_number: # The situation where a shard is retried but the bot running the previous # try somehow reappears and reports success, the result must still show # the last try's result. We still need to update cost_usd manually. result_summary.costs_usd[run_result.try_number - 1] = run_result.cost_usd result_summary.modified_ts = now else: result_summary.set_from_run_result(run_result, request) to_put.append(result_summary) ndb.put_multi(to_put) return result_summary, run_result, task_completed, None try: smry, run_result, task_completed, error = datastore_utils.transaction(run) except datastore_utils.CommitError as e: logging.info("Got commit error: %s", e) # It is important that the caller correctly surface this error. return False, False if run_result: # Caller must retry if PubSub enqueue fails. if not _maybe_pubsub_notify_now(smry, request): return False, False _update_stats(run_result, bot_id, request, task_completed) if error: logging.error("Task %s %s", packed, error) return True, task_completed
def schedule_request(request, secret_bytes): """Creates and stores all the entities to schedule a new task request. Assumes ACL check has already happened (see 'check_schedule_request_acl'). The number of entities created is ~4: TaskRequest, TaskToRun and TaskResultSummary and (optionally) SecretBytes. They are in single entity group and saved in a single transaction. Arguments: - request: TaskRequest entity to be saved in the DB. It's key must not be set and the entity must not be saved in the DB yet. - secret_bytes: SecretBytes entity to be saved in the DB. It's key will be set and the entity will be stored by this function. None is allowed if there are no SecretBytes for this task. Returns: TaskResultSummary. TaskToRun is not returned. """ assert isinstance(request, task_request.TaskRequest), request assert not request.key, request.key # This does a DB GET, occasionally triggers a task queue. May throw, which is # surfaced to the user but it is safe as the task request wasn't stored yet. task_queues.assert_task(request) now = utils.utcnow() request.key = task_request.new_request_key() result_summary = task_result.new_result_summary(request) result_summary.modified_ts = now to_run = None if secret_bytes: secret_bytes.key = request.secret_bytes_key dupe_summary = None for i in xrange(request.num_task_slices): t = request.task_slice(i) if t.properties.idempotent: dupe_summary = _find_dupe_task(now, t.properties_hash()) if dupe_summary: _dedupe_result_summary(dupe_summary, result_summary, i) # In this code path, there's not much to do as the task will not be run, # previous results are returned. We still need to store the TaskRequest # and TaskResultSummary. # Since the task is never scheduled, TaskToRun is not stored. # Since the has_secret_bytes property is already set for UI purposes, # and the task itself will never be run, we skip storing the # SecretBytes, as they would never be read and will just consume space # in the datastore (and the task we deduplicated with will have them # stored anyway, if we really want to get them again). secret_bytes = None break if not dupe_summary: # The task has to run. Make sure there's capacity. index = 0 while index < request.num_task_slices: # This needs to be extremely fast. to_run = task_to_run.new_task_to_run(request, 1, index) if _has_capacity(request.task_slice(index).properties.dimensions): # It's pending at this index now. result_summary.current_task_slice = index break index += 1 if index == request.num_task_slices: # Skip to_run since it's not enqueued. to_run = None # Same rationale as deduped task. secret_bytes = None # Instantaneously denied. result_summary.abandoned_ts = result_summary.created_ts result_summary.state = task_result.State.NO_RESOURCE # Storing these entities makes this task live. It is important at this point # that the HTTP handler returns as fast as possible, otherwise the task will # be run but the client will not know about it. _gen_key = lambda: _gen_new_keys(result_summary, to_run, secret_bytes) extra = filter(bool, [result_summary, to_run, secret_bytes]) datastore_utils.insert(request, new_key_callback=_gen_key, extra=extra) if dupe_summary: logging.debug( 'New request %s reusing %s', result_summary.task_id, dupe_summary.task_id) elif result_summary.state == task_result.State.NO_RESOURCE: logging.warning( 'New request %s denied with NO_RESOURCE', result_summary.task_id) logging.debug('New request %s', result_summary.task_id) else: logging.debug('New request %s', result_summary.task_id) # Get parent task details if applicable. if request.parent_task_id: parent_run_key = task_pack.unpack_run_result_key(request.parent_task_id) parent_task_keys = [ parent_run_key, task_pack.run_result_key_to_result_summary_key(parent_run_key), ] def run_parent(): # This one is slower. items = ndb.get_multi(parent_task_keys) k = result_summary.task_id for item in items: item.children_task_ids.append(k) item.modified_ts = now ndb.put_multi(items) # Raising will abort to the caller. There's a risk that for tasks with # parent tasks, the task will be lost due to this transaction. # TODO(maruel): An option is to update the parent task as part of a cron # job, which would remove this code from the critical path. datastore_utils.transaction(run_parent) ts_mon_metrics.on_task_requested(result_summary, bool(dupe_summary)) return result_summary
def _handle_dead_bot(run_result_key): """Handles TaskRunResult where its bot has stopped showing sign of life. Transactionally updates the entities depending on the state of this task. The task may be retried automatically, canceled or left alone. Returns: True if the task was retried, False if the task was killed, None if no action was done. """ result_summary_key = task_pack.run_result_key_to_result_summary_key(run_result_key) request_key = task_pack.result_summary_key_to_request_key(result_summary_key) request_future = request_key.get_async() now = utils.utcnow() server_version = utils.get_app_version() packed = task_pack.pack_run_result_key(run_result_key) request = request_future.get_result() to_run_key = task_to_run.request_to_task_to_run_key(request) def run(): """Returns tuple(task_is_retried or None, bot_id).""" # Do one GET, one PUT at the end. run_result, result_summary, to_run = ndb.get_multi((run_result_key, result_summary_key, to_run_key)) if run_result.state != task_result.State.RUNNING: # It was updated already or not updating last. Likely DB index was stale. return None, run_result.bot_id run_result.signal_server_version(server_version) run_result.modified_ts = now notify = False if result_summary.try_number != run_result.try_number: # Not updating correct run_result, cancel it without touching # result_summary. to_put = (run_result,) run_result.state = task_result.State.BOT_DIED run_result.internal_failure = True run_result.abandoned_ts = now task_is_retried = None elif result_summary.try_number == 1 and now < request.expiration_ts: # Retry it. to_put = (run_result, result_summary, to_run) to_run.queue_number = task_to_run.gen_queue_number(request) run_result.state = task_result.State.BOT_DIED run_result.internal_failure = True run_result.abandoned_ts = now # Do not sync data from run_result to result_summary, since the task is # being retried. result_summary.reset_to_pending() result_summary.modified_ts = now task_is_retried = True else: # Cancel it, there was more than one try or the task expired in the # meantime. to_put = (run_result, result_summary) run_result.state = task_result.State.BOT_DIED run_result.internal_failure = True run_result.abandoned_ts = now result_summary.set_from_run_result(run_result, request) notify = True task_is_retried = False futures = ndb.put_multi_async(to_put) if notify: _maybe_pubsub_notify_via_tq(result_summary, request) for f in futures: f.check_success() return task_is_retried, run_result.bot_id try: task_is_retried, bot_id = datastore_utils.transaction(run) except datastore_utils.CommitError: task_is_retried, bot_id = None, None if task_is_retried is not None: task_to_run.set_lookup_cache(to_run_key, task_is_retried) if not task_is_retried: stats.add_run_entry( "run_bot_died", run_result_key, bot_id=bot_id[0], dimensions=request.properties.dimensions, user=request.user, ) else: logging.info("Retried %s", packed) else: logging.info("Ignored %s", packed) return task_is_retried
def bot_update_task( run_result_key, bot_id, output, output_chunk_start, exit_code, duration, hard_timeout, io_timeout, cost_usd, outputs_ref, cipd_pins, performance_stats): """Updates a TaskRunResult and TaskResultSummary, along TaskOutputChunk. Arguments: - run_result_key: ndb.Key to TaskRunResult. - bot_id: Self advertised bot id to ensure it's the one expected. - output: Data to append to this command output. - output_chunk_start: Index of output in the stdout stream. - exit_code: Mark that this task completed. - duration: Time spent in seconds for this task, excluding overheads. - hard_timeout: Bool set if an hard timeout occured. - io_timeout: Bool set if an I/O timeout occured. - cost_usd: Cost in $USD of this task up to now. - outputs_ref: task_request.FilesRef instance or None. - cipd_pins: None or task_result.CipdPins - performance_stats: task_result.PerformanceStats instance or None. Can only be set when the task is completing. Invalid states, these are flat out refused: - A command is updated after it had an exit code assigned to. Returns: TaskRunResult.state or None in case of failure. """ assert output_chunk_start is None or isinstance(output_chunk_start, int) assert output is None or isinstance(output, str) if cost_usd is not None and cost_usd < 0.: raise ValueError('cost_usd must be None or greater or equal than 0') if duration is not None and duration < 0.: raise ValueError('duration must be None or greater or equal than 0') if (duration is None) != (exit_code is None): raise ValueError( 'had unexpected duration; expected iff a command completes\n' 'duration: %r; exit: %r' % (duration, exit_code)) if performance_stats and duration is None: raise ValueError( 'duration must be set when performance_stats is set\n' 'duration: %s; performance_stats: %s' % (duration, performance_stats)) packed = task_pack.pack_run_result_key(run_result_key) logging.debug( 'bot_update_task(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)', packed, bot_id, len(output) if output else output, output_chunk_start, exit_code, duration, hard_timeout, io_timeout, cost_usd, outputs_ref, cipd_pins, performance_stats) result_summary_key = task_pack.run_result_key_to_result_summary_key( run_result_key) request_key = task_pack.result_summary_key_to_request_key(result_summary_key) request_future = request_key.get_async() server_version = utils.get_app_version() request = request_future.get_result() now = utils.utcnow() run = lambda: _bot_update_tx( run_result_key, bot_id, output, output_chunk_start, exit_code, duration, hard_timeout, io_timeout, cost_usd, outputs_ref, cipd_pins, performance_stats, now, result_summary_key, server_version, request) try: smry, run_result, error = datastore_utils.transaction(run) except datastore_utils.CommitError as e: logging.info('Got commit error: %s', e) # It is important that the caller correctly surface this error as the bot # will retry on HTTP 500. return None if smry and smry.state != task_result.State.RUNNING: # Take no chance and explicitly clear the ndb memcache entry. A very rare # race condition is observed where a stale version of the entities it kept # in memcache. ndb.get_context()._clear_memcache( [result_summary_key, run_result_key]).check_success() assert bool(error) != bool(run_result), (error, run_result) if error: logging.error('Task %s %s', packed, error) return None # Caller must retry if PubSub enqueue fails. if not _maybe_pubsub_notify_now(smry, request): return None if smry.state not in task_result.State.STATES_RUNNING: event_mon_metrics.send_task_event(smry) ts_mon_metrics.on_task_completed(smry) # Hack a bit to tell the bot what it needs to hear (see handler_bot.py). It's # kind of an ugly hack but the other option is to return the whole run_result. if run_result.killing: return task_result.State.KILLED return run_result.state
def bot_update_task( run_result_key, bot_id, output, output_chunk_start, exit_code, duration, hard_timeout, io_timeout, cost_usd, outputs_ref): """Updates a TaskRunResult and TaskResultSummary, along TaskOutput. Arguments: - run_result_key: ndb.Key to TaskRunResult. - bot_id: Self advertised bot id to ensure it's the one expected. - output: Data to append to this command output. - output_chunk_start: Index of output in the stdout stream. - exit_code: Mark that this command is terminated. - duration: Time spent in seconds for this command. - hard_timeout: Bool set if an hard timeout occured. - io_timeout: Bool set if an I/O timeout occured. - cost_usd: Cost in $USD of this task up to now. - outputs_ref: Serialized FilesRef instance or None. Invalid states, these are flat out refused: - A command is updated after it had an exit code assigned to. Returns: tuple(bool, bool); first is if the update succeeded, second is if the task completed. """ assert output_chunk_start is None or isinstance(output_chunk_start, int) assert output is None or isinstance(output, str) if cost_usd is not None and cost_usd < 0.: raise ValueError('cost_usd must be None or greater or equal than 0') result_summary_key = task_pack.run_result_key_to_result_summary_key( run_result_key) request_key = task_pack.result_summary_key_to_request_key(result_summary_key) request_future = request_key.get_async() server_version = utils.get_app_version() packed = task_pack.pack_run_result_key(run_result_key) request = request_future.get_result() now = utils.utcnow() def run(): # 2 consecutive GETs, one PUT. run_result_future = run_result_key.get_async() result_summary_future = result_summary_key.get_async() run_result = run_result_future.get_result() if not run_result: result_summary_future.wait() return None, False, 'is missing' if run_result.bot_id != bot_id: result_summary_future.wait() return None, False, 'expected bot (%s) but had update from bot %s' % ( run_result.bot_id, bot_id) # This happens as an HTTP request is retried when the DB write succeeded but # it still returned HTTP 500. if len(run_result.exit_codes) and exit_code is not None: if run_result.exit_codes[0] != exit_code: result_summary_future.wait() return None, False, 'got 2 different exit_codes; %d then %d' % ( run_result.exit_codes[0], exit_code) if (duration is None) != (exit_code is None): result_summary_future.wait() return None, False, ( 'had unexpected duration; expected iff a command completes; index %d' % len(run_result.exit_codes)) if exit_code is not None: # The command completed. run_result.durations.append(duration) run_result.exit_codes.append(exit_code) if outputs_ref: run_result.outputs_ref = task_request.FilesRef(**outputs_ref) task_completed = len(run_result.exit_codes) == 1 if run_result.state in task_result.State.STATES_RUNNING: if hard_timeout or io_timeout: run_result.state = task_result.State.TIMED_OUT run_result.completed_ts = now elif task_completed: run_result.state = task_result.State.COMPLETED run_result.completed_ts = now run_result.signal_server_version(server_version) to_put = [run_result] if output: # This does 1 multi GETs. This also modifies run_result in place. to_put.extend( run_result.append_output(0, output, output_chunk_start or 0)) run_result.cost_usd = max(cost_usd, run_result.cost_usd or 0.) run_result.modified_ts = now result_summary = result_summary_future.get_result() if (result_summary.try_number and result_summary.try_number > run_result.try_number): # The situation where a shard is retried but the bot running the previous # try somehow reappears and reports success, the result must still show # the last try's result. We still need to update cost_usd manually. result_summary.costs_usd[run_result.try_number-1] = run_result.cost_usd result_summary.modified_ts = now else: result_summary.set_from_run_result(run_result, request) to_put.append(result_summary) ndb.put_multi(to_put) return run_result, task_completed, None try: run_result, task_completed, error = datastore_utils.transaction(run) except datastore_utils.CommitError: # It is important that the caller correctly surface this error. return False, False if run_result: _update_stats(run_result, bot_id, request, task_completed) if error: logging.error('Task %s %s', packed, error) return True, task_completed
def schedule_request(request, check_acls=True): """Creates and stores all the entities to schedule a new task request. Checks ACLs first. Raises auth.AuthorizationError if caller is not authorized to post this request. The number of entities created is 3: TaskRequest, TaskToRun and TaskResultSummary. All 3 entities in the same entity group (TaskReqest, TaskToRun, TaskResultSummary) are saved as a DB transaction. Arguments: - request: TaskRequest entity to be saved in the DB. It's key must not be set and the entity must not be saved in the DB yet. - check_acls: Whether the request should check ACLs. Returns: TaskResultSummary. TaskToRun is not returned. """ assert isinstance(request, task_request.TaskRequest), request assert not request.key, request.key # Raises AuthorizationError with helpful message if the request.authorized # can't use some of the requested dimensions. if check_acls: _check_dimension_acls(request) now = utils.utcnow() request.key = task_request.new_request_key() task = task_to_run.new_task_to_run(request) result_summary = task_result.new_result_summary(request) result_summary.modified_ts = now def get_new_keys(): # Warning: this assumes knowledge about the hierarchy of each entity. key = task_request.new_request_key() task.key.parent = key old = result_summary.task_id result_summary.parent = key logging.info('%s conflicted, using %s', old, result_summary.task_id) return key deduped = False if request.properties.idempotent: dupe_summary = _find_dupe_task(now, request.properties_hash) if dupe_summary: # Setting task.queue_number to None removes it from the scheduling. task.queue_number = None _copy_summary( dupe_summary, result_summary, ('created_ts', 'modified_ts', 'name', 'user', 'tags')) # Zap irrelevant properties. PerformanceStats is also not copied over, # since it's not relevant. result_summary.properties_hash = None result_summary.try_number = 0 result_summary.cost_saved_usd = result_summary.cost_usd # Only zap after. result_summary.costs_usd = [] result_summary.deduped_from = task_pack.pack_run_result_key( dupe_summary.run_result_key) # In this code path, there's not much to do as the task will not be run, # previous results are returned. We still need to store all the entities # correctly. datastore_utils.insert(request, get_new_keys, extra=[task, result_summary]) logging.debug('New request %s reusing %s', result_summary.task_id, dupe_summary.task_id) deduped = True if not deduped: # Storing these entities makes this task live. It is important at this point # that the HTTP handler returns as fast as possible, otherwise the task will # be run but the client will not know about it. datastore_utils.insert(request, get_new_keys, extra=[task, result_summary]) logging.debug('New request %s', result_summary.task_id) # Get parent task details if applicable. if request.parent_task_id: parent_run_key = task_pack.unpack_run_result_key( request.parent_task_id) parent_task_keys = [ parent_run_key, task_pack.run_result_key_to_result_summary_key(parent_run_key), ] def run_parent(): # This one is slower. items = ndb.get_multi(parent_task_keys) k = result_summary.task_id for item in items: item.children_task_ids.append(k) item.modified_ts = now ndb.put_multi(items) # Raising will abort to the caller. There's a risk that for tasks with # parent tasks, the task will be lost due to this transaction. # TODO(maruel): An option is to update the parent task as part of a cron # job, which would remove this code from the critical path. datastore_utils.transaction(run_parent) stats.add_task_entry('task_enqueued', result_summary.key, dimensions=request.properties.dimensions, user=request.user) ts_mon_metrics.update_jobs_requested_metrics(result_summary, deduped) return result_summary
def bot_update_task(run_result_key, bot_id, output, output_chunk_start, exit_code, duration, hard_timeout, io_timeout, cost_usd, outputs_ref, cipd_pins, performance_stats): """Updates a TaskRunResult and TaskResultSummary, along TaskOutput. Arguments: - run_result_key: ndb.Key to TaskRunResult. - bot_id: Self advertised bot id to ensure it's the one expected. - output: Data to append to this command output. - output_chunk_start: Index of output in the stdout stream. - exit_code: Mark that this task completed. - duration: Time spent in seconds for this task, excluding overheads. - hard_timeout: Bool set if an hard timeout occured. - io_timeout: Bool set if an I/O timeout occured. - cost_usd: Cost in $USD of this task up to now. - outputs_ref: task_request.FilesRef instance or None. - cipd_pins: None or task_result.CipdPins - performance_stats: task_result.PerformanceStats instance or None. Can only be set when the task is completing. Invalid states, these are flat out refused: - A command is updated after it had an exit code assigned to. Returns: TaskRunResult.state or None in case of failure. """ assert output_chunk_start is None or isinstance(output_chunk_start, int) assert output is None or isinstance(output, str) if cost_usd is not None and cost_usd < 0.: raise ValueError('cost_usd must be None or greater or equal than 0') if duration is not None and duration < 0.: raise ValueError('duration must be None or greater or equal than 0') if (duration is None) != (exit_code is None): raise ValueError( 'had unexpected duration; expected iff a command completes\n' 'duration: %r; exit: %r' % (duration, exit_code)) if performance_stats and duration is None: raise ValueError('duration must be set when performance_stats is set\n' 'duration: %s; performance_stats: %s' % (duration, performance_stats)) packed = task_pack.pack_run_result_key(run_result_key) logging.debug( 'bot_update_task(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)', packed, bot_id, len(output) if output else output, output_chunk_start, exit_code, duration, hard_timeout, io_timeout, cost_usd, outputs_ref, cipd_pins, performance_stats) result_summary_key = task_pack.run_result_key_to_result_summary_key( run_result_key) request_key = task_pack.result_summary_key_to_request_key( result_summary_key) request_future = request_key.get_async() server_version = utils.get_app_version() request = request_future.get_result() now = utils.utcnow() def run(): """Returns tuple(TaskRunResult, bool(completed), str(error)). Any error is returned as a string to be passed to logging.error() instead of logging inside the transaction for performance. """ # 2 consecutive GETs, one PUT. run_result_future = run_result_key.get_async() result_summary_future = result_summary_key.get_async() run_result = run_result_future.get_result() if not run_result: result_summary_future.wait() return None, None, 'is missing' if run_result.bot_id != bot_id: result_summary_future.wait() return None, None, ( 'expected bot (%s) but had update from bot %s' % (run_result.bot_id, bot_id)) if not run_result.started_ts: return None, None, 'TaskRunResult is broken; %s' % ( run_result.to_dict()) # Assumptions: # - duration and exit_code are both set or not set. # - same for run_result. if exit_code is not None: if run_result.exit_code is not None: # This happens as an HTTP request is retried when the DB write succeeded # but it still returned HTTP 500. if run_result.exit_code != exit_code: result_summary_future.wait() return None, None, 'got 2 different exit_code; %s then %s' % ( run_result.exit_code, exit_code) if run_result.duration != duration: result_summary_future.wait() return None, None, 'got 2 different durations; %s then %s' % ( run_result.duration, duration) else: run_result.duration = duration run_result.exit_code = exit_code if outputs_ref: run_result.outputs_ref = outputs_ref if cipd_pins: run_result.cipd_pins = cipd_pins if run_result.state in task_result.State.STATES_RUNNING: if hard_timeout or io_timeout: run_result.state = task_result.State.TIMED_OUT run_result.completed_ts = now elif run_result.exit_code is not None: run_result.state = task_result.State.COMPLETED run_result.completed_ts = now run_result.signal_server_version(server_version) run_result.validate(request) to_put = [run_result] if output: # This does 1 multi GETs. This also modifies run_result in place. to_put.extend( run_result.append_output(output, output_chunk_start or 0)) if performance_stats: performance_stats.key = task_pack.run_result_key_to_performance_stats_key( run_result.key) to_put.append(performance_stats) run_result.cost_usd = max(cost_usd, run_result.cost_usd or 0.) run_result.modified_ts = now result_summary = result_summary_future.get_result() if (result_summary.try_number and result_summary.try_number > run_result.try_number): # The situation where a shard is retried but the bot running the previous # try somehow reappears and reports success, the result must still show # the last try's result. We still need to update cost_usd manually. result_summary.costs_usd[run_result.try_number - 1] = run_result.cost_usd result_summary.modified_ts = now else: result_summary.set_from_run_result(run_result, request) result_summary.validate(request) to_put.append(result_summary) ndb.put_multi(to_put) return result_summary, run_result, None try: smry, run_result, error = datastore_utils.transaction(run) except datastore_utils.CommitError as e: logging.info('Got commit error: %s', e) # It is important that the caller correctly surface this error. return None assert bool(error) != bool(run_result), (error, run_result) if error: logging.error('Task %s %s', packed, error) return None # Caller must retry if PubSub enqueue fails. task_completed = run_result.state != task_result.State.RUNNING if not _maybe_pubsub_notify_now(smry, request): return None _update_stats(run_result, bot_id, request, task_completed) if task_completed: ts_mon_metrics.update_jobs_completed_metrics(smry) return run_result.state
def _reap_task(bot_dimensions, bot_version, to_run_key, request): """Reaps a task and insert the results entity. Returns: (TaskRunResult, SecretBytes) if successful, (None, None) otherwise. """ assert request.key == task_to_run.task_to_run_key_to_request_key(to_run_key) result_summary_key = task_pack.request_key_to_result_summary_key(request.key) bot_id = bot_dimensions[u'id'][0] now = utils.utcnow() # Log before the task id in case the function fails in a bad state where the # DB TX ran but the reply never comes to the bot. This is the worst case as # this leads to a task that results in BOT_DIED without ever starting. This # case is specifically handled in cron_handle_bot_died(). logging.info( '_reap_task(%s)', task_pack.pack_result_summary_key(result_summary_key)) def run(): # 3 GET, 1 PUT at the end. to_run_future = to_run_key.get_async() result_summary_future = result_summary_key.get_async() to_run = to_run_future.get_result() t = request.task_slice(to_run.task_slice_index) if t.properties.has_secret_bytes: secret_bytes_future = request.secret_bytes_key.get_async() result_summary = result_summary_future.get_result() orig_summary_state = result_summary.state secret_bytes = None if t.properties.has_secret_bytes: secret_bytes = secret_bytes_future.get_result() if not to_run: logging.error('Missing TaskToRun?\n%s', result_summary.task_id) return None, None if not to_run.is_reapable: logging.info('%s is not reapable', result_summary.task_id) return None, None if result_summary.bot_id == bot_id: # This means two things, first it's a retry, second it's that the first # try failed and the retry is being reaped by the same bot. Deny that, as # the bot may be deeply broken and could be in a killing spree. # TODO(maruel): Allow retry for bot locked task using 'id' dimension. logging.warning( '%s can\'t retry its own internal failure task', result_summary.task_id) return None, None to_run.queue_number = None run_result = task_result.new_run_result( request, to_run, bot_id, bot_version, bot_dimensions) # Upon bot reap, both .started_ts and .modified_ts matches. They differ on # the first ping. run_result.started_ts = now run_result.modified_ts = now result_summary.set_from_run_result(run_result, request) ndb.put_multi([to_run, run_result, result_summary]) if result_summary.state != orig_summary_state: _maybe_pubsub_notify_via_tq(result_summary, request) return run_result, secret_bytes # Add it to the negative cache *before* running the transaction. This will # inhibit concurrently readers to try to reap this task. The downside is if # this request fails in the middle of the transaction, the task may stay # unreapable for up to 15 seconds. if not task_to_run.set_lookup_cache(to_run_key, False): logging.debug('hit negative cache') return None, None try: run_result, secret_bytes = datastore_utils.transaction(run, retries=0) except datastore_utils.CommitError: # The challenge here is that the transaction may have failed because: # - The DB had an hickup and the TaskToRun, TaskRunResult and # TaskResultSummary haven't been updated. # - The entities had been updated by a concurrent transaction on another # handler so it was not reapable anyway. This does cause exceptions as # both GET returns the TaskToRun.queue_number != None but only one succeed # at the PUT. # # In the first case, we may want to reset the negative cache, while we don't # want to in the later case. The trade off are one of: # - negative cache is incorrectly set, so the task is not reapable for 15s # - resetting the negative cache would cause even more contention # # We chose the first one here for now, as the when the DB starts misbehaving # and the index becomes stale, it means the DB is *already* not in good # shape, so it is preferable to not put more stress on it, and skipping a # few tasks for 15s may even actively help the DB to stabilize. logging.info('CommitError; reaping failed') # The bot will reap the next available task in case of failure, no big deal. run_result = None secret_bytes = None return run_result, secret_bytes
def _handle_dead_bot(run_result_key): """Handles TaskRunResult where its bot has stopped showing sign of life. Transactionally updates the entities depending on the state of this task. The task may be retried automatically, canceled or left alone. Returns: True if the task was retried, False if the task was killed, None if no action was done. """ result_summary_key = task_pack.run_result_key_to_result_summary_key( run_result_key) request_key = task_pack.result_summary_key_to_request_key( result_summary_key) request_future = request_key.get_async() now = utils.utcnow() server_version = utils.get_app_version() packed = task_pack.pack_run_result_key(run_result_key) request = request_future.get_result() to_run_key = task_to_run.request_to_task_to_run_key(request) def run(): """Returns tuple(task_is_retried or None, bot_id).""" # Do one GET, one PUT at the end. run_result, result_summary, to_run = ndb.get_multi( (run_result_key, result_summary_key, to_run_key)) if run_result.state != task_result.State.RUNNING: # It was updated already or not updating last. Likely DB index was stale. return None, run_result.bot_id if run_result.modified_ts > now - task_result.BOT_PING_TOLERANCE: # The query index IS stale. return None, run_result.bot_id run_result.signal_server_version(server_version) run_result.modified_ts = now notify = False if result_summary.try_number != run_result.try_number: # Not updating correct run_result, cancel it without touching # result_summary. to_put = (run_result, ) run_result.state = task_result.State.BOT_DIED run_result.internal_failure = True run_result.abandoned_ts = now task_is_retried = None elif result_summary.try_number == 1 and now < request.expiration_ts: # Retry it. to_put = (run_result, result_summary, to_run) to_run.queue_number = task_to_run.gen_queue_number(request) run_result.state = task_result.State.BOT_DIED run_result.internal_failure = True run_result.abandoned_ts = now # Do not sync data from run_result to result_summary, since the task is # being retried. result_summary.reset_to_pending() result_summary.modified_ts = now task_is_retried = True else: # Cancel it, there was more than one try or the task expired in the # meantime. to_put = (run_result, result_summary) run_result.state = task_result.State.BOT_DIED run_result.internal_failure = True run_result.abandoned_ts = now result_summary.set_from_run_result(run_result, request) notify = True task_is_retried = False futures = ndb.put_multi_async(to_put) if notify: _maybe_pubsub_notify_via_tq(result_summary, request) for f in futures: f.check_success() return task_is_retried, run_result.bot_id try: task_is_retried, bot_id = datastore_utils.transaction(run) except datastore_utils.CommitError: task_is_retried, bot_id = None, None if task_is_retried is not None: task_to_run.set_lookup_cache(to_run_key, task_is_retried) if not task_is_retried: stats.add_run_entry('run_bot_died', run_result_key, bot_id=bot_id[0], dimensions=request.properties.dimensions, user=request.user) else: logging.info('Retried %s', packed) else: logging.info('Ignored %s', packed) return task_is_retried
def handle(self, namespace, hash_key): """Handles this request.""" # Extract relevant request parameters. expiration_ts = self.request.get('x') item_size = self.request.get('s') is_isolated = self.request.get('i') uploaded_to_gs = self.request.get('g') signature = self.request.get('sig') # Build correct signature. expected_sig = self.generate_signature( config.settings().global_secret, self.request.method, expiration_ts, namespace, hash_key, item_size, is_isolated, uploaded_to_gs) # Verify signature is correct. if not utils.constant_time_equals(signature, expected_sig): return self.send_error('Incorrect signature.') # Convert parameters from strings back to something useful. # It can't fail since matching signature means it was us who generated # this strings in a first place. expiration_ts = int(expiration_ts) item_size = int(item_size) is_isolated = bool(int(is_isolated)) uploaded_to_gs = bool(int(uploaded_to_gs)) # Verify signature is not yet expired. if time.time() > expiration_ts: return self.send_error('Expired signature.') if uploaded_to_gs: # GS upload finalization uses empty POST body. assert self.request.method == 'POST' if self.request.headers.get('content-length'): return self.send_error('Expecting empty POST.') content = None else: # Datastore upload uses PUT. assert self.request.method == 'PUT' if self.request.headers.get('content-length'): content = self.request.body else: content = '' # Info about corresponding GS entry (if it exists). gs_bucket = config.settings().gs_bucket key = model.entry_key(namespace, hash_key) # Verify the data while at it since it's already in memory but before # storing it in memcache and datastore. if content is not None: # Verify advertised hash matches the data. try: hex_digest, expanded_size = hash_content(content, namespace) if hex_digest != hash_key: raise ValueError( 'Hash and data do not match, ' '%d bytes (%d bytes expanded)' % (len(content), expanded_size)) if expanded_size != item_size: raise ValueError( 'Advertised data length (%d) and actual data length (%d) ' 'do not match' % (item_size, expanded_size)) except ValueError as err: return self.send_error('Inline verification failed.\n%s' % err) # Successfully verified! compressed_size = len(content) needs_verification = False else: # Fetch size of the stored file. file_info = gcs.get_file_info(gs_bucket, key.id()) if not file_info: # TODO(maruel): Do not fail yet. If the request got up to here, the file # is likely there but the service may have trouble fetching the metadata # from GS. return self.send_error( 'File should be in Google Storage.\nFile: \'%s\' Size: %d.' % (key.id(), item_size)) compressed_size = file_info.size needs_verification = True # Data is here and it's too large for DS, so put it in GS. It is likely # between MIN_SIZE_FOR_GS <= len(content) < MIN_SIZE_FOR_DIRECT_GS if content is not None and len(content) >= MIN_SIZE_FOR_GS: if not gcs.write_file(gs_bucket, key.id(), [content]): # Returns 503 so the client automatically retries. return self.send_error( 'Unable to save the content to GS.', http_code=503) # It's now in GS. uploaded_to_gs = True # Can create entity now, everything appears to be legit. entry = model.new_content_entry( key=key, is_isolated=is_isolated, compressed_size=compressed_size, expanded_size=-1 if needs_verification else item_size, is_verified = not needs_verification) # If it's not in GS then put it inline. if not uploaded_to_gs: assert content is not None and len(content) < MIN_SIZE_FOR_GS entry.content = content # Start saving *.isolated into memcache iff its content is available and # it's not in Datastore: there's no point in saving inline blobs in memcache # because ndb already memcaches them. memcache_store_future = None if (content is not None and entry.content is None and entry.is_isolated and entry.compressed_size <= model.MAX_MEMCACHE_ISOLATED): memcache_store_future = model.save_in_memcache( namespace, hash_key, content, async=True) try: # If entry was already verified above (i.e. it is a small inline entry), # store it right away, possibly overriding existing entity. Most of # the time it is a new entry anyway (since clients try to upload only # new entries). if not needs_verification: entry.put() else: # For large entries (that require expensive verification) be more # careful and check that it is indeed a new entity. No need to do it in # transaction: a race condition would lead to redundant verification # task enqueued, no big deal. existing = entry.key.get() if existing: if existing.is_verified: logging.info('Entity exists and already verified') else: logging.info('Entity exists, but not yet verified') else: # New entity. Store it and enqueue verification task, transactionally. task_queue_host = utils.get_task_queue_host() def run(): entry.put() taskqueue.add( url='/internal/taskqueue/verify/%s' % entry.key.id(), queue_name='verify', headers={'Host': task_queue_host}, transactional=True) datastore_utils.transaction(run) # TODO(vadimsh): Fill in details about the entry, such as expiration time. self.send_json({'entry': {}}) # Log stats. where = 'GS; ' + 'inline' if entry.content is not None else entry.key.id() stats.add_entry(stats.STORE, entry.compressed_size, where) finally: # Do not keep dangling futures. Note that error here is ignored, # memcache is just an optimization. if memcache_store_future: memcache_store_future.wait()
def rebuild_task_cache(payload): """Rebuilds the TaskDimensions cache. This function is called in two cases: - A new kind of task request dimensions never seen before - The TaskDimensions.valid_until_ts expired It is a cache miss, query all the bots and check for the ones which can run the task. Warning: There's a race condition, where the TaskDimensions query could be missing some instances due to eventually coherent consistency in the BotInfo query. This only happens when there's new request dimensions set AND a bot that can run this task recently showed up. Runtime expectation: the scale on the number of bots that can run the task, via BotInfo.dimensions_flat filtering. As there can be tens of thousands of bots that can run the task, this can take a long time to store all the entities on a new kind of request. As such, it must be called in the backend. Arguments: - payload: dict as created in assert_task() with: - 'dimensions': dict of task dimensions to refresh - 'dimensions_hash': precalculated hash for dimensions - 'valid_until_ts': expiration_ts + _ADVANCE for how long this cache is valid Returns: True if everything was processed, False if it needs to be retried. """ data = json.loads(payload) logging.debug('rebuild_task_cache(%s)', data) dimensions = data[u'dimensions'] dimensions_hash = int(data[u'dimensions_hash']) valid_until_ts = utils.parse_datetime(data[u'valid_until_ts']) dimensions_flat = [] for k, values in dimensions.iteritems(): for v in values: dimensions_flat.append(u'%s:%s' % (k, v)) dimensions_flat.sort() now = utils.utcnow() updated = 0 viable = 0 try: pending = set() for bot_task_key in _yield_BotTaskDimensions_keys( dimensions_hash, dimensions_flat): viable += 1 future = _refresh_BotTaskDimensions(bot_task_key, dimensions_flat, now, valid_until_ts) pending.add(future) updated += sum(1 for i in _cap_futures(pending) if i) updated += sum(1 for i in _flush_futures(pending) if i) # Done updating, now store the entity. Must use a transaction as there could # be other dimensions set in the entity. task_dims_key = _get_task_dims_key(dimensions_hash, dimensions) def run(): obj = task_dims_key.get() if not obj: obj = TaskDimensions(key=task_dims_key) if obj.assert_request(now, valid_until_ts, dimensions_flat): obj.put() return obj try: # Retry often. This transaction tends to fail frequently, and this is # running from a task queue so it's fine if it takes more time, success is # more important. datastore_utils.transaction(run, retries=4) except datastore_utils.CommitError as e: # Still log an error but no need for a stack trace in the logs. It is # important to surface that the call failed so the task queue is retried # later. logging.error('Failed updating TaskDimensions: %s', e) return False finally: # Any of the _refresh_BotTaskDimensions() calls above could throw. Still log # how far we went. logging.debug( 'rebuild_task_cache(%d) in %.3fs. viable bots: %d; bots updated: %d\n' '%s', dimensions_hash, (utils.utcnow() - now).total_seconds(), viable, updated, '\n'.join(' ' + d for d in dimensions_flat)) return True
def handle(self, namespace, hash_key): """Handles this request.""" # Extract relevant request parameters. expiration_ts = self.request.get('x') item_size = self.request.get('s') is_isolated = self.request.get('i') uploaded_to_gs = self.request.get('g') signature = self.request.get('sig') # Build correct signature. expected_sig = self.generate_signature(config.settings().global_secret, self.request.method, expiration_ts, namespace, hash_key, item_size, is_isolated, uploaded_to_gs) # Verify signature is correct. if not utils.constant_time_equals(signature, expected_sig): return self.send_error('Incorrect signature.') # Convert parameters from strings back to something useful. # It can't fail since matching signature means it was us who generated # this strings in a first place. expiration_ts = int(expiration_ts) item_size = int(item_size) is_isolated = bool(int(is_isolated)) uploaded_to_gs = bool(int(uploaded_to_gs)) # Verify signature is not yet expired. if time.time() > expiration_ts: return self.send_error('Expired signature.') if uploaded_to_gs: # GS upload finalization uses empty POST body. assert self.request.method == 'POST' if self.request.headers.get('content-length'): return self.send_error('Expecting empty POST.') content = None else: # Datastore upload uses PUT. assert self.request.method == 'PUT' if self.request.headers.get('content-length'): content = self.request.body else: content = '' # Info about corresponding GS entry (if it exists). gs_bucket = config.settings().gs_bucket key = model.entry_key(namespace, hash_key) # Verify the data while at it since it's already in memory but before # storing it in memcache and datastore. if content is not None: # Verify advertised hash matches the data. try: hex_digest, expanded_size = hash_content(content, namespace) if hex_digest != hash_key: raise ValueError('Hash and data do not match, ' '%d bytes (%d bytes expanded)' % (len(content), expanded_size)) if expanded_size != item_size: raise ValueError( 'Advertised data length (%d) and actual data length (%d) ' 'do not match' % (item_size, expanded_size)) except ValueError as err: return self.send_error('Inline verification failed.\n%s' % err) # Successfully verified! compressed_size = len(content) needs_verification = False else: # Fetch size of the stored file. file_info = gcs.get_file_info(gs_bucket, key.id()) if not file_info: # TODO(maruel): Do not fail yet. If the request got up to here, the file # is likely there but the service may have trouble fetching the metadata # from GS. return self.send_error( 'File should be in Google Storage.\nFile: \'%s\' Size: %d.' % (key.id(), item_size)) compressed_size = file_info.size needs_verification = True # Data is here and it's too large for DS, so put it in GS. It is likely # between MIN_SIZE_FOR_GS <= len(content) < MIN_SIZE_FOR_DIRECT_GS if content is not None and len(content) >= MIN_SIZE_FOR_GS: if not gcs.write_file(gs_bucket, key.id(), [content]): # Returns 503 so the client automatically retries. return self.send_error('Unable to save the content to GS.', http_code=503) # It's now in GS. uploaded_to_gs = True # Can create entity now, everything appears to be legit. entry = model.new_content_entry( key=key, is_isolated=is_isolated, compressed_size=compressed_size, expanded_size=-1 if needs_verification else item_size, is_verified=not needs_verification) # If it's not in GS then put it inline. if not uploaded_to_gs: assert content is not None and len(content) < MIN_SIZE_FOR_GS entry.content = content # Start saving *.isolated into memcache iff its content is available and # it's not in Datastore: there's no point in saving inline blobs in memcache # because ndb already memcaches them. memcache_store_future = None if (content is not None and entry.content is None and entry.is_isolated and entry.compressed_size <= model.MAX_MEMCACHE_ISOLATED): memcache_store_future = model.save_in_memcache(namespace, hash_key, content, async=True) try: # If entry was already verified above (i.e. it is a small inline entry), # store it right away, possibly overriding existing entity. Most of # the time it is a new entry anyway (since clients try to upload only # new entries). if not needs_verification: entry.put() else: # For large entries (that require expensive verification) be more # careful and check that it is indeed a new entity. No need to do it in # transaction: a race condition would lead to redundant verification # task enqueued, no big deal. existing = entry.key.get() if existing: if existing.is_verified: logging.info('Entity exists and already verified') else: logging.info('Entity exists, but not yet verified') else: # New entity. Store it and enqueue verification task, transactionally. task_queue_host = utils.get_task_queue_host() def run(): entry.put() taskqueue.add(url='/internal/taskqueue/verify/%s' % entry.key.id(), queue_name='verify', headers={'Host': task_queue_host}, transactional=True) datastore_utils.transaction(run) # TODO(vadimsh): Fill in details about the entry, such as expiration time. self.send_json({'entry': {}}) # Log stats. where = 'GS; ' + 'inline' if entry.content is not None else entry.key.id( ) stats.add_entry(stats.STORE, entry.compressed_size, where) finally: # Do not keep dangling futures. Note that error here is ignored, # memcache is just an optimization. if memcache_store_future: memcache_store_future.wait()