def cron_update_bot_info(): """Refreshes BotInfo.composite for dead bots.""" dt = datetime.timedelta(seconds=config.settings().bot_death_timeout_secs) cutoff = utils.utcnow() - dt @ndb.tasklet def run(bot_key): bot = yield bot_key.get_async() if (bot and bot.last_seen_ts <= cutoff and (BotInfo.ALIVE in bot.composite or BotInfo.DEAD not in bot.composite)): # Updating it recomputes composite. # TODO(maruel): BotEvent. yield bot.put_async() logging.info('DEAD: %s', bot.id) raise ndb.Return(1) raise ndb.Return(0) # The assumption here is that a cron job can churn through all the entities # fast enough. The number of dead bot is expected to be <10k. In practice the # average runtime is around 8 seconds. dead = 0 seen = 0 failed = 0 try: futures = [] for b in BotInfo.query(BotInfo.last_seen_ts <= cutoff): seen += 1 if BotInfo.ALIVE in b.composite or BotInfo.DEAD not in b.composite: # Make sure the variable is not aliased. k = b.key # Unregister the bot from task queues since it can't reap anything. task_queues.cleanup_after_bot(k.parent()) # Retry more often than the default 1. We do not want to throw too much # in the logs and there should be plenty of time to do the retries. f = datastore_utils.transaction_async(lambda: run(k), retries=5) futures.append(f) if len(futures) >= 5: ndb.Future.wait_any(futures) for i in xrange(len(futures) - 1, -1, -1): if futures[i].done(): try: dead += futures.pop(i).get_result() except datastore_utils.CommitError: logging.warning('Failed to commit a Tx') failed += 1 for f in futures: try: dead += f.get_result() except datastore_utils.CommitError: logging.warning('Failed to commit a Tx') failed += 1 finally: logging.debug('Seen %d bots, updated %d bots, failed %d tx', seen, dead, failed) return dead
def _remove_old_entity_async(key, now): """Removes a stale TaskDimensions or BotTaskDimensions instance. Returns: key if it was deleted. """ obj = yield key.get_async() if not obj or obj.valid_until_ts >= now: raise ndb.Return(None) @ndb.tasklet def tx(): obj = yield key.get_async() if obj and obj.valid_until_ts < now: yield key.delete_async() raise ndb.Return(key) res = yield datastore_utils.transaction_async( tx, propagation=ndb.TransactionOptions.INDEPENDENT) raise ndb.Return(res)
def schedule_request(request): """Creates and stores all the entities to schedule a new task request. The number of entities created is 3: TaskRequest, TaskResultSummary and TaskToRun. The TaskRequest is saved first as a DB transaction, then TaskResultSummary and TaskToRun are saved as a single DB RPC. The Search index is also updated in-between. Arguments: - request: is in the TaskRequest entity saved in the DB. Returns: TaskResultSummary. TaskToRun is not returned. """ dupe_future = None if request.properties.idempotent: # Find a previously run task that is also idempotent and completed. Start a # query to fetch items that can be used to dedupe the task. See the comment # for this property for more details. # # Do not use "cls.created_ts > oldest" here because this would require a # composite index. It's unnecessary because TaskRequest.key is mostly # equivalent to decreasing TaskRequest.created_ts, ordering by key works as # well and doesn't require a composite index. cls = task_result.TaskResultSummary h = request.properties.properties_hash dupe_future = cls.query(cls.properties_hash == h).order(cls.key).get_async() # At this point, the request is now in the DB but not yet in a mode where it # can be triggered or visible. Index it right away so it is searchable. If any # of remaining calls in this function fail, the TaskRequest and Search # Document will simply point to an incomplete task, which will be ignored. # # Creates the entities TaskToRun and TaskResultSummary but do not save them # yet. TaskRunResult will be created once a bot starts it. task = task_to_run.new_task_to_run(request) result_summary = task_result.new_result_summary(request) # Do not specify a doc_id, as they are guaranteed to be monotonically # increasing and searches are done in reverse order, which fits exactly the # created_ts ordering. This is useful because DateField is precise to the date # (!) and NumberField is signed 32 bits so the best it could do with EPOCH is # second resolution up to year 2038. index = search.Index(name="requests") packed = task_pack.pack_result_summary_key(result_summary.key) doc = search.Document( fields=[search.TextField(name="name", value=request.name), search.AtomField(name="id", value=packed)] ) # Even if it fails here, we're still fine, as the task is not "alive" yet. search_future = index.put_async([doc]) now = utils.utcnow() if dupe_future: # Reuse the results! dupe_summary = dupe_future.get_result() # Refuse tasks older than X days. This is due to the isolate server dropping # files. https://code.google.com/p/swarming/issues/detail?id=197 oldest = now - datetime.timedelta(seconds=config.settings().reusable_task_age_secs) if dupe_summary and dupe_summary.created_ts > oldest: # If there's a bug, commenting out this block is sufficient to disable the # functionality. # Setting task.queue_number to None removes it from the scheduling. task.queue_number = None _copy_entity(dupe_summary, result_summary, ("created_ts", "name", "user", "tags")) result_summary.properties_hash = None result_summary.try_number = 0 result_summary.cost_saved_usd = result_summary.cost_usd # Only zap after. result_summary.costs_usd = [] result_summary.deduped_from = task_pack.pack_run_result_key(dupe_summary.run_result_key) # Get parent task details if applicable. parent_task_keys = None if request.parent_task_id: parent_run_key = task_pack.unpack_run_result_key(request.parent_task_id) parent_task_keys = [parent_run_key, task_pack.run_result_key_to_result_summary_key(parent_run_key)] result_summary.modified_ts = now # Storing these entities makes this task live. It is important at this point # that the HTTP handler returns as fast as possible, otherwise the task will # be run but the client will not know about it. def run(): ndb.put_multi([result_summary, task]) def run_parent(): # This one is slower. items = ndb.get_multi(parent_task_keys) k = result_summary.task_id for item in items: item.children_task_ids.append(k) item.modified_ts = now ndb.put_multi(items) # Raising will abort to the caller. futures = [datastore_utils.transaction_async(run)] if parent_task_keys: futures.append(datastore_utils.transaction_async(run_parent)) try: search_future.get_result() except search.Error: # Do not abort the task, for now search is best effort. logging.exception("Put failed") for future in futures: # Check for failures, it would raise in this case, aborting the call. future.get_result() stats.add_task_entry( "task_enqueued", result_summary.key, dimensions=request.properties.dimensions, user=request.user ) return result_summary
def schedule_request(request): """Creates and stores all the entities to schedule a new task request. The number of entities created is 3: TaskRequest, TaskResultSummary and TaskToRun. The TaskRequest is saved first as a DB transaction, then TaskResultSummary and TaskToRun are saved as a single DB RPC. The Search index is also updated in-between. Arguments: - request: is in the TaskRequest entity saved in the DB. Returns: TaskResultSummary. TaskToRun is not returned. """ dupe_future = None if request.properties.idempotent: # Find a previously run task that is also idempotent and completed. Start a # query to fetch items that can be used to dedupe the task. See the comment # for this property for more details. # # Do not use "cls.created_ts > oldest" here because this would require a # composite index. It's unnecessary because TaskRequest.key is mostly # equivalent to decreasing TaskRequest.created_ts, ordering by key works as # well and doesn't require a composite index. cls = task_result.TaskResultSummary h = request.properties.properties_hash dupe_future = cls.query(cls.properties_hash==h).order(cls.key).get_async() # At this point, the request is now in the DB but not yet in a mode where it # can be triggered or visible. Index it right away so it is searchable. If any # of remaining calls in this function fail, the TaskRequest and Search # Document will simply point to an incomplete task, which will be ignored. # # Creates the entities TaskToRun and TaskResultSummary but do not save them # yet. TaskRunResult will be created once a bot starts it. task = task_to_run.new_task_to_run(request) result_summary = task_result.new_result_summary(request) # Do not specify a doc_id, as they are guaranteed to be monotonically # increasing and searches are done in reverse order, which fits exactly the # created_ts ordering. This is useful because DateField is precise to the date # (!) and NumberField is signed 32 bits so the best it could do with EPOCH is # second resolution up to year 2038. index = search.Index(name='requests') packed = task_pack.pack_result_summary_key(result_summary.key) doc = search.Document( fields=[ search.TextField(name='name', value=request.name), search.AtomField(name='id', value=packed), ]) # Even if it fails here, we're still fine, as the task is not "alive" yet. search_future = index.put_async([doc]) now = utils.utcnow() if dupe_future: # Reuse the results! dupe_summary = dupe_future.get_result() # Refuse tasks older than X days. This is due to the isolate server dropping # files. https://code.google.com/p/swarming/issues/detail?id=197 oldest = now - datetime.timedelta( seconds=config.settings().reusable_task_age_secs) if dupe_summary and dupe_summary.created_ts > oldest: # If there's a bug, commenting out this block is sufficient to disable the # functionality. # Setting task.queue_number to None removes it from the scheduling. task.queue_number = None _copy_entity(dupe_summary, result_summary, ('created_ts', 'name', 'user')) result_summary.properties_hash = None result_summary.try_number = 0 result_summary.cost_saved_usd = result_summary.cost_usd # Only zap after. result_summary.costs_usd = [] result_summary.deduped_from = task_pack.pack_run_result_key( dupe_summary.run_result_key) # Get parent task details if applicable. parent_task_keys = None if request.parent_task_id: parent_run_key = task_pack.unpack_run_result_key(request.parent_task_id) parent_task_keys = [ parent_run_key, task_pack.run_result_key_to_result_summary_key(parent_run_key), ] result_summary.modified_ts = now # Storing these entities makes this task live. It is important at this point # that the HTTP handler returns as fast as possible, otherwise the task will # be run but the client will not know about it. def run(): ndb.put_multi([result_summary, task]) def run_parent(): # This one is slower. items = ndb.get_multi(parent_task_keys) k = result_summary.task_id for item in items: item.children_task_ids.append(k) item.modified_ts = now ndb.put_multi(items) # Raising will abort to the caller. futures = [datastore_utils.transaction_async(run)] if parent_task_keys: futures.append(datastore_utils.transaction_async(run_parent)) try: search_future.get_result() except search.Error: # Do not abort the task, for now search is best effort. logging.exception('Put failed') for future in futures: # Check for failures, it would raise in this case, aborting the call. future.get_result() stats.add_task_entry( 'task_enqueued', result_summary.key, dimensions=request.properties.dimensions, user=request.user) return result_summary
def cron_update_bot_info(): """Refreshes BotInfo.composite for dead bots.""" @ndb.tasklet def run(bot_key): bot = yield bot_key.get_async() if bot and bot.should_be_dead and (bot.is_alive or not bot.is_dead): # bot composite get updated in _pre_put_hook yield bot.put_async() logging.info('Changing Bot status to DEAD: %s', bot.id) raise ndb.Return(bot_key) raise ndb.Return(None) def tx_result(future, stats): try: bot_key = future.get_result() if bot_key: stats['dead'] += 1 bot = bot_key.get() logging.info('Sending bot_missing event: %s', bot.id) bot_event(event_type='bot_missing', bot_id=bot.id, message=None, external_ip=None, authenticated_as=None, dimensions=None, state=None, version=None, quarantined=None, maintenance_msg=None, task_id=None, task_name=None, register_dimensions=False, last_seen_ts=bot.last_seen_ts) except datastore_utils.CommitError: logging.warning('Failed to commit a Tx') stats['failed'] += 1 # The assumption here is that a cron job can churn through all the entities # fast enough. The number of dead bot is expected to be <10k. In practice the # average runtime is around 8 seconds. cron_stats = { 'dead': 0, 'seen': 0, 'failed': 0, } try: futures = [] for b in BotInfo.yield_dead_bots(): cron_stats['seen'] += 1 if b.is_alive or not b.is_dead: # Make sure the variable is not aliased. k = b.key # Unregister the bot from task queues since it can't reap anything. task_queues.cleanup_after_bot(k.parent()) # Retry more often than the default 1. We do not want to throw too much # in the logs and there should be plenty of time to do the retries. f = datastore_utils.transaction_async(lambda: run(k), retries=5) futures.append(f) if len(futures) >= 5: ndb.Future.wait_any(futures) for i in range(len(futures) - 1, -1, -1): if futures[i].done(): f = futures.pop(i) tx_result(f, cron_stats) for f in futures: tx_result(f, cron_stats) finally: logging.debug('Seen %d bots, updated %d dead bots, failed %d tx', cron_stats['seen'], cron_stats['dead'], cron_stats['failed']) return cron_stats['dead']