Exemple #1
0
 def test_request_key_to_result_summary_key(self):
     # New style key.
     request_key = task_pack.unpack_request_key('11')
     result_key = task_pack.request_key_to_result_summary_key(request_key)
     expected = ndb.Key('TaskRequest', 0x7fffffffffffffee,
                        'TaskResultSummary', 1)
     self.assertEqual(expected, result_key)
     # Old style key.
     request_key = task_pack.unpack_request_key('10')
     result_key = task_pack.request_key_to_result_summary_key(request_key)
     expected = ndb.Key('TaskRequestShard', 'f71849', 'TaskRequest', 256,
                        'TaskResultSummary', 1)
     self.assertEqual(expected, result_key)
Exemple #2
0
def cron_abort_expired_task_to_run():
  """Aborts expired TaskToRun requests to execute a TaskRequest on a bot.

  Three reasons can cause this situation:
  - Higher throughput of task requests incoming than the rate task requests
    being completed, e.g. there's not enough bots to run all the tasks that gets
    in at the current rate. That's normal overflow and must be handled
    accordingly.
  - No bot connected that satisfies the requested dimensions. This is trickier,
    it is either a typo in the dimensions or bots all died and the admins must
    reconnect them.
  - Server has internal failures causing it to fail to either distribute the
    tasks or properly receive results from the bots.
  """
  killed = 0
  skipped = 0
  try:
    for to_run in task_to_run.yield_expired_task_to_run():
      request = to_run.request_key.get()
      if _expire_task(to_run.key, request):
        killed += 1
        stats.add_task_entry(
            'task_request_expired',
            task_pack.request_key_to_result_summary_key(request.key),
            dimensions=request.properties.dimensions,
            user=request.user)
      else:
        # It's not a big deal, the bot will continue running.
        skipped += 1
  finally:
    # TODO(maruel): Use stats_framework.
    logging.info('Killed %d task, skipped %d', killed, skipped)
  return killed
Exemple #3
0
def _update_stats(run_result, bot_id, request, completed):
    """Updates stats after a bot task update notification."""
    if completed:
        runtime_ms = 0
        if run_result.duration_total:
            runtime_ms = _secs_to_ms(run_result.duration_total.total_seconds())
        pending_ms = 0
        if run_result.started_ts:
            pending_ms = _secs_to_ms((run_result.started_ts - request.created_ts).total_seconds())
        stats.add_run_entry(
            "run_completed",
            run_result.key,
            bot_id=bot_id,
            dimensions=request.properties.dimensions,
            runtime_ms=runtime_ms,
            user=request.user,
        )
        stats.add_task_entry(
            "task_completed",
            task_pack.request_key_to_result_summary_key(request.key),
            dimensions=request.properties.dimensions,
            pending_ms=pending_ms,
            user=request.user,
        )
    else:
        stats.add_run_entry("run_updated", run_result.key, bot_id=bot_id, dimensions=request.properties.dimensions)
Exemple #4
0
 def test_result_summary_key_to_request_key(self):
     request_key = task_pack.unpack_request_key('11')
     result_summary_key = task_pack.request_key_to_result_summary_key(
         request_key)
     actual = task_pack.result_summary_key_to_request_key(
         result_summary_key)
     self.assertEqual(request_key, actual)
Exemple #5
0
def _update_stats(run_result, bot_id, request, completed):
    """Updates stats after a bot task update notification."""
    if completed:
        runtime_ms = 0
        if run_result.duration_as_seen_by_server:
            runtime_ms = _secs_to_ms(
                run_result.duration_as_seen_by_server.total_seconds())
        pending_ms = 0
        if run_result.started_ts:
            pending_ms = _secs_to_ms(
                (run_result.started_ts - request.created_ts).total_seconds())
        stats.add_run_entry('run_completed',
                            run_result.key,
                            bot_id=bot_id,
                            dimensions=request.properties.dimensions,
                            runtime_ms=runtime_ms,
                            user=request.user)
        stats.add_task_entry('task_completed',
                             task_pack.request_key_to_result_summary_key(
                                 request.key),
                             dimensions=request.properties.dimensions,
                             pending_ms=pending_ms,
                             user=request.user)
    else:
        stats.add_run_entry('run_updated',
                            run_result.key,
                            bot_id=bot_id,
                            dimensions=request.properties.dimensions)
 def test_request_key_to_result_summary_key(self):
   # New style key.
   request_key = task_pack.unpack_request_key('11')
   result_key = task_pack.request_key_to_result_summary_key(
       request_key)
   expected = ndb.Key(
       'TaskRequest', 0x7fffffffffffffee, 'TaskResultSummary', 1)
   self.assertEqual(expected, result_key)
   # Old style key.
   request_key = task_pack.unpack_request_key('10')
   result_key = task_pack.request_key_to_result_summary_key(
       request_key)
   expected = ndb.Key(
       'TaskRequestShard', 'f71849', 'TaskRequest', 256,
       'TaskResultSummary', 1)
   self.assertEqual(expected, result_key)
Exemple #7
0
def cron_abort_expired_task_to_run():
  """Aborts expired TaskToRun requests to execute a TaskRequest on a bot.

  Three reasons can cause this situation:
  - Higher throughput of task requests incoming than the rate task requests
    being completed, e.g. there's not enough bots to run all the tasks that gets
    in at the current rate. That's normal overflow and must be handled
    accordingly.
  - No bot connected that satisfies the requested dimensions. This is trickier,
    it is either a typo in the dimensions or bots all died and the admins must
    reconnect them.
  - Server has internal failures causing it to fail to either distribute the
    tasks or properly receive results from the bots.
  """
  killed = 0
  skipped = 0
  try:
    for to_run in task_to_run.yield_expired_task_to_run():
      request = to_run.request_key.get()
      if _expire_task(to_run.key, request):
        killed += 1
        stats.add_task_entry(
            'task_request_expired',
            task_pack.request_key_to_result_summary_key(request.key),
            dimensions=request.properties.dimensions,
            user=request.user)
      else:
        # It's not a big deal, the bot will continue running.
        skipped += 1
  finally:
    # TODO(maruel): Use stats_framework.
    logging.info('Killed %d task, skipped %d', killed, skipped)
  return killed
  def make_task_request(
      self, service_account, service_account_token, try_number=1):
    now = utils.utcnow()
    args = {
      'created_ts': now,
      'manual_tags': [u'tag:1'],
      'name': 'Request with %s' % service_account,
      'priority': 50,
      'task_slices': [
        task_request.TaskSlice(
            expiration_secs=60,
            properties=task_request.TaskProperties(
                command=[u'command1'],
                dimensions_data={u'pool': [u'default']},
                execution_timeout_secs=24*60*60)),
      ],
      'user': '******',
    }
    req = task_request.TaskRequest(**args)
    task_request.init_new_request(req, True)
    req.key = task_request.new_request_key()
    req.service_account = service_account
    req.service_account_token = service_account_token
    req.put()

    summary_key = task_pack.request_key_to_result_summary_key(req.key)
    run_result_key = task_pack.result_summary_key_to_run_result_key(
        summary_key, try_number)
    return task_pack.pack_run_result_key(run_result_key)
Exemple #9
0
 def run_result_key(self):
     """Returns the TaskRunResult ndb.Key that will be created for this TaskToRun
 once reaped.
 """
     summary_key = task_pack.request_key_to_result_summary_key(
         self.request_key)
     return task_pack.result_summary_key_to_run_result_key(
         summary_key, self.try_number)
Exemple #10
0
def get_tasks(task_name, task_tags, cursor_str, limit, sort, state):
    """Returns TaskResultSummary entities for this query.

  This function is synchronous.

  Arguments:
    task_name: search for task name whole word.
    task_tags: list of search for one or multiple task tags.
    cursor_str: query-dependent string encoded cursor to continue a previous
        search.
    limit: Maximum number of items to return.
    sort: get_result_summary_query() argument. Only used if both task_name and
        task_tags are empty.
    state: get_result_summary_query() argument. Only used if both task_name and
        task_tags are empty.

  Returns:
    tuple(list of tasks, str encoded cursor, updated sort, updated state)
  """
    # TODO(vadimsh): Use tags with get_result_summary_query. Will require existing
    # entities to be updated first to include 'tags' fields (otherwise they'll
    # disappear from the search).
    if task_tags:
        # Tag based search. Override the flags.
        sort = 'created_ts'
        state = 'all'
        # Only the TaskRequest has the tags. So first query all the keys to
        # requests; then fetch the TaskResultSummary.
        order = _sort_property(sort)
        query = task_request.TaskRequest.query().order(order)
        task_tags = task_tags[:]
        tags_filter = task_request.TaskRequest.tags == task_tags.pop(0)
        while task_tags:
            tags_filter = ndb.AND(
                tags_filter, task_request.TaskRequest.tags == task_tags.pop(0))
        query = query.filter(tags_filter)
        cursor = datastore_query.Cursor(urlsafe=cursor_str)
        requests, cursor, more = query.fetch_page(limit,
                                                  start_cursor=cursor,
                                                  keys_only=True)
        keys = [
            task_pack.request_key_to_result_summary_key(k) for k in requests
        ]
        tasks = ndb.get_multi(keys)
        cursor_str = cursor.urlsafe() if cursor and more else None
    elif task_name:
        # Task name based word based search. Override the flags.
        sort = 'created_ts'
        state = 'all'
        tasks, cursor_str = search_by_name(task_name, cursor_str, limit)
    else:
        # Normal listing.
        query = get_result_summary_query(sort, state, None)
        cursor = datastore_query.Cursor(urlsafe=cursor_str)
        tasks, cursor, more = query.fetch_page(limit, start_cursor=cursor)
        cursor_str = cursor.urlsafe() if cursor and more else None

    return tasks, cursor_str, sort, state
 def test_run_result_key_to_performance_stats_key(self):
   request_key = task_pack.unpack_request_key('11')
   result_summary_key = task_pack.request_key_to_result_summary_key(
       request_key)
   run_result_key = task_pack.result_summary_key_to_run_result_key(
       result_summary_key, 1)
   perf_stats_key = task_pack.run_result_key_to_performance_stats_key(
       run_result_key)
   self.assertEqual('PerformanceStats',perf_stats_key.kind())
Exemple #12
0
 def test_run_result_key_to_performance_stats_key(self):
     request_key = task_pack.unpack_request_key('11')
     result_summary_key = task_pack.request_key_to_result_summary_key(
         request_key)
     run_result_key = task_pack.result_summary_key_to_run_result_key(
         result_summary_key, 1)
     perf_stats_key = task_pack.run_result_key_to_performance_stats_key(
         run_result_key)
     self.assertEqual('PerformanceStats', perf_stats_key.kind())
Exemple #13
0
 def test_run_result_key_to_result_summary_key(self):
     request_key = task_pack.unpack_request_key('11')
     result_summary_key = task_pack.request_key_to_result_summary_key(
         request_key)
     run_result_key = task_pack.result_summary_key_to_run_result_key(
         result_summary_key, 1)
     self.assertEqual(
         result_summary_key,
         task_pack.run_result_key_to_result_summary_key(run_result_key))
 def test_run_result_key_to_result_summary_key(self):
   request_key = task_pack.unpack_request_key('11')
   result_summary_key = task_pack.request_key_to_result_summary_key(
       request_key)
   run_result_key = task_pack.result_summary_key_to_run_result_key(
       result_summary_key, 1)
   self.assertEqual(
       result_summary_key,
       task_pack.run_result_key_to_result_summary_key(run_result_key))
  def test_pack_run_result_key(self):
    request_key = task_pack.unpack_request_key('11')
    result_summary_key = task_pack.request_key_to_result_summary_key(
        request_key)
    run_result_key = task_pack.result_summary_key_to_run_result_key(
        result_summary_key, 1)
    self.assertEqual('111', task_pack.pack_run_result_key(run_result_key))

    with self.assertRaises(AssertionError):
      task_pack.pack_run_result_key(result_summary_key)
Exemple #16
0
def new_result_summary(request):
  """Returns the new and only TaskResultSummary for a TaskRequest.

  The caller must save it in the DB.
  """
  return TaskResultSummary(
      key=task_pack.request_key_to_result_summary_key(request.key),
      created_ts=request.created_ts,
      name=request.name,
      user=request.user)
Exemple #17
0
def new_result_summary(request):
    """Returns the new and only TaskResultSummary for a TaskRequest.

  The caller must save it in the DB.
  """
    return TaskResultSummary(key=task_pack.request_key_to_result_summary_key(
        request.key),
                             created_ts=request.created_ts,
                             name=request.name,
                             user=request.user)
Exemple #18
0
    def test_pack_run_result_key(self):
        request_key = task_pack.unpack_request_key('11')
        result_summary_key = task_pack.request_key_to_result_summary_key(
            request_key)
        run_result_key = task_pack.result_summary_key_to_run_result_key(
            result_summary_key, 1)
        self.assertEqual('111', task_pack.pack_run_result_key(run_result_key))

        with self.assertRaises(AssertionError):
            task_pack.pack_run_result_key(result_summary_key)
Exemple #19
0
def get_tasks(task_name, task_tags, cursor_str, limit, sort, state):
  """Returns TaskResultSummary entities for this query.

  This function is synchronous.

  Arguments:
    task_name: search for task name whole word.
    task_tags: list of search for one or multiple task tags.
    cursor_str: query-dependent string encoded cursor to continue a previous
        search.
    limit: Maximum number of items to return.
    sort: get_result_summary_query() argument. Only used if both task_name and
        task_tags are empty.
    state: get_result_summary_query() argument. Only used if both task_name and
        task_tags are empty.

  Returns:
    tuple(list of tasks, str encoded cursor, updated sort, updated state)
  """
  # TODO(vadimsh): Use tags with get_result_summary_query. Will require existing
  # entities to be updated first to include 'tags' fields (otherwise they'll
  # disappear from the search).
  if task_tags:
    # Tag based search. Override the flags.
    sort = 'created_ts'
    state = 'all'
    # Only the TaskRequest has the tags. So first query all the keys to
    # requests; then fetch the TaskResultSummary.
    order = _sort_property(sort)
    query = task_request.TaskRequest.query().order(order)
    task_tags = task_tags[:]
    tags_filter = task_request.TaskRequest.tags == task_tags.pop(0)
    while task_tags:
      tags_filter = ndb.AND(
          tags_filter, task_request.TaskRequest.tags == task_tags.pop(0))
    query = query.filter(tags_filter)
    cursor = datastore_query.Cursor(urlsafe=cursor_str)
    requests, cursor, more = query.fetch_page(
        limit, start_cursor=cursor, keys_only=True)
    keys = [task_pack.request_key_to_result_summary_key(k) for k in requests]
    tasks = ndb.get_multi(keys)
    cursor_str = cursor.urlsafe() if cursor and more else None
  elif task_name:
    # Task name based word based search. Override the flags.
    sort = 'created_ts'
    state = 'all'
    tasks, cursor_str = search_by_name(task_name, cursor_str, limit)
  else:
    # Normal listing.
    query = get_result_summary_query(sort, state, None)
    cursor = datastore_query.Cursor(urlsafe=cursor_str)
    tasks, cursor, more = query.fetch_page(limit, start_cursor=cursor)
    cursor_str = cursor.urlsafe() if cursor and more else None

  return tasks, cursor_str, sort, state
Exemple #20
0
def _expire_task(to_run_key, request):
    """Expires a TaskResultSummary and unschedules the TaskToRun.

  Returns:
    True on success.
  """
    # Look if the TaskToRun is reapable once before doing the check inside the
    # transaction. This reduces the likelihood of failing this check inside the
    # transaction, which is an order of magnitude more costly.
    if not to_run_key.get().is_reapable:
        logging.info('Not reapable anymore')
        return None

    result_summary_key = task_pack.request_key_to_result_summary_key(
        request.key)
    now = utils.utcnow()

    def run():
        # 2 concurrent GET, one PUT. Optionally with an additional serialized GET.
        to_run_future = to_run_key.get_async()
        result_summary_future = result_summary_key.get_async()
        to_run = to_run_future.get_result()
        if not to_run or not to_run.is_reapable:
            result_summary_future.wait()
            return False

        to_run.queue_number = None
        result_summary = result_summary_future.get_result()
        if result_summary.try_number:
            # It's a retry that is being expired. Keep the old state. That requires an
            # additional pipelined GET but that shouldn't be the common case.
            run_result = result_summary.run_result_key.get()
            result_summary.set_from_run_result(run_result, request)
        else:
            result_summary.state = task_result.State.EXPIRED
        result_summary.abandoned_ts = now
        result_summary.modified_ts = now

        futures = ndb.put_multi_async((to_run, result_summary))
        _maybe_pubsub_notify_via_tq(result_summary, request)
        for f in futures:
            f.check_success()

        return True

    # It'll be caught by next cron job execution in case of failure.
    try:
        success = datastore_utils.transaction(run)
    except datastore_utils.CommitError:
        success = False
    if success:
        task_to_run.set_lookup_cache(to_run_key, False)
        logging.info('Expired %s',
                     task_pack.pack_result_summary_key(result_summary_key))
    return success
Exemple #21
0
def new_result_summary(request):
    """Returns the new and only TaskResultSummary for a TaskRequest.

  The caller must save it in the DB.
  """
    return TaskResultSummary(key=task_pack.request_key_to_result_summary_key(
        request.key),
                             created_ts=request.created_ts,
                             name=request.name,
                             server_versions=[utils.get_app_version()],
                             user=request.user,
                             tags=request.tags)
Exemple #22
0
def _expire_task(to_run_key, request):
    """Expires a TaskResultSummary and unschedules the TaskToRun.

  Returns:
    True on success.
  """
    # Look if the TaskToRun is reapable once before doing the check inside the
    # transaction. This reduces the likelihood of failing this check inside the
    # transaction, which is an order of magnitude more costly.
    if not to_run_key.get().is_reapable:
        logging.info("Not reapable anymore")
        return None

    result_summary_key = task_pack.request_key_to_result_summary_key(request.key)
    now = utils.utcnow()

    def run():
        # 2 concurrent GET, one PUT. Optionally with an additional serialized GET.
        to_run_future = to_run_key.get_async()
        result_summary_future = result_summary_key.get_async()
        to_run = to_run_future.get_result()
        if not to_run or not to_run.is_reapable:
            result_summary_future.wait()
            return False

        to_run.queue_number = None
        result_summary = result_summary_future.get_result()
        if result_summary.try_number:
            # It's a retry that is being expired. Keep the old state. That requires an
            # additional pipelined GET but that shouldn't be the common case.
            run_result = result_summary.run_result_key.get()
            result_summary.set_from_run_result(run_result, request)
        else:
            result_summary.state = task_result.State.EXPIRED
        result_summary.abandoned_ts = now
        result_summary.modified_ts = now

        futures = ndb.put_multi_async((to_run, result_summary))
        _maybe_pubsub_notify_via_tq(result_summary, request)
        for f in futures:
            f.check_success()

        return True

    # It'll be caught by next cron job execution in case of failure.
    try:
        success = datastore_utils.transaction(run)
    except datastore_utils.CommitError:
        success = False
    if success:
        task_to_run.set_lookup_cache(to_run_key, False)
        logging.info("Expired %s", task_pack.pack_result_summary_key(result_summary_key))
    return success
Exemple #23
0
def new_run_result(request, try_number, bot_id, bot_version):
    """Returns a new TaskRunResult for a TaskRequest.

  The caller must save it in the DB.
  """
    assert isinstance(request, task_request.TaskRequest)
    summary_key = task_pack.request_key_to_result_summary_key(request.key)
    return TaskRunResult(key=task_pack.result_summary_key_to_run_result_key(
        summary_key, try_number),
                         bot_id=bot_id,
                         started_ts=utils.utcnow(),
                         bot_version=bot_version,
                         server_versions=[utils.get_app_version()])
Exemple #24
0
def _reap_task(to_run_key, request, bot_id, bot_version, bot_dimensions):
    """Reaps a task and insert the results entity.

  Returns:
    TaskRunResult if successful, None otherwise.
  """
    assert bot_id, bot_id
    assert request.key == task_to_run.task_to_run_key_to_request_key(
        to_run_key)
    result_summary_key = task_pack.request_key_to_result_summary_key(
        request.key)

    now = utils.utcnow()

    def run():
        # 2 GET, 1 PUT at the end.
        to_run_future = to_run_key.get_async()
        result_summary_future = result_summary_key.get_async()
        to_run = to_run_future.get_result()
        result_summary = result_summary_future.get_result()
        if not to_run:
            logging.error('Missing TaskToRun?\n%s', result_summary.task_id)
            return None
        if not to_run.is_reapable:
            logging.info('%s is not reapable', result_summary.task_id)
            return None
        if result_summary.bot_id == bot_id:
            # This means two things, first it's a retry, second it's that the first
            # try failed and the retry is being reaped by the same bot. Deny that, as
            # the bot may be deeply broken and could be in a killing spree.
            logging.warning('%s can\'t retry its own internal failure task',
                            result_summary.task_id)
            return None
        to_run.queue_number = None
        run_result = task_result.new_run_result(
            request, (result_summary.try_number or 0) + 1, bot_id, bot_version,
            bot_dimensions)
        run_result.modified_ts = now
        result_summary.set_from_run_result(run_result, request)
        ndb.put_multi([to_run, run_result, result_summary])
        return run_result

    # The bot will reap the next available task in case of failure, no big deal.
    try:
        run_result = datastore_utils.transaction(run, retries=0)
    except datastore_utils.CommitError:
        run_result = None
    if run_result:
        task_to_run.set_lookup_cache(to_run_key, False)
    return run_result
Exemple #25
0
def new_run_result(request, try_number, bot_id, bot_version):
  """Returns a new TaskRunResult for a TaskRequest.

  The caller must save it in the DB.
  """
  assert isinstance(request, task_request.TaskRequest)
  summary_key = task_pack.request_key_to_result_summary_key(request.key)
  return TaskRunResult(
      key=task_pack.result_summary_key_to_run_result_key(
          summary_key, try_number),
      bot_id=bot_id,
      started_ts=utils.utcnow(),
      bot_version=bot_version,
      server_versions=[utils.get_app_version()])
Exemple #26
0
def get_results(request_key):
  """Fetches all task results for a specified TaskRequest ndb.Key.

  Returns:
    tuple(TaskResultSummary, list of TaskRunResult that exist).
  """
  result_summary_key = task_pack.request_key_to_result_summary_key(request_key)
  result_summary = result_summary_key.get()
  # There's two way to look at it, either use a DB query or fetch all the
  # entities that could exist, at most 255. In general, there will be <3
  # entities so just fetching them by key would be faster. This function is
  # exclusively used in unit tests so it's not performance critical.
  q = task_result.TaskRunResult.query(ancestor=result_summary_key)
  q = q.order(task_result.TaskRunResult.key)
  return result_summary, q.fetch()
def get_results(request_key):
  """Fetches all task results for a specified TaskRequest ndb.Key.

  Returns:
    tuple(TaskResultSummary, list of TaskRunResult that exist).
  """
  result_summary_key = task_pack.request_key_to_result_summary_key(request_key)
  result_summary = result_summary_key.get()
  # There's two way to look at it, either use a DB query or fetch all the
  # entities that could exist, at most 255. In general, there will be <3
  # entities so just fetching them by key would be faster. This function is
  # exclusively used in unit tests so it's not performance critical.
  q = task_result.TaskRunResult.query(ancestor=result_summary_key)
  q = q.order(task_result.TaskRunResult.key)
  return result_summary, q.fetch()
Exemple #28
0
def get_result_summaries(
    task_tags, cursor_str, start, end, state, batch_size):
  """Returns TaskResultSummary entities for this query.

  Arguments:
    task_tags: list of search for one or multiple task tags.
    cursor_str: query-dependent string encoded cursor to continue a previous
        search.
    start: earliest creation date of retrieved tasks
    end: most recent creation date of retrieved tasks
    state: get_result_summary_query() argument. Only used if both task_name and
        task_tags are empty.
    batch_size: Maximum number of items to return.

  Returns:
    tuple(list of tasks, str encoded cursor, updated state)

  This is a slight modification of get_tasks above; it removes support for
  "limit," "name," and "sort" and adds support for date ranges.
  """
  if not 0 < batch_size <= 1000:
    raise ValueError('Inappropriate value for batch_size.')
  query = task_request.TaskRequest.query()
  # Inequalities are <= and >= because keys are in reverse chronological order.
  start_key = _datetime_to_key(start)
  if start_key:
    query = query.filter(task_request.TaskRequest.key <= start_key)
  end_key = _datetime_to_key(end)
  if end_key:
    query = query.filter(task_request.TaskRequest.key >= end_key)
  query = query.order(task_request.TaskRequest.key)

  # Filter by one or more tags.
  for tag in task_tags:
    query = query.filter(task_request.TaskRequest.tags == tag)

  # Fetch and return.
  cursor = datastore_query.Cursor(urlsafe=cursor_str)
  requests, cursor, more = query.fetch_page(
      batch_size, start_cursor=cursor, keys_only=True)
  keys = [task_pack.request_key_to_result_summary_key(k) for k in requests]
  # The TaskResultSummary may be missing for a corresponding TaskRequest. This
  # may happen because the TaskResultSummary is added as a follow up transaction
  # to the transaction that adds TaskRequest and the second one may fail (this
  # should be changed). In this case, ignore the request.
  tasks = [i for i in ndb.get_multi(keys) if i]
  cursor_str = cursor.urlsafe() if cursor and more else None
  return tasks, cursor_str, state
Exemple #29
0
def new_run_result(request, to_run, bot_id, bot_version, bot_dimensions):
    """Returns a new TaskRunResult for a TaskRequest.

  Initializes only the immutable parts.

  The caller must save it in the DB.
  """
    assert isinstance(request, task_request.TaskRequest)
    summary_key = task_pack.request_key_to_result_summary_key(request.key)
    return TaskRunResult(key=task_pack.result_summary_key_to_run_result_key(
        summary_key, to_run.try_number),
                         bot_dimensions=bot_dimensions,
                         bot_id=bot_id,
                         bot_version=bot_version,
                         current_task_slice=to_run.task_slice_index,
                         server_versions=[utils.get_app_version()])
Exemple #30
0
def cron_abort_expired_task_to_run(host):
    """Aborts expired TaskToRun requests to execute a TaskRequest on a bot.

  Three reasons can cause this situation:
  - Higher throughput of task requests incoming than the rate task requests
    being completed, e.g. there's not enough bots to run all the tasks that gets
    in at the current rate. That's normal overflow and must be handled
    accordingly.
  - No bot connected that satisfies the requested dimensions. This is trickier,
    it is either a typo in the dimensions or bots all died and the admins must
    reconnect them.
  - Server has internal failures causing it to fail to either distribute the
    tasks or properly receive results from the bots.

  Returns:
    Packed tasks ids of aborted tasks.
  """
    killed = []
    skipped = 0
    try:
        for to_run in task_to_run.yield_expired_task_to_run():
            request = to_run.request_key.get()
            if _expire_task(to_run.key, request):
                # TODO(maruel): Know which try it is.
                killed.append(request)
                ts_mon_metrics.tasks_expired.increment(
                    fields=ts_mon_metrics.extract_job_fields(request.tags))
                stats.add_task_entry(
                    'task_request_expired',
                    task_pack.request_key_to_result_summary_key(request.key),
                    dimensions=request.properties.dimensions,
                    user=request.user)
            else:
                # It's not a big deal, the bot will continue running.
                skipped += 1
    finally:
        if killed:
            logging.warning(
                'EXPIRED!\n%d tasks:\n%s', len(killed),
                '\n'.join('  %s/user/task/%s  %s' %
                          (host, i.task_id, i.properties.dimensions)
                          for i in killed))
        # TODO(maruel): Use stats_framework.
        logging.info('Killed %d task, skipped %d', len(killed), skipped)
    return [i.task_id for i in killed]
Exemple #31
0
def cron_abort_expired_task_to_run(host):
    """Aborts expired TaskToRun requests to execute a TaskRequest on a bot.

  Three reasons can cause this situation:
  - Higher throughput of task requests incoming than the rate task requests
    being completed, e.g. there's not enough bots to run all the tasks that gets
    in at the current rate. That's normal overflow and must be handled
    accordingly.
  - No bot connected that satisfies the requested dimensions. This is trickier,
    it is either a typo in the dimensions or bots all died and the admins must
    reconnect them.
  - Server has internal failures causing it to fail to either distribute the
    tasks or properly receive results from the bots.

  Returns:
    Packed tasks ids of aborted tasks.
  """
    killed = []
    skipped = 0
    try:
        for to_run in task_to_run.yield_expired_task_to_run():
            request = to_run.request_key.get()
            if _expire_task(to_run.key, request):
                # TODO(maruel): Know which try it is.
                killed.append(request.task_id)
                stats.add_task_entry(
                    "task_request_expired",
                    task_pack.request_key_to_result_summary_key(request.key),
                    dimensions=request.properties.dimensions,
                    user=request.user,
                )
            else:
                # It's not a big deal, the bot will continue running.
                skipped += 1
    finally:
        if killed:
            logging.error(
                "EXPIRED!\n%d tasks:\n%s",
                len(killed),
                "\n".join("  https://%s/user/task/%s" % (host, i) for i in killed),
            )
        # TODO(maruel): Use stats_framework.
        logging.info("Killed %d task, skipped %d", len(killed), skipped)
    return killed
Exemple #32
0
def get_result_summaries(task_tags, cursor_str, start, end, state, batch_size):
    """Returns TaskResultSummary entities for this query.

  Arguments:
    task_tags: list of search for one or multiple task tags.
    cursor_str: query-dependent string encoded cursor to continue a previous
        search.
    start: earliest creation date of retrieved tasks
    end: most recent creation date of retrieved tasks
    state: get_result_summary_query() argument. Only used if both task_name and
        task_tags are empty.
    batch_size: Maximum number of items to return.

  Returns:
    tuple(list of tasks, str encoded cursor, updated state)

  This is a slight modification of get_tasks above; it removes support for
  "limit," "name," and "sort" and adds support for date ranges.
  """
    if not 0 < batch_size <= 1000:
        raise ValueError('Inappropriate value for batch_size.')
    query = task_request.TaskRequest.query()
    # Inequalities are <= and >= because keys are in reverse chronological order.
    start_key = _datetime_to_key(start)
    if start_key:
        query = query.filter(task_request.TaskRequest.key <= start_key)
    end_key = _datetime_to_key(end)
    if end_key:
        query = query.filter(task_request.TaskRequest.key >= end_key)
    query = query.order(task_request.TaskRequest.key)

    # Filter by one or more tags.
    for tag in task_tags:
        query = query.filter(task_request.TaskRequest.tags == tag)

    # Fetch and return.
    cursor = datastore_query.Cursor(urlsafe=cursor_str)
    requests, cursor, more = query.fetch_page(batch_size,
                                              start_cursor=cursor,
                                              keys_only=True)
    keys = [task_pack.request_key_to_result_summary_key(k) for k in requests]
    tasks = ndb.get_multi(keys)
    cursor_str = cursor.urlsafe() if cursor and more else None
    return tasks, cursor_str, state
Exemple #33
0
def delete_old(entity):
  key_to_delete = None
  if entity.key.parent():
    # It is a TaskRequestShard, it is very old.
    key_to_delete = entity.key.parent()
  elif not task_pack.request_key_to_result_summary_key(entity.key).get(
      use_cache=False, use_memcache=False):
    # There's a TaskRequest without TaskResultSummary, delete it.
    key_to_delete = entity.key

  if key_to_delete:
    logging.info('Deleting %s: %s', entity.task_id, key_to_delete)
    total = 1
    qo = ndb.QueryOptions(keys_only=True)
    for k in ndb.Query(default_options=qo, ancestor=key_to_delete):
      yield operation.db.Delete(k)
      total += 1
    yield operation.db.Delete(key_to_delete)
    logging.info('Deleted %d entities', total)
Exemple #34
0
def _reap_task(to_run_key, request, bot_id, bot_version, bot_dimensions):
    """Reaps a task and insert the results entity.

  Returns:
    TaskRunResult if successful, None otherwise.
  """
    assert bot_id, bot_id
    assert request.key == task_to_run.task_to_run_key_to_request_key(to_run_key)
    result_summary_key = task_pack.request_key_to_result_summary_key(request.key)

    now = utils.utcnow()

    def run():
        # 2 GET, 1 PUT at the end.
        to_run_future = to_run_key.get_async()
        result_summary_future = result_summary_key.get_async()
        to_run = to_run_future.get_result()
        if not to_run or not to_run.is_reapable:
            result_summary_future.wait()
            return None
        result_summary = result_summary_future.get_result()
        if result_summary.bot_id == bot_id:
            # This means two things, first it's a retry, second it's that the first
            # try failed and the retry is being reaped by the same bot. Deny that, as
            # the bot may be deeply broken and could be in a killing spree.
            return None
        to_run.queue_number = None
        run_result = task_result.new_run_result(
            request, (result_summary.try_number or 0) + 1, bot_id, bot_version, bot_dimensions
        )
        run_result.modified_ts = now
        result_summary.set_from_run_result(run_result, request)
        ndb.put_multi([to_run, run_result, result_summary])
        return run_result

    # The bot will reap the next available task in case of failure, no big deal.
    try:
        run_result = datastore_utils.transaction(run, retries=0)
    except datastore_utils.CommitError:
        run_result = None
    if run_result:
        task_to_run.set_lookup_cache(to_run_key, False)
    return run_result
Exemple #35
0
def delete_old(entity):
    key_to_delete = None
    if entity.key.parent():
        # It is a TaskRequestShard, it is very old.
        key_to_delete = entity.key.parent()
    elif not task_pack.request_key_to_result_summary_key(entity.key).get(
            use_cache=False, use_memcache=False):
        # There's a TaskRequest without TaskResultSummary, delete it.
        key_to_delete = entity.key

    if key_to_delete:
        logging.info('Deleting %s: %s', entity.task_id, key_to_delete)
        total = 1
        qo = ndb.QueryOptions(keys_only=True)
        for k in ndb.Query(default_options=qo, ancestor=key_to_delete):
            yield operation.db.Delete(k)
            total += 1
        yield operation.db.Delete(key_to_delete)
        logging.info('Deleted %d entities', total)
  def test_result_summary_key_to_run_result_key(self):
    request_key = task_pack.unpack_request_key('11')
    result_summary_key = task_pack.request_key_to_result_summary_key(
        request_key)
    run_result_key = task_pack.result_summary_key_to_run_result_key(
        result_summary_key, 1)
    expected = ndb.Key('TaskRequest', 0x7fffffffffffffee, 'TaskResultSummary',
                       1, 'TaskRunResult', 1)
    self.assertEqual(expected, run_result_key)
    run_result_key = task_pack.result_summary_key_to_run_result_key(
        result_summary_key, 2)
    expected = ndb.Key(
        'TaskRequest', 0x7fffffffffffffee, 'TaskResultSummary', 1,
        'TaskRunResult', 2)
    self.assertEqual(expected, run_result_key)

    with self.assertRaises(ValueError):
      task_pack.result_summary_key_to_run_result_key(result_summary_key, 0)
    with self.assertRaises(ValueError):
      task_pack.result_summary_key_to_run_result_key(result_summary_key, 3)
  def test_result_summary_key_to_run_result_key(self):
    request_key = task_pack.unpack_request_key('11')
    result_summary_key = task_pack.request_key_to_result_summary_key(
        request_key)
    run_result_key = task_pack.result_summary_key_to_run_result_key(
        result_summary_key, 1)
    expected = ndb.Key(
        'TaskRequest', 0x7fffffffffffffee, 'TaskResultSummary', 1,
        'TaskRunResult', 1)
    self.assertEqual(expected, run_result_key)
    run_result_key = task_pack.result_summary_key_to_run_result_key(
        result_summary_key, 2)
    expected = ndb.Key(
        'TaskRequest', 0x7fffffffffffffee, 'TaskResultSummary', 1,
        'TaskRunResult', 2)
    self.assertEqual(expected, run_result_key)

    with self.assertRaises(ValueError):
      task_pack.result_summary_key_to_run_result_key(result_summary_key, 0)
    with self.assertRaises(NotImplementedError):
      task_pack.result_summary_key_to_run_result_key(result_summary_key, 3)
Exemple #38
0
def _reap_task(bot_dimensions, bot_version, to_run_key, request):
  """Reaps a task and insert the results entity.

  Returns:
    (TaskRunResult, SecretBytes) if successful, (None, None) otherwise.
  """
  assert request.key == task_to_run.task_to_run_key_to_request_key(to_run_key)
  result_summary_key = task_pack.request_key_to_result_summary_key(request.key)
  bot_id = bot_dimensions[u'id'][0]

  now = utils.utcnow()
  # Log before the task id in case the function fails in a bad state where the
  # DB TX ran but the reply never comes to the bot. This is the worst case as
  # this leads to a task that results in BOT_DIED without ever starting. This
  # case is specifically handled in cron_handle_bot_died().
  logging.info(
      '_reap_task(%s)', task_pack.pack_result_summary_key(result_summary_key))

  def run():
    # 3 GET, 1 PUT at the end.
    to_run_future = to_run_key.get_async()
    result_summary_future = result_summary_key.get_async()
    to_run = to_run_future.get_result()
    t = request.task_slice(to_run.task_slice_index)
    if t.properties.has_secret_bytes:
      secret_bytes_future = request.secret_bytes_key.get_async()
    result_summary = result_summary_future.get_result()
    orig_summary_state = result_summary.state
    secret_bytes = None
    if t.properties.has_secret_bytes:
      secret_bytes = secret_bytes_future.get_result()
    if not to_run:
      logging.error('Missing TaskToRun?\n%s', result_summary.task_id)
      return None, None
    if not to_run.is_reapable:
      logging.info('%s is not reapable', result_summary.task_id)
      return None, None
    if result_summary.bot_id == bot_id:
      # This means two things, first it's a retry, second it's that the first
      # try failed and the retry is being reaped by the same bot. Deny that, as
      # the bot may be deeply broken and could be in a killing spree.
      # TODO(maruel): Allow retry for bot locked task using 'id' dimension.
      logging.warning(
          '%s can\'t retry its own internal failure task',
          result_summary.task_id)
      return None, None
    to_run.queue_number = None
    run_result = task_result.new_run_result(
        request, to_run, bot_id, bot_version, bot_dimensions)
    # Upon bot reap, both .started_ts and .modified_ts matches. They differ on
    # the first ping.
    run_result.started_ts = now
    run_result.modified_ts = now
    result_summary.set_from_run_result(run_result, request)
    ndb.put_multi([to_run, run_result, result_summary])
    if result_summary.state != orig_summary_state:
      _maybe_pubsub_notify_via_tq(result_summary, request)
    return run_result, secret_bytes

  # Add it to the negative cache *before* running the transaction. This will
  # inhibit concurrently readers to try to reap this task. The downside is if
  # this request fails in the middle of the transaction, the task may stay
  # unreapable for up to 15 seconds.
  if not task_to_run.set_lookup_cache(to_run_key, False):
    logging.debug('hit negative cache')
    return None, None

  try:
    run_result, secret_bytes = datastore_utils.transaction(run, retries=0)
  except datastore_utils.CommitError:
    # The challenge here is that the transaction may have failed because:
    # - The DB had an hickup and the TaskToRun, TaskRunResult and
    #   TaskResultSummary haven't been updated.
    # - The entities had been updated by a concurrent transaction on another
    #   handler so it was not reapable anyway. This does cause exceptions as
    #   both GET returns the TaskToRun.queue_number != None but only one succeed
    #   at the PUT.
    #
    # In the first case, we may want to reset the negative cache, while we don't
    # want to in the later case. The trade off are one of:
    # - negative cache is incorrectly set, so the task is not reapable for 15s
    # - resetting the negative cache would cause even more contention
    #
    # We chose the first one here for now, as the when the DB starts misbehaving
    # and the index becomes stale, it means the DB is *already* not in good
    # shape, so it is preferable to not put more stress on it, and skipping a
    # few tasks for 15s may even actively help the DB to stabilize.
    logging.info('CommitError; reaping failed')
    # The bot will reap the next available task in case of failure, no big deal.
    run_result = None
    secret_bytes = None
  return run_result, secret_bytes
Exemple #39
0
def _expire_task(to_run_key, request):
  """Expires a TaskResultSummary and unschedules the TaskToRun.

  This function is only meant to process PENDING tasks.

  If a follow up TaskSlice is available, reenqueue a new TaskToRun instead of
  expiring the TaskResultSummary.

  Returns:
    TaskResultSummary on success, bool if reenqueued (due to following
    TaskSlice).
  """
  # Look if the TaskToRun is reapable once before doing the check inside the
  # transaction. This reduces the likelihood of failing this check inside the
  # transaction, which is an order of magnitude more costly.
  if not to_run_key.get().is_reapable:
    logging.info('Not reapable anymore')
    return None, None

  result_summary_key = task_pack.request_key_to_result_summary_key(request.key)
  now = utils.utcnow()

  def run():
    # 2 concurrent GET, one PUT. Optionally with an additional serialized GET.
    to_run_future = to_run_key.get_async()
    result_summary_future = result_summary_key.get_async()
    to_run = to_run_future.get_result()
    if not to_run or not to_run.is_reapable:
      result_summary_future.get_result()
      return None, None

    # In any case, dequeue the TaskToRun.
    to_run.queue_number = None
    result_summary = result_summary_future.get_result()
    to_put = [to_run, result_summary]
    # Check if there's a TaskSlice fallback that could be reenqueued.
    new_to_run = None
    index = result_summary.current_task_slice+1
    while index < request.num_task_slices:
      dimensions = request.task_slice(index).properties.dimensions
      if _has_capacity(dimensions):
        # Enqueue a new TasktoRun for this next TaskSlice, it has capacity!
        new_to_run = task_to_run.new_task_to_run(request, 1, index)
        result_summary.current_task_slice = index
        to_put.append(new_to_run)
        break
      index += 1

    if not new_to_run:
      # There's no fallback, giving up.
      if result_summary.try_number:
        # It's a retry that is being expired, i.e. the first try had BOT_DIED.
        # Keep the old state. That requires an additional pipelined GET but that
        # shouldn't be the common case.
        run_result = result_summary.run_result_key.get()
        result_summary.set_from_run_result(run_result, request)
      else:
        result_summary.state = task_result.State.EXPIRED
      result_summary.abandoned_ts = now
    result_summary.modified_ts = now

    futures = ndb.put_multi_async(to_put)
    _maybe_pubsub_notify_via_tq(result_summary, request)
    for f in futures:
      f.check_success()

    return result_summary, new_to_run

  # Add it to the negative cache *before* running the transaction. Either way
  # the task was already reaped or the task is correctly expired and not
  # reapable.
  task_to_run.set_lookup_cache(to_run_key, False)

  # It'll be caught by next cron job execution in case of failure.
  try:
    res, r = datastore_utils.transaction(run)
  except datastore_utils.CommitError:
    res = None
    r = None
  if res:
    logging.info(
        'Expired %s', task_pack.pack_result_summary_key(result_summary_key))
    ts_mon_metrics.on_task_completed(res)
  return res, r
Exemple #40
0
 def test_request_key_to_result_summary_key(self):
     request_key = task_pack.unpack_request_key('11')
     result_key = task_pack.request_key_to_result_summary_key(request_key)
     expected = ndb.Key('TaskRequest', 0x7fffffffffffffee,
                        'TaskResultSummary', 1)
     self.assertEqual(expected, result_key)
 def test_result_summary_key_to_request_key(self):
   request_key = task_pack.unpack_request_key('11')
   result_summary_key = task_pack.request_key_to_result_summary_key(
       request_key)
   actual = task_pack.result_summary_key_to_request_key(result_summary_key)
   self.assertEqual(request_key, actual)
Exemple #42
0
 def task_id(self):
   """Returns the TaskResultSummary packed id, not the task request key."""
   return task_pack.pack_result_summary_key(
       task_pack.request_key_to_result_summary_key(self.key))