Example #1
0
def _reap_task(to_run_key, request, bot_id, bot_version, bot_dimensions):
    """Reaps a task and insert the results entity.

  Returns:
    TaskRunResult if successful, None otherwise.
  """
    assert bot_id, bot_id
    assert request.key == task_to_run.task_to_run_key_to_request_key(
        to_run_key)
    result_summary_key = task_pack.request_key_to_result_summary_key(
        request.key)

    now = utils.utcnow()

    def run():
        # 2 GET, 1 PUT at the end.
        to_run_future = to_run_key.get_async()
        result_summary_future = result_summary_key.get_async()
        to_run = to_run_future.get_result()
        result_summary = result_summary_future.get_result()
        if not to_run:
            logging.error('Missing TaskToRun?\n%s', result_summary.task_id)
            return None
        if not to_run.is_reapable:
            logging.info('%s is not reapable', result_summary.task_id)
            return None
        if result_summary.bot_id == bot_id:
            # This means two things, first it's a retry, second it's that the first
            # try failed and the retry is being reaped by the same bot. Deny that, as
            # the bot may be deeply broken and could be in a killing spree.
            logging.warning('%s can\'t retry its own internal failure task',
                            result_summary.task_id)
            return None
        to_run.queue_number = None
        run_result = task_result.new_run_result(
            request, (result_summary.try_number or 0) + 1, bot_id, bot_version,
            bot_dimensions)
        run_result.modified_ts = now
        result_summary.set_from_run_result(run_result, request)
        ndb.put_multi([to_run, run_result, result_summary])
        return run_result

    # The bot will reap the next available task in case of failure, no big deal.
    try:
        run_result = datastore_utils.transaction(run, retries=0)
    except datastore_utils.CommitError:
        run_result = None
    if run_result:
        task_to_run.set_lookup_cache(to_run_key, False)
    return run_result
Example #2
0
def _reap_task(to_run_key, request, bot_id, bot_version, bot_dimensions):
    """Reaps a task and insert the results entity.

  Returns:
    TaskRunResult if successful, None otherwise.
  """
    assert bot_id, bot_id
    assert request.key == task_to_run.task_to_run_key_to_request_key(to_run_key)
    result_summary_key = task_pack.request_key_to_result_summary_key(request.key)

    now = utils.utcnow()

    def run():
        # 2 GET, 1 PUT at the end.
        to_run_future = to_run_key.get_async()
        result_summary_future = result_summary_key.get_async()
        to_run = to_run_future.get_result()
        if not to_run or not to_run.is_reapable:
            result_summary_future.wait()
            return None
        result_summary = result_summary_future.get_result()
        if result_summary.bot_id == bot_id:
            # This means two things, first it's a retry, second it's that the first
            # try failed and the retry is being reaped by the same bot. Deny that, as
            # the bot may be deeply broken and could be in a killing spree.
            return None
        to_run.queue_number = None
        run_result = task_result.new_run_result(
            request, (result_summary.try_number or 0) + 1, bot_id, bot_version, bot_dimensions
        )
        run_result.modified_ts = now
        result_summary.set_from_run_result(run_result, request)
        ndb.put_multi([to_run, run_result, result_summary])
        return run_result

    # The bot will reap the next available task in case of failure, no big deal.
    try:
        run_result = datastore_utils.transaction(run, retries=0)
    except datastore_utils.CommitError:
        run_result = None
    if run_result:
        task_to_run.set_lookup_cache(to_run_key, False)
    return run_result
 def test_task_to_run_key_to_request_key(self):
   request = task_request.make_request(_gen_request(), True)
   task_key = task_to_run.request_to_task_to_run_key(request)
   actual = task_to_run.task_to_run_key_to_request_key(task_key)
   self.assertEqual(request.key, actual)
Example #4
0
 def test_task_to_run_key_to_request_key(self):
   request = self.mkreq(1, _gen_request())
   task_key = task_to_run.request_to_task_to_run_key(request, 1, 0)
   actual = task_to_run.task_to_run_key_to_request_key(task_key)
   self.assertEqual(request.key, actual)
Example #5
0
 def test_task_to_run_key_to_request_key(self):
     request = task_request.make_request(_gen_request_data())
     task_key = task_to_run.request_to_task_to_run_key(request)
     actual = task_to_run.task_to_run_key_to_request_key(task_key)
     self.assertEqual(request.key, actual)
Example #6
0
def _reap_task(bot_dimensions, bot_version, to_run_key, request):
  """Reaps a task and insert the results entity.

  Returns:
    (TaskRunResult, SecretBytes) if successful, (None, None) otherwise.
  """
  assert request.key == task_to_run.task_to_run_key_to_request_key(to_run_key)
  result_summary_key = task_pack.request_key_to_result_summary_key(request.key)
  bot_id = bot_dimensions[u'id'][0]

  now = utils.utcnow()
  # Log before the task id in case the function fails in a bad state where the
  # DB TX ran but the reply never comes to the bot. This is the worst case as
  # this leads to a task that results in BOT_DIED without ever starting. This
  # case is specifically handled in cron_handle_bot_died().
  logging.info(
      '_reap_task(%s)', task_pack.pack_result_summary_key(result_summary_key))

  def run():
    # 3 GET, 1 PUT at the end.
    to_run_future = to_run_key.get_async()
    result_summary_future = result_summary_key.get_async()
    to_run = to_run_future.get_result()
    t = request.task_slice(to_run.task_slice_index)
    if t.properties.has_secret_bytes:
      secret_bytes_future = request.secret_bytes_key.get_async()
    result_summary = result_summary_future.get_result()
    orig_summary_state = result_summary.state
    secret_bytes = None
    if t.properties.has_secret_bytes:
      secret_bytes = secret_bytes_future.get_result()
    if not to_run:
      logging.error('Missing TaskToRun?\n%s', result_summary.task_id)
      return None, None
    if not to_run.is_reapable:
      logging.info('%s is not reapable', result_summary.task_id)
      return None, None
    if result_summary.bot_id == bot_id:
      # This means two things, first it's a retry, second it's that the first
      # try failed and the retry is being reaped by the same bot. Deny that, as
      # the bot may be deeply broken and could be in a killing spree.
      # TODO(maruel): Allow retry for bot locked task using 'id' dimension.
      logging.warning(
          '%s can\'t retry its own internal failure task',
          result_summary.task_id)
      return None, None
    to_run.queue_number = None
    run_result = task_result.new_run_result(
        request, to_run, bot_id, bot_version, bot_dimensions)
    # Upon bot reap, both .started_ts and .modified_ts matches. They differ on
    # the first ping.
    run_result.started_ts = now
    run_result.modified_ts = now
    result_summary.set_from_run_result(run_result, request)
    ndb.put_multi([to_run, run_result, result_summary])
    if result_summary.state != orig_summary_state:
      _maybe_pubsub_notify_via_tq(result_summary, request)
    return run_result, secret_bytes

  # Add it to the negative cache *before* running the transaction. This will
  # inhibit concurrently readers to try to reap this task. The downside is if
  # this request fails in the middle of the transaction, the task may stay
  # unreapable for up to 15 seconds.
  if not task_to_run.set_lookup_cache(to_run_key, False):
    logging.debug('hit negative cache')
    return None, None

  try:
    run_result, secret_bytes = datastore_utils.transaction(run, retries=0)
  except datastore_utils.CommitError:
    # The challenge here is that the transaction may have failed because:
    # - The DB had an hickup and the TaskToRun, TaskRunResult and
    #   TaskResultSummary haven't been updated.
    # - The entities had been updated by a concurrent transaction on another
    #   handler so it was not reapable anyway. This does cause exceptions as
    #   both GET returns the TaskToRun.queue_number != None but only one succeed
    #   at the PUT.
    #
    # In the first case, we may want to reset the negative cache, while we don't
    # want to in the later case. The trade off are one of:
    # - negative cache is incorrectly set, so the task is not reapable for 15s
    # - resetting the negative cache would cause even more contention
    #
    # We chose the first one here for now, as the when the DB starts misbehaving
    # and the index becomes stale, it means the DB is *already* not in good
    # shape, so it is preferable to not put more stress on it, and skipping a
    # few tasks for 15s may even actively help the DB to stabilize.
    logging.info('CommitError; reaping failed')
    # The bot will reap the next available task in case of failure, no big deal.
    run_result = None
    secret_bytes = None
  return run_result, secret_bytes