Beispiel #1
0
def recreate_virtual_device():
  """Recreate virtual device from image."""
  device_name = environment.get_value('BOT_NAME')
  failure_wait_interval = environment.get_value('FAIL_WAIT')
  project = environment.get_value('GCE_PROJECT')
  retry_limit = environment.get_value('FAIL_RETRIES')
  zone = environment.get_value('GCE_ZONE')

  # This is needed to populate the initial /data partition. We use a separate
  # disk for /data since the one in the provided images is only 2GB.
  preimage_metadata_value = environment.get_value('GCE_DATA_PREIMAGE_METADATA')
  if preimage_metadata_value:
    additional_metadata = {GCE_PREIMAGE_METADATA_KEY: preimage_metadata_value}
  else:
    additional_metadata = None

  for _ in xrange(retry_limit):
    if compute_engine.recreate_instance_with_disks(
        device_name,
        project,
        zone,
        additional_metadata=additional_metadata,
        wait_for_completion=True):
      # Instance recreation succeeeded. Try reconnecting after some wait.
      time.sleep(REMOTE_RECREATE_TIMEOUT)

      if connect_remote(reconnect=True, num_retries=REMOTE_CONNECT_RETRIES * 2):
        # We were able to successfully reconnect to device after recreation.
        return True

    time.sleep(utils.random_number(1, failure_wait_interval))

  logs.log_error('Failed to reimage device.')
  return False
Beispiel #2
0
def task_loop():
    """Executes tasks indefinitely."""
    # Defer heavy task imports to prevent issues with multiprocessing.Process
    from bot.tasks import commands

    clean_exit = False
    while True:
        stacktrace = ''
        exception_occurred = False
        task = None
        # This caches the current environment on first run. Don't move this.
        environment.reset_environment()
        try:
            # Run regular updates.
            update_task.run()
            update_task.track_revision()

            task = tasks.get_task()
            if not task:
                continue

            with _Monitor(task):
                with task.lease():
                    # Execute the command and delete the task.
                    commands.process_command(task)
        except SystemExit as e:
            exception_occurred = True
            clean_exit = (e.code == 0)
            if not clean_exit and not isinstance(e, untrusted.HostException):
                logs.log_error('SystemExit occurred while working on task.')

            stacktrace = traceback.format_exc()
        except commands.AlreadyRunningError:
            exception_occurred = False
        except Exception:
            logs.log_error('Error occurred while working on task.')
            exception_occurred = True
            stacktrace = traceback.format_exc()

        if exception_occurred:
            # Prevent looping too quickly. See: crbug.com/644830
            failure_wait_interval = environment.get_value('FAIL_WAIT')
            time.sleep(utils.random_number(1, failure_wait_interval))
            break

    task_payload = task.payload() if task else None
    return stacktrace, clean_exit, task_payload
Beispiel #3
0
def needs_update(revision_file, revision):
    """Check a revision file against the provided revision
  to see if an update is required."""
    failure_wait_interval = environment.get_value('FAIL_WAIT')
    file_exists = False
    retry_limit = environment.get_value('FAIL_RETRIES')

    for _ in range(retry_limit):
        # NFS can sometimes return a wrong result on file existence, so redo
        # this check a couple of times to be sure.
        if not os.path.exists(revision_file):
            file_exists = False
            time.sleep(15)
            continue

        # Found the file, now try to read its contents.
        file_exists = True

        try:
            file_handle = open(revision_file, 'r')
            current_revision = file_handle.read()
            file_handle.close()
        except:
            logs.log_error('Error occurred while reading revision file %s.' %
                           revision_file)
            time.sleep(utils.random_number(1, failure_wait_interval))
            continue

        if current_revision.isdigit():
            return int(revision) > int(current_revision)

        return str(revision) != str(current_revision)

    # If there is no revision file or if we have lost track of its revision,
    # then we do need to update the data bundle.
    if not file_exists:
        return True

    # An error has occurred and we have failed to read revision file
    # despite several retries. So, don't bother updating the data
    # bundle as it will probably fail as well.
    logs.log_error('Failed to read revision file, exiting.')
    return False
Beispiel #4
0
def update_task_status(task_name, status, expiry_interval=None):
  """Updates status for a task. Used to ensure that a single instance of a task
  is running at any given time."""
  bot_name = environment.get_value('BOT_NAME')
  failure_wait_interval = environment.get_value('FAIL_WAIT')

  # If we didn't get an expiry interval, default to our task lease interval.
  if expiry_interval is None:
    expiry_interval = environment.get_value('TASK_LEASE_SECONDS')
    if expiry_interval is None:
      logs.log_error('expiry_interval is None and TASK_LEASE_SECONDS not set.')

  def _try_update_status():
    """Try update metadata."""
    task_status = get_task_status(task_name, create_if_needed=True)

    # If another bot is already working on this task, bail out with error.
    if (status == data_types.TaskState.STARTED and
        task_status.status == data_types.TaskState.STARTED and
        not dates.time_has_expired(
            task_status.time, seconds=expiry_interval - 1)):
      return False

    task_status.bot_name = bot_name
    task_status.status = status
    task_status.time = utils.utcnow()
    task_status.put()
    return True

  # It is important that we do not continue until the metadata is updated.
  # This can lead to task loss, or can cause issues with multiple bots
  # attempting to run the task at the same time.
  while True:
    try:
      return ndb.transaction(_try_update_status, retries=0)
    except Exception:
      # We need to update the status under all circumstances.
      # Failing to update 'completed' status causes another bot
      # that picked up this job to bail out.
      logs.log_error('Unable to update %s task metadata. Retrying.' % task_name)
      time.sleep(utils.random_number(1, failure_wait_interval))
Beispiel #5
0
def process_command(task):
    """Figures out what to do with the given task and executes the command."""
    logs.log("Executing command '%s'" % task.payload())
    if not task.payload().strip():
        logs.log_error('Empty task received.')
        return

    # Parse task payload.
    task_name = task.command
    task_argument = task.argument
    job_name = task.job

    environment.set_value('TASK_NAME', task_name)
    environment.set_value('TASK_ARGUMENT', task_argument)
    environment.set_value('JOB_NAME', job_name)
    if job_name != 'none':
        job = data_types.Job.query(data_types.Job.name == job_name).get()
        # Job might be removed. In that case, we don't want an exception
        # raised and causing this task to be retried by another bot.
        if not job:
            logs.log_error("Job '%s' not found." % job_name)
            return

        if not job.platform:
            error_string = "No platform set for job '%s'" % job_name
            logs.log_error(error_string)
            raise errors.BadStateError(error_string)

        # A misconfiguration led to this point. Clean up the job if necessary.
        job_queue_suffix = tasks.queue_suffix_for_platform(job.platform)
        bot_queue_suffix = tasks.default_queue_suffix()

        if job_queue_suffix != bot_queue_suffix:
            # This happens rarely, store this as a hard exception.
            logs.log_error(
                'Wrong platform for job %s: job queue [%s], bot queue [%s].' %
                (job_name, job_queue_suffix, bot_queue_suffix))

            # Try to recreate the job in the correct task queue.
            new_queue = (tasks.high_end_queue()
                         if task.high_end else tasks.regular_queue())
            new_queue += job_queue_suffix

            # Command override is continuously run by a bot. If we keep failing
            # and recreating the task, it will just DoS the entire task queue.
            # So, we don't create any new tasks in that case since it needs
            # manual intervention to fix the override anyway.
            if not task.is_command_override:
                try:
                    tasks.add_task(task_name, task_argument, job_name,
                                   new_queue)
                except Exception:
                    # This can happen on trying to publish on a non-existent topic, e.g.
                    # a topic for a high-end bot on another platform. In this case, just
                    # give up.
                    logs.log_error('Failed to fix platform and re-add task.')

            # Add a wait interval to avoid overflowing task creation.
            failure_wait_interval = environment.get_value('FAIL_WAIT')
            time.sleep(failure_wait_interval)
            return

        if task_name != 'fuzz':
            # Make sure that our platform id matches that of the testcase (for
            # non-fuzz tasks).
            testcase = data_handler.get_entity_by_type_and_id(
                data_types.Testcase, task_argument)
            if testcase:
                current_platform_id = environment.get_platform_id()
                testcase_platform_id = testcase.platform_id

                # This indicates we are trying to run this job on the wrong platform.
                # This can happen when you have different type of devices (e.g
                # android) on the same platform group. In this case, we just recreate
                # the task.
                if (task_name != 'variant'
                        and testcase_platform_id and not utils.fields_match(
                            testcase_platform_id, current_platform_id)):
                    logs.log(
                        'Testcase %d platform (%s) does not match with ours (%s), exiting'
                        % (testcase.key.id(), testcase_platform_id,
                           current_platform_id))
                    tasks.add_task(task_name,
                                   task_argument,
                                   job_name,
                                   wait_time=utils.random_number(
                                       1, TASK_RETRY_WAIT_LIMIT))
                    return

        # Some fuzzers contain additional environment variables that should be
        # set for them. Append these for tests generated by these fuzzers and for
        # the fuzz command itself.
        fuzzer_name = None
        if task_name == 'fuzz':
            fuzzer_name = task_argument
        elif testcase:
            fuzzer_name = testcase.fuzzer_name

        # Get job's environment string.
        environment_string = job.get_environment_string()

        if task_name == 'minimize':
            # Let jobs specify a different job and fuzzer to minimize with.
            job_environment = job.get_environment()
            minimize_job_override = job_environment.get(
                'MINIMIZE_JOB_OVERRIDE')
            if minimize_job_override:
                minimize_job = data_types.Job.query(
                    data_types.Job.name == minimize_job_override).get()
                if minimize_job:
                    environment.set_value('JOB_NAME', minimize_job_override)
                    environment_string = minimize_job.get_environment_string()
                    environment_string += '\nORIGINAL_JOB_NAME = %s\n' % job_name
                    job_name = minimize_job_override
                else:
                    logs.log_error('Job for minimization not found: %s.' %
                                   minimize_job_override)
                    # Fallback to using own job for minimization.

            minimize_fuzzer_override = job_environment.get(
                'MINIMIZE_FUZZER_OVERRIDE')
            fuzzer_name = minimize_fuzzer_override or fuzzer_name

        if fuzzer_name and not environment.is_engine_fuzzer_job(job_name):
            fuzzer = data_types.Fuzzer.query(
                data_types.Fuzzer.name == fuzzer_name).get()
            additional_default_variables = ''
            additional_variables_for_job = ''
            if (fuzzer and hasattr(fuzzer, 'additional_environment_string')
                    and fuzzer.additional_environment_string):
                for line in fuzzer.additional_environment_string.splitlines():
                    # Job specific values may be defined in fuzzer additional
                    # environment variable name strings in the form
                    # job_name:VAR_NAME = VALUE.
                    if '=' in line and ':' in line.split('=', 1)[0]:
                        fuzzer_job_name, environment_definition = line.split(
                            ':', 1)
                        if fuzzer_job_name == job_name:
                            additional_variables_for_job += '\n%s' % environment_definition
                        continue

                    additional_default_variables += '\n%s' % line

            environment_string += additional_default_variables
            environment_string += additional_variables_for_job

        # Update environment for the job.
        update_environment_for_job(environment_string)

    # Match the cpu architecture with the ones required in the job definition.
    # If they don't match, then bail out and recreate task.
    if not is_supported_cpu_arch_for_job():
        logs.log(
            'Unsupported cpu architecture specified in job definition, exiting.'
        )
        tasks.add_task(task_name,
                       task_argument,
                       job_name,
                       wait_time=utils.random_number(1, TASK_RETRY_WAIT_LIMIT))
        return

    # Initial cleanup.
    cleanup_task_state()

    start_web_server_if_needed()

    try:
        run_command(task_name, task_argument, job_name)
    finally:
        # Final clean up.
        cleanup_task_state()