Exemple #1
0
def run_on_client(host, client_results_dir, cleanup_only=False):
    """Run result utils on the given host.

    @param host: Host to run the result utils.
    @param client_results_dir: Path to the results directory on the client.
    @param cleanup_only: True to delete all existing directory summary files in
            the given directory.
    @return: True: If the command runs on client without error.
             False: If the command failed with error in result throttling.
    """
    success = False
    with metrics.SecondsTimer(
            'chromeos/autotest/job/dir_summary_collection_duration',
            fields={'dut_host_name': host.hostname}) as fields:
        try:
            _deploy_result_tools(host)

            if cleanup_only:
                logging.debug('Cleaning up directory summary in %s',
                              client_results_dir)
                cmd = (_CLEANUP_DIR_SUMMARY_CMD %
                       (DEFAULT_AUTOTEST_DIR, client_results_dir))
                host.run(cmd,
                         ignore_status=False,
                         timeout=_CLEANUP_DIR_SUMMARY_TIMEOUT)
            else:
                logging.debug('Getting directory summary for %s',
                              client_results_dir)
                throttle_option = ''
                if ENABLE_RESULT_THROTTLING:
                    try:
                        throttle_option = (_THROTTLE_OPTION_FMT %
                                           host.job.max_result_size_KB)
                    except AttributeError:
                        # In case host job is not set, skip throttling.
                        logging.warn(
                            'host object does not have job attribute, '
                            'skipping result throttling.')
                cmd = (_BUILD_DIR_SUMMARY_CMD %
                       (DEFAULT_AUTOTEST_DIR, client_results_dir,
                        throttle_option))
                host.run(cmd,
                         ignore_status=False,
                         timeout=_BUILD_DIR_SUMMARY_TIMEOUT)
                success = True
            fields['success'] = True
        except error.AutoservRunError:
            action = 'cleanup' if cleanup_only else 'create'
            logging.exception(
                'Non-critical failure: Failed to %s directory summary for '
                '%s.', action, client_results_dir)
            fields['success'] = False

    return success
 def _push_media(self, CTS_URI):
     """Downloads, caches and pushed media files to DUT."""
     media = self._install_bundle(CTS_URI['media'])
     base = os.path.splitext(os.path.basename(CTS_URI['media']))[0]
     cts_media = os.path.join(media, base)
     # TODO(ihf): this really should measure throughput in Bytes/s.
     m = 'chromeos/autotest/infra_benchmark/cheets/push_media/duration'
     fields = {'success': False, 'dut_host_name': self._host.hostname}
     with metrics.SecondsTimer(m, fields=fields) as c:
         self._copy_media(cts_media)
         c['success'] = True
     if not self._verify_media(cts_media):
         raise error.TestFail('Error: saw corruption pushing media files.')
Exemple #3
0
def main():
    """Main script."""
    options = parse_options()
    log_config = logging_config.LoggingConfig()
    if options.logfile:
        log_config.add_file_handler(file_path=os.path.abspath(options.logfile),
                                    level=logging.DEBUG)

    with ts_mon_config.SetupTsMonGlobalState(service_name='cleanup_tko_db',
                                             indirect=True):
        server = CONFIG.get_config_value('AUTOTEST_WEB',
                                         'global_db_host',
                                         default=CONFIG.get_config_value(
                                             'AUTOTEST_WEB', 'host'))
        user = CONFIG.get_config_value('AUTOTEST_WEB',
                                       'global_db_user',
                                       default=CONFIG.get_config_value(
                                           'AUTOTEST_WEB', 'user'))
        password = CONFIG.get_config_value('AUTOTEST_WEB',
                                           'global_db_password',
                                           default=CONFIG.get_config_value(
                                               'AUTOTEST_WEB', 'password'))
        database = CONFIG.get_config_value('AUTOTEST_WEB',
                                           'global_db_database',
                                           default=CONFIG.get_config_value(
                                               'AUTOTEST_WEB', 'database'))

        logging.info(
            'Starting cleaning up old records in TKO database %s on '
            'server %s.', database, server)

        start_time = time.time()
        try:
            with metrics.SecondsTimer(CLEANUP_METRIC,
                                      fields={'success': False}) as fields:
                utils.run_sql_cmd(server, user, password, CLEANUP_TKO_CMD,
                                  database)
                fields['success'] = True
        except:
            logging.exception('Cleanup failed with exception.')
        finally:
            duration = time.time() - start_time
            logging.info('Cleanup attempt finished in %s seconds.', duration)
Exemple #4
0
def main(argv):
  parser = commandline.ArgumentParser(description=__doc__)
  parser.add_argument('swarming_server', action='store',
                      help='Swarming server to send no-op requests to.')
  options = parser.parse_args(argv)

  m_timer = 'chromeos/autotest/swarming_proxy/no_op_durations'
  m_count = 'chromeos/autotest/swarming_proxy/no_op_attempts'
  command = commands.RUN_SUITE_PATH
  fields = {'success': False, 'swarming_server': options.swarming_server}
  with ts_mon_config.SetupTsMonGlobalState('swarm_mon', indirect=True):
    while True:
      with metrics.SecondsTimer(m_timer, fields=fields) as f:
        try:
          with metrics.SuccessCounter(m_count):
            swarming_lib.RunSwarmingCommand([command, '--do_nothing'],
                                            options.swarming_server,
                                            dimensions=[('pool', 'default')],
                                            timeout_secs=120)
          f['success'] = True
        except (cros_build_lib.RunCommandError, timeout_util.TimeoutError):
          pass
      time.sleep(60)
Exemple #5
0
    def trigger_refresh(self):
        """Triggers a drone manager refresh.

        @raises DroneManagerError: If a drone has un-executed calls.
            Since they will get clobbered when we queue refresh calls.
        """
        self._reset()
        self._drop_old_pidfiles()
        pidfile_paths = [pidfile_id.path
                         for pidfile_id in self._registered_pidfile_info]
        drones = list(self.get_drones())
        for drone in drones:
            calls = drone.get_calls()
            if calls:
                raise DroneManagerError('Drone %s has un-executed calls: %s '
                                        'which might get corrupted through '
                                        'this invocation' %
                                        (drone, [str(call) for call in calls]))
            drone.queue_call('refresh', pidfile_paths)
        logging.info("Invoking drone refresh.")
        with metrics.SecondsTimer(
                'chromeos/autotest/drone_manager/trigger_refresh_duration'):
            self._refresh_task_queue.execute(drones, wait=False)
Exemple #6
0
    def run_cmd(self, cmd, expected=None):
        """Runs rpc command and log metrics

        @param cmd: string of rpc command to send
        @param expected: expected result of rpc
        """
        metric_fields = self._metric_fields.copy()
        metric_fields['command'] = cmd
        metric_fields['success'] = True
        metric_fields['failure_reason'] = ''

        with metrics.SecondsTimer(METRIC_RPC_CALL_DURATIONS,
                                  fields=dict(metric_fields),
                                  scale=0.001) as f:

            msg_str = "%s:%s" % (self._hostname, cmd)

            try:
                result = self._afe.run(cmd)
                logging.debug("%s result = %s", msg_str, result)
                if expected is not None and expected != result:
                    _failed(f, msg_str, 'IncorrectResponse')

            except urllib2.HTTPError as e:
                _failed(f, msg_str, 'HTTPError:%d' % e.code)

            except Exception as e:
                _failed(f,
                        msg_str,
                        FAILURE_REASONS.get(type(e), 'Unknown'),
                        err=e)

                if type(e) not in FAILURE_REASONS:
                    raise

            if f['success']:
                logging.info("%s success", msg_str)
 def testContextManagerIgnoresInvalidField(self):
   """Test that we ignore fields that are set with no default."""
   with metrics.SecondsTimer('fooname', fields={'foo': 'bar'}) as c:
     c['qux'] = 'qwert'
   self._mockMetric.add.assert_called_with(mock.ANY, fields={'foo': 'bar'})
Exemple #8
0
def _main(argv):
    """main method of script.

  Args:
    argv: All command line arguments to pass as list of strings.

  Returns:
    Return code of cbuildbot as an integer.
  """
    options = PreParseArguments(argv)

    branchname = options.branch or 'master'
    root = options.buildroot
    buildroot = os.path.join(root, 'repository')
    depot_tools_path = os.path.join(buildroot, constants.DEPOT_TOOLS_SUBPATH)

    metrics_fields = {
        'branch_name': branchname,
        'build_config': options.build_config_name,
        'tryjob': options.remote_trybot,
    }

    # Does the entire build pass or fail.
    with metrics.Presence(METRIC_ACTIVE, metrics_fields), \
         metrics.SuccessCounter(METRIC_COMPLETED, metrics_fields) as s_fields:

        # Preliminary set, mostly command line parsing.
        with metrics.SuccessCounter(METRIC_INVOKED, metrics_fields):
            if options.enable_buildbot_tags:
                logging.EnableBuildbotMarkers()
            ConfigureGlobalEnvironment()

        # Prepare the buildroot with source for the build.
        with metrics.SuccessCounter(METRIC_PREP, metrics_fields):
            site_config = config_lib.GetConfig()
            manifest_url = site_config.params['MANIFEST_INT_URL']
            repo = repository.RepoRepository(
                manifest_url,
                buildroot,
                branch=branchname,
                git_cache_dir=options.git_cache_dir)
            previous_build_state = GetLastBuildState(root)

            # Clean up the buildroot to a safe state.
            with metrics.SecondsTimer(METRIC_CLEAN, fields=metrics_fields):
                build_state = GetCurrentBuildState(options, branchname)
                CleanBuildRoot(root, repo, metrics_fields, build_state)

            # Get a checkout close enough to the branch that cbuildbot can handle it.
            if options.sync:
                with metrics.SecondsTimer(METRIC_INITIAL,
                                          fields=metrics_fields):
                    InitialCheckout(repo)

            # Get a checkout close enough to the branch that cbuildbot can handle it.
            with metrics.SecondsTimer(METRIC_DEPOT_TOOLS,
                                      fields=metrics_fields):
                DepotToolsEnsureBootstrap(depot_tools_path)

        # Run cbuildbot inside the full ChromeOS checkout, on the specified branch.
        with metrics.SecondsTimer(METRIC_CBUILDBOT, fields=metrics_fields):
            if previous_build_state.is_valid():
                argv.append('--previous-build-state')
                argv.append(base64.b64encode(previous_build_state.to_json()))

            result = Cbuildbot(buildroot, depot_tools_path, argv)
            s_fields['success'] = (result == 0)

            build_state.status = (constants.BUILDER_STATUS_PASSED if result
                                  == 0 else constants.BUILDER_STATUS_FAILED)
            SetLastBuildState(root, build_state)

            CleanupChroot(buildroot)
            return result
 def testContextManagerWithUpdate(self):
   """Tests that timing context manager with a field update emits metric."""
   with metrics.SecondsTimer('fooname', fields={'foo': 'bar'}) as c:
     c['foo'] = 'qux'
   self._mockMetric.add.assert_called_with(mock.ANY, fields={'foo': 'qux'})
Exemple #10
0
 def testContextManagerWithoutUpdate(self):
   """Tests that the default value for fields is used when not updated."""
   # pylint: disable=unused-variable
   with metrics.SecondsTimer('fooname', fields={'foo': 'bar'}) as c:
     pass
   self._mockMetric.add.assert_called_with(mock.ANY, fields={'foo': 'bar'})
Exemple #11
0
def run_autoserv(pid_file_manager, results, parser, ssp_url, use_ssp):
    """Run server job with given options.

    @param pid_file_manager: PidFileManager used to monitor the autoserv process
    @param results: Folder to store results.
    @param parser: Parser for the command line arguments.
    @param ssp_url: Url to server-side package.
    @param use_ssp: Set to True to run with server-side packaging.
    """
    # send stdin to /dev/null
    dev_null = os.open(os.devnull, os.O_RDONLY)
    os.dup2(dev_null, sys.stdin.fileno())
    os.close(dev_null)

    # Create separate process group if the process is not a process group
    # leader. This allows autoserv process to keep running after the caller
    # process (drone manager call) exits.
    if os.getpid() != os.getpgid(0):
        os.setsid()

    # Container name is predefined so the container can be destroyed in
    # handle_sigterm.
    job_or_task_id = job_directories.get_job_id_or_task_id(
        parser.options.results)
    container_id = lxc.ContainerId(job_or_task_id, time.time(), os.getpid())

    # Implement SIGTERM handler
    def handle_sigterm(signum, frame):
        logging.debug('Received SIGTERM')
        if pid_file_manager:
            pid_file_manager.close_file(1, signal.SIGTERM)
        logging.debug('Finished writing to pid_file. Killing process.')

        # Update results folder's file permission. This needs to be done ASAP
        # before the parsing process tries to access the log.
        if use_ssp and results:
            correct_results_folder_permission(results)

        # TODO (sbasi) - remove the time.sleep when crbug.com/302815 is solved.
        # This sleep allows the pending output to be logged before the kill
        # signal is sent.
        time.sleep(.1)
        if use_ssp:
            logging.debug(
                'Destroy container %s before aborting the autoserv '
                'process.', container_id)
            try:
                bucket = lxc.ContainerBucket()
                container = bucket.get_container(container_id)
                if container:
                    container.destroy()
                else:
                    logging.debug('Container %s is not found.', container_id)
            except:
                # Handle any exception so the autoserv process can be aborted.
                logging.exception('Failed to destroy container %s.',
                                  container_id)
            # Try to correct the result file permission again after the
            # container is destroyed, as the container might have created some
            # new files in the result folder.
            if results:
                correct_results_folder_permission(results)

        os.killpg(os.getpgrp(), signal.SIGKILL)

    # Set signal handler
    signal.signal(signal.SIGTERM, handle_sigterm)

    # faulthandler is only needed to debug in the Lab and is not avaliable to
    # be imported in the chroot as part of VMTest, so Try-Except it.
    try:
        import faulthandler
        faulthandler.register(signal.SIGTERM, all_threads=True, chain=True)
        logging.debug('faulthandler registered on SIGTERM.')
    except ImportError:
        sys.exc_clear()

    # Ignore SIGTTOU's generated by output from forked children.
    signal.signal(signal.SIGTTOU, signal.SIG_IGN)

    # If we received a SIGALARM, let's be loud about it.
    signal.signal(signal.SIGALRM, log_alarm)

    # Server side tests that call shell scripts often depend on $USER being set
    # but depending on how you launch your autotest scheduler it may not be set.
    os.environ['USER'] = getpass.getuser()

    label = parser.options.label
    group_name = parser.options.group_name
    user = parser.options.user
    client = parser.options.client
    server = parser.options.server
    verify = parser.options.verify
    repair = parser.options.repair
    cleanup = parser.options.cleanup
    provision = parser.options.provision
    reset = parser.options.reset
    job_labels = parser.options.job_labels
    no_tee = parser.options.no_tee
    execution_tag = parser.options.execution_tag
    ssh_user = parser.options.ssh_user
    ssh_port = parser.options.ssh_port
    ssh_pass = parser.options.ssh_pass
    collect_crashinfo = parser.options.collect_crashinfo
    control_filename = parser.options.control_filename
    verify_job_repo_url = parser.options.verify_job_repo_url
    skip_crash_collection = parser.options.skip_crash_collection
    ssh_verbosity = int(parser.options.ssh_verbosity)
    ssh_options = parser.options.ssh_options
    no_use_packaging = parser.options.no_use_packaging
    in_lab = bool(parser.options.lab)

    # can't be both a client and a server side test
    if client and server:
        parser.parser.error(
            "Can not specify a test as both server and client!")

    if provision and client:
        parser.parser.error("Cannot specify provisioning and client!")

    is_special_task = (verify or repair or cleanup or collect_crashinfo
                       or provision or reset)
    use_client_trampoline = False
    if parser.options.control_name:
        if use_ssp:
            # When use_ssp is True, autoserv will be re-executed inside a
            # container preserving the --control-name argument. Control file
            # will be staged inside the rexecuted autoserv.
            control = None
        else:
            try:
                control = _stage_control_file(parser.options.control_name,
                                              results)
            except error.AutoservError as e:
                logging.info("Using client trampoline because of: %s", e)
                control = parser.options.control_name
                use_client_trampoline = True

    elif parser.args:
        control = parser.args[0]
    else:
        if not is_special_task:
            parser.parser.error("Missing argument: control file")
        control = None

    if ssh_verbosity > 0:
        # ssh_verbosity is an integer between 0 and 3, inclusive
        ssh_verbosity_flag = '-' + 'v' * ssh_verbosity
    else:
        ssh_verbosity_flag = ''

    machines = _get_machines(parser)
    if group_name and len(machines) < 2:
        parser.parser.error('-G %r may only be supplied with more than one '
                            'machine.' % group_name)

    job_kwargs = {
        'control':
        control,
        'args':
        parser.args[1:],
        'resultdir':
        results,
        'label':
        label,
        'user':
        user,
        'machines':
        machines,
        'machine_dict_list':
        server_job.get_machine_dicts(
            machine_names=machines,
            store_dir=os.path.join(results, parser.options.host_info_subdir),
            in_lab=in_lab,
            use_shadow_store=not parser.options.local_only_host_info,
            host_attributes=parser.options.host_attributes,
        ),
        'client':
        client,
        'ssh_user':
        ssh_user,
        'ssh_port':
        ssh_port,
        'ssh_pass':
        ssh_pass,
        'ssh_verbosity_flag':
        ssh_verbosity_flag,
        'ssh_options':
        ssh_options,
        'group_name':
        group_name,
        'tag':
        execution_tag,
        'disable_sysinfo':
        parser.options.disable_sysinfo,
        'in_lab':
        in_lab,
        'use_client_trampoline':
        use_client_trampoline,
    }
    if parser.options.parent_job_id:
        job_kwargs['parent_job_id'] = int(parser.options.parent_job_id)
    if control_filename:
        job_kwargs['control_filename'] = control_filename
    job = server_job.server_job(**job_kwargs)

    job.logging.start_logging()

    # perform checks
    job.precheck()

    # run the job
    exit_code = 0
    auto_start_servod = _CONFIG.get_config_value('AUTOSERV',
                                                 'auto_start_servod',
                                                 type=bool,
                                                 default=False)

    site_utils.SetupTsMonGlobalState('autoserv',
                                     indirect=False,
                                     short_lived=True)
    try:
        try:
            if repair:
                if auto_start_servod and len(machines) == 1:
                    _start_servod(machines[0])
                job.repair(job_labels)
            elif verify:
                job.verify(job_labels)
            elif provision:
                job.provision(job_labels)
            elif reset:
                job.reset(job_labels)
            elif cleanup:
                job.cleanup(job_labels)
            else:
                if auto_start_servod and len(machines) == 1:
                    _start_servod(machines[0])
                if use_ssp:
                    try:
                        _run_with_ssp(job, container_id, job_or_task_id,
                                      results, parser, ssp_url, machines)
                    finally:
                        # Update the ownership of files in result folder.
                        correct_results_folder_permission(results)
                else:
                    if collect_crashinfo:
                        # Update the ownership of files in result folder. If the
                        # job to collect crashinfo was running inside container
                        # (SSP) and crashed before correcting folder permission,
                        # the result folder might have wrong permission setting.
                        try:
                            correct_results_folder_permission(results)
                        except:
                            # Ignore any error as the user may not have root
                            # permission to run sudo command.
                            pass
                    metric_name = ('chromeos/autotest/experimental/'
                                   'autoserv_job_run_duration')
                    f = {
                        'in_container': utils.is_in_container(),
                        'success': False
                    }
                    with metrics.SecondsTimer(metric_name, fields=f) as c:
                        job.run(verify_job_repo_url=verify_job_repo_url,
                                only_collect_crashinfo=collect_crashinfo,
                                skip_crash_collection=skip_crash_collection,
                                job_labels=job_labels,
                                use_packaging=(not no_use_packaging))
                        c['success'] = True

        finally:
            job.close()
            # Special task doesn't run parse, so result summary needs to be
            # built here.
            if results and (repair or verify or reset or cleanup or provision):
                # Throttle the result on the server side.
                try:
                    result_utils.execute(
                        results, control_data.DEFAULT_MAX_RESULT_SIZE_KB)
                except:
                    logging.exception(
                        'Non-critical failure: Failed to throttle results '
                        'in directory %s.', results)
                # Build result view and report metrics for result sizes.
                site_utils.collect_result_sizes(results)
    except:
        exit_code = 1
        traceback.print_exc()
    finally:
        metrics.Flush()

    sys.exit(exit_code)
Exemple #12
0
 def testContextManager(self):
   """Test that timing context manager emits a metric."""
   with metrics.SecondsTimer('fooname'):
     pass
   self.assertEqual(metrics.CumulativeSecondsDistribution.call_count, 1)
   self.assertEqual(self._mockMetric.add.call_count, 1)
Exemple #13
0
def runtest(job,
            url,
            tag,
            args,
            dargs,
            local_namespace={},
            global_namespace={},
            before_test_hook=None,
            after_test_hook=None,
            before_iteration_hook=None,
            after_iteration_hook=None):
    local_namespace = local_namespace.copy()
    global_namespace = global_namespace.copy()
    # if this is not a plain test name then download and install the
    # specified test
    if url.endswith('.tar.bz2'):
        (testgroup, testname) = _installtest(job, url)
        bindir = os.path.join(job.testdir, 'download', testgroup, testname)
        importdir = os.path.join(job.testdir, 'download')
        modulename = '%s.%s' % (re.sub('/', '.', testgroup), testname)
        classname = '%s.%s' % (modulename, testname)
        path = testname
    else:
        # If the test is local, it may be under either testdir or site_testdir.
        # Tests in site_testdir override tests defined in testdir
        testname = path = url
        testgroup = ''
        path = re.sub(':', '/', testname)
        modulename = os.path.basename(path)
        classname = '%s.%s' % (modulename, modulename)

        # Try installing the test package
        # The job object may be either a server side job or a client side job.
        # 'install_pkg' method will be present only if it's a client side job.
        if hasattr(job, 'install_pkg'):
            try:
                bindir = os.path.join(job.testdir, testname)
                job.install_pkg(testname, 'test', bindir)
            except error.PackageInstallError:
                # continue as a fall back mechanism and see if the test code
                # already exists on the machine
                pass

        bindir = None
        for dir in [job.testdir, getattr(job, 'site_testdir', None)]:
            if dir is not None and os.path.exists(os.path.join(dir, path)):
                importdir = bindir = os.path.join(dir, path)
        if not bindir:
            raise error.TestError(testname + ': test does not exist')

    subdir = os.path.join(dargs.pop('master_testpath', ""), testname)
    outputdir = os.path.join(job.resultdir, subdir)
    if tag:
        outputdir += '.' + tag

    local_namespace['job'] = job
    local_namespace['bindir'] = bindir
    local_namespace['outputdir'] = outputdir

    sys.path.insert(0, importdir)
    try:
        exec('import %s' % modulename, local_namespace, global_namespace)
        exec("mytest = %s(job, bindir, outputdir)" % classname,
             local_namespace, global_namespace)
    finally:
        sys.path.pop(0)

    pwd = os.getcwd()
    os.chdir(outputdir)

    try:
        mytest = global_namespace['mytest']
        mytest.success = False
        if not job.fast and before_test_hook:
            logging.info('Starting before_hook for %s', mytest.tagged_testname)
            with metrics.SecondsTimer(
                    'chromeos/autotest/job/before_hook_duration'):
                before_test_hook(mytest)
            logging.info('before_hook completed')

        # we use the register iteration hooks methods to register the passed
        # in hooks
        if before_iteration_hook:
            mytest.register_before_iteration_hook(before_iteration_hook)
        if after_iteration_hook:
            mytest.register_after_iteration_hook(after_iteration_hook)
        mytest._exec(args, dargs)
        mytest.success = True
    finally:
        os.chdir(pwd)
        if after_test_hook and (not mytest.success or not job.fast):
            logging.info('Starting after_hook for %s', mytest.tagged_testname)
            with metrics.SecondsTimer(
                    'chromeos/autotest/job/after_hook_duration'):
                after_test_hook(mytest)
            logging.info('after_hook completed')

        shutil.rmtree(mytest.tmpdir, ignore_errors=True)
 def refresh(self):
     """Refresh all drones."""
     with metrics.SecondsTimer(
             'chromeos/autotest/drone_manager/refresh_duration'):
         self.trigger_refresh()
         self.sync_refresh()
def _main(options, argv):
    """main method of script.

  Args:
    options: preparsed options object for the build.
    argv: All command line arguments to pass as list of strings.

  Returns:
    Return code of cbuildbot as an integer.
  """
    branchname = options.branch or 'master'
    root = options.buildroot
    buildroot = os.path.join(root, 'repository')
    workspace = os.path.join(root, 'workspace')
    depot_tools_path = os.path.join(buildroot, constants.DEPOT_TOOLS_SUBPATH)

    # Does the entire build pass or fail.
    with metrics.Presence(METRIC_ACTIVE), \
         metrics.SuccessCounter(METRIC_COMPLETED) as s_fields:

        # Preliminary set, mostly command line parsing.
        with metrics.SuccessCounter(METRIC_INVOKED):
            if options.enable_buildbot_tags:
                logging.EnableBuildbotMarkers()
            ConfigureGlobalEnvironment()

        # Prepare the buildroot with source for the build.
        with metrics.SuccessCounter(METRIC_PREP):
            manifest_url = config_lib.GetSiteParams().MANIFEST_INT_URL
            repo = repository.RepoRepository(
                manifest_url,
                buildroot,
                branch=branchname,
                git_cache_dir=options.git_cache_dir)
            previous_build_state = GetLastBuildState(root)

            # Clean up the buildroot to a safe state.
            with metrics.SecondsTimer(METRIC_CLEAN):
                build_state = GetCurrentBuildState(options, branchname)
                CleanBuildRoot(root, repo, options.cache_dir, build_state)

            # Get a checkout close enough to the branch that cbuildbot can handle it.
            if options.sync:
                with metrics.SecondsTimer(METRIC_INITIAL):
                    InitialCheckout(repo)

        # Run cbuildbot inside the full ChromeOS checkout, on the specified branch.
        with metrics.SecondsTimer(METRIC_CBUILDBOT), \
             metrics.SecondsInstanceTimer(METRIC_CBUILDBOT_INSTANCE):
            if previous_build_state.is_valid():
                argv.append('--previous-build-state')
                argv.append(
                    base64.b64encode(previous_build_state.to_json().encode(
                        'utf-8')).decode('utf-8'))
            argv.extend(['--workspace', workspace])

            if not options.cache_dir_specified:
                argv.extend(['--cache-dir', options.cache_dir])

            result = Cbuildbot(buildroot, depot_tools_path, argv)
            s_fields['success'] = (result == 0)

            build_state.status = (constants.BUILDER_STATUS_PASSED if result
                                  == 0 else constants.BUILDER_STATUS_FAILED)
            SetLastBuildState(root, build_state)

            with metrics.SecondsTimer(METRIC_CHROOT_CLEANUP):
                CleanupChroot(buildroot)

            return result
Exemple #16
0
    def run_very_slowly(self,
                        command,
                        timeout=3600,
                        ignore_status=False,
                        stdout_tee=utils.TEE_TO_LOGS,
                        stderr_tee=utils.TEE_TO_LOGS,
                        connect_timeout=30,
                        options='',
                        stdin=None,
                        verbose=True,
                        args=(),
                        ignore_timeout=False,
                        ssh_failure_retry_ok=False):
        """
        Run a command on the remote host.
        This RPC call has an overhead of minimum 40ms and up to 400ms on
        servers (crbug.com/734887). Each time a run_very_slowly is added for
        every job - a server core dies in the lab.
        @see common_lib.hosts.host.run()

        @param timeout: command execution timeout
        @param connect_timeout: ssh connection timeout (in seconds)
        @param options: string with additional ssh command options
        @param verbose: log the commands
        @param ignore_timeout: bool True if SSH command timeouts should be
                ignored.  Will return None on command timeout.
        @param ssh_failure_retry_ok: True if the command may be retried on
                probable ssh failure (error 255 or timeout).  When true,
                the command may be executed up to three times, the second
                time after restarting the ssh master connection.  Use only for
                commands that are idempotent, because when a "probable
                ssh failure" occurs, we cannot tell if the command executed
                or not.

        @raises AutoservRunError: if the command failed
        @raises AutoservSSHTimeout: ssh connection has timed out
        """
        with metrics.SecondsTimer('chromeos/autotest/ssh/master_ssh_time',
                                  scale=0.001):
            if verbose:
                stack = self._get_server_stack_state(lowest_frames=1,
                                                     highest_frames=7)
                logging.debug("Running (ssh) '%s' from '%s'", command, stack)
                command = self._verbose_logger_command(command)

            # Start a master SSH connection if necessary.
            self.start_master_ssh()

            env = " ".join("=".join(pair) for pair in self.env.iteritems())
            try:
                return self._run(command, timeout, ignore_status, stdout_tee,
                                 stderr_tee, connect_timeout, env, options,
                                 stdin, args, ignore_timeout,
                                 ssh_failure_retry_ok)
            except error.CmdError, cmderr:
                # We get a CmdError here only if there is timeout of that
                # command. Catch that and stuff it into AutoservRunError and
                # raise it.
                timeout_message = str('Timeout encountered: %s' %
                                      cmderr.args[0])
                raise error.AutoservRunError(timeout_message, cmderr.args[1])
def run_autoserv(pid_file_manager, results, parser, ssp_url, use_ssp):
    """Run server job with given options.

    @param pid_file_manager: PidFileManager used to monitor the autoserv process
    @param results: Folder to store results.
    @param parser: Parser for the command line arguments.
    @param ssp_url: Url to server-side package.
    @param use_ssp: Set to True to run with server-side packaging.
    """
    if parser.options.warn_no_ssp:
        # Post a warning in the log.
        logging.warn('Autoserv is required to run with server-side packaging. '
                     'However, no drone is found to support server-side '
                     'packaging. The test will be executed in a drone without '
                     'server-side packaging supported.')

    # send stdin to /dev/null
    dev_null = os.open(os.devnull, os.O_RDONLY)
    os.dup2(dev_null, sys.stdin.fileno())
    os.close(dev_null)

    # Create separate process group if the process is not a process group
    # leader. This allows autoserv process to keep running after the caller
    # process (drone manager call) exits.
    if os.getpid() != os.getpgid(0):
        os.setsid()

    # Container name is predefined so the container can be destroyed in
    # handle_sigterm.
    job_or_task_id = job_directories.get_job_id_or_task_id(
        parser.options.results)
    container_name = (lxc.TEST_CONTAINER_NAME_FMT %
                      (job_or_task_id, time.time(), os.getpid()))
    job_folder = job_directories.get_job_folder_name(parser.options.results)

    # Implement SIGTERM handler
    def handle_sigterm(signum, frame):
        logging.debug('Received SIGTERM')
        if pid_file_manager:
            pid_file_manager.close_file(1, signal.SIGTERM)
        logging.debug('Finished writing to pid_file. Killing process.')

        # Update results folder's file permission. This needs to be done ASAP
        # before the parsing process tries to access the log.
        if use_ssp and results:
            correct_results_folder_permission(results)

        # TODO (sbasi) - remove the time.sleep when crbug.com/302815 is solved.
        # This sleep allows the pending output to be logged before the kill
        # signal is sent.
        time.sleep(.1)
        if use_ssp:
            logging.debug(
                'Destroy container %s before aborting the autoserv '
                'process.', container_name)
            metadata = {
                'drone': socket.gethostname(),
                'job_id': job_or_task_id,
                'container_name': container_name,
                'action': 'abort',
                'success': True
            }
            try:
                bucket = lxc.ContainerBucket()
                container = bucket.get(container_name)
                if container:
                    container.destroy()
                else:
                    metadata['success'] = False
                    metadata['error'] = 'container not found'
                    logging.debug('Container %s is not found.', container_name)
            except:
                metadata['success'] = False
                metadata['error'] = 'Exception: %s' % str(sys.exc_info())
                # Handle any exception so the autoserv process can be aborted.
                logging.exception('Failed to destroy container %s.',
                                  container_name)
            autotest_es.post(use_http=True,
                             type_str=lxc.CONTAINER_RUN_TEST_METADB_TYPE,
                             metadata=metadata)
            # Try to correct the result file permission again after the
            # container is destroyed, as the container might have created some
            # new files in the result folder.
            if results:
                correct_results_folder_permission(results)

        os.killpg(os.getpgrp(), signal.SIGKILL)

    # Set signal handler
    signal.signal(signal.SIGTERM, handle_sigterm)

    # faulthandler is only needed to debug in the Lab and is not avaliable to
    # be imported in the chroot as part of VMTest, so Try-Except it.
    try:
        import faulthandler
        faulthandler.register(signal.SIGTERM, all_threads=True, chain=True)
        logging.debug('faulthandler registered on SIGTERM.')
    except ImportError:
        sys.exc_clear()

    # Ignore SIGTTOU's generated by output from forked children.
    signal.signal(signal.SIGTTOU, signal.SIG_IGN)

    # If we received a SIGALARM, let's be loud about it.
    signal.signal(signal.SIGALRM, log_alarm)

    # Server side tests that call shell scripts often depend on $USER being set
    # but depending on how you launch your autotest scheduler it may not be set.
    os.environ['USER'] = getpass.getuser()

    label = parser.options.label
    group_name = parser.options.group_name
    user = parser.options.user
    client = parser.options.client
    server = parser.options.server
    install_before = parser.options.install_before
    install_after = parser.options.install_after
    verify = parser.options.verify
    repair = parser.options.repair
    cleanup = parser.options.cleanup
    provision = parser.options.provision
    reset = parser.options.reset
    job_labels = parser.options.job_labels
    no_tee = parser.options.no_tee
    parse_job = parser.options.parse_job
    execution_tag = parser.options.execution_tag
    if not execution_tag:
        execution_tag = parse_job
    ssh_user = parser.options.ssh_user
    ssh_port = parser.options.ssh_port
    ssh_pass = parser.options.ssh_pass
    collect_crashinfo = parser.options.collect_crashinfo
    control_filename = parser.options.control_filename
    test_retry = parser.options.test_retry
    verify_job_repo_url = parser.options.verify_job_repo_url
    skip_crash_collection = parser.options.skip_crash_collection
    ssh_verbosity = int(parser.options.ssh_verbosity)
    ssh_options = parser.options.ssh_options
    no_use_packaging = parser.options.no_use_packaging
    host_attributes = parser.options.host_attributes
    in_lab = bool(parser.options.lab)

    # can't be both a client and a server side test
    if client and server:
        parser.parser.error(
            "Can not specify a test as both server and client!")

    if provision and client:
        parser.parser.error("Cannot specify provisioning and client!")

    is_special_task = (verify or repair or cleanup or collect_crashinfo
                       or provision or reset)
    if len(parser.args) < 1 and not is_special_task:
        parser.parser.error("Missing argument: control file")

    if ssh_verbosity > 0:
        # ssh_verbosity is an integer between 0 and 3, inclusive
        ssh_verbosity_flag = '-' + 'v' * ssh_verbosity
    else:
        ssh_verbosity_flag = ''

    # We have a control file unless it's just a verify/repair/cleanup job
    if len(parser.args) > 0:
        control = parser.args[0]
    else:
        control = None

    machines = _get_machines(parser)
    if group_name and len(machines) < 2:
        parser.parser.error('-G %r may only be supplied with more than one '
                            'machine.' % group_name)

    kwargs = {
        'group_name': group_name,
        'tag': execution_tag,
        'disable_sysinfo': parser.options.disable_sysinfo
    }
    if parser.options.parent_job_id:
        kwargs['parent_job_id'] = int(parser.options.parent_job_id)
    if control_filename:
        kwargs['control_filename'] = control_filename
    if host_attributes:
        kwargs['host_attributes'] = host_attributes
    kwargs['in_lab'] = in_lab
    job = server_job.server_job(control, parser.args[1:], results, label, user,
                                machines, client, parse_job, ssh_user,
                                ssh_port, ssh_pass, ssh_verbosity_flag,
                                ssh_options, test_retry, **kwargs)

    job.logging.start_logging()
    job.init_parser()

    # perform checks
    job.precheck()

    # run the job
    exit_code = 0
    auto_start_servod = _CONFIG.get_config_value('AUTOSERV',
                                                 'auto_start_servod',
                                                 type=bool,
                                                 default=False)

    site_utils.SetupTsMonGlobalState('autoserv',
                                     indirect=False,
                                     short_lived=True)
    try:
        try:
            if repair:
                if auto_start_servod and len(machines) == 1:
                    _start_servod(machines[0])
                job.repair(job_labels)
            elif verify:
                job.verify(job_labels)
            elif provision:
                job.provision(job_labels)
            elif reset:
                job.reset(job_labels)
            elif cleanup:
                job.cleanup(job_labels)
            else:
                if auto_start_servod and len(machines) == 1:
                    _start_servod(machines[0])
                if use_ssp:
                    try:
                        _run_with_ssp(job, container_name, job_or_task_id,
                                      results, parser, ssp_url, job_folder,
                                      machines)
                    finally:
                        # Update the ownership of files in result folder.
                        correct_results_folder_permission(results)
                else:
                    if collect_crashinfo:
                        # Update the ownership of files in result folder. If the
                        # job to collect crashinfo was running inside container
                        # (SSP) and crashed before correcting folder permission,
                        # the result folder might have wrong permission setting.
                        try:
                            correct_results_folder_permission(results)
                        except:
                            # Ignore any error as the user may not have root
                            # permission to run sudo command.
                            pass
                    metric_name = ('chromeos/autotest/experimental/'
                                   'autoserv_job_run_duration')
                    f = {
                        'in_container': utils.is_in_container(),
                        'success': False
                    }
                    with metrics.SecondsTimer(metric_name, fields=f) as c:
                        job.run(install_before,
                                install_after,
                                verify_job_repo_url=verify_job_repo_url,
                                only_collect_crashinfo=collect_crashinfo,
                                skip_crash_collection=skip_crash_collection,
                                job_labels=job_labels,
                                use_packaging=(not no_use_packaging))
                        c['success'] = True

        finally:
            while job.hosts:
                host = job.hosts.pop()
                host.close()
    except:
        exit_code = 1
        traceback.print_exc()
    finally:
        metrics.Flush()

    if pid_file_manager:
        pid_file_manager.num_tests_failed = job.num_tests_failed
        pid_file_manager.close_file(exit_code)
    job.cleanup_parser()

    sys.exit(exit_code)