def run_on_client(host, client_results_dir, cleanup_only=False): """Run result utils on the given host. @param host: Host to run the result utils. @param client_results_dir: Path to the results directory on the client. @param cleanup_only: True to delete all existing directory summary files in the given directory. @return: True: If the command runs on client without error. False: If the command failed with error in result throttling. """ success = False with metrics.SecondsTimer( 'chromeos/autotest/job/dir_summary_collection_duration', fields={'dut_host_name': host.hostname}) as fields: try: _deploy_result_tools(host) if cleanup_only: logging.debug('Cleaning up directory summary in %s', client_results_dir) cmd = (_CLEANUP_DIR_SUMMARY_CMD % (DEFAULT_AUTOTEST_DIR, client_results_dir)) host.run(cmd, ignore_status=False, timeout=_CLEANUP_DIR_SUMMARY_TIMEOUT) else: logging.debug('Getting directory summary for %s', client_results_dir) throttle_option = '' if ENABLE_RESULT_THROTTLING: try: throttle_option = (_THROTTLE_OPTION_FMT % host.job.max_result_size_KB) except AttributeError: # In case host job is not set, skip throttling. logging.warn( 'host object does not have job attribute, ' 'skipping result throttling.') cmd = (_BUILD_DIR_SUMMARY_CMD % (DEFAULT_AUTOTEST_DIR, client_results_dir, throttle_option)) host.run(cmd, ignore_status=False, timeout=_BUILD_DIR_SUMMARY_TIMEOUT) success = True fields['success'] = True except error.AutoservRunError: action = 'cleanup' if cleanup_only else 'create' logging.exception( 'Non-critical failure: Failed to %s directory summary for ' '%s.', action, client_results_dir) fields['success'] = False return success
def _push_media(self, CTS_URI): """Downloads, caches and pushed media files to DUT.""" media = self._install_bundle(CTS_URI['media']) base = os.path.splitext(os.path.basename(CTS_URI['media']))[0] cts_media = os.path.join(media, base) # TODO(ihf): this really should measure throughput in Bytes/s. m = 'chromeos/autotest/infra_benchmark/cheets/push_media/duration' fields = {'success': False, 'dut_host_name': self._host.hostname} with metrics.SecondsTimer(m, fields=fields) as c: self._copy_media(cts_media) c['success'] = True if not self._verify_media(cts_media): raise error.TestFail('Error: saw corruption pushing media files.')
def main(): """Main script.""" options = parse_options() log_config = logging_config.LoggingConfig() if options.logfile: log_config.add_file_handler(file_path=os.path.abspath(options.logfile), level=logging.DEBUG) with ts_mon_config.SetupTsMonGlobalState(service_name='cleanup_tko_db', indirect=True): server = CONFIG.get_config_value('AUTOTEST_WEB', 'global_db_host', default=CONFIG.get_config_value( 'AUTOTEST_WEB', 'host')) user = CONFIG.get_config_value('AUTOTEST_WEB', 'global_db_user', default=CONFIG.get_config_value( 'AUTOTEST_WEB', 'user')) password = CONFIG.get_config_value('AUTOTEST_WEB', 'global_db_password', default=CONFIG.get_config_value( 'AUTOTEST_WEB', 'password')) database = CONFIG.get_config_value('AUTOTEST_WEB', 'global_db_database', default=CONFIG.get_config_value( 'AUTOTEST_WEB', 'database')) logging.info( 'Starting cleaning up old records in TKO database %s on ' 'server %s.', database, server) start_time = time.time() try: with metrics.SecondsTimer(CLEANUP_METRIC, fields={'success': False}) as fields: utils.run_sql_cmd(server, user, password, CLEANUP_TKO_CMD, database) fields['success'] = True except: logging.exception('Cleanup failed with exception.') finally: duration = time.time() - start_time logging.info('Cleanup attempt finished in %s seconds.', duration)
def main(argv): parser = commandline.ArgumentParser(description=__doc__) parser.add_argument('swarming_server', action='store', help='Swarming server to send no-op requests to.') options = parser.parse_args(argv) m_timer = 'chromeos/autotest/swarming_proxy/no_op_durations' m_count = 'chromeos/autotest/swarming_proxy/no_op_attempts' command = commands.RUN_SUITE_PATH fields = {'success': False, 'swarming_server': options.swarming_server} with ts_mon_config.SetupTsMonGlobalState('swarm_mon', indirect=True): while True: with metrics.SecondsTimer(m_timer, fields=fields) as f: try: with metrics.SuccessCounter(m_count): swarming_lib.RunSwarmingCommand([command, '--do_nothing'], options.swarming_server, dimensions=[('pool', 'default')], timeout_secs=120) f['success'] = True except (cros_build_lib.RunCommandError, timeout_util.TimeoutError): pass time.sleep(60)
def trigger_refresh(self): """Triggers a drone manager refresh. @raises DroneManagerError: If a drone has un-executed calls. Since they will get clobbered when we queue refresh calls. """ self._reset() self._drop_old_pidfiles() pidfile_paths = [pidfile_id.path for pidfile_id in self._registered_pidfile_info] drones = list(self.get_drones()) for drone in drones: calls = drone.get_calls() if calls: raise DroneManagerError('Drone %s has un-executed calls: %s ' 'which might get corrupted through ' 'this invocation' % (drone, [str(call) for call in calls])) drone.queue_call('refresh', pidfile_paths) logging.info("Invoking drone refresh.") with metrics.SecondsTimer( 'chromeos/autotest/drone_manager/trigger_refresh_duration'): self._refresh_task_queue.execute(drones, wait=False)
def run_cmd(self, cmd, expected=None): """Runs rpc command and log metrics @param cmd: string of rpc command to send @param expected: expected result of rpc """ metric_fields = self._metric_fields.copy() metric_fields['command'] = cmd metric_fields['success'] = True metric_fields['failure_reason'] = '' with metrics.SecondsTimer(METRIC_RPC_CALL_DURATIONS, fields=dict(metric_fields), scale=0.001) as f: msg_str = "%s:%s" % (self._hostname, cmd) try: result = self._afe.run(cmd) logging.debug("%s result = %s", msg_str, result) if expected is not None and expected != result: _failed(f, msg_str, 'IncorrectResponse') except urllib2.HTTPError as e: _failed(f, msg_str, 'HTTPError:%d' % e.code) except Exception as e: _failed(f, msg_str, FAILURE_REASONS.get(type(e), 'Unknown'), err=e) if type(e) not in FAILURE_REASONS: raise if f['success']: logging.info("%s success", msg_str)
def testContextManagerIgnoresInvalidField(self): """Test that we ignore fields that are set with no default.""" with metrics.SecondsTimer('fooname', fields={'foo': 'bar'}) as c: c['qux'] = 'qwert' self._mockMetric.add.assert_called_with(mock.ANY, fields={'foo': 'bar'})
def _main(argv): """main method of script. Args: argv: All command line arguments to pass as list of strings. Returns: Return code of cbuildbot as an integer. """ options = PreParseArguments(argv) branchname = options.branch or 'master' root = options.buildroot buildroot = os.path.join(root, 'repository') depot_tools_path = os.path.join(buildroot, constants.DEPOT_TOOLS_SUBPATH) metrics_fields = { 'branch_name': branchname, 'build_config': options.build_config_name, 'tryjob': options.remote_trybot, } # Does the entire build pass or fail. with metrics.Presence(METRIC_ACTIVE, metrics_fields), \ metrics.SuccessCounter(METRIC_COMPLETED, metrics_fields) as s_fields: # Preliminary set, mostly command line parsing. with metrics.SuccessCounter(METRIC_INVOKED, metrics_fields): if options.enable_buildbot_tags: logging.EnableBuildbotMarkers() ConfigureGlobalEnvironment() # Prepare the buildroot with source for the build. with metrics.SuccessCounter(METRIC_PREP, metrics_fields): site_config = config_lib.GetConfig() manifest_url = site_config.params['MANIFEST_INT_URL'] repo = repository.RepoRepository( manifest_url, buildroot, branch=branchname, git_cache_dir=options.git_cache_dir) previous_build_state = GetLastBuildState(root) # Clean up the buildroot to a safe state. with metrics.SecondsTimer(METRIC_CLEAN, fields=metrics_fields): build_state = GetCurrentBuildState(options, branchname) CleanBuildRoot(root, repo, metrics_fields, build_state) # Get a checkout close enough to the branch that cbuildbot can handle it. if options.sync: with metrics.SecondsTimer(METRIC_INITIAL, fields=metrics_fields): InitialCheckout(repo) # Get a checkout close enough to the branch that cbuildbot can handle it. with metrics.SecondsTimer(METRIC_DEPOT_TOOLS, fields=metrics_fields): DepotToolsEnsureBootstrap(depot_tools_path) # Run cbuildbot inside the full ChromeOS checkout, on the specified branch. with metrics.SecondsTimer(METRIC_CBUILDBOT, fields=metrics_fields): if previous_build_state.is_valid(): argv.append('--previous-build-state') argv.append(base64.b64encode(previous_build_state.to_json())) result = Cbuildbot(buildroot, depot_tools_path, argv) s_fields['success'] = (result == 0) build_state.status = (constants.BUILDER_STATUS_PASSED if result == 0 else constants.BUILDER_STATUS_FAILED) SetLastBuildState(root, build_state) CleanupChroot(buildroot) return result
def testContextManagerWithUpdate(self): """Tests that timing context manager with a field update emits metric.""" with metrics.SecondsTimer('fooname', fields={'foo': 'bar'}) as c: c['foo'] = 'qux' self._mockMetric.add.assert_called_with(mock.ANY, fields={'foo': 'qux'})
def testContextManagerWithoutUpdate(self): """Tests that the default value for fields is used when not updated.""" # pylint: disable=unused-variable with metrics.SecondsTimer('fooname', fields={'foo': 'bar'}) as c: pass self._mockMetric.add.assert_called_with(mock.ANY, fields={'foo': 'bar'})
def run_autoserv(pid_file_manager, results, parser, ssp_url, use_ssp): """Run server job with given options. @param pid_file_manager: PidFileManager used to monitor the autoserv process @param results: Folder to store results. @param parser: Parser for the command line arguments. @param ssp_url: Url to server-side package. @param use_ssp: Set to True to run with server-side packaging. """ # send stdin to /dev/null dev_null = os.open(os.devnull, os.O_RDONLY) os.dup2(dev_null, sys.stdin.fileno()) os.close(dev_null) # Create separate process group if the process is not a process group # leader. This allows autoserv process to keep running after the caller # process (drone manager call) exits. if os.getpid() != os.getpgid(0): os.setsid() # Container name is predefined so the container can be destroyed in # handle_sigterm. job_or_task_id = job_directories.get_job_id_or_task_id( parser.options.results) container_id = lxc.ContainerId(job_or_task_id, time.time(), os.getpid()) # Implement SIGTERM handler def handle_sigterm(signum, frame): logging.debug('Received SIGTERM') if pid_file_manager: pid_file_manager.close_file(1, signal.SIGTERM) logging.debug('Finished writing to pid_file. Killing process.') # Update results folder's file permission. This needs to be done ASAP # before the parsing process tries to access the log. if use_ssp and results: correct_results_folder_permission(results) # TODO (sbasi) - remove the time.sleep when crbug.com/302815 is solved. # This sleep allows the pending output to be logged before the kill # signal is sent. time.sleep(.1) if use_ssp: logging.debug( 'Destroy container %s before aborting the autoserv ' 'process.', container_id) try: bucket = lxc.ContainerBucket() container = bucket.get_container(container_id) if container: container.destroy() else: logging.debug('Container %s is not found.', container_id) except: # Handle any exception so the autoserv process can be aborted. logging.exception('Failed to destroy container %s.', container_id) # Try to correct the result file permission again after the # container is destroyed, as the container might have created some # new files in the result folder. if results: correct_results_folder_permission(results) os.killpg(os.getpgrp(), signal.SIGKILL) # Set signal handler signal.signal(signal.SIGTERM, handle_sigterm) # faulthandler is only needed to debug in the Lab and is not avaliable to # be imported in the chroot as part of VMTest, so Try-Except it. try: import faulthandler faulthandler.register(signal.SIGTERM, all_threads=True, chain=True) logging.debug('faulthandler registered on SIGTERM.') except ImportError: sys.exc_clear() # Ignore SIGTTOU's generated by output from forked children. signal.signal(signal.SIGTTOU, signal.SIG_IGN) # If we received a SIGALARM, let's be loud about it. signal.signal(signal.SIGALRM, log_alarm) # Server side tests that call shell scripts often depend on $USER being set # but depending on how you launch your autotest scheduler it may not be set. os.environ['USER'] = getpass.getuser() label = parser.options.label group_name = parser.options.group_name user = parser.options.user client = parser.options.client server = parser.options.server verify = parser.options.verify repair = parser.options.repair cleanup = parser.options.cleanup provision = parser.options.provision reset = parser.options.reset job_labels = parser.options.job_labels no_tee = parser.options.no_tee execution_tag = parser.options.execution_tag ssh_user = parser.options.ssh_user ssh_port = parser.options.ssh_port ssh_pass = parser.options.ssh_pass collect_crashinfo = parser.options.collect_crashinfo control_filename = parser.options.control_filename verify_job_repo_url = parser.options.verify_job_repo_url skip_crash_collection = parser.options.skip_crash_collection ssh_verbosity = int(parser.options.ssh_verbosity) ssh_options = parser.options.ssh_options no_use_packaging = parser.options.no_use_packaging in_lab = bool(parser.options.lab) # can't be both a client and a server side test if client and server: parser.parser.error( "Can not specify a test as both server and client!") if provision and client: parser.parser.error("Cannot specify provisioning and client!") is_special_task = (verify or repair or cleanup or collect_crashinfo or provision or reset) use_client_trampoline = False if parser.options.control_name: if use_ssp: # When use_ssp is True, autoserv will be re-executed inside a # container preserving the --control-name argument. Control file # will be staged inside the rexecuted autoserv. control = None else: try: control = _stage_control_file(parser.options.control_name, results) except error.AutoservError as e: logging.info("Using client trampoline because of: %s", e) control = parser.options.control_name use_client_trampoline = True elif parser.args: control = parser.args[0] else: if not is_special_task: parser.parser.error("Missing argument: control file") control = None if ssh_verbosity > 0: # ssh_verbosity is an integer between 0 and 3, inclusive ssh_verbosity_flag = '-' + 'v' * ssh_verbosity else: ssh_verbosity_flag = '' machines = _get_machines(parser) if group_name and len(machines) < 2: parser.parser.error('-G %r may only be supplied with more than one ' 'machine.' % group_name) job_kwargs = { 'control': control, 'args': parser.args[1:], 'resultdir': results, 'label': label, 'user': user, 'machines': machines, 'machine_dict_list': server_job.get_machine_dicts( machine_names=machines, store_dir=os.path.join(results, parser.options.host_info_subdir), in_lab=in_lab, use_shadow_store=not parser.options.local_only_host_info, host_attributes=parser.options.host_attributes, ), 'client': client, 'ssh_user': ssh_user, 'ssh_port': ssh_port, 'ssh_pass': ssh_pass, 'ssh_verbosity_flag': ssh_verbosity_flag, 'ssh_options': ssh_options, 'group_name': group_name, 'tag': execution_tag, 'disable_sysinfo': parser.options.disable_sysinfo, 'in_lab': in_lab, 'use_client_trampoline': use_client_trampoline, } if parser.options.parent_job_id: job_kwargs['parent_job_id'] = int(parser.options.parent_job_id) if control_filename: job_kwargs['control_filename'] = control_filename job = server_job.server_job(**job_kwargs) job.logging.start_logging() # perform checks job.precheck() # run the job exit_code = 0 auto_start_servod = _CONFIG.get_config_value('AUTOSERV', 'auto_start_servod', type=bool, default=False) site_utils.SetupTsMonGlobalState('autoserv', indirect=False, short_lived=True) try: try: if repair: if auto_start_servod and len(machines) == 1: _start_servod(machines[0]) job.repair(job_labels) elif verify: job.verify(job_labels) elif provision: job.provision(job_labels) elif reset: job.reset(job_labels) elif cleanup: job.cleanup(job_labels) else: if auto_start_servod and len(machines) == 1: _start_servod(machines[0]) if use_ssp: try: _run_with_ssp(job, container_id, job_or_task_id, results, parser, ssp_url, machines) finally: # Update the ownership of files in result folder. correct_results_folder_permission(results) else: if collect_crashinfo: # Update the ownership of files in result folder. If the # job to collect crashinfo was running inside container # (SSP) and crashed before correcting folder permission, # the result folder might have wrong permission setting. try: correct_results_folder_permission(results) except: # Ignore any error as the user may not have root # permission to run sudo command. pass metric_name = ('chromeos/autotest/experimental/' 'autoserv_job_run_duration') f = { 'in_container': utils.is_in_container(), 'success': False } with metrics.SecondsTimer(metric_name, fields=f) as c: job.run(verify_job_repo_url=verify_job_repo_url, only_collect_crashinfo=collect_crashinfo, skip_crash_collection=skip_crash_collection, job_labels=job_labels, use_packaging=(not no_use_packaging)) c['success'] = True finally: job.close() # Special task doesn't run parse, so result summary needs to be # built here. if results and (repair or verify or reset or cleanup or provision): # Throttle the result on the server side. try: result_utils.execute( results, control_data.DEFAULT_MAX_RESULT_SIZE_KB) except: logging.exception( 'Non-critical failure: Failed to throttle results ' 'in directory %s.', results) # Build result view and report metrics for result sizes. site_utils.collect_result_sizes(results) except: exit_code = 1 traceback.print_exc() finally: metrics.Flush() sys.exit(exit_code)
def testContextManager(self): """Test that timing context manager emits a metric.""" with metrics.SecondsTimer('fooname'): pass self.assertEqual(metrics.CumulativeSecondsDistribution.call_count, 1) self.assertEqual(self._mockMetric.add.call_count, 1)
def runtest(job, url, tag, args, dargs, local_namespace={}, global_namespace={}, before_test_hook=None, after_test_hook=None, before_iteration_hook=None, after_iteration_hook=None): local_namespace = local_namespace.copy() global_namespace = global_namespace.copy() # if this is not a plain test name then download and install the # specified test if url.endswith('.tar.bz2'): (testgroup, testname) = _installtest(job, url) bindir = os.path.join(job.testdir, 'download', testgroup, testname) importdir = os.path.join(job.testdir, 'download') modulename = '%s.%s' % (re.sub('/', '.', testgroup), testname) classname = '%s.%s' % (modulename, testname) path = testname else: # If the test is local, it may be under either testdir or site_testdir. # Tests in site_testdir override tests defined in testdir testname = path = url testgroup = '' path = re.sub(':', '/', testname) modulename = os.path.basename(path) classname = '%s.%s' % (modulename, modulename) # Try installing the test package # The job object may be either a server side job or a client side job. # 'install_pkg' method will be present only if it's a client side job. if hasattr(job, 'install_pkg'): try: bindir = os.path.join(job.testdir, testname) job.install_pkg(testname, 'test', bindir) except error.PackageInstallError: # continue as a fall back mechanism and see if the test code # already exists on the machine pass bindir = None for dir in [job.testdir, getattr(job, 'site_testdir', None)]: if dir is not None and os.path.exists(os.path.join(dir, path)): importdir = bindir = os.path.join(dir, path) if not bindir: raise error.TestError(testname + ': test does not exist') subdir = os.path.join(dargs.pop('master_testpath', ""), testname) outputdir = os.path.join(job.resultdir, subdir) if tag: outputdir += '.' + tag local_namespace['job'] = job local_namespace['bindir'] = bindir local_namespace['outputdir'] = outputdir sys.path.insert(0, importdir) try: exec('import %s' % modulename, local_namespace, global_namespace) exec("mytest = %s(job, bindir, outputdir)" % classname, local_namespace, global_namespace) finally: sys.path.pop(0) pwd = os.getcwd() os.chdir(outputdir) try: mytest = global_namespace['mytest'] mytest.success = False if not job.fast and before_test_hook: logging.info('Starting before_hook for %s', mytest.tagged_testname) with metrics.SecondsTimer( 'chromeos/autotest/job/before_hook_duration'): before_test_hook(mytest) logging.info('before_hook completed') # we use the register iteration hooks methods to register the passed # in hooks if before_iteration_hook: mytest.register_before_iteration_hook(before_iteration_hook) if after_iteration_hook: mytest.register_after_iteration_hook(after_iteration_hook) mytest._exec(args, dargs) mytest.success = True finally: os.chdir(pwd) if after_test_hook and (not mytest.success or not job.fast): logging.info('Starting after_hook for %s', mytest.tagged_testname) with metrics.SecondsTimer( 'chromeos/autotest/job/after_hook_duration'): after_test_hook(mytest) logging.info('after_hook completed') shutil.rmtree(mytest.tmpdir, ignore_errors=True)
def refresh(self): """Refresh all drones.""" with metrics.SecondsTimer( 'chromeos/autotest/drone_manager/refresh_duration'): self.trigger_refresh() self.sync_refresh()
def _main(options, argv): """main method of script. Args: options: preparsed options object for the build. argv: All command line arguments to pass as list of strings. Returns: Return code of cbuildbot as an integer. """ branchname = options.branch or 'master' root = options.buildroot buildroot = os.path.join(root, 'repository') workspace = os.path.join(root, 'workspace') depot_tools_path = os.path.join(buildroot, constants.DEPOT_TOOLS_SUBPATH) # Does the entire build pass or fail. with metrics.Presence(METRIC_ACTIVE), \ metrics.SuccessCounter(METRIC_COMPLETED) as s_fields: # Preliminary set, mostly command line parsing. with metrics.SuccessCounter(METRIC_INVOKED): if options.enable_buildbot_tags: logging.EnableBuildbotMarkers() ConfigureGlobalEnvironment() # Prepare the buildroot with source for the build. with metrics.SuccessCounter(METRIC_PREP): manifest_url = config_lib.GetSiteParams().MANIFEST_INT_URL repo = repository.RepoRepository( manifest_url, buildroot, branch=branchname, git_cache_dir=options.git_cache_dir) previous_build_state = GetLastBuildState(root) # Clean up the buildroot to a safe state. with metrics.SecondsTimer(METRIC_CLEAN): build_state = GetCurrentBuildState(options, branchname) CleanBuildRoot(root, repo, options.cache_dir, build_state) # Get a checkout close enough to the branch that cbuildbot can handle it. if options.sync: with metrics.SecondsTimer(METRIC_INITIAL): InitialCheckout(repo) # Run cbuildbot inside the full ChromeOS checkout, on the specified branch. with metrics.SecondsTimer(METRIC_CBUILDBOT), \ metrics.SecondsInstanceTimer(METRIC_CBUILDBOT_INSTANCE): if previous_build_state.is_valid(): argv.append('--previous-build-state') argv.append( base64.b64encode(previous_build_state.to_json().encode( 'utf-8')).decode('utf-8')) argv.extend(['--workspace', workspace]) if not options.cache_dir_specified: argv.extend(['--cache-dir', options.cache_dir]) result = Cbuildbot(buildroot, depot_tools_path, argv) s_fields['success'] = (result == 0) build_state.status = (constants.BUILDER_STATUS_PASSED if result == 0 else constants.BUILDER_STATUS_FAILED) SetLastBuildState(root, build_state) with metrics.SecondsTimer(METRIC_CHROOT_CLEANUP): CleanupChroot(buildroot) return result
def run_very_slowly(self, command, timeout=3600, ignore_status=False, stdout_tee=utils.TEE_TO_LOGS, stderr_tee=utils.TEE_TO_LOGS, connect_timeout=30, options='', stdin=None, verbose=True, args=(), ignore_timeout=False, ssh_failure_retry_ok=False): """ Run a command on the remote host. This RPC call has an overhead of minimum 40ms and up to 400ms on servers (crbug.com/734887). Each time a run_very_slowly is added for every job - a server core dies in the lab. @see common_lib.hosts.host.run() @param timeout: command execution timeout @param connect_timeout: ssh connection timeout (in seconds) @param options: string with additional ssh command options @param verbose: log the commands @param ignore_timeout: bool True if SSH command timeouts should be ignored. Will return None on command timeout. @param ssh_failure_retry_ok: True if the command may be retried on probable ssh failure (error 255 or timeout). When true, the command may be executed up to three times, the second time after restarting the ssh master connection. Use only for commands that are idempotent, because when a "probable ssh failure" occurs, we cannot tell if the command executed or not. @raises AutoservRunError: if the command failed @raises AutoservSSHTimeout: ssh connection has timed out """ with metrics.SecondsTimer('chromeos/autotest/ssh/master_ssh_time', scale=0.001): if verbose: stack = self._get_server_stack_state(lowest_frames=1, highest_frames=7) logging.debug("Running (ssh) '%s' from '%s'", command, stack) command = self._verbose_logger_command(command) # Start a master SSH connection if necessary. self.start_master_ssh() env = " ".join("=".join(pair) for pair in self.env.iteritems()) try: return self._run(command, timeout, ignore_status, stdout_tee, stderr_tee, connect_timeout, env, options, stdin, args, ignore_timeout, ssh_failure_retry_ok) except error.CmdError, cmderr: # We get a CmdError here only if there is timeout of that # command. Catch that and stuff it into AutoservRunError and # raise it. timeout_message = str('Timeout encountered: %s' % cmderr.args[0]) raise error.AutoservRunError(timeout_message, cmderr.args[1])
def run_autoserv(pid_file_manager, results, parser, ssp_url, use_ssp): """Run server job with given options. @param pid_file_manager: PidFileManager used to monitor the autoserv process @param results: Folder to store results. @param parser: Parser for the command line arguments. @param ssp_url: Url to server-side package. @param use_ssp: Set to True to run with server-side packaging. """ if parser.options.warn_no_ssp: # Post a warning in the log. logging.warn('Autoserv is required to run with server-side packaging. ' 'However, no drone is found to support server-side ' 'packaging. The test will be executed in a drone without ' 'server-side packaging supported.') # send stdin to /dev/null dev_null = os.open(os.devnull, os.O_RDONLY) os.dup2(dev_null, sys.stdin.fileno()) os.close(dev_null) # Create separate process group if the process is not a process group # leader. This allows autoserv process to keep running after the caller # process (drone manager call) exits. if os.getpid() != os.getpgid(0): os.setsid() # Container name is predefined so the container can be destroyed in # handle_sigterm. job_or_task_id = job_directories.get_job_id_or_task_id( parser.options.results) container_name = (lxc.TEST_CONTAINER_NAME_FMT % (job_or_task_id, time.time(), os.getpid())) job_folder = job_directories.get_job_folder_name(parser.options.results) # Implement SIGTERM handler def handle_sigterm(signum, frame): logging.debug('Received SIGTERM') if pid_file_manager: pid_file_manager.close_file(1, signal.SIGTERM) logging.debug('Finished writing to pid_file. Killing process.') # Update results folder's file permission. This needs to be done ASAP # before the parsing process tries to access the log. if use_ssp and results: correct_results_folder_permission(results) # TODO (sbasi) - remove the time.sleep when crbug.com/302815 is solved. # This sleep allows the pending output to be logged before the kill # signal is sent. time.sleep(.1) if use_ssp: logging.debug( 'Destroy container %s before aborting the autoserv ' 'process.', container_name) metadata = { 'drone': socket.gethostname(), 'job_id': job_or_task_id, 'container_name': container_name, 'action': 'abort', 'success': True } try: bucket = lxc.ContainerBucket() container = bucket.get(container_name) if container: container.destroy() else: metadata['success'] = False metadata['error'] = 'container not found' logging.debug('Container %s is not found.', container_name) except: metadata['success'] = False metadata['error'] = 'Exception: %s' % str(sys.exc_info()) # Handle any exception so the autoserv process can be aborted. logging.exception('Failed to destroy container %s.', container_name) autotest_es.post(use_http=True, type_str=lxc.CONTAINER_RUN_TEST_METADB_TYPE, metadata=metadata) # Try to correct the result file permission again after the # container is destroyed, as the container might have created some # new files in the result folder. if results: correct_results_folder_permission(results) os.killpg(os.getpgrp(), signal.SIGKILL) # Set signal handler signal.signal(signal.SIGTERM, handle_sigterm) # faulthandler is only needed to debug in the Lab and is not avaliable to # be imported in the chroot as part of VMTest, so Try-Except it. try: import faulthandler faulthandler.register(signal.SIGTERM, all_threads=True, chain=True) logging.debug('faulthandler registered on SIGTERM.') except ImportError: sys.exc_clear() # Ignore SIGTTOU's generated by output from forked children. signal.signal(signal.SIGTTOU, signal.SIG_IGN) # If we received a SIGALARM, let's be loud about it. signal.signal(signal.SIGALRM, log_alarm) # Server side tests that call shell scripts often depend on $USER being set # but depending on how you launch your autotest scheduler it may not be set. os.environ['USER'] = getpass.getuser() label = parser.options.label group_name = parser.options.group_name user = parser.options.user client = parser.options.client server = parser.options.server install_before = parser.options.install_before install_after = parser.options.install_after verify = parser.options.verify repair = parser.options.repair cleanup = parser.options.cleanup provision = parser.options.provision reset = parser.options.reset job_labels = parser.options.job_labels no_tee = parser.options.no_tee parse_job = parser.options.parse_job execution_tag = parser.options.execution_tag if not execution_tag: execution_tag = parse_job ssh_user = parser.options.ssh_user ssh_port = parser.options.ssh_port ssh_pass = parser.options.ssh_pass collect_crashinfo = parser.options.collect_crashinfo control_filename = parser.options.control_filename test_retry = parser.options.test_retry verify_job_repo_url = parser.options.verify_job_repo_url skip_crash_collection = parser.options.skip_crash_collection ssh_verbosity = int(parser.options.ssh_verbosity) ssh_options = parser.options.ssh_options no_use_packaging = parser.options.no_use_packaging host_attributes = parser.options.host_attributes in_lab = bool(parser.options.lab) # can't be both a client and a server side test if client and server: parser.parser.error( "Can not specify a test as both server and client!") if provision and client: parser.parser.error("Cannot specify provisioning and client!") is_special_task = (verify or repair or cleanup or collect_crashinfo or provision or reset) if len(parser.args) < 1 and not is_special_task: parser.parser.error("Missing argument: control file") if ssh_verbosity > 0: # ssh_verbosity is an integer between 0 and 3, inclusive ssh_verbosity_flag = '-' + 'v' * ssh_verbosity else: ssh_verbosity_flag = '' # We have a control file unless it's just a verify/repair/cleanup job if len(parser.args) > 0: control = parser.args[0] else: control = None machines = _get_machines(parser) if group_name and len(machines) < 2: parser.parser.error('-G %r may only be supplied with more than one ' 'machine.' % group_name) kwargs = { 'group_name': group_name, 'tag': execution_tag, 'disable_sysinfo': parser.options.disable_sysinfo } if parser.options.parent_job_id: kwargs['parent_job_id'] = int(parser.options.parent_job_id) if control_filename: kwargs['control_filename'] = control_filename if host_attributes: kwargs['host_attributes'] = host_attributes kwargs['in_lab'] = in_lab job = server_job.server_job(control, parser.args[1:], results, label, user, machines, client, parse_job, ssh_user, ssh_port, ssh_pass, ssh_verbosity_flag, ssh_options, test_retry, **kwargs) job.logging.start_logging() job.init_parser() # perform checks job.precheck() # run the job exit_code = 0 auto_start_servod = _CONFIG.get_config_value('AUTOSERV', 'auto_start_servod', type=bool, default=False) site_utils.SetupTsMonGlobalState('autoserv', indirect=False, short_lived=True) try: try: if repair: if auto_start_servod and len(machines) == 1: _start_servod(machines[0]) job.repair(job_labels) elif verify: job.verify(job_labels) elif provision: job.provision(job_labels) elif reset: job.reset(job_labels) elif cleanup: job.cleanup(job_labels) else: if auto_start_servod and len(machines) == 1: _start_servod(machines[0]) if use_ssp: try: _run_with_ssp(job, container_name, job_or_task_id, results, parser, ssp_url, job_folder, machines) finally: # Update the ownership of files in result folder. correct_results_folder_permission(results) else: if collect_crashinfo: # Update the ownership of files in result folder. If the # job to collect crashinfo was running inside container # (SSP) and crashed before correcting folder permission, # the result folder might have wrong permission setting. try: correct_results_folder_permission(results) except: # Ignore any error as the user may not have root # permission to run sudo command. pass metric_name = ('chromeos/autotest/experimental/' 'autoserv_job_run_duration') f = { 'in_container': utils.is_in_container(), 'success': False } with metrics.SecondsTimer(metric_name, fields=f) as c: job.run(install_before, install_after, verify_job_repo_url=verify_job_repo_url, only_collect_crashinfo=collect_crashinfo, skip_crash_collection=skip_crash_collection, job_labels=job_labels, use_packaging=(not no_use_packaging)) c['success'] = True finally: while job.hosts: host = job.hosts.pop() host.close() except: exit_code = 1 traceback.print_exc() finally: metrics.Flush() if pid_file_manager: pid_file_manager.num_tests_failed = job.num_tests_failed pid_file_manager.close_file(exit_code) job.cleanup_parser() sys.exit(exit_code)