def counters_inc(counter_name, failure_name): """Helper function to increment metrics counters. @param counter_name: string indicating which counter to use @param failure_name: string indentifying an error, or 'success' """ if counter_name == 'call': # ssh_counter records the outcome of each ssh invocation # inside _run(), including exceptions. ssh_counter = metrics.Counter('chromeos/autotest/ssh/calls') fields = { 'error': failure_name or 'success', 'attempt': ssh_call_count } ssh_counter.increment(fields=fields) if counter_name == 'run': # run_counter records each call to _run() with its result # and how many tries were made. Calls are recorded when # _run() exits (including exiting with an exception) run_counter = metrics.Counter('chromeos/autotest/ssh/runs') fields = { 'error': failure_name or 'success', 'attempt': ssh_call_count } run_counter.increment(fields=fields)
def main(argv): """Entry point.""" logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - " + "%(levelname)s - %(message)s") parser, options, args = parse_options() if not verify_options_and_args(options, args): parser.print_help() sys.exit(1) with ts_mon_config.SetupTsMonGlobalState(service_name='sync_server_db', indirect=True): try: metrics.Counter(_METRICS_PREFIX + '/start').increment() logging.info("Setting signal handler") signal.signal(signal.SIGINT, handle_signal) signal.signal(signal.SIGTERM, handle_signal) while not _shutdown: _main(options) metrics.Counter(_METRICS_PREFIX + '/tick').increment(fields={'success': True}) time.sleep(options.sleep) except: metrics.Counter(_METRICS_PREFIX + '/tick').increment(fields={'success': False}) raise
def main(): """Main entry.""" # Clear all loggers to make sure the following basicConfig take effect. logging.shutdown() reload(logging) logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.DEBUG) with ts_mon_config.SetupTsMonGlobalState(service_name='kill_slow_queries', indirect=True): count = 0 parser, options, args = parse_options() if not verify_options_and_args(options, args): parser.print_help() return 1 try: while True: result_log_strs, count = kill_slow_queries( options.user, options.password, options.timeout) if result_log_strs: gmail_lib.send_email( options.mail, 'Successfully killed slow autotest db queries', 'Below are killed queries:\n%s' % result_log_strs) m = 'chromeos/autotest/afe_db/killed_slow_queries' metrics.Counter(m).increment_by(count) time.sleep(options.timeout) except Exception as e: m = 'chromeos/autotest/afe_db/failed_to_kill_query' metrics.Counter(m).increment() logging.error('Failed to kill slow db queries.\n%s', e) raise
def __init__(self, tag, dependencies, triggers, host_class): super(RepairAction, self).__init__(tag, 'repair', dependencies) self._trigger_list = triggers self._failure_modes_counter = metrics.Counter( 'chromeos/autotest/repair/failure_modes') self._failure_detail_counter = metrics.Counter( 'chromeos/autotest/repair/failure_detail') self.host_class = host_class
def _verify_store_infos(self, primary_info, shadow_info): if primary_info == shadow_info: metrics.Counter(_REFRESH_METRIC_NAME).increment( fields={'validation_result': 'success'}) else: self._mismatch_callback(primary_info, shadow_info) metrics.Counter(_REFRESH_METRIC_NAME).increment( fields={'validation_result': 'fail_mismatch'}) self._shadow_store.commit(primary_info)
def _commit_to_shadow_store(self, info): try: self._shadow_store.commit(info) except host_info.StoreError: logger.exception('shadow commit failed. ' 'Expect primary / shadow desync in the future.') metrics.Counter(_COMMIT_METRIC_NAME).increment( fields={'file_commit_result': 'fail'}) else: metrics.Counter(_COMMIT_METRIC_NAME).increment( fields={'file_commit_result': 'success'})
def __init__(self, verifier_data, repair_data, host_class): """ Construct a `RepairStrategy` from simplified DAG data. The input `verifier_data` object describes how to construct verify nodes and the dependencies that relate them, as detailed above. The input `repair_data` object describes how to construct repair actions and their dependencies and triggers, as detailed above. @param verifier_data Iterable value with constructors for the elements of the verification DAG and their dependencies. @param repair_data Iterable value with constructors for the elements of the repair action list, and their dependencies and triggers. @property host_class A string identifier that identify what class of host this repair strategy target on, will be used as a field to send repair metrics. """ # Metrics - we report on 'actions' for every repair action # we execute; we report on 'strategy' for every complete # repair operation. self._strategy_counter = metrics.Counter( 'chromeos/autotest/repair/repair_strategy_v2') self._actions_counter = metrics.Counter( 'chromeos/autotest/repair/repair_actions') self.host_class = host_class # We use the `all_verifiers` list to guarantee that our root # verifier will execute its dependencies in the order provided # to us by our caller. verifier_map = {} all_tags = [] dependencies = set() for constructor, tag, deps in verifier_data: self._add_verifier(verifier_map, constructor, tag, deps) dependencies.update(deps) all_tags.append(tag) # Capture all the verifiers that have nothing depending on them. root_tags = [t for t in all_tags if t not in dependencies] self._add_verifier(verifier_map, _RootVerifier, self.ROOT_TAG, root_tags) self._verify_root = verifier_map[self.ROOT_TAG] self._repair_actions = [] for constructor, tag, deps, triggers in repair_data: r = constructor(tag, [verifier_map[d] for d in deps], [verifier_map[t] for t in triggers], self.host_class) self._repair_actions.append(r)
def main(): if _monitor_db_host_acquisition: logging.info('Please set inline_host_acquisition=False in the shadow ' 'config before starting the host scheduler.') sys.exit(0) try: options = parse_arguments(sys.argv[1:]) scheduler_lib.check_production_settings(options) # If server database is enabled, check if the server has role # `host_scheduler`. If the server does not have host_scheduler role, # exception will be raised and host scheduler will not continue to run. if server_manager_utils.use_server_db(): server_manager_utils.confirm_server_has_role(hostname='localhost', role='host_scheduler') initialize(options.testing) with ts_mon_config.SetupTsMonGlobalState( 'autotest_host_scheduler', indirect=True, debug_file=options.metrics_file, ): metrics.Counter('%s/start' % _METRICS_PREFIX).increment() process_start_time = time.time() host_scheduler = HostScheduler() minimum_tick_sec = global_config.global_config.get_config_value( 'SCHEDULER', 'host_scheduler_minimum_tick_sec', type=float) while not _shutdown: if _lifetime_expired(options.lifetime_hours, process_start_time): break start = time.time() host_scheduler.tick() curr_tick_sec = time.time() - start if (minimum_tick_sec > curr_tick_sec): time.sleep(minimum_tick_sec - curr_tick_sec) else: time.sleep(0.0001) logging.info('Shutdown request recieved. Bye! Bye!') except server_manager_utils.ServerActionError: # This error is expected when the server is not in primary status # for host-scheduler role. Thus do not send email for it. raise except Exception: metrics.Counter('%s/uncaught_exception' % _METRICS_PREFIX).increment() raise finally: email_manager.manager.send_queued_emails() if _db_manager: _db_manager.disconnect()
def _emit_offload_metrics(dirpath): """Emit gs offload metrics. @param dirpath: Offloaded directory path. """ dir_size = file_utils.get_directory_size_kibibytes(dirpath) metrics_fields = _get_metrics_fields(dirpath) m_offload_count = ('chromeos/autotest/gs_offloader/jobs_offloaded') metrics.Counter(m_offload_count).increment(fields=metrics_fields) m_offload_size = ('chromeos/autotest/gs_offloader/' 'kilobytes_transferred') metrics.Counter(m_offload_size).increment_by(dir_size, fields=metrics_fields)
def main(): ts_mon_config.SetupTsMonGlobalState('shard_client') try: metrics.Counter('chromeos/autotest/shard_client/start').increment() main_without_exception_handling() except Exception as e: metrics.Counter( 'chromeos/autotest/shard_client/uncaught_exception').increment() message = 'Uncaught exception. Terminating shard_client.' email_manager.manager.log_stacktrace(message) logging.exception(message) raise finally: email_manager.manager.send_queued_emails()
def start(self, pool_size=constants.DEFAULT_CONTAINER_POOL_SIZE): """Starts the service. @param pool_size: The desired size of the container pool. This parameter has no effect if a pre-created pool was DI'd into the Service constructor. """ self._running = True # Start the container pool. if self._pool is None: factory = container_factory.ContainerFactory( base_container=base_image.BaseImage().get(), container_class=zygote.Zygote) self._pool = pool.Pool(factory=factory, size=pool_size) # Start listening asynchronously for incoming connections. self._connection_listener.start() # Poll for incoming connections, and spawn threads to handle them. logging.debug('Start event loop.') while self._stop_event is None: self._handle_incoming_connections() self._cleanup_closed_connections() # TODO(kenobi): Poll for and log errors from pool. metrics.Counter(METRICS_PREFIX + '/tick').increment() time.sleep(_MIN_POLLING_PERIOD) logging.debug('Exit event loop.') # Stopped - stop all the client threads, stop listening, then signal # that shutdown is complete. for thread in self._client_threads: thread.stop() try: self._connection_listener.close() except Exception as e: logging.error('Error stopping pool service: %r', e) raise finally: # Clean up the container pool. self._pool.cleanup() # Make sure state is consistent. self._stop_event.set() self._stop_event = None self._running = False metrics.Counter(METRICS_PREFIX + '/service_stopped').increment() logging.debug('Container pool stopped.')
def _RepoSelfupdate(self): """Execute repo selfupdate command. 'repo selfupdate' would clean up the .repo/repo dir on certain exceptions and warnings, it must be followed by the 'repo init' command, which would recover .repo/repo in this circumstance. """ cmd = [self.repo_cmd, 'selfupdate'] failed_to_selfupdate = False try: cmd_result = cros_build_lib.run(cmd, cwd=self.directory, capture_output=True, log_output=True, encoding='utf-8') if (cmd_result.error is not None and SELFUPDATE_WARNING_RE.search(cmd_result.error)): logging.warning('Unable to selfupdate because of warning "%s"', SELFUPDATE_WARNING) failed_to_selfupdate = True except cros_build_lib.RunCommandError as e: logging.warning('repo selfupdate failed with exception: %s', e) failed_to_selfupdate = True if failed_to_selfupdate: metrics.Counter( constants.MON_REPO_SELFUPDATE_FAILURE_COUNT).increment() logging.warning( 'Failed to selfupdate repo, cleaning .repo/repo in %s', self.directory) osutils.RmDir(os.path.join(self.directory, '.repo', 'repo'), ignore_missing=True)
def poll_rpc_servers(servers, servers_lock, shards=None, period=60, stop_event=None): """Blocking function that polls all servers and shards @param servers: list of servers to poll @param servers_lock: lock to be used when accessing servers or shards @param shards: list of shards to poll @param period: time between polls @param stop_event: Event that can be set to stop polling """ pool = multiprocessing.Pool(processes=multiprocessing.cpu_count() * 4) while (not stop_event or not stop_event.is_set()): start_time = time.time() with servers_lock: all_servers = set(servers).union(shards) logging.debug('Starting Server Polling: %s', ', '.join(all_servers)) pool.map(afe_rpc_call, all_servers) logging.debug('Finished Server Polling') metrics.Counter(METRIC_TICK).increment() wait_time = (start_time + period) - time.time() if wait_time > 0: time.sleep(wait_time)
def is_server_in_prod(server_name, afe): """Validate server's role and status. @param server_name: the server name to be validated. @param afe: the afe server to get role & status info in server_db. @return: A boolean value, True when the server_name is in prod, False otherwise, or if RPC fails. """ logging.info('Validating server: %s', server_name) afe = frontend_wrappers.RetryingAFE(timeout_min=5, delay_sec=10, server=afe) is_prod_proxy_server = False try: if afe.run('get_servers', hostname=server_name, status='primary', role='golo_proxy'): is_prod_proxy_server = True except urllib2.URLError as e: logging.warning('RPC get_servers failed on afe %s: %s', afe, str(e)) finally: metrics.Counter(metrics_template % 'server_in_prod_check').increment( fields={'success': is_prod_proxy_server}) return is_prod_proxy_server
def _deserialize_many(self, serialized_list, djmodel, message): """Deserialize data in JSON format to database. Deserialize a list of JSON-formatted data to database using Django. @param serialized_list: A list of JSON-formatted data or python dict literals. @param djmodel: Django model type. @param message: A string to be used in a logging message. """ logging.info('Deserializing %s %ss', len(serialized_list), message) i = 0 for serialized in serialized_list: i += 1 if i % 100 == 1: logging.info('Progress: at entry %s', i) with transaction.commit_on_success(): try: djmodel.deserialize(serialized) except Exception as e: logging.error('Deserializing a %s fails: %s, Error: %s', message, serialized, e) metrics.Counter( 'chromeos/autotest/shard_client/deserialization_failed' ).increment() logging.info('Done deserializing %ss', message)
def __init__(self, container_path=constants.DEFAULT_CONTAINER_PATH, container_factory=None): """Initialize a ContainerBucket. @param container_path: Path to the directory used to store containers. Default is set to AUTOSERV/container_path in global config. @param container_factory: A factory for creating Containers. """ self.container_path = os.path.realpath(container_path) if container_factory is not None: self._factory = container_factory else: # Pass in the container path so that the bucket is hermetic (i.e. so # that if the container path is customized, the base image doesn't # fall back to using the default container path). try: base_image_ok = True container = BaseImage(self.container_path).get() except error.ContainerError as e: base_image_ok = False raise e finally: metrics.Counter(METRICS_PREFIX + '/base_image', field_spec=[ts_mon.BooleanField('corrupted')] ).increment( fields={'corrupted': not base_image_ok}) self._factory = ContainerFactory( base_container=container, lxc_path=self.container_path) self.container_cache = {}
def _emit_gs_returncode_metric(returncode): """Increment the gs_returncode counter based on |returncode|.""" m_gs_returncode = 'chromeos/autotest/gs_offloader/gs_returncode' rcode = int(returncode) if rcode < 0 or rcode > 255: rcode = -1 metrics.Counter(m_gs_returncode).increment(fields={'return_code': rcode})
def main(): """Main entry.""" logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.DEBUG) with ts_mon_config.SetupTsMonGlobalState(service_name='kill_slow_queries', indirect=True): count = 0 parser, options, args = parse_options() if not verify_options_and_args(options, args): parser.print_help() return 1 try: while True: result_log_strs, count = kill_slow_queries( options.user, options.password, options.timeout) if result_log_strs: gmail_lib.send_email( options.mail, 'Successfully killed slow autotest db queries', 'Below are killed queries:\n%s' % result_log_strs) m = 'chromeos/autotest/afe_db/killed_slow_queries' metrics.Counter(m).increment_by(count) time.sleep(options.timeout) except Exception as e: logging.error('Failed to kill slow db queries.\n%s', e) gmail_lib.send_email( options.mail, 'Failed to kill slow autotest db queries.', ('Error occurred during killing slow db queries:\n%s\n' 'Detailed logs can be found in /var/log/slow_queries.log on db' ' backup server.\nTo avoid db crash, please check ASAP.') % e) raise
def _MaybeCleanDistfiles(repo, distfiles_ts, metrics_fields): """Cleans the distfiles directory if too old. Args: repo: repository.RepoRepository instance. distfiles_ts: A timestamp str for the last time distfiles was cleaned. May be None. metrics_fields: Dictionary of fields to include in metrics. Returns: The new distfiles_ts to persist in state. """ if distfiles_ts is None: return None distfiles_age = (time.time() - distfiles_ts) / 3600.0 if distfiles_age < _DISTFILES_CACHE_EXPIRY_HOURS: return distfiles_ts logging.info('Remove old distfiles cache (cache expiry %d hours)', _DISTFILES_CACHE_EXPIRY_HOURS) osutils.RmDir(os.path.join(repo.directory, '.cache', 'distfiles'), ignore_missing=True, sudo=True) metrics.Counter(METRIC_DISTFILES_CLEANUP).increment( field(metrics_fields, reason='cache_expired')) # Cleaned cache, so reset distfiles_ts return None
def main(): """Runs the program.""" options = parse_options() logging_manager.configure_logging( test_importer.TestImporterLoggingConfig(), verbose=options.verbose) backup_succeeded = False with ts_mon_config.SetupTsMonGlobalState(service_name='mysql_db_backup', indirect=True): with metrics.SecondsTimer('chromeos/autotest/afe_db/backup/durations', fields={'type': options.type}): try: logging.debug('Start db backup: %s', options.type) archiver = MySqlArchiver(options.type, options.keep, options.gs_bucket) dump_file = archiver.dump() logging.debug('Uploading backup: %s', options.type) archiver.upload_to_google_storage(dump_file) archiver.cleanup() logging.debug('Db backup completed: %s', options.type) backup_succeeded = True finally: metrics.Counter('chromeos/autotest/db/db_backup/completed' ).increment(fields={ 'success': backup_succeeded, 'type': options.type })
def cleanup(self, timeout=0): """Cleans up the container pool. Stops all worker threads, and destroys all Containers still in the Pool. @param timeout: For testing. If this is non-zero, it specifies the number of seconds to wait for each worker to shut down. An error is raised if shutdown has not occurred by then. If zero (the default), don't wait for worker threads to shut down, just return immediately. """ logging.info('Pool.cleanup called.') # Stop the monitor thread, then drain the pool. self._monitor.stop(timeout) try: dcount = 0 logging.debug('Emptying container pool') while True: container = self._pool.get(block=False) dcount += 1 container.destroy() except Queue.Empty: pass finally: metrics.Counter(METRICS_PREFIX + '/containers_cleaned_up').increment_by(dcount) logging.debug('Done. Destroyed %d containers', dcount)
def __init__(self, primary_store, shadow_store, mismatch_callback=None): """ @param primary_store: A CachingHostInfoStore to be used as the primary store. @param shadow_store: A CachingHostInfoStore to be used to shadow the primary store. @param mismatch_callback: A callback used to notify whenever we notice a mismatch between primary_store and shadow_store. The signature of the callback must match: callback(primary_info, shadow_info) where primary_info and shadow_info are HostInfo objects obtained from the two stores respectively. Mostly used by unittests. Actual users don't know / nor care that they're using a ShadowingStore. """ super(ShadowingStore, self).__init__() self._primary_store = primary_store self._shadow_store = shadow_store self._mismatch_callback = ( mismatch_callback if mismatch_callback is not None else _log_info_mismatch) try: self._shadow_store.commit(self._primary_store.get()) except host_info.StoreError as e: metrics.Counter( _METRICS_PREFIX + 'initialization_fail_count').increment() logger.exception( 'Failed to initialize shadow store. ' 'Expect primary / shadow desync in the future.')
def _FinishBuildStageInCIDBAndMonarch(self, status, elapsed_time_seconds=0): """Mark the stage as finished in cidb. Args: status: The finish status of the build. Enum type constants.BUILDER_COMPLETED_STATUSES elapsed_time_seconds: (optional) Elapsed time in stage, in seconds. """ _, db = self._run.GetCIDBHandle() if self._build_stage_id is not None and db is not None: db.FinishBuildStage(self._build_stage_id, status) fields = { 'status': status, 'name': self.name, 'build_config': self._run.config.name, 'important': self._run.config.important } metrics.SecondsDistribution(constants.MON_STAGE_DURATION).add( elapsed_time_seconds, fields=fields) metrics.Counter( constants.MON_STAGE_COMP_COUNT).increment(fields=fields)
def QueryAndEmit(baselines, cursor): """Queries MySQL for important stats and emits Monarch metrics @param baselines: A dict containing the initial values for the cumulative metrics. @param cursor: The mysql command line. """ for status in EMITTED_STATUSES_COUNTERS: delta = GetStatus(cursor, status) - baselines[status] metric_name = 'chromeos/autotest/afe_db/%s' % status.lower() metrics.Counter(metric_name).set(delta) for status in EMITTED_STATUS_GAUGES: metric_name = 'chromeos/autotest/afe_db/%s' % status.lower() metrics.Gauge(metric_name).set(GetStatus(cursor, status)) pages_free = GetStatus(cursor, 'Innodb_buffer_pool_pages_free') pages_total = GetStatus(cursor, 'Innodb_buffer_pool_pages_total') metrics.Gauge('chromeos/autotest/afe_db/buffer_pool_pages').set( pages_free, fields={'used': False}) metrics.Gauge('chromeos/autotest/afe_db/buffer_pool_pages').set( pages_total - pages_free, fields={'used': True})
def _get(self, id, timeout): """Gets a container from the pool. @param id: A ContainerId to assign to the new container. @param timeout: A timeout (in seconds) to wait for the pool. If a container is not available from the pool within the given period, None will be returned. @return: A container from the pool. """ logging.debug('Received get request (id=%s)', id) container = self._pool.get(timeout) # Assign an ID to the container as soon as it is removed from the pool. # This associates the container with the process to which it will be # handed off. if container is not None: logging.debug('Assigning container (name=%s, id=%s)', container.name, id) container.id = id else: logging.debug('No container (id=%s)', id) metrics.Counter(METRICS_PREFIX + '/container_requests', field_spec=[ts_mon.BooleanField('success')]).increment( fields={'success': (container is not None)}) return container
def _commit_to_primary_store(self, info): try: self._primary_store.commit(info) except host_info.StoreError: metrics.Counter(_COMMIT_METRIC_NAME).increment( fields={'file_commit_result': 'skipped'}) raise
def CancelBuilds(buildbucket_ids, buildbucket_client, debug=True, config=None): """Cancel Buildbucket builds in a set. Args: buildbucket_ids: A list of build_ids (strings). buildbucket_client: Instance of buildbucket_lib.buildbucket_client. debug: Boolean indicating whether it's a dry run. Default to True. config: Instance of config_lib.BuildConfig. Config dict for the master build initiating the cancel. Optional. """ if buildbucket_ids: logging.info('Canceling buildbucket_ids: %s', buildbucket_ids) if (not debug) and config: fields = {'build_type': config.build_type, 'build_name': config.name} metrics.Counter(constants.MON_BB_CANCEL_BATCH_BUILDS_COUNT).increment( fields=fields) cancel_results = buildbucket_client.CancelBatchBuildsRequest( buildbucket_ids, dryrun=debug) result_map = buildbucket_lib.GetResultMap(cancel_results) for buildbucket_id, result in result_map.items(): # Check for error messages if buildbucket_lib.GetNestedAttr(result, ['error']): # TODO(nxia): Get build url and log url in the warnings. logging.warning('Error cancelling build %s with reason: %s. ' 'Please check the status of the build.', buildbucket_id, buildbucket_lib.GetErrorReason(result))
def _refresh_from_shadow_store(self): try: return self._shadow_store.get(force_refresh=True) except host_info.StoreError: metrics.Counter(_REFRESH_METRIC_NAME).increment(fields={ 'validation_result': 'fail_shadow_store_refresh'}) raise
def send_email(bug, bug_template): """Send email to the owner and cc's to notify the TestBug. @param bug: TestBug instance. @param bug_template: A template dictionary specifying the default bug filing options for failures in this suite. """ to_set = set(bug.cc) if bug.cc else set() if bug.owner: to_set.add(bug.owner) if bug_template.get('cc'): to_set = to_set.union(bug_template.get('cc')) if bug_template.get('owner'): to_set.add(bug_template.get('owner')) recipients = ', '.join(to_set) if not recipients: logging.warning('No owner/cc found. Will skip sending a mail.') return success = False try: gmail_lib.send_email( recipients, bug.title(), bug.summary(), retry=False, creds_path=site_utils.get_creds_abspath(EMAIL_CREDS_FILE)) success = True finally: (metrics.Counter('chromeos/autotest/errors/send_bug_email').increment( fields={'success': success}))
def _MaybeCleanDistfiles(cache_dir, distfiles_ts): """Cleans the distfiles directory if too old. Args: cache_dir: Directory of the cache, as a string. distfiles_ts: A timestamp str for the last time distfiles was cleaned. May be None. Returns: The new distfiles_ts to persist in state. """ # distfiles_ts can be None for a fresh environment, which means clean. if distfiles_ts is None: return time.time() distfiles_age = (time.time() - distfiles_ts) / 3600.0 if distfiles_age < _DISTFILES_CACHE_EXPIRY_HOURS: return distfiles_ts logging.info('Remove old distfiles cache (cache expiry %d hours)', _DISTFILES_CACHE_EXPIRY_HOURS) osutils.RmDir(os.path.join(cache_dir, 'distfiles'), ignore_missing=True, sudo=True) metrics.Counter(METRIC_DISTFILES_CLEANUP).increment( fields=field({}, reason='cache_expired')) # Cleaned cache, so reset distfiles_ts return time.time()