def main(args): parser = commandline.ArgumentParser(description=__doc__) parser.add_argument( '--interval', default=60, type=int, help='time (in seconds) between sampling system metrics') opts = parser.parse_args(args) opts.Freeze() # This returns a 0 value the first time it's called. Call it now and # discard the return value. psutil.cpu_times_percent() # Wait a random amount of time before starting the loop in case sysmon # is started at exactly the same time on all machines. time.sleep(random.uniform(0, opts.interval)) # This call returns a context manager that doesn't do anything, so we # ignore the return value. ts_mon_config.SetupTsMonGlobalState('sysmon') # The default prefix is '/chrome/infra/'. interface.state.metric_name_prefix = (interface.state.metric_name_prefix + 'chromeos/sysmon/') mainloop = _MainLoop(opts.interval) mainloop.loop_forever()
def _SetupConnections(options, build_config): """Set up CIDB connections using the appropriate Setup call. Args: options: Command line options structure. build_config: Config object for this build. """ # Outline: # 1) Based on options and build_config, decide whether we are a production # run, debug run, or standalone run. # 2) Set up cidb instance accordingly. # 3) Update topology info from cidb, so that any other service set up can use # topology. # 4) Set up any other services. run_type = _GetRunEnvironment(options, build_config) if run_type == _ENVIRONMENT_PROD: cidb.CIDBConnectionFactory.SetupProdCidb() context = ts_mon_config.SetupTsMonGlobalState( 'cbuildbot', indirect=True, task_num=options.ts_mon_task_num) elif run_type == _ENVIRONMENT_DEBUG: cidb.CIDBConnectionFactory.SetupDebugCidb() context = ts_mon_config.TrivialContextManager() else: cidb.CIDBConnectionFactory.SetupNoCidb() context = ts_mon_config.TrivialContextManager() db = cidb.CIDBConnectionFactory.GetCIDBConnectionForBuilder() topology.FetchTopologyFromCIDB(db) return context
def main(argv): """Standard main() for command line processing. @param argv Command line arguments (normally sys.argv). """ parser = GetParser() options = parser.parse_args(argv[1:]) with ts_mon_config.SetupTsMonGlobalState('dump_suite_report'): afe = frontend_wrappers.RetryingAFE(timeout_min=5, delay_sec=10, server=options.afe) tko = frontend_wrappers.RetryingTKO(timeout_min=5, delay_sec=10) # Look up and generate entries for all jobs. entries = [] for suite_job_id in options.job_ids: logging.debug('Suite job %s:' % suite_job_id) suite_entries = suite_report.generate_suite_report(suite_job_id, afe=afe, tko=tko) logging.debug('... generated %d entries' % len(suite_entries)) entries.extend(suite_entries) # Write all entries as JSON. if options.output: with open(options.output, 'w') as f: suite_report.dump_entries_as_json(entries, f) else: suite_report.dump_entries_as_json(entries, sys.stdout)
def main(): """Main entry.""" logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.DEBUG) with ts_mon_config.SetupTsMonGlobalState(service_name='kill_slow_queries', indirect=True): count = 0 parser, options, args = parse_options() if not verify_options_and_args(options, args): parser.print_help() return 1 try: while True: result_log_strs, count = kill_slow_queries( options.user, options.password, options.timeout) if result_log_strs: gmail_lib.send_email( options.mail, 'Successfully killed slow autotest db queries', 'Below are killed queries:\n%s' % result_log_strs) m = 'chromeos/autotest/afe_db/killed_slow_queries' metrics.Counter(m).increment_by(count) time.sleep(options.timeout) except Exception as e: logging.error('Failed to kill slow db queries.\n%s', e) gmail_lib.send_email( options.mail, 'Failed to kill slow autotest db queries.', ('Error occurred during killing slow db queries:\n%s\n' 'Detailed logs can be found in /var/log/slow_queries.log on db' ' backup server.\nTo avoid db crash, please check ASAP.') % e) raise
def main(): parser = commandline.ArgumentParser(description=__doc__, default_log_level='DEBUG') parser.add_argument( '--interval', default=60, type=int, help='time (in seconds) between sampling system metrics') parser.add_argument( '--collect-prod-hosts', action='store_true', help='[DEPRECATED. Use --collect-host-manifest instead.] ' 'Enable collection of prod host metrics, like roles') parser.add_argument( '--collect-host-manifest', default=None, choices=['prod', 'staging'], help='Enable collection of server metrics (e.g. roles) for servers in ' 'the given lab environment.') opts = parser.parse_args() opts.Freeze() # This call returns a context manager that doesn't do anything, so we # ignore the return value. ts_mon_config.SetupTsMonGlobalState('sysmon', auto_flush=False) # The default prefix is '/chrome/infra/'. interface.state.metric_name_prefix = (interface.state.metric_name_prefix + 'chromeos/sysmon/') # Transitional, while we migrate users off of |collect_prod_hosts| if opts.collect_host_manifest is not None: opts.collect_prod_hosts = True collector = _MetricCollector(collect_prod_hosts=opts.collect_prod_hosts) loop.SleepLoop(callback=collector, interval=opts.interval).loop_forever()
def main(): """Main entry.""" # Clear all loggers to make sure the following basicConfig take effect. logging.shutdown() reload(logging) logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.DEBUG) with ts_mon_config.SetupTsMonGlobalState(service_name='kill_slow_queries', indirect=True): count = 0 parser, options, args = parse_options() if not verify_options_and_args(options, args): parser.print_help() return 1 try: while True: result_log_strs, count = kill_slow_queries( options.user, options.password, options.timeout) if result_log_strs: gmail_lib.send_email( options.mail, 'Successfully killed slow autotest db queries', 'Below are killed queries:\n%s' % result_log_strs) m = 'chromeos/autotest/afe_db/killed_slow_queries' metrics.Counter(m).increment_by(count) time.sleep(options.timeout) except Exception as e: m = 'chromeos/autotest/afe_db/failed_to_kill_query' metrics.Counter(m).increment() logging.error('Failed to kill slow db queries.\n%s', e) raise
def main(args): """Main func. @args: A list of system arguments. """ args = _parse_args(args) swarming_bots.setup_logging(args.verbose, args.log_file) if not args.swarming_proxy: logging.error('No swarming proxy instance specified. ' 'Specify swarming_proxy in [CROS] in shadow_config, ' 'or use --swarming_proxy') return 1 if not args.swarming_proxy.startswith('https://'): swarming_proxy = 'https://' + args.swarming_proxy else: swarming_proxy = args.swarming_proxy global _shut_down logging.info("Setting signal handler.") signal.signal(signal.SIGINT, handle_signal) signal.signal(signal.SIGTERM, handle_signal) bot_manager = swarming_bots.BotManager(swarming_bots.parse_range( args.id_range), args.working_dir, args.swarming_proxy, specify_bot_id=args.specify_bot_id) is_prod = False retryable = True with ts_mon_config.SetupTsMonGlobalState('swarming_bots', indirect=True): while not _shut_down: tick(args.afe, bot_manager) time.sleep(CHECK_INTERVAL)
def main(argv): parser = commandline.ArgumentParser(description=__doc__) parser.add_argument('swarming_server', action='store', help='Swarming server to send no-op requests to.') options = parser.parse_args(argv) m_timer = 'chromeos/autotest/swarming_proxy/no_op_durations' m_count = 'chromeos/autotest/swarming_proxy/no_op_attempts' command = commands.RUN_SUITE_PATH fields = {'success': False, 'swarming_server': options.swarming_server} with ts_mon_config.SetupTsMonGlobalState('swarm_mon', indirect=True): while True: with metrics.SecondsTimer(m_timer, fields=fields) as f: try: with metrics.SuccessCounter(m_count): swarming_lib.RunSwarmingCommand( [command, '--do_nothing'], options.swarming_server, dimensions=[('pool', 'default')], timeout_secs=120) f['success'] = True except (cros_build_lib.RunCommandError, timeout_util.TimeoutError): pass time.sleep(60)
def main(argv): options = _ParseArguments(argv) manager = tasks.ProcessPoolTaskManager(options.max_tasks, _GetTaskHandler(options), options.interval) queue = service.WorkQueueServer(options.spool) try: # The ordering for logging setup here matters (alas): # The `ts_mon` setup starts a subprocess that makes logging # calls, and TimedRotatingFileHandler isn't multiprocess safe. # So, we need for the `ts_mon` child and this process to write # to different logs. The gory details are in crbug.com/774597. # # This is a hack, really. If you're studying this comment # because you have to clean up my mess, I'm truly and profoundly # sorry. But still I wouldn't change a thing... # https://www.youtube.com/watch?v=fFtGfyruroU with ts_mon_config.SetupTsMonGlobalState('provision_workqueue', indirect=True): _SetupLogging(options) logging.info('Work queue service starts') logging.info(' Spool dir is %s', options.spool) logging.info(' Maximum of %d concurrent tasks', options.max_tasks) logging.info(' Time per tick is %.3f seconds', options.interval) queue.ProcessRequests(manager) except KeyboardInterrupt: pass finally: manager.Close()
def main(argv): """Entry point.""" logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - " + "%(levelname)s - %(message)s") parser, options, args = parse_options() if not verify_options_and_args(options, args): parser.print_help() sys.exit(1) with ts_mon_config.SetupTsMonGlobalState(service_name='sync_server_db', indirect=True): try: metrics.Counter(_METRICS_PREFIX + '/start').increment() logging.info("Setting signal handler") signal.signal(signal.SIGINT, handle_signal) signal.signal(signal.SIGTERM, handle_signal) while not _shutdown: _main(options) metrics.Counter(_METRICS_PREFIX + '/tick').increment(fields={'success': True}) time.sleep(options.sleep) except: metrics.Counter(_METRICS_PREFIX + '/tick').increment(fields={'success': False}) raise
def main(): """Runs the program.""" options = parse_options() logging_manager.configure_logging( test_importer.TestImporterLoggingConfig(), verbose=options.verbose) backup_succeeded = False with ts_mon_config.SetupTsMonGlobalState(service_name='mysql_db_backup', indirect=True): with metrics.SecondsTimer('chromeos/autotest/afe_db/backup/durations', fields={'type': options.type}): try: logging.debug('Start db backup: %s', options.type) archiver = MySqlArchiver(options.type, options.keep, options.gs_bucket) dump_file = archiver.dump() logging.debug('Uploading backup: %s', options.type) archiver.upload_to_google_storage(dump_file) archiver.cleanup() logging.debug('Db backup completed: %s', options.type) backup_succeeded = True finally: metrics.Counter('chromeos/autotest/db/db_backup/completed' ).increment(fields={ 'success': backup_succeeded, 'type': options.type })
def SetupTsMonGlobalState(*args, **kwargs): """Import-safe wrap around chromite.lib.ts_mon_config's setup function. @param *args: Args to pass through. @param **kwargs: Kwargs to pass through. """ try: # TODO(crbug.com/739466) This module import is delayed because it adds # 1-2 seconds to the module import time and most users of site_utils # don't need it. The correct fix is to break apart site_utils into more # meaningful chunks. from chromite.lib import ts_mon_config except ImportError: logging.warn('Unable to import chromite. Monarch is disabled.') return TrivialContextManager() try: context = ts_mon_config.SetupTsMonGlobalState(*args, **kwargs) if hasattr(context, '__exit__'): return context except Exception as e: logging.warning( 'Caught an exception trying to setup ts_mon, ' 'monitoring is disabled: %s', e, exc_info=True) return TrivialContextManager()
def testTaskNumWithIndirect(self): """The task_num argument should propagate to the flushing subprocess.""" create_flushing_process = self.PatchObject( ts_mon_config, '_CreateTsMonFlushingProcess') ts_mon_config.SetupTsMonGlobalState('unittest', indirect=True, task_num=42) options = ts_mon_config._GenerateTsMonArgparseOptions( 'unittest', False, False, None, 42) create_flushing_process.assert_called_once_with(options)
def main(): """Sets up ts_mon and repeatedly queries MySQL stats""" logging.basicConfig(stream=sys.stdout, level=logging.INFO) conn = MySQLConnection('localhost', DEFAULT_USER, DEFAULT_PASSWD) conn.Connect() with ts_mon_config.SetupTsMonGlobalState('mysql_stats', indirect=True): QueryLoop(conn)
def testShortLived(self): """Tests that configuring ts-mon to use short-lived processes works.""" self.patchTime() with tempfile.NamedTemporaryFile(dir='/var/tmp') as out: with ts_mon_config.SetupTsMonGlobalState('metrics_unittest', short_lived=True, debug_file=out.name): # pylint: disable=protected-access self.assertTrue(ts_mon_config._WasSetup)
def main(argv): parser = GetParser() options = parser.parse_args(argv) creds_file = options.service_acct_json project_id = options.project_id client = _Client(creds_path=creds_file) with ts_mon_config.SetupTsMonGlobalState('export_to_cloud_trace'): _WatchAndSendSpans(project_id, client)
def main(): """Sets up ts_mon and repeatedly queries MySQL stats""" logging.basicConfig(stream=sys.stdout, level=logging.INFO) conn = RetryingConnection('localhost', DEFAULT_USER, DEFAULT_PASSWD) conn.Connect() # TODO(crbug.com/803566) Use indirect=False to mitigate orphan mysql_stats # processes overwhelming shards. with ts_mon_config.SetupTsMonGlobalState('mysql_stats', indirect=False): QueryLoop(conn)
def main(): """Main method of gs_offloader.""" options = parse_options() if options.process_all: offloader_type = 'all' elif options.process_hosts_only: offloader_type = 'hosts' else: offloader_type = 'jobs' log_timestamp = time.strftime(LOG_TIMESTAMP_FORMAT) if options.log_size > 0: log_timestamp = '' log_basename = LOG_FILENAME_FORMAT % (offloader_type, log_timestamp) log_filename = os.path.join(LOG_LOCATION, log_basename) log_formatter = logging.Formatter(LOGGING_FORMAT) # Replace the default logging handler with a RotatingFileHandler. If # options.log_size is 0, the file size will not be limited. Keeps # one backup just in case. handler = logging.handlers.RotatingFileHandler( log_filename, maxBytes=1024 * options.log_size, backupCount=1) handler.setFormatter(log_formatter) logger = logging.getLogger() logger.setLevel(logging.DEBUG) logger.addHandler(handler) # Nice our process (carried to subprocesses) so we don't overload # the system. if not options.normal_priority: logging.debug('Set process to nice value: %d', NICENESS) os.nice(NICENESS) if psutil: proc = psutil.Process() logging.debug('Set process to ionice IDLE') proc.ionice(psutil.IOPRIO_CLASS_IDLE) # os.listdir returns relative paths, so change to where we need to # be to avoid an os.path.join on each loop. logging.debug('Offloading Autotest results in %s', RESULTS_DIR) os.chdir(RESULTS_DIR) signal.signal(signal.SIGALRM, timeout_handler) with ts_mon_config.SetupTsMonGlobalState('gs_offloader', indirect=True, short_lived=False): offloader = Offloader(options) if not options.delete_only: wait_for_gs_write_access(offloader.gs_uri) while True: offloader.offload_once() if options.offload_once: break time.sleep(SLEEP_TIME_SECS)
def main(): """Entry point.""" arguments = parse_arguments() with ts_mon_config.SetupTsMonGlobalState(service_name='test_push', indirect=True): test_push_success = False try: _main(arguments) test_push_success = True finally: metrics.Counter('chromeos/autotest/test_push/completed').increment( fields={'success': test_push_success})
def main(): ts_mon_config.SetupTsMonGlobalState('shard_client') try: metrics.Counter('chromeos/autotest/shard_client/start').increment() main_without_exception_handling() except Exception as e: message = 'Uncaught exception. Terminating shard_client.' email_manager.manager.log_stacktrace(message) logging.exception(message) raise finally: email_manager.manager.send_queued_emails()
def main(argv): options = PreParseArguments(argv) metric_fields = { 'branch_name': options.branch or 'master', 'build_config': options.build_config_name, 'tryjob': options.remote_trybot, } # Enable Monarch metrics gathering. with ts_mon_config.SetupTsMonGlobalState( 'cbuildbot_launch', common_metric_fields=metric_fields, indirect=True): return _main(options, argv)
def Main(): """Sets up logging and runs matchers against stdin.""" args = ParseArgs() log_daemon_common.SetupLogging(args) # Set up metrics sending and go. ts_mon_args = {} if args.debug_metrics_file: ts_mon_args['debug_file'] = args.debug_metrics_file with ts_mon_config.SetupTsMonGlobalState('apache_access_log_metrics', **ts_mon_args): log_daemon_common.RunMatchers(sys.stdin, MATCHERS)
def main(): if _monitor_db_host_acquisition: logging.info('Please set inline_host_acquisition=False in the shadow ' 'config before starting the host scheduler.') sys.exit(0) try: options = parse_arguments(sys.argv[1:]) scheduler_lib.check_production_settings(options) # If server database is enabled, check if the server has role # `host_scheduler`. If the server does not have host_scheduler role, # exception will be raised and host scheduler will not continue to run. if server_manager_utils.use_server_db(): server_manager_utils.confirm_server_has_role(hostname='localhost', role='host_scheduler') initialize(options.testing) with ts_mon_config.SetupTsMonGlobalState( 'autotest_host_scheduler', indirect=True, debug_file=options.metrics_file, ): metrics.Counter('%s/start' % _METRICS_PREFIX).increment() process_start_time = time.time() host_scheduler = HostScheduler() minimum_tick_sec = global_config.global_config.get_config_value( 'SCHEDULER', 'host_scheduler_minimum_tick_sec', type=float) while not _shutdown: if _lifetime_expired(options.lifetime_hours, process_start_time): break start = time.time() host_scheduler.tick() curr_tick_sec = time.time() - start if (minimum_tick_sec > curr_tick_sec): time.sleep(minimum_tick_sec - curr_tick_sec) else: time.sleep(0.0001) logging.info('Shutdown request recieved. Bye! Bye!') except server_manager_utils.ServerActionError: # This error is expected when the server is not in primary status # for host-scheduler role. Thus do not send email for it. raise except Exception: metrics.Counter('%s/uncaught_exception' % _METRICS_PREFIX).increment() raise finally: email_manager.manager.send_queued_emails() if _db_manager: _db_manager.disconnect()
def testResetAfter(self): """Tests that the reset_after flag works to send metrics only once.""" # By mocking out its "time" module, the forked flushing process will think # it should call Flush() whenever we send a metric. self.patchTime() with tempfile.NamedTemporaryFile(dir='/var/tmp') as out: # * The indirect=True flag is required for reset_after to work. # * Using debug_file, we send metrics to the temporary file instead of # sending metrics to production via PubSub. with ts_mon_config.SetupTsMonGlobalState('metrics_unittest', indirect=True, debug_file=out.name): def MetricName(i, flushed): return 'test/metric/name/%d/%s' % (i, flushed) # Each of these .set() calls will result in a Flush() call. for i in range(7): # any extra streams with different fields and reset_after=False # will be cleared only if the below metric is cleared. metrics.Boolean(MetricName(i, True), reset_after=False).set( True, fields={'original': False}) metrics.Boolean(MetricName(i, True), reset_after=True).set( True, fields={'original': True}) for i in range(7): metrics.Boolean(MetricName(i, False), reset_after=False).set(True) # By leaving the context, we .join() the flushing process. with open(out.name, 'r') as fh: content = fh.read() # The flushed metrics should be sent only three times, because: # * original=False is sent twice # * original=True is sent once. for i in range(7): self.assertEqual(content.count(MetricName(i, True)), 3) # The nonflushed metrics are sent once-per-flush. # There are 7 of these metrics, # * The 0th is sent 7 times, # * The 1st is sent 6 times, # ... # * The 6th is sent 1 time. # So the "i"th metric is sent (7-i) times. for i in range(7): self.assertEqual(content.count(MetricName(i, False)), 7 - i)
def main(): if _monitor_db_host_acquisition: logging.info('Please set inline_host_acquisition=False in the shadow ' 'config before starting the host scheduler.') # The upstart job for the host scheduler understands exit(0) to mean # 'don't respawn'. This is desirable when the job scheduler is acquiring # hosts inline. sys.exit(0) try: options = parse_arguments(sys.argv[1:]) scheduler_lib.check_production_settings(options) # If server database is enabled, check if the server has role # `host_scheduler`. If the server does not have host_scheduler role, # exception will be raised and host scheduler will not continue to run. if server_manager_utils.use_server_db(): server_manager_utils.confirm_server_has_role(hostname='localhost', role='host_scheduler') initialize(options.testing) # Start the thread to report metadata. metadata_reporter.start() ts_mon_config.SetupTsMonGlobalState('autotest_host_scheduler') host_scheduler = HostScheduler() minimum_tick_sec = global_config.global_config.get_config_value( 'SCHEDULER', 'minimum_tick_sec', type=float) while not _shutdown: start = time.time() host_scheduler.tick() curr_tick_sec = time.time() - start if (minimum_tick_sec > curr_tick_sec): time.sleep(minimum_tick_sec - curr_tick_sec) else: time.sleep(0.0001) except server_manager_utils.ServerActionError as e: # This error is expected when the server is not in primary status # for host-scheduler role. Thus do not send email for it. raise except Exception: email_manager.manager.log_stacktrace( 'Uncaught exception; terminating host_scheduler.') raise finally: email_manager.manager.send_queued_emails() if _db_manager: _db_manager.disconnect() metadata_reporter.abort()
def main(): """Main script.""" options = parse_options() log_config = logging_config.LoggingConfig() if options.logfile: log_config.add_file_handler(file_path=os.path.abspath(options.logfile), level=logging.DEBUG) with ts_mon_config.SetupTsMonGlobalState(service_name='cleanup_tko_db', indirect=True): server = CONFIG.get_config_value('AUTOTEST_WEB', 'global_db_host', default=CONFIG.get_config_value( 'AUTOTEST_WEB', 'host')) user = CONFIG.get_config_value('AUTOTEST_WEB', 'global_db_user', default=CONFIG.get_config_value( 'AUTOTEST_WEB', 'user')) password = CONFIG.get_config_value('AUTOTEST_WEB', 'global_db_password', default=CONFIG.get_config_value( 'AUTOTEST_WEB', 'password')) database = CONFIG.get_config_value('AUTOTEST_WEB', 'global_db_database', default=CONFIG.get_config_value( 'AUTOTEST_WEB', 'database')) logging.info( 'Starting cleaning up old records in TKO database %s on ' 'server %s.', database, server) start_time = time.time() try: if options.recreate_test_attributes: with metrics.SecondsTimer(RECREATE_TEST_ATTRIBUTES_METRIC, fields={'success': False}) as fields: _recreate_test_attributes(server, user, password, database) fields['success'] = True else: with metrics.SecondsTimer(CLEANUP_METRIC, fields={'success': False}) as fields: utils.run_sql_cmd(server, user, password, CLEANUP_TKO_CMD, database) fields['success'] = True except: logging.exception('Cleanup failed with exception.') finally: duration = time.time() - start_time logging.info('Cleanup attempt finished in %s seconds.', duration)
def main(): """Main method of gs_offloader.""" options = parse_options() if options.process_all: offloader_type = 'all' elif options.process_hosts_only: offloader_type = 'hosts' else: offloader_type = 'jobs' _setup_logging(options, offloader_type) if options.enable_timestamp_cache: # Extend the cache expiry time by another 1% so the timstamps # are available as the results are purged. job_timestamp_cache.setup(options.age_to_delete * 1.01) # Nice our process (carried to subprocesses) so we don't overload # the system. if not options.normal_priority: logging.debug('Set process to nice value: %d', NICENESS) os.nice(NICENESS) if psutil: proc = psutil.Process() logging.debug('Set process to ionice IDLE') proc.ionice(psutil.IOPRIO_CLASS_IDLE) # os.listdir returns relative paths, so change to where we need to # be to avoid an os.path.join on each loop. logging.debug('Offloading Autotest results in %s', RESULTS_DIR) os.chdir(RESULTS_DIR) service_name = 'gs_offloader(%s)' % offloader_type with ts_mon_config.SetupTsMonGlobalState(service_name, indirect=True, short_lived=False, debug_file=options.metrics_file): with metrics.SuccessCounter('chromeos/autotest/gs_offloader/exit'): offloader = Offloader(options) if not options.delete_only: wait_for_gs_write_access(offloader.gs_uri) while True: offloader.offload_once() if options.offload_once: break time.sleep(SLEEP_TIME_SECS)
def main(): """main script. """ parser = argparse.ArgumentParser() parser.add_argument('--span', type=int, dest='span', default=1, help=('Number of hours that stats should be collected. ' 'If it is set to 24, the end time of stats being ' 'collected will set to the mid of the night. ' 'Default is set to 1 hour.')) parser.add_argument('-e', '--email', dest='email', default=None, help='Email any errors to the given email address.') options = parser.parse_args() boards = host_label_utils.get_all_boards() pools = ['bvt', 'suites', 'cq'] if options.span == 24: today = datetime.combine(date.today(), datetime.min.time()) end_time = time_utils.to_epoch_time(today) else: now = datetime.now() end_time = datetime(year=now.year, month=now.month, day=now.day, hour=now.hour) end_time = time_utils.to_epoch_time(end_time) start_time = end_time - timedelta(hours=options.span).total_seconds() print ('Collecting host stats from %s to %s...' % (time_utils.epoch_time_to_date_string(start_time), time_utils.epoch_time_to_date_string(end_time))) ts_mon_config.SetupTsMonGlobalState('collect_host_stats') errors = [] if not boards: errors.append('Error! No board found in metadb.') for board in boards: for pool in pools: error = report_stats(board, pool, start_time, end_time, options.span) if error: errors.append(error) if options.email and errors: gmail_lib.send_email(options.email, 'Error occured when collecting host stats.', '\n'.join(errors))
def main(argv): """Entry point for dut_mon.""" logging.getLogger().setLevel(logging.INFO) with ts_mon_config.SetupTsMonGlobalState('dut_mon', indirect=True): afe = frontend.AFE() counters = collections.defaultdict(lambda: 0) field_spec = [ts_mon.StringField('board'), ts_mon.StringField('model'), ts_mon.StringField('pool'), ts_mon.BooleanField('is_locked'), ts_mon.StringField('status'), ] dut_count = metrics.Gauge('chromeos/autotest/dut_mon/dut_count', description='The number of duts in a given ' 'state and bucket.', field_spec=field_spec) tick_count = metrics.Counter('chromeos/autotest/dut_mon/tick', description='Tick counter of dut_mon.') while True: # Note: We reset all counters to zero in each loop rather than # creating a new defaultdict, because we want to ensure that any # gauges that were previously set to a nonzero value by this process # get set back to zero if necessary. for k in counters: counters[k] = 0 logging.info('Fetching all hosts.') hosts = afe.get_hosts() logging.info('Fetched %s hosts.', len(hosts)) for host in hosts: fields = _get_bucket_for_host(host) counters[fields] += 1 for field, value in counters.iteritems(): logging.info('%s %s', field, value) dut_count.set(value, fields=field.__dict__) tick_count.increment() logging.info('Sleeping for 2 minutes.') time.sleep(120)
def main(): parser = commandline.ArgumentParser(description=__doc__, default_log_level='DEBUG') parser.add_argument( '--interval', default=60, type=int, help='time (in seconds) between sampling system metrics') opts = parser.parse_args() opts.Freeze() # This call returns a context manager that doesn't do anything, so we # ignore the return value. ts_mon_config.SetupTsMonGlobalState('sysmon', auto_flush=False) # The default prefix is '/chrome/infra/'. interface.state.metric_name_prefix = (interface.state.metric_name_prefix + 'chromeos/sysmon/') collector = _MetricCollector() loop.SleepLoop(callback=collector, interval=opts.interval).loop_forever()