def _stage_build_artifacts(build): """ Ensure components of |build| necessary for installing images are staged. @param build image we want to stage. @raises StageControlFileFailure: if the dev server throws 500 while staging suite control files. @return: dev_server.ImageServer instance to use with this build. @return: timings dictionary containing staging start/end times. """ timings = {} # Ensure components of |build| necessary for installing images are staged # on the dev server. However set synchronous to False to allow other # components to be downloaded in the background. ds = dev_server.ImageServer.resolve(build) timings[constants.DOWNLOAD_STARTED_TIME] = formatted_now() timer = autotest_stats.Timer( 'control_files.stage.%s' % (ds.get_server_name(ds.url()).replace('.', '_'))) try: with timer: ds.stage_artifacts(build, ['test_suites']) except dev_server.DevServerException as e: raise error.StageControlFileFailure("Failed to stage %s: %s" % (build, e)) timings[constants.PAYLOAD_FINISHED_TIME] = formatted_now() return (ds, timings)
def collect_client_job_results(self): """ A method that collects all the current results of a running client job into the results dir. By default does nothing as no client job is running, but when running a client job you can override this with something that will actually do something. """ # make an effort to wait for the machine to come up try: self.host.wait_up(timeout=30) except error.AutoservError: # don't worry about any errors, we'll try and # get the results anyway pass # Copy all dirs in default to results_dir timer = autotest_stats.Timer('collect_client_job_results') timer.start() try: self.host.get_file(self.client_results_dir + '/', self.server_results_dir, preserve_symlinks=True) # Only report time used for successful get_file calls. timer.stop() except Exception: # well, don't stop running just because we couldn't get logs e_msg = "Unexpected error copying test result logs, continuing ..." logging.error(e_msg) traceback.print_exc(file=sys.stdout)
def persist_records_sent_from_shard(shard, jobs, hqes): """ Sanity checking then saving serialized records sent to master from shard. During heartbeats shards upload jobs and hostqueuentries. This performs some sanity checks on these and then updates the existing records for those entries with the updated ones from the heartbeat. The sanity checks include: - Checking if the objects sent already exist on the master. - Checking if the objects sent were assigned to this shard. - hostqueueentries must be sent together with their jobs. @param shard: The shard the records were sent from. @param jobs: The jobs the shard sent. @param hqes: The hostqueuentries the shart sent. @raises error.UnallowedRecordsSentToMaster if any of the sanity checks fail. """ timer = autotest_stats.Timer('shard_heartbeat') with timer.get_client('persist_jobs'): job_ids_sent = _persist_records_with_type_sent_from_shard( shard, jobs, models.Job) with timer.get_client('persist_hqes'): _persist_records_with_type_sent_from_shard( shard, hqes, models.HostQueueEntry, job_ids_sent=job_ids_sent)
def dispatchRequest(self, request): """ Invoke a json RPC call from a decoded json request. @param request: a decoded json_request @returns a dictionary with keys id, result, err and err_traceback """ results = self.blank_result_dict() try: results['id'] = self._getRequestId(request) methName = request['method'] args = request['params'] except KeyError: raise BadServiceRequest(request) autotest_stats.Counter('rpc').increment(methName) metadata = request.copy() metadata['_type'] = 'rpc' timer = autotest_stats.Timer('rpc', metadata=metadata) try: timer.start() meth = self.findServiceEndpoint(methName) results['result'] = self.invokeServiceEndpoint(meth, args) except Exception, err: results['err_traceback'] = traceback.format_exc() results['err'] = err
def _run(): """Report metadata in the queue until being aborted. """ # Time when the first time upload failed. None if the last upload succeeded. first_failed_upload = None # True if email alert was sent when upload has been failing continuously # for _MAX_UPLOAD_FAIL_DURATION seconds. email_alert = False upload_size = _MIN_RETRY_ENTRIES try: while True: start_time = time.time() data_list = [] if (first_failed_upload and time.time() - first_failed_upload > _MAX_UPLOAD_FAIL_DURATION): upload_size = _MIN_RETRY_ENTRIES if not email_alert: _email_alert() email_alert = True else: upload_size = min(upload_size * 2, _MAX_UPLOAD_SIZE) while (not metadata_queue.empty() and len(data_list) < upload_size): data_list.append(metadata_queue.get_nowait()) if data_list: if autotest_es.bulk_post(data_list=data_list): time_used = time.time() - start_time logging.info( '%d entries of metadata uploaded in %s ' 'seconds.', len(data_list), time_used) autotest_stats.Timer('metadata_reporter').send( 'time_used', time_used) autotest_stats.Gauge('metadata_reporter').send( 'entries_uploaded', len(data_list)) first_failed_upload = None email_alert = False else: logging.warn( 'Failed to upload %d entries of metadata, ' 'they will be retried later.', len(data_list)) autotest_stats.Gauge('metadata_reporter').send( 'entries_failed', len(data_list)) for data in data_list: queue(data) if not first_failed_upload: first_failed_upload = time.time() sleep_time = _REPORT_INTERVAL_SECONDS - time.time() + start_time if sleep_time < 0: sleep_time = 0.5 _abort.wait(timeout=sleep_time) except Exception as e: logging.error('Metadata reporter thread failed with error: %s', e) raise finally: logging.info('Metadata reporting thread is exiting.') _abort.clear() _report_lock.release()
def run(self): """Wrapper around the thread's run method.""" try: with autotest_stats.Timer(self.name): super(ExceptionRememberingThread, self).run() except Exception as self.err: logging.error( '%s raised an exception that will be re-raised by ' 'the thread pool manager.', self.getName())
def wait_for_restart(self, timeout=DEFAULT_REBOOT_TIMEOUT, down_timeout=WAIT_DOWN_REBOOT_TIMEOUT, down_warning=WAIT_DOWN_REBOOT_WARNING, log_failure=True, old_boot_id=None, **dargs): """ Wait for the host to come back from a reboot. This is a generic implementation based entirely on wait_up and wait_down. """ key_string = 'Reboot.%s' % dargs.get('board') total_reboot_timer = autotest_stats.Timer( '%s.total' % key_string, metadata=self._construct_host_metadata('reboot_total')) wait_down_timer = autotest_stats.Timer( '%s.wait_down' % key_string, metadata=self._construct_host_metadata('reboot_down')) total_reboot_timer.start() wait_down_timer.start() if not self.wait_down(timeout=down_timeout, warning_timer=down_warning, old_boot_id=old_boot_id): if log_failure: self.record("ABORT", None, "reboot.verify", "shut down failed") raise error.AutoservShutdownError("Host did not shut down") wait_down_timer.stop() wait_up_timer = autotest_stats.Timer( '%s.wait_up' % key_string, metadata=self._construct_host_metadata('reboot_up')) wait_up_timer.start() if self.wait_up(timeout): self.record("GOOD", None, "reboot.verify") self.reboot_followup(**dargs) wait_up_timer.stop() total_reboot_timer.stop() else: self.record("ABORT", None, "reboot.verify", "Host did not return from reboot") raise error.AutoservRebootError("Host did not return from reboot")
def retrieve_shard(shard_hostname): """ Retrieves the shard with the given hostname from the database. @param shard_hostname: Hostname of the shard to retrieve @raises models.Shard.DoesNotExist, if no shard with this hostname was found. @returns: Shard object """ timer = autotest_stats.Timer('shard_heartbeat.retrieve_shard') with timer: return models.Shard.smart_get(shard_hostname)
def find_records_for_shard(shard, known_job_ids, known_host_ids): """Find records that should be sent to a shard. @param shard: Shard to find records for. @param known_job_ids: List of ids of jobs the shard already has. @param known_host_ids: List of ids of hosts the shard already has. @returns: Tuple of three lists for hosts, jobs, and suite job keyvals: (hosts, jobs, suite_job_keyvals). """ timer = autotest_stats.Timer('shard_heartbeat') with timer.get_client('find_hosts'): hosts = models.Host.assign_to_shard(shard, known_host_ids) with timer.get_client('find_jobs'): jobs = models.Job.assign_to_shard(shard, known_job_ids) with timer.get_client('find_suite_job_keyvals'): parent_job_ids = [job.parent_job_id for job in jobs] suite_job_keyvals = models.JobKeyval.objects.filter( job_id__in=parent_job_ids) return hosts, jobs, suite_job_keyvals
def _get_control_file_contents_by_name(build, ds, suite_name): """Return control file contents for |suite_name|. Query the dev server at |ds| for the control file |suite_name|, included in |build| for |board|. @param build: unique name by which to refer to the image from now on. @param ds: a dev_server.DevServer instance to fetch control file with. @param suite_name: canonicalized suite name, e.g. test_suites/control.bvt. @raises ControlFileNotFound if a unique suite control file doesn't exist. @raises NoControlFileList if we can't list the control files at all. @raises ControlFileEmpty if the control file exists on the server, but can't be read. @return the contents of the desired control file. """ getter = control_file_getter.DevServerGetter.create(build, ds) timer = autotest_stats.Timer('control_files.parse.%s.%s' % (ds.get_server_name(ds.url()).replace( '.', '_'), suite_name.rsplit('.')[-1])) # Get the control file for the suite. try: with timer: control_file_in = getter.get_control_file_contents_by_name( suite_name) except error.CrosDynamicSuiteException as e: raise type(e)("%s while testing %s." % (e, build)) if not control_file_in: raise error.ControlFileEmpty("Fetching %s returned no data." % suite_name) # Force control files to only contain ascii characters. try: control_file_in.encode('ascii') except UnicodeDecodeError as e: raise error.ControlFileMalformed(str(e)) return control_file_in
class HostScheduler(BaseHostScheduler): """A scheduler capable managing host acquisition for new jobs.""" _timer = autotest_stats.Timer('host_scheduler') def __init__(self): super(HostScheduler, self).__init__() self.job_query_manager = query_managers.AFEJobQueryManager() # Keeping track on how many hosts each suite is holding # {suite_job_id: num_hosts} self._suite_recorder = SuiteRecorder(self.job_query_manager) def _record_host_assignment(self, host, queue_entry): """Record that |host| is assigned to |queue_entry|. Record: 1. How long it takes to assign a host to a job in metadata db. 2. Record host assignment of a suite. @param host: A Host object. @param queue_entry: A HostQueueEntry object. """ secs_in_queued = (datetime.datetime.now() - queue_entry.job.created_on).total_seconds() job_overhead.record_state_duration(queue_entry.job_id, host.hostname, job_overhead.STATUS.QUEUED, secs_in_queued) self._suite_recorder.record_assignment(queue_entry) @_timer.decorate def _schedule_jobs(self): """Schedule new jobs against hosts.""" key = 'host_scheduler.jobs_per_tick' new_jobs_with_hosts = 0 queue_entries = self.job_query_manager.get_pending_queue_entries( only_hostless=False) unverified_host_jobs = [ job for job in queue_entries if not job.is_hostless() ] if not unverified_host_jobs: return for acquisition in self.find_hosts_for_jobs(unverified_host_jobs): self.schedule_host_job(acquisition.host, acquisition.job) self._record_host_assignment(acquisition.host, acquisition.job) new_jobs_with_hosts += 1 autotest_stats.Gauge(key).send('new_jobs_with_hosts', new_jobs_with_hosts) autotest_stats.Gauge(key).send( 'new_jobs_without_hosts', len(unverified_host_jobs) - new_jobs_with_hosts) @_timer.decorate def _lease_hosts_of_frontend_tasks(self): """Lease hosts of tasks scheduled through the frontend.""" # We really don't need to get all the special tasks here, just the ones # without hqes, but reusing the method used by the scheduler ensures # we prioritize the same way. lease_hostnames = [ task.host.hostname for task in self.job_query_manager.get_prioritized_special_tasks( only_tasks_with_leased_hosts=False) if task.queue_entry_id is None and not task.host.leased ] # Leasing a leased hosts here shouldn't be a problem: # 1. The only way a host can be leased is if it's been assigned to # an active hqe or another similar frontend task, but doing so will # have already precluded it from the list of tasks returned by the # job_query_manager. # 2. The unleasing is done based on global conditions. Eg: Even if a # task has already leased a host and we lease it again, the # host scheduler won't release the host till both tasks are complete. if lease_hostnames: self.host_query_manager.set_leased(True, hostname__in=lease_hostnames) def acquire_hosts(self, host_jobs): """Override acquire_hosts. This method overrides the method in parent class. It figures out a set of suites that |host_jobs| belong to; and get min_duts requirement for each suite. It pipes min_duts for each suite to rdb. """ parent_job_ids = set( [q.job.parent_job_id for q in host_jobs if q.job.parent_job_id]) suite_min_duts = self._suite_recorder.get_min_duts(parent_job_ids) return rdb_lib.acquire_hosts(host_jobs, suite_min_duts) @_timer.decorate def tick(self): logging.info('Calling new tick.') logging.info('Leasing hosts for frontend tasks.') self._lease_hosts_of_frontend_tasks() logging.info('Finding hosts for new jobs.') self._schedule_jobs() logging.info('Releasing unused hosts.') released_hosts = self._release_hosts() logging.info('Updating suite assignment with released hosts') self._suite_recorder.record_release(released_hosts) logging.info('Calling email_manager.') email_manager.manager.send_queued_emails()
import contextlib import logging import time from multiprocessing import pool import base_event, board_enumerator, build_event import task, timed_event import common from autotest_lib.client.common_lib.cros.graphite import autotest_stats from autotest_lib.server import utils POOL_SIZE = 32 _timer = autotest_stats.Timer('suite_scheduler') class Driver(object): """Implements the main loop of the suite_scheduler. @var EVENT_CLASSES: list of the event classes Driver supports. @var _LOOP_INTERVAL_SECONDS: seconds to wait between loop iterations. @var _scheduler: a DedupingScheduler, used to schedule jobs with the AFE. @var _enumerator: a BoardEnumerator, used to list plaforms known to the AFE @var _events: dict of BaseEvents to be handled each time through main loop. """ EVENT_CLASSES = [
""" import logging import socket import subprocess import sys import common from autotest_lib.client.common_lib.cros.graphite import autotest_stats from autotest_lib.frontend import database_settings_helper from autotest_lib.scheduler import email_manager # Format Appears as: [Date] [Time] - [Msg Level] - [Message] LOGGING_FORMAT = '%(asctime)s - %(levelname)s - %(message)s' STATS_KEY = 'db_optimize.%s' % socket.gethostname() timer = autotest_stats.Timer(STATS_KEY) @timer.decorate def main_without_exception_handling(): database_settings = database_settings_helper.get_default_db_config() command = [ 'mysqlcheck', '-o', database_settings['NAME'], '-u', database_settings['USER'], '-p%s' % database_settings['PASSWORD'], # we want to do db optimation on each master/slave # in rotation. Do not write otimize table to bin log # so that it won't be picked up by slaves automatically
class SiteDispatcher(object): """ SiteDispatcher subclasses BaseDispatcher in monitor_db. """ DEFAULT_REQUESTED_BY_USER_ID = 1 _timer = autotest_stats.Timer('scheduler') _gauge = autotest_stats.Gauge('scheduler_rel') _tick_start = None @_timer.decorate def tick(self): self._tick_start = time.time() super(SiteDispatcher, self).tick() self._gauge.send('tick', time.time() - self._tick_start) @_timer.decorate def _garbage_collection(self): super(SiteDispatcher, self)._garbage_collection() if self._tick_start: self._gauge.send('_garbage_collection', time.time() - self._tick_start) @_timer.decorate def _run_cleanup(self): super(SiteDispatcher, self)._run_cleanup() if self._tick_start: self._gauge.send('_run_cleanup', time.time() - self._tick_start) @_timer.decorate def _find_aborting(self): super(SiteDispatcher, self)._find_aborting() if self._tick_start: self._gauge.send('_find_aborting', time.time() - self._tick_start) @_timer.decorate def _process_recurring_runs(self): super(SiteDispatcher, self)._process_recurring_runs() if self._tick_start: self._gauge.send('_process_recurring_runs', time.time() - self._tick_start) @_timer.decorate def _schedule_delay_tasks(self): super(SiteDispatcher, self)._schedule_delay_tasks() if self._tick_start: self._gauge.send('_schedule_delay_tasks', time.time() - self._tick_start) @_timer.decorate def _schedule_running_host_queue_entries(self): super(SiteDispatcher, self)._schedule_running_host_queue_entries() if self._tick_start: self._gauge.send('_schedule_running_host_queue_entries', time.time() - self._tick_start) @_timer.decorate def _schedule_special_tasks(self): super(SiteDispatcher, self)._schedule_special_tasks() if self._tick_start: self._gauge.send('_schedule_special_tasks', time.time() - self._tick_start) @_timer.decorate def _schedule_new_jobs(self): super(SiteDispatcher, self)._schedule_new_jobs() if self._tick_start: self._gauge.send('_schedule_new_jobs', time.time() - self._tick_start) @_timer.decorate def _handle_agents(self): super(SiteDispatcher, self)._handle_agents() if self._tick_start: self._gauge.send('_handle_agents', time.time() - self._tick_start) def _reverify_hosts_where(self, where, print_message='Reverifying host %s'): """ This is an altered version of _reverify_hosts_where the class to models.SpecialTask.objects.create passes in an argument for requested_by, in order to allow the Reset task to be created properly. """ full_where = 'locked = 0 AND invalid = 0 AND ' + where for host in scheduler_models.Host.fetch(where=full_where): if self.host_has_agent(host): # host has already been recovered in some way continue if self._host_has_scheduled_special_task(host): # host will have a special task scheduled on the next cycle continue if print_message: logging.error(print_message, host.hostname) try: user = models.User.objects.get(login='******') except models.User.DoesNotExist: user = models.User.objects.get( id=SiteDispatcher.DEFAULT_REQUESTED_BY_USER_ID) models.SpecialTask.objects.create( task=models.SpecialTask.Task.RESET, host=models.Host.objects.get(id=host.id), requested_by=user) def _check_for_unrecovered_verifying_entries(self): # Verify is replaced by Reset. queue_entries = scheduler_models.HostQueueEntry.fetch( where='status = "%s"' % models.HostQueueEntry.Status.RESETTING) for queue_entry in queue_entries: special_tasks = models.SpecialTask.objects.filter( task__in=(models.SpecialTask.Task.CLEANUP, models.SpecialTask.Task.VERIFY, models.SpecialTask.Task.RESET), queue_entry__id=queue_entry.id, is_complete=False) if special_tasks.count() == 0: logging.error( 'Unrecovered Resetting host queue entry: %s. ' 'Setting status to Queued.', str(queue_entry)) # Essentially this host queue entry was set to be Verifying # however no special task exists for entry. This occurs if the # scheduler dies between changing the status and creating the # special task. By setting it to queued, the job can restart # from the beginning and proceed correctly. This is much more # preferable than having monitor_db not launching. queue_entry.set_status('Queued')
class SiteDroneManager(object): _timer = autotest_stats.Timer('drone_manager') def copy_to_results_repository(self, process, source_path, destination_path=None): """ Copy results from the given process at source_path to destination_path in the results repository. This site subclassed version will only copy the results back for Special Agent Tasks (Cleanup, Verify, Repair) that reside in the hosts/ subdirectory of results if the copy_task_results_back flag has been set to True inside global_config.ini It will also only copy .parse.log files back to the scheduler if the copy_parse_log_back flag in global_config.ini has been set to True. """ if not ENABLE_ARCHIVING: return copy_task_results_back = global_config.global_config.get_config_value( scheduler_config.CONFIG_SECTION, 'copy_task_results_back', type=bool) copy_parse_log_back = global_config.global_config.get_config_value( scheduler_config.CONFIG_SECTION, 'copy_parse_log_back', type=bool) special_task = source_path.startswith(HOSTS_JOB_SUBDIR) parse_log = source_path.endswith(PARSE_LOG) if (copy_task_results_back or not special_task) and (copy_parse_log_back or not parse_log): super(SiteDroneManager, self).copy_to_results_repository(process, source_path, destination_path) def kill_process(self, process): """ Kill the given process. """ logging.info('killing %s', process) drone = self._get_drone_for_process(process) drone.queue_kill_process(process) def _add_drone(self, hostname): """ Forked from drone_manager.py Catches AutoservRunError if the drone fails initialization and does not add it to the list of usable drones. @param hostname: Hostname of the drone we are trying to add. """ logging.info('Adding drone %s' % hostname) drone = drones.get_drone(hostname) if drone: try: drone.call('initialize', self.absolute_path('')) except error.AutoservRunError as e: logging.error('Failed to initialize drone %s with error: %s', hostname, e) return self._drones[drone.hostname] = drone @_timer.decorate def refresh(self): super(SiteDroneManager, self).refresh() @_timer.decorate def execute_actions(self): super(SiteDroneManager, self).execute_actions()
import argparse import getpass import logging import os import sys from datetime import datetime import common from autotest_lib.client.common_lib.cros.graphite import autotest_stats from autotest_lib.server import frontend from autotest_lib.server import utils LOG_NAME_TEMPLATE = 'abort_suite-%s.log' SUITE_JOB_NAME_TEMPLATE = '%s-test_suites/control.%s' _timer = autotest_stats.Timer('abort_suites') def parse_args(): """ Parse the arguments to this script. @return The arguments to this script. """ parser = argparse.ArgumentParser() parser.add_argument('-s', '--suite_name', dest='name') parser.add_argument('-i', '--build', dest='build') return parser.parse_args()
from autotest_lib.client.common_lib import control_data from autotest_lib.client.common_lib import global_config from autotest_lib.client.common_lib import utils from autotest_lib.client.common_lib.cros.graphite import autotest_stats from autotest_lib.tko import db try: from autotest_lib.server.site_common import site_utils as server_utils except: from autotest_lib.server import utils as server_utils form_ntuples_from_machines = server_utils.form_ntuples_from_machines GLOBAL_CONFIG = global_config.global_config DEFAULT_SERVER = 'autotest' _tko_timer = autotest_stats.Timer('tko') def dump_object(header, obj): """ Standard way to print out the frontend objects (eg job, host, acl, label) in a human-readable fashion for debugging """ result = header + '\n' for key in obj.hash: if key == 'afe' or key == 'hash': continue result += '%20s: %s\n' % (key, obj.hash[key]) return result
class BaseHostScheduler(object): """Base class containing host acquisition logic. This class contains all the core host acquisition logic needed by the scheduler to run jobs on hosts. It is only capable of releasing hosts back to the rdb through its tick, any other action must be instigated by the job scheduler. """ _timer = autotest_stats.Timer('base_host_scheduler') host_assignment = collections.namedtuple('host_assignment', ['host', 'job']) def __init__(self): self.host_query_manager = query_managers.AFEHostQueryManager() @_timer.decorate def _release_hosts(self): """Release hosts to the RDB. Release all hosts that are ready and are currently not being used by an active hqe, and don't have a new special task scheduled against them. @return a list of hosts that are released. """ release_hosts = self.host_query_manager.find_unused_healty_hosts() release_hostnames = [host.hostname for host in release_hosts] if release_hostnames: self.host_query_manager.set_leased(False, hostname__in=release_hostnames) return release_hosts @classmethod def schedule_host_job(cls, host, queue_entry): """Schedule a job on a host. Scheduling a job involves: 1. Setting the active bit on the queue_entry. 2. Scheduling a special task on behalf of the queue_entry. Performing these actions will lead the job scheduler through a chain of events, culminating in running the test and collecting results from the host. @param host: The host against which to schedule the job. @param queue_entry: The queue_entry to schedule. """ if queue_entry.host_id is None: queue_entry.set_host(host) elif host.id != queue_entry.host_id: raise rdb_utils.RDBException( 'The rdb returned host: %s ' 'but the job:%s was already assigned a host: %s. ' % (host.hostname, queue_entry.job_id, queue_entry.host.hostname)) queue_entry.update_field('active', True) # TODO: crbug.com/373936. The host scheduler should only be assigning # jobs to hosts, but the criterion we use to release hosts depends # on it not being used by an active hqe. Since we're activating the # hqe here, we also need to schedule its first prejob task. OTOH, # we could converge to having the host scheduler manager all special # tasks, since their only use today is to verify/cleanup/reset a host. logging.info('Scheduling pre job tasks for entry: %s', queue_entry) queue_entry.schedule_pre_job_tasks() def acquire_hosts(self, host_jobs): """Accquire hosts for given jobs. This method sends jobs that need hosts to rdb. Child class can override this method to pipe more args to rdb. @param host_jobs: A list of queue entries that either require hosts, or require host assignment validation through the rdb. @param return: A generator that yields an rdb_hosts.RDBClientHostWrapper for each host acquired on behalf of a queue_entry, or None if a host wasn't found. """ return rdb_lib.acquire_hosts(host_jobs) def find_hosts_for_jobs(self, host_jobs): """Find and verify hosts for a list of jobs. @param host_jobs: A list of queue entries that either require hosts, or require host assignment validation through the rdb. @return: A list of tuples of the form (host, queue_entry) for each valid host-queue_entry assignment. """ jobs_with_hosts = [] hosts = self.acquire_hosts(host_jobs) for host, job in zip(hosts, host_jobs): if host: jobs_with_hosts.append(self.host_assignment(host, job)) return jobs_with_hosts @_timer.decorate def tick(self): """Schedule core host management activities.""" self._release_hosts()
# TODO(crbug.com/464834): Snapshot clone is disabled until Moblab can # support overlayfs or aufs, which requires a newer kernel. SUPPORT_SNAPSHOT_CLONE = not IS_MOBLAB # Number of seconds to wait for network to be up in a container. NETWORK_INIT_TIMEOUT = 300 # Network bring up is slower in Moblab. NETWORK_INIT_CHECK_INTERVAL = 2 if IS_MOBLAB else 0.1 # Type string for container related metadata. CONTAINER_CREATE_METADB_TYPE = 'container_create' CONTAINER_CREATE_RETRY_METADB_TYPE = 'container_create_retry' CONTAINER_RUN_TEST_METADB_TYPE = 'container_run_test' STATS_KEY = 'lxc.%s' % socket.gethostname().replace('.', '_') timer = autotest_stats.Timer(STATS_KEY) # Timer used inside container should not include the hostname, as that will # create individual timer for each container. container_timer = autotest_stats.Timer('lxc') def _get_container_info_moblab(container_path, **filters): """Get a collection of container information in the given container path in a Moblab. TODO(crbug.com/457496): remove this method once python 3 can be installed in Moblab and lxc-ls command can use python 3 code. When running in Moblab, lxc-ls behaves differently from a server with python 3 installed: 1. lxc-ls returns a list of containers installed under /etc/lxc, the default
class UserCleanup(PeriodicCleanup): """User cleanup that is controlled by the global config variable clean_interval_minutes in the SCHEDULER section. """ timer = autotest_stats.Timer('monitor_db_cleanup.user_cleanup') def __init__(self, db, clean_interval_minutes): super(UserCleanup, self).__init__(db, clean_interval_minutes) self._last_reverify_time = time.time() @timer.decorate def _cleanup(self): logging.info('Running periodic cleanup') self._abort_timed_out_jobs() self._abort_jobs_past_max_runtime() self._clear_inactive_blocks() self._check_for_db_inconsistencies() self._reverify_dead_hosts() self._django_session_cleanup() @timer.decorate def _abort_timed_out_jobs(self): msg = 'Aborting all jobs that have timed out and are not complete' logging.info(msg) query = models.Job.objects.filter( hostqueueentry__complete=False).extra( where=['created_on + INTERVAL timeout_mins MINUTE < NOW()']) for job in query.distinct(): logging.warning('Aborting job %d due to job timeout', job.id) job.abort() @timer.decorate def _abort_jobs_past_max_runtime(self): """ Abort executions that have started and are past the job's max runtime. """ logging.info('Aborting all jobs that have passed maximum runtime') rows = self._db.execute(""" SELECT hqe.id FROM afe_host_queue_entries AS hqe INNER JOIN afe_jobs ON (hqe.job_id = afe_jobs.id) WHERE NOT hqe.complete AND NOT hqe.aborted AND hqe.started_on + INTERVAL afe_jobs.max_runtime_mins MINUTE < NOW()""") query = models.HostQueueEntry.objects.filter( id__in=[row[0] for row in rows]) for queue_entry in query.distinct(): logging.warning('Aborting entry %s due to max runtime', queue_entry) queue_entry.abort() @timer.decorate def _check_for_db_inconsistencies(self): logging.info('Cleaning db inconsistencies') self._check_all_invalid_related_objects() def _check_invalid_related_objects_one_way(self, first_model, relation_field, second_model): if 'invalid' not in first_model.get_field_dict(): return [] invalid_objects = list(first_model.objects.filter(invalid=True)) first_model.objects.populate_relationships(invalid_objects, second_model, 'related_objects') error_lines = [] for invalid_object in invalid_objects: if invalid_object.related_objects: related_list = ', '.join( str(related_object) for related_object in invalid_object.related_objects) error_lines.append('Invalid %s %s is related to %ss: %s' % (first_model.__name__, invalid_object, second_model.__name__, related_list)) related_manager = getattr(invalid_object, relation_field) related_manager.clear() return error_lines def _check_invalid_related_objects(self, first_model, first_field, second_model, second_field): errors = self._check_invalid_related_objects_one_way( first_model, first_field, second_model) errors.extend( self._check_invalid_related_objects_one_way( second_model, second_field, first_model)) return errors def _check_all_invalid_related_objects(self): model_pairs = ((models.Host, 'labels', models.Label, 'host_set'), (models.AclGroup, 'hosts', models.Host, 'aclgroup_set'), (models.AclGroup, 'users', models.User, 'aclgroup_set'), (models.Test, 'dependency_labels', models.Label, 'test_set')) errors = [] for first_model, first_field, second_model, second_field in model_pairs: errors.extend( self._check_invalid_related_objects(first_model, first_field, second_model, second_field)) if errors: subject = ('%s relationships to invalid models, cleaned all' % len(errors)) message = '\n'.join(errors) logging.warning(subject) logging.warning(message) email_manager.manager.enqueue_notify_email(subject, message) @timer.decorate def _clear_inactive_blocks(self): msg = 'Clear out blocks for all completed jobs.' logging.info(msg) # this would be simpler using NOT IN (subquery), but MySQL # treats all IN subqueries as dependent, so this optimizes much # better self._db.execute(""" DELETE ihq FROM afe_ineligible_host_queues ihq LEFT JOIN (SELECT DISTINCT job_id FROM afe_host_queue_entries WHERE NOT complete) hqe USING (job_id) WHERE hqe.job_id IS NULL""") def _should_reverify_hosts_now(self): reverify_period_sec = ( scheduler_config.config.reverify_period_minutes * 60) if reverify_period_sec == 0: return False return (self._last_reverify_time + reverify_period_sec) <= time.time() def _choose_subset_of_hosts_to_reverify(self, hosts): """Given hosts needing verification, return a subset to reverify.""" max_at_once = scheduler_config.config.reverify_max_hosts_at_once if (max_at_once > 0 and len(hosts) > max_at_once): return random.sample(hosts, max_at_once) return sorted(hosts) @timer.decorate def _reverify_dead_hosts(self): if not self._should_reverify_hosts_now(): return self._last_reverify_time = time.time() logging.info('Checking for dead hosts to reverify') hosts = models.Host.objects.filter( status=models.Host.Status.REPAIR_FAILED, locked=False, invalid=False) hosts = hosts.exclude( protection=host_protections.Protection.DO_NOT_VERIFY) if not hosts: return hosts = list(hosts) total_hosts = len(hosts) hosts = self._choose_subset_of_hosts_to_reverify(hosts) logging.info('Reverifying dead hosts (%d of %d) %s', len(hosts), total_hosts, ', '.join(host.hostname for host in hosts)) for host in hosts: models.SpecialTask.schedule_special_task( host=host, task=models.SpecialTask.Task.VERIFY) @timer.decorate def _django_session_cleanup(self): """Clean up django_session since django doesn't for us. http://www.djangoproject.com/documentation/0.96/sessions/ """ logging.info('Deleting old sessions from django_session') sql = 'TRUNCATE TABLE django_session' self._db.execute(sql)
class TwentyFourHourUpkeep(PeriodicCleanup): """Cleanup that runs at the startup of monitor_db and every subsequent twenty four hours. """ timer = autotest_stats.Timer('monitor_db_cleanup.twentyfourhour_cleanup') def __init__(self, db, drone_manager, run_at_initialize=True): """Initialize TwentyFourHourUpkeep. @param db: Database connection object. @param drone_manager: DroneManager to access drones. @param run_at_initialize: True to run cleanup when scheduler starts. Default is set to True. """ self.drone_manager = drone_manager clean_interval_minutes = 24 * 60 # 24 hours super(TwentyFourHourUpkeep, self).__init__(db, clean_interval_minutes, run_at_initialize=run_at_initialize) @timer.decorate def _cleanup(self): logging.info('Running 24 hour clean up') self._check_for_uncleanable_db_inconsistencies() self._cleanup_orphaned_containers() @timer.decorate def _check_for_uncleanable_db_inconsistencies(self): logging.info('Checking for uncleanable DB inconsistencies') self._check_for_active_and_complete_queue_entries() self._check_for_multiple_platform_hosts() self._check_for_no_platform_hosts() self._check_for_multiple_atomic_group_hosts() @timer.decorate def _check_for_active_and_complete_queue_entries(self): query = models.HostQueueEntry.objects.filter(active=True, complete=True) if query.count() != 0: subject = ('%d queue entries found with active=complete=1' % query.count()) lines = [] for entry in query: lines.append(str(entry.get_object_dict())) if entry.status == 'Aborted': logging.error( 'Aborted entry: %s is both active and ' 'complete. Setting active value to False.', str(entry)) entry.active = False entry.save() self._send_inconsistency_message(subject, lines) @timer.decorate def _check_for_multiple_platform_hosts(self): rows = self._db.execute(""" SELECT afe_hosts.id, hostname, COUNT(1) AS platform_count, GROUP_CONCAT(afe_labels.name) FROM afe_hosts INNER JOIN afe_hosts_labels ON afe_hosts.id = afe_hosts_labels.host_id INNER JOIN afe_labels ON afe_hosts_labels.label_id = afe_labels.id WHERE afe_labels.platform GROUP BY afe_hosts.id HAVING platform_count > 1 ORDER BY hostname""") if rows: subject = '%s hosts with multiple platforms' % self._db.rowcount lines = [' '.join(str(item) for item in row) for row in rows] self._send_inconsistency_message(subject, lines) @timer.decorate def _check_for_no_platform_hosts(self): rows = self._db.execute(""" SELECT hostname FROM afe_hosts LEFT JOIN afe_hosts_labels ON afe_hosts.id = afe_hosts_labels.host_id AND afe_hosts_labels.label_id IN (SELECT id FROM afe_labels WHERE platform) WHERE NOT afe_hosts.invalid AND afe_hosts_labels.host_id IS NULL""" ) if rows: logging.warning('%s hosts with no platform\n%s', self._db.rowcount, ', '.join(row[0] for row in rows)) @timer.decorate def _check_for_multiple_atomic_group_hosts(self): rows = self._db.execute(""" SELECT afe_hosts.id, hostname, COUNT(DISTINCT afe_atomic_groups.name) AS atomic_group_count, GROUP_CONCAT(afe_labels.name), GROUP_CONCAT(afe_atomic_groups.name) FROM afe_hosts INNER JOIN afe_hosts_labels ON afe_hosts.id = afe_hosts_labels.host_id INNER JOIN afe_labels ON afe_hosts_labels.label_id = afe_labels.id INNER JOIN afe_atomic_groups ON afe_labels.atomic_group_id = afe_atomic_groups.id WHERE NOT afe_hosts.invalid AND NOT afe_labels.invalid GROUP BY afe_hosts.id HAVING atomic_group_count > 1 ORDER BY hostname""") if rows: subject = '%s hosts with multiple atomic groups' % self._db.rowcount lines = [' '.join(str(item) for item in row) for row in rows] self._send_inconsistency_message(subject, lines) def _send_inconsistency_message(self, subject, lines): logging.error(subject) message = '\n'.join(lines) if len(message) > 5000: message = message[:5000] + '\n(truncated)\n' email_manager.manager.enqueue_notify_email(subject, message) @timer.decorate def _cleanup_orphaned_containers(self): """Cleanup orphaned containers in each drone. The function queues a lxc_cleanup call in each drone without waiting for the script to finish, as the cleanup procedure could take minutes and the script output is logged. """ ssp_enabled = global_config.global_config.get_config_value( 'AUTOSERV', 'enable_ssp_container') if not ssp_enabled: logging.info( 'Server-side packaging is not enabled, no need to clean' ' up orphaned containers.') return self.drone_manager.cleanup_orphaned_containers()
import logging import os import re from autotest_lib.client.common_lib import utils as client_utils from autotest_lib.client.common_lib.cros import dev_server from autotest_lib.client.common_lib.cros import retry from autotest_lib.client.common_lib.cros.graphite import autotest_stats from autotest_lib.client.cros import constants from autotest_lib.server.cros.dynamic_suite.constants import JOB_BUILD_KEY from autotest_lib.server import utils CRASH_SERVER_OVERLOAD = 'crash_server_overload' CRASH_SERVER_FOUND = 'crash_server_found' SYMBOLICATE_TIMEDOUT = 'symbolicate_timedout' timer = autotest_stats.Timer('crash_collect') def generate_minidump_stacktrace(minidump_path): """ Generates a stacktrace for the specified minidump. This function expects the debug symbols to reside under: /build/<board>/usr/lib/debug @param minidump_path: absolute path to minidump to by symbolicated. @raise client_utils.error.CmdError if minidump_stackwalk return code != 0. """ symbol_dir = '%s/../../../lib/debug' % utils.get_server_dir() logging.info('symbol_dir: %s', symbol_dir) client_utils.run('minidump_stackwalk "%s" "%s" > "%s.txt"' %
class BaseDroneManager(object): """ This class acts as an interface from the scheduler to drones, whether it be only a single "drone" for localhost or multiple remote drones. All paths going into and out of this class are relative to the full results directory, except for those returns by absolute_path(). """ # Minimum time to wait before next email # about a drone hitting process limit is sent. NOTIFY_INTERVAL = 60 * 60 * 24 # one day _STATS_KEY = 'drone_manager' _timer = autotest_stats.Timer(_STATS_KEY) def __init__(self): # absolute path of base results dir self._results_dir = None # holds Process objects self._process_set = set() # holds the list of all processes running on all drones self._all_processes = {} # maps PidfileId to PidfileContents self._pidfiles = {} # same as _pidfiles self._pidfiles_second_read = {} # maps PidfileId to _PidfileInfo self._registered_pidfile_info = {} # used to generate unique temporary paths self._temporary_path_counter = 0 # maps hostname to Drone object self._drones = {} self._results_drone = None # maps results dir to dict mapping file path to contents self._attached_files = {} # heapq of _DroneHeapWrappers self._drone_queue = [] # map drone hostname to time stamp of email that # has been sent about the drone hitting process limit. self._notify_record = {} # A threaded task queue used to refresh drones asynchronously. if _THREADED_DRONE_MANAGER: self._refresh_task_queue = thread_lib.ThreadedTaskQueue( name='%s.refresh_queue' % self._STATS_KEY) else: self._refresh_task_queue = drone_task_queue.DroneTaskQueue() def initialize(self, base_results_dir, drone_hostnames, results_repository_hostname): self._results_dir = base_results_dir for hostname in drone_hostnames: self._add_drone(hostname) if not self._drones: # all drones failed to initialize raise DroneManagerError('No valid drones found') self.refresh_drone_configs() logging.info('Using results repository on %s', results_repository_hostname) self._results_drone = drones.get_drone(results_repository_hostname) results_installation_dir = global_config.global_config.get_config_value( scheduler_config.CONFIG_SECTION, 'results_host_installation_directory', default=None) if results_installation_dir: self._results_drone.set_autotest_install_dir( results_installation_dir) # don't initialize() the results drone - we don't want to clear out any # directories and we don't need to kill any processes def reinitialize_drones(self): self._call_all_drones('initialize', self._results_dir) def shutdown(self): for drone in self.get_drones(): drone.shutdown() def _get_max_pidfile_refreshes(self): """ Normally refresh() is called on every monitor_db.Dispatcher.tick(). @returns: The number of refresh() calls before we forget a pidfile. """ pidfile_timeout = global_config.global_config.get_config_value( scheduler_config.CONFIG_SECTION, 'max_pidfile_refreshes', type=int, default=2000) return pidfile_timeout def _add_drone(self, hostname): logging.info('Adding drone %s', hostname) drone = drones.get_drone(hostname) if drone: self._drones[drone.hostname] = drone drone.call('initialize', self.absolute_path('')) def _remove_drone(self, hostname): self._drones.pop(hostname, None) def refresh_drone_configs(self): """ Reread global config options for all drones. """ # Import server_manager_utils is delayed rather than at the beginning of # this module. The reason is that test_that imports drone_manager when # importing autoserv_utils. The import is done before test_that setup # django (test_that only setup django in setup_local_afe, since it's # not needed when test_that runs the test in a lab duts through :lab: # option. Therefore, if server_manager_utils is imported at the # beginning of this module, test_that will fail since django is not # setup yet. from autotest_lib.site_utils import server_manager_utils config = global_config.global_config section = scheduler_config.CONFIG_SECTION config.parse_config_file() for hostname, drone in self._drones.iteritems(): if server_manager_utils.use_server_db(): server = server_manager_utils.get_servers(hostname=hostname)[0] attributes = dict([(a.attribute, a.value) for a in server.attributes.all()]) drone.enabled = ( int(attributes.get('disabled', 0)) == 0) drone.max_processes = int( attributes.get( 'max_processes', scheduler_config.config.max_processes_per_drone)) allowed_users = attributes.get('users', None) else: disabled = config.get_config_value( section, '%s_disabled' % hostname, default='') drone.enabled = not bool(disabled) drone.max_processes = config.get_config_value( section, '%s_max_processes' % hostname, type=int, default=scheduler_config.config.max_processes_per_drone) allowed_users = config.get_config_value( section, '%s_users' % hostname, default=None) if allowed_users: drone.allowed_users = set(allowed_users.split()) else: drone.allowed_users = None logging.info('Drone %s.max_processes: %s', hostname, drone.max_processes) logging.info('Drone %s.enabled: %s', hostname, drone.enabled) logging.info('Drone %s.allowed_users: %s', hostname, drone.allowed_users) logging.info('Drone %s.support_ssp: %s', hostname, drone.support_ssp) self._reorder_drone_queue() # max_processes may have changed # Clear notification record about reaching max_processes limit. self._notify_record = {} def get_drones(self): return self._drones.itervalues() def cleanup_orphaned_containers(self): """Queue cleanup_orphaned_containers call at each drone. """ for drone in self._drones.values(): logging.info('Queue cleanup_orphaned_containers at %s', drone.hostname) drone.queue_call('cleanup_orphaned_containers') def _get_drone_for_process(self, process): return self._drones[process.hostname] def _get_drone_for_pidfile_id(self, pidfile_id): pidfile_contents = self.get_pidfile_contents(pidfile_id) assert pidfile_contents.process is not None return self._get_drone_for_process(pidfile_contents.process) def _drop_old_pidfiles(self): # use items() since the dict is modified in unregister_pidfile() for pidfile_id, info in self._registered_pidfile_info.items(): if info.age > self._get_max_pidfile_refreshes(): logging.warning('dropping leaked pidfile %s', pidfile_id) self.unregister_pidfile(pidfile_id) else: info.age += 1 def _reset(self): self._process_set = set() self._all_processes = {} self._pidfiles = {} self._pidfiles_second_read = {} self._drone_queue = [] def _call_all_drones(self, method, *args, **kwargs): all_results = {} for drone in self.get_drones(): with self._timer.get_client( '%s.%s' % (drone.hostname.replace('.', '_'), method)): all_results[drone] = drone.call(method, *args, **kwargs) return all_results def _parse_pidfile(self, drone, raw_contents): """Parse raw pidfile contents. @param drone: The drone on which this pidfile was found. @param raw_contents: The raw contents of a pidfile, eg: "pid\nexit_staus\nnum_tests_failed\n". """ contents = PidfileContents() if not raw_contents: return contents lines = raw_contents.splitlines() if len(lines) > 3: return InvalidPidfile('Corrupt pid file (%d lines):\n%s' % (len(lines), lines)) try: pid = int(lines[0]) contents.process = Process(drone.hostname, pid) # if len(lines) == 2, assume we caught Autoserv between writing # exit_status and num_failed_tests, so just ignore it and wait for # the next cycle if len(lines) == 3: contents.exit_status = int(lines[1]) contents.num_tests_failed = int(lines[2]) except ValueError, exc: return InvalidPidfile('Corrupt pid file: ' + str(exc.args)) return contents
class SuiteRecorder(object): """Recording the host assignment for suites. The recorder holds two things: * suite_host_num, records how many duts a suite is holding, which is a map <suite_job_id -> num_of_hosts> * hosts_to_suites, records which host is assigned to which suite, it is a map <host_id -> suite_job_id> The two datastructure got updated when a host is assigned to or released by a job. The reason to maintain hosts_to_suites is that, when a host is released, we need to know which suite it was leased to. Querying the db for the latest completed job that has run on a host is slow. Therefore, we go with an alternative: keeping a <host id, suite job id> map in memory (for 10K hosts, the map should take less than 1M memory on 64-bit machine with python 2.7) """ _timer = autotest_stats.Timer('suite_recorder') def __init__(self, job_query_manager): """Initialize. @param job_queue_manager: A JobQueueryManager object. """ self.job_query_manager = job_query_manager self.suite_host_num, self.hosts_to_suites = ( self.job_query_manager.get_suite_host_assignment()) def record_assignment(self, queue_entry): """Record that the hqe has got a host. @param queue_entry: A scheduler_models.HostQueueEntry object which has got a host. """ parent_id = queue_entry.job.parent_job_id if not parent_id: return if self.hosts_to_suites.get(queue_entry.host_id, None) == parent_id: logging.error( 'HQE (id: %d, parent_job_id: %d, host: %s) ' 'seems already recorded', queue_entry.id, parent_id, queue_entry.host.hostname) return num_hosts = self.suite_host_num.get(parent_id, 0) self.suite_host_num[parent_id] = num_hosts + 1 self.hosts_to_suites[queue_entry.host_id] = parent_id logging.debug('Suite %d got host %s, currently holding %d hosts', parent_id, queue_entry.host.hostname, self.suite_host_num[parent_id]) def record_release(self, hosts): """Update the record with host releasing event. @param hosts: A list of scheduler_models.Host objects. """ for host in hosts: if host.id in self.hosts_to_suites: parent_job_id = self.hosts_to_suites.pop(host.id) count = self.suite_host_num[parent_job_id] - 1 if count == 0: del self.suite_host_num[parent_job_id] else: self.suite_host_num[parent_job_id] = count logging.debug( 'Suite %d releases host %s, currently holding %d hosts', parent_job_id, host.hostname, count) def get_min_duts(self, suite_job_ids): """Figure out min duts to request. Given a set ids of suite jobs, figure out minimum duts to request for each suite. It is determined by two factors: min_duts specified for each suite in its job keyvals, and how many duts a suite is currently holding. @param suite_job_ids: A set of suite job ids. @returns: A dictionary, the key is suite_job_id, the value is the minimum number of duts to request. """ suite_min_duts = self.job_query_manager.get_min_duts_of_suites( suite_job_ids) for parent_id in suite_job_ids: min_duts = suite_min_duts.get(parent_id, 0) cur_duts = self.suite_host_num.get(parent_id, 0) suite_min_duts[parent_id] = max(0, min_duts - cur_duts) logging.debug( 'Minimum duts to get for suites (suite_id: min_duts): %s', suite_min_duts) return suite_min_duts
def shard_heartbeat(shard_hostname, jobs=(), hqes=(), known_job_ids=(), known_host_ids=(), known_host_statuses=()): """Receive updates for job statuses from shards and assign hosts and jobs. @param shard_hostname: Hostname of the calling shard @param jobs: Jobs in serialized form that should be updated with newer status from a shard. @param hqes: Hostqueueentries in serialized form that should be updated with newer status from a shard. Note that for every hostqueueentry the corresponding job must be in jobs. @param known_job_ids: List of ids of jobs the shard already has. @param known_host_ids: List of ids of hosts the shard already has. @param known_host_statuses: List of statuses of hosts the shard already has. @returns: Serialized representations of hosts, jobs, suite job keyvals and their dependencies to be inserted into a shard's database. """ # The following alternatives to sending host and job ids in every heartbeat # have been considered: # 1. Sending the highest known job and host ids. This would work for jobs: # Newer jobs always have larger ids. Also, if a job is not assigned to a # particular shard during a heartbeat, it never will be assigned to this # shard later. # This is not true for hosts though: A host that is leased won't be sent # to the shard now, but might be sent in a future heartbeat. This means # sometimes hosts should be transfered that have a lower id than the # maximum host id the shard knows. # 2. Send the number of jobs/hosts the shard knows to the master in each # heartbeat. Compare these to the number of records that already have # the shard_id set to this shard. In the normal case, they should match. # In case they don't, resend all entities of that type. # This would work well for hosts, because there aren't that many. # Resending all jobs is quite a big overhead though. # Also, this approach might run into edge cases when entities are # ever deleted. # 3. Mixtures of the above: Use 1 for jobs and 2 for hosts. # Using two different approaches isn't consistent and might cause # confusion. Also the issues with the case of deletions might still # occur. # # The overhead of sending all job and host ids in every heartbeat is low: # At peaks one board has about 1200 created but unfinished jobs. # See the numbers here: http://goo.gl/gQCGWH # Assuming that job id's have 6 digits and that json serialization takes a # comma and a space as overhead, the traffic per id sent is about 8 bytes. # If 5000 ids need to be sent, this means 40 kilobytes of traffic. # A NOT IN query with 5000 ids took about 30ms in tests made. # These numbers seem low enough to outweigh the disadvantages of the # solutions described above. timer = autotest_stats.Timer('shard_heartbeat') with timer: shard_obj = rpc_utils.retrieve_shard(shard_hostname=shard_hostname) rpc_utils.persist_records_sent_from_shard(shard_obj, jobs, hqes) assert len(known_host_ids) == len(known_host_statuses) for i in range(len(known_host_ids)): host_model = models.Host.objects.get(pk=known_host_ids[i]) if host_model.status != known_host_statuses[i]: host_model.status = known_host_statuses[i] host_model.save() hosts, jobs, suite_keyvals = rpc_utils.find_records_for_shard( shard_obj, known_job_ids=known_job_ids, known_host_ids=known_host_ids) return { 'hosts': [host.serialize() for host in hosts], 'jobs': [job.serialize() for job in jobs], 'suite_keyvals': [kv.serialize() for kv in suite_keyvals], }
"""Scheduler library classes. """ import collections import logging import common from autotest_lib.client.common_lib.cros.graphite import autotest_stats from autotest_lib.frontend import setup_django_environment from autotest_lib.frontend.afe import models from autotest_lib.server.cros.dynamic_suite import constants from autotest_lib.scheduler import scheduler_models from autotest_lib.scheduler import scheduler_lib _job_timer = autotest_stats.Timer('scheduler.job_query_manager') class AFEJobQueryManager(object): """Query manager for AFE Jobs.""" # A subquery to only get inactive hostless jobs. hostless_query = 'host_id IS NULL AND meta_host IS NULL' @_job_timer.decorate def get_pending_queue_entries(self, only_hostless=False): """ Fetch a list of new host queue entries. The ordering of this list is important, as every new agent we schedule can potentially contribute to the process count
def analyze_suites(start_time, end_time): """ Calculates timing stats (i.e., suite runtime, scheduling overhead) for the suites that finished within the timestamps given by parameters. @param start_time: Beginning timestamp. @param end_time: Ending timestamp. """ print('Analyzing suites from %s to %s...' % (time_utils.epoch_time_to_date_string(start_time), time_utils.epoch_time_to_date_string(end_time))) if _options.bvtonly: batch_constraints = [('suite_name', ['bvt-inline', 'bvt-cq', 'bvt-perbuild'])] else: batch_constraints = [] start_time_epoch = time_utils.to_epoch_time(start_time) end_time_epoch = time_utils.to_epoch_time(end_time) results = autotest_es.query(fields_returned=[ 'suite_name', 'suite_job_id', 'board', 'build', 'num_child_jobs', 'duration' ], equality_constraints=[ ('_type', job_overhead.SUITE_RUNTIME_KEY), ], range_constraints=[ ('time_recorded', start_time_epoch, end_time_epoch) ], sort_specs=[{ 'time_recorded': 'asc' }], batch_constraints=batch_constraints) print('Found %d suites' % (results.total)) for hit in results.hits: suite_job_id = hit['suite_job_id'] try: suite_name = hit['suite_name'] num_child_jobs = int(hit['num_child_jobs']) suite_runtime = float(hit['duration']) print('Suite: %s (%s), Board: %s, Build: %s, Num child jobs: %d' % (suite_name, suite_job_id, hit['board'], hit['build'], num_child_jobs)) suite_stats = get_scheduling_overhead(suite_job_id, num_child_jobs) print('Suite: %s (%s) runtime: %f,' % (suite_name, suite_job_id, suite_runtime)), print_suite_stats(suite_stats) if _options.cron_mode: key = utils.get_data_key('suite_time_stats', suite_name, hit['build'], hit['board']) autotest_stats.Timer(key).send('suite_runtime', suite_runtime) for stat, val in suite_stats.iteritems(): autotest_stats.Timer(key).send(stat, val) except Exception as e: print('ERROR: Exception is raised while processing suite %s' % (suite_job_id)) print e
def start(self): """Create and start a new timer.""" self.timer = autotest_stats.Timer(self.name) self.timer.start()