def main(options): """Main script. @param options: Options to run the script. """ config = logging_config.LoggingConfig() if options.logfile: config.add_file_handler( file_path=os.path.abspath(options.logfile), level=logging.DEBUG if options.verbose else logging.INFO) bucket = lxc.ContainerBucket() logging.info('') logging.info('Cleaning container bucket %s', bucket.container_path) success_count = 0 failure_count = 0 for container in bucket.get_all().values(): if is_container_orphaned(container): if cleanup(container, options): success_count += 1 else: failure_count += 1 if options.execute: key = 'container_cleanup.%s' % socket.gethostname().replace('.', '_') autotest_stats.Gauge(key).send('success', success_count) autotest_stats.Gauge(key).send('failure', failure_count) logging.info('Cleanup finished.')
def kill_processes(self, process_list): """Send signals escalating in severity to the processes in process_list. @param process_list: A list of drone_manager.Process objects representing the processes to kill. """ kill_proc_key = 'kill_processes' autotest_stats.Gauge(_STATS_KEY).send('%s.%s' % (kill_proc_key, 'net'), len(process_list)) try: logging.info('List of process to be killed: %s', process_list) processes_to_kill = {} for p in process_list: signal_queue = self.get_signal_queue_to_kill(p) processes_to_kill[signal_queue] = ( processes_to_kill.get(signal_queue, []) + [p]) sig_counts = {} for signal_queue, processes in processes_to_kill.iteritems(): sig_counts.update( utils.nuke_pids([-process.pid for process in processes], signal_queue=signal_queue)) for name, count in sig_counts.iteritems(): autotest_stats.Gauge(_STATS_KEY).send( '%s.%s' % (kill_proc_key, name), count) except error.AutoservRunError as e: self._warn('Error occured when killing processes. Error: %s' % e)
def do_heartbeat(self): """Perform a heartbeat: Retreive new jobs. This function executes a `shard_heartbeat` RPC. It retrieves the response of this call and processes the response by storing the returned objects in the local database. """ logging.info("Performing heartbeat.") packet = self._heartbeat_packet() autotest_stats.Gauge(STATS_KEY).send('heartbeat.request_size', len(str(packet))) try: response = self.afe.run(HEARTBEAT_AFE_ENDPOINT, **packet) except urllib2.HTTPError as e: self._heartbeat_failure("HTTPError %d: %s" % (e.code, e.reason)) return except urllib2.URLError as e: self._heartbeat_failure("URLError: %s" % e.reason) return except httplib.HTTPException as e: self._heartbeat_failure("HTTPException: %s" % e) return except timeout_util.TimeoutError as e: self._heartbeat_failure("TimeoutError: %s" % e) return autotest_stats.Gauge(STATS_KEY).send('heartbeat.response_size', len(str(response))) self._mark_jobs_as_uploaded([job['id'] for job in packet['jobs']]) self.process_heartbeat_response(response) logging.info("Heartbeat completed.")
def _run(): """Report metadata in the queue until being aborted. """ # Time when the first time upload failed. None if the last upload succeeded. first_failed_upload = None # True if email alert was sent when upload has been failing continuously # for _MAX_UPLOAD_FAIL_DURATION seconds. email_alert = False upload_size = _MIN_RETRY_ENTRIES try: while True: start_time = time.time() data_list = [] if (first_failed_upload and time.time() - first_failed_upload > _MAX_UPLOAD_FAIL_DURATION): upload_size = _MIN_RETRY_ENTRIES if not email_alert: _email_alert() email_alert = True else: upload_size = min(upload_size * 2, _MAX_UPLOAD_SIZE) while (not metadata_queue.empty() and len(data_list) < upload_size): data_list.append(metadata_queue.get_nowait()) if data_list: if autotest_es.bulk_post(data_list=data_list): time_used = time.time() - start_time logging.info( '%d entries of metadata uploaded in %s ' 'seconds.', len(data_list), time_used) autotest_stats.Timer('metadata_reporter').send( 'time_used', time_used) autotest_stats.Gauge('metadata_reporter').send( 'entries_uploaded', len(data_list)) first_failed_upload = None email_alert = False else: logging.warn( 'Failed to upload %d entries of metadata, ' 'they will be retried later.', len(data_list)) autotest_stats.Gauge('metadata_reporter').send( 'entries_failed', len(data_list)) for data in data_list: queue(data) if not first_failed_upload: first_failed_upload = time.time() sleep_time = _REPORT_INTERVAL_SECONDS - time.time() + start_time if sleep_time < 0: sleep_time = 0.5 _abort.wait(timeout=sleep_time) except Exception as e: logging.error('Metadata reporter thread failed with error: %s', e) raise finally: logging.info('Metadata reporting thread is exiting.') _abort.clear() _report_lock.release()
def process_heartbeat_response(self, heartbeat_response): """Save objects returned by a heartbeat to the local database. This deseralizes hosts and jobs including their dependencies and saves them to the local database. @param heartbeat_response: A dictionary with keys 'hosts' and 'jobs', as returned by the `shard_heartbeat` rpc call. """ hosts_serialized = heartbeat_response['hosts'] jobs_serialized = heartbeat_response['jobs'] suite_keyvals_serialized = heartbeat_response['suite_keyvals'] autotest_stats.Gauge(STATS_KEY).send('hosts_received', len(hosts_serialized)) autotest_stats.Gauge(STATS_KEY).send('jobs_received', len(jobs_serialized)) autotest_stats.Gauge(STATS_KEY).send('suite_keyvals_received', len(suite_keyvals_serialized)) self._deserialize_many(hosts_serialized, models.Host, 'host') self._deserialize_many(jobs_serialized, models.Job, 'job') self._deserialize_many(suite_keyvals_serialized, models.JobKeyval, 'jobkeyval') host_ids = [h['id'] for h in hosts_serialized] logging.info('Heartbeat response contains hosts %s', host_ids) job_ids = [j['id'] for j in jobs_serialized] logging.info('Heartbeat response contains jobs %s', job_ids) parent_jobs_with_keyval = set( [kv['job_id'] for kv in suite_keyvals_serialized]) logging.info('Heartbeat response contains suite_keyvals_for jobs %s', list(parent_jobs_with_keyval)) # If the master has just sent any jobs that we think have completed, # re-sync them with the master. This is especially useful when a # heartbeat or job is silently dropped, as the next heartbeat will # have a disagreement. Updating the shard_id to NULL will mark these # jobs for upload on the next heartbeat. job_models = models.Job.objects.filter(id__in=job_ids, hostqueueentry__complete=True) if job_models: job_models.update(shard=None) job_ids_repr = ', '.join([str(job.id) for job in job_models]) logging.warn( 'Following completed jobs are reset shard_id to NULL ' 'to be uploaded to master again: %s', job_ids_repr)
def main(): """Counts the number of AFE jobs in the last day, then pushes the count to statsd""" parser = argparse.ArgumentParser(description=( 'A script which records the number of afe jobs run in a time interval.' )) parser.parse_args(sys.argv[1:]) count = number_of_jobs_since(timedelta(days=1)) autotest_stats.Gauge('jobs_rate').send('afe_daily_count', count)
def _execute_calls(self, calls): autotest_stats.Gauge('drone_execute_call_count').send( self.hostname.replace('.', '_'), len(calls)) return_message = self._execute_calls_impl(calls) for warning in return_message['warnings']: subject = 'Warning from drone %s' % self.hostname logging.warning(subject + '\n' + warning) email_manager.manager.enqueue_notify_email(subject, warning) return return_message['results']
def time_wait(server): """ Submits a stat for the number of TIME_WAIT sockets that are on the server. @param server: The AFE server. """ out = infra.execute_command(server, 'ss -o state time-wait | wc -l') stat = autotest_stats.Gauge(server, bare=True) # ss prints out a header for the columns also, so we subtract one to report # about only the data. stat.send('time_wait_sockets', int(out.strip()) - 1)
def num_devserver_processes(server): """ Submits a stat for the number of devserver processes that are on the server. @param server: The AFE server. """ out = infra.execute_command(server, 'ps -C devserver.py| wc -l') stat = autotest_stats.Gauge(server, bare=True) # ps prints out a header for the columns also, so we subtract one to report # about only the data. stat.send('num_devserver_processes', int(out.strip()) - 1)
def _schedule_jobs(self): """Schedule new jobs against hosts.""" key = 'host_scheduler.jobs_per_tick' new_jobs_with_hosts = 0 queue_entries = self.job_query_manager.get_pending_queue_entries( only_hostless=False) unverified_host_jobs = [ job for job in queue_entries if not job.is_hostless() ] if not unverified_host_jobs: return for acquisition in self.find_hosts_for_jobs(unverified_host_jobs): self.schedule_host_job(acquisition.host, acquisition.job) self._record_host_assignment(acquisition.host, acquisition.job) new_jobs_with_hosts += 1 autotest_stats.Gauge(key).send('new_jobs_with_hosts', new_jobs_with_hosts) autotest_stats.Gauge(key).send( 'new_jobs_without_hosts', len(unverified_host_jobs) - new_jobs_with_hosts)
def _compute_active_processes(self, drone): drone.active_processes = 0 for pidfile_id, contents in self._pidfiles.iteritems(): is_running = contents.exit_status is None on_this_drone = (contents.process and contents.process.hostname == drone.hostname) if is_running and on_this_drone: info = self._registered_pidfile_info[pidfile_id] if info.num_processes is not None: drone.active_processes += info.num_processes autotest_stats.Gauge(self._STATS_KEY).send( '%s.%s' % (drone.hostname.replace('.', '_'), 'active_processes'), drone.active_processes)
def main(): logging_manager.configure_logging( drone_logging_config.DroneLoggingConfig()) with timer.get_client('decode'): calls = parse_input() args = _parse_args(sys.argv[1:]) if args.call_time is not None: autotest_stats.Gauge(_STATS_KEY).send('invocation_overhead', time.time() - args.call_time) drone_utility = DroneUtility() return_value = drone_utility.execute_calls(calls) with timer.get_client('encode'): return_data(return_value)
def check_proc(prog, max_elapsed_sec): """Check the number of long-running processes for a given program. Finds out the number of processes for a given program that have run more than a given elapsed time. Sends out the number to stats dashboard. @param prog: Program name. @param max_elapsed_sec: Max elapsed time in seconds. Processes that have run more than this value will be caught. """ cmd = ('ps -eo etimes,args | grep "%s" | awk \'{if($1 > %d) print $0}\' | ' 'wc -l' % (prog, max_elapsed_sec)) count = int(subprocess.check_output(cmd, shell=True)) autotest_stats.Gauge(STATS_KEY).send(prog, count)
def rpcs_per_sec(server): """ Scrape the requests/sec number off of the apache server-status page and submit it as a stat to statsd. @param server: The AFE server. """ try: page = requests.get('http://%s/server-status' % server).text except requests.ConnectionError as e: logging.exception(e) return m = re.search("(\d+) requests/sec", page) if m: val = int(m.groups(0)[0]) stat = autotest_stats.Gauge(server, bare=True) stat.send('requests_per_sec', val)
class RequestAccountant(object): """A helper class that count requests and manages min_duts requirement. On initialization, this object takes a list of host requests. It will batch the requests by grouping similar requests together and generate a mapping from unique request-> count of the request. It will also generates a mapping from suite_job_id -> min_duts. RDB does a two-round of host aquisition. The first round attempts to get min_duts for each suite. The second round attemps to satisfy the rest requests. RDB calls get_min_duts and get_rest to figure out how many duts it should attempt to get for a unique request in the first and second round respectively. Assume we have two distinct requests R1 (parent_job_id: 10, need hosts: 2) R2 (parent_job_id: 10, need hosts: 4) And parent job P (job_id:10) has min dut requirement of 3. So we got requests_to_counts = {R1: 2, R2: 4} min_duts_map = {P: 3} First round acquiring: Call get_min_duts(R1) return 2, because P hasn't reach its min dut limit (3) yet requests_to_counts -> {R1: 2-2=0, R2: 4} min_duts_map -> {P: 3-2=1} Call get_min_duts(R2) return 1, because although R2 needs 4 duts, P's min dut limit is now 1 requests_to_counts -> {R1: 0, R2: 4-1=3} min_duts_map -> {P: 1-1=0} Second round acquiring: Call get_rest(R1): return 0, requests_to_counts[R1] Call get_rest(R2): return 3, requests_to_counts[R2] Note it is possible that in the first round acquiring, although R1 requested 2 duts, it may only get 1 or None. However get_rest doesn't need to care whether the first round succeeded or not, as in the case when the first round failed, regardless how many duts get_rest requests, it will not be fullfilled anyway. """ _gauge = autotest_stats.Gauge(RDB_STATS_KEY) def __init__(self, host_requests): """Initialize. @param host_requests: A list of request to acquire hosts. """ self.requests_to_counts = {} # The order matters, it determines which request got fullfilled first. self.requests = [] for request, count in self._batch_requests(host_requests): self.requests.append(request) self.requests_to_counts[request] = count self.min_duts_map = dict((r.parent_job_id, r.suite_min_duts) for r in self.requests_to_counts.iterkeys() if r.parent_job_id) @classmethod def _batch_requests(cls, requests): """ Group similar requests, sort by priority and parent_job_id. @param requests: A list or unsorted, unordered requests. @return: A list of tuples of the form (request, number of occurances) formed by counting the number of requests with the same acls/deps/ priority in the input list of requests, and sorting by priority. The order of this list ensures against priority inversion. """ sort_function = lambda request: (request[0].priority, -request[0]. parent_job_id) return sorted(collections.Counter(requests).items(), key=sort_function, reverse=True) def get_min_duts(self, host_request): """Given a distinct host request figure out min duts to request for. @param host_request: A request. @returns: The minimum duts that should be requested. """ parent_id = host_request.parent_job_id count = self.requests_to_counts[host_request] if parent_id: min_duts = self.min_duts_map.get(parent_id, 0) to_acquire = min(count, min_duts) self.min_duts_map[parent_id] = max(0, min_duts - to_acquire) else: to_acquire = 0 self.requests_to_counts[host_request] -= to_acquire return to_acquire def get_duts(self, host_request): """Return the number of duts host_request still need. @param host_request: A request. @returns: The number of duts need to be requested. """ return self.requests_to_counts[host_request] def record_acquire_min_duts(cls, host_request, hosts_required, acquired_host_count): """Send stats to graphite about host acquisition. @param host_request: A request. @param hosts_required: Number of hosts required to satisfy request. @param acquired_host_count: Number of host acquired. """ try: priority = priorities.Priority.get_string(host_request.priority) except ValueError: return key = ('min_duts_acquisition_rate.%s') % priority cls._gauge.send(key, acquired_host_count / float(hosts_required))
def offload_dir(dir_entry, dest_path): """Offload the specified directory entry to Google storage. @param dir_entry: Directory entry to offload. @param dest_path: Location in google storage where we will offload the directory. """ try: counter = autotest_stats.Counter(STATS_KEY) counter.increment('jobs_offload_started') sanitize_dir(dir_entry) if LIMIT_FILE_COUNT: limit_file_count(dir_entry) error = False stdout_file = tempfile.TemporaryFile('w+') stderr_file = tempfile.TemporaryFile('w+') process = None signal.alarm(OFFLOAD_TIMEOUT_SECS) gs_path = '%s%s' % (gs_uri, dest_path) process = subprocess.Popen(get_cmd_list(multiprocessing, dir_entry, gs_path), stdout=stdout_file, stderr=stderr_file) process.wait() signal.alarm(0) if process.returncode == 0: dir_size = get_directory_size_kibibytes(dir_entry) counter.increment('kibibytes_transferred_total', dir_size) metadata = { '_type': METADATA_TYPE, 'size_KB': dir_size, 'result_dir': dir_entry, 'drone': socket.gethostname().replace('.', '_') } autotest_stats.Gauge(STATS_KEY, metadata=metadata).send( 'kibibytes_transferred', dir_size) counter.increment('jobs_offloaded') shutil.rmtree(dir_entry) else: error = True except TimeoutException: # If we finished the call to Popen(), we may need to # terminate the child process. We don't bother calling # process.poll(); that inherently races because the child # can die any time it wants. if process: try: process.terminate() except OSError: # We don't expect any error other than "No such # process". pass logging.error( 'Offloading %s timed out after waiting %d ' 'seconds.', dir_entry, OFFLOAD_TIMEOUT_SECS) error = True except OSError as e: # The wrong file permission can lead call # `shutil.rmtree(dir_entry)` to raise OSError with message # 'Permission denied'. Details can be found in # crbug.com/536151 if e.errno == errno.EACCES: logging.warn('Try to correct file permission of %s.', dir_entry) correct_results_folder_permission(dir_entry) finally: signal.alarm(0) if error: # Rewind the log files for stdout and stderr and log # their contents. stdout_file.seek(0) stderr_file.seek(0) stderr_content = stderr_file.read() logging.error('Error occurred when offloading %s:', dir_entry) logging.error('Stdout:\n%s \nStderr:\n%s', stdout_file.read(), stderr_content) # Some result files may have wrong file permission. Try # to correct such error so later try can success. # TODO(dshi): The code is added to correct result files # with wrong file permission caused by bug 511778. After # this code is pushed to lab and run for a while to # clean up these files, following code and function # correct_results_folder_permission can be deleted. if 'CommandException: Error opening file' in stderr_content: logging.warn('Try to correct file permission of %s.', dir_entry) correct_results_folder_permission(dir_entry) stdout_file.close() stderr_file.close()
class SiteDispatcher(object): """ SiteDispatcher subclasses BaseDispatcher in monitor_db. """ DEFAULT_REQUESTED_BY_USER_ID = 1 _timer = autotest_stats.Timer('scheduler') _gauge = autotest_stats.Gauge('scheduler_rel') _tick_start = None @_timer.decorate def tick(self): self._tick_start = time.time() super(SiteDispatcher, self).tick() self._gauge.send('tick', time.time() - self._tick_start) @_timer.decorate def _garbage_collection(self): super(SiteDispatcher, self)._garbage_collection() if self._tick_start: self._gauge.send('_garbage_collection', time.time() - self._tick_start) @_timer.decorate def _run_cleanup(self): super(SiteDispatcher, self)._run_cleanup() if self._tick_start: self._gauge.send('_run_cleanup', time.time() - self._tick_start) @_timer.decorate def _find_aborting(self): super(SiteDispatcher, self)._find_aborting() if self._tick_start: self._gauge.send('_find_aborting', time.time() - self._tick_start) @_timer.decorate def _process_recurring_runs(self): super(SiteDispatcher, self)._process_recurring_runs() if self._tick_start: self._gauge.send('_process_recurring_runs', time.time() - self._tick_start) @_timer.decorate def _schedule_delay_tasks(self): super(SiteDispatcher, self)._schedule_delay_tasks() if self._tick_start: self._gauge.send('_schedule_delay_tasks', time.time() - self._tick_start) @_timer.decorate def _schedule_running_host_queue_entries(self): super(SiteDispatcher, self)._schedule_running_host_queue_entries() if self._tick_start: self._gauge.send('_schedule_running_host_queue_entries', time.time() - self._tick_start) @_timer.decorate def _schedule_special_tasks(self): super(SiteDispatcher, self)._schedule_special_tasks() if self._tick_start: self._gauge.send('_schedule_special_tasks', time.time() - self._tick_start) @_timer.decorate def _schedule_new_jobs(self): super(SiteDispatcher, self)._schedule_new_jobs() if self._tick_start: self._gauge.send('_schedule_new_jobs', time.time() - self._tick_start) @_timer.decorate def _handle_agents(self): super(SiteDispatcher, self)._handle_agents() if self._tick_start: self._gauge.send('_handle_agents', time.time() - self._tick_start) def _reverify_hosts_where(self, where, print_message='Reverifying host %s'): """ This is an altered version of _reverify_hosts_where the class to models.SpecialTask.objects.create passes in an argument for requested_by, in order to allow the Reset task to be created properly. """ full_where = 'locked = 0 AND invalid = 0 AND ' + where for host in scheduler_models.Host.fetch(where=full_where): if self.host_has_agent(host): # host has already been recovered in some way continue if self._host_has_scheduled_special_task(host): # host will have a special task scheduled on the next cycle continue if print_message: logging.error(print_message, host.hostname) try: user = models.User.objects.get(login='******') except models.User.DoesNotExist: user = models.User.objects.get( id=SiteDispatcher.DEFAULT_REQUESTED_BY_USER_ID) models.SpecialTask.objects.create( task=models.SpecialTask.Task.RESET, host=models.Host.objects.get(id=host.id), requested_by=user) def _check_for_unrecovered_verifying_entries(self): # Verify is replaced by Reset. queue_entries = scheduler_models.HostQueueEntry.fetch( where='status = "%s"' % models.HostQueueEntry.Status.RESETTING) for queue_entry in queue_entries: special_tasks = models.SpecialTask.objects.filter( task__in=(models.SpecialTask.Task.CLEANUP, models.SpecialTask.Task.VERIFY, models.SpecialTask.Task.RESET), queue_entry__id=queue_entry.id, is_complete=False) if special_tasks.count() == 0: logging.error( 'Unrecovered Resetting host queue entry: %s. ' 'Setting status to Queued.', str(queue_entry)) # Essentially this host queue entry was set to be Verifying # however no special task exists for entry. This occurs if the # scheduler dies between changing the status and creating the # special task. By setting it to queued, the job can restart # from the beginning and proceed correctly. This is much more # preferable than having monitor_db not launching. queue_entry.set_status('Queued')
def report_stats(board, pool, start_time, end_time, span): """Report machine stats for given board, pool and time period. @param board: Name of board. @param pool: Name of pool. @param start_time: start time to collect stats. @param end_time: end time to collect stats. @param span: Number of hours that the stats should be collected for. @return: Error message collected when calculating the stats. """ print '================ %-12s %-12s ================' % (board, pool) try: history = host_history.get_history_details(start_time=start_time, end_time=end_time, board=board, pool=pool) except host_history_utils.NoHostFoundException as e: print 'No history found. Error:\n%s' % e history = None mur = -1 mar = -1 mir = -1 if history: status_intervals = host_history_utils.get_status_intervals(history) stats_all, num_hosts = host_history_utils.aggregate_hosts( status_intervals) total = 0 total_time = span * 3600 * num_hosts for status, interval in stats_all.iteritems(): total += interval if abs(total - total_time) > 10: error = ('Status intervals do not add up. No stats will be ' 'collected for board: %s, pool: %s, diff: %s' % (board, pool, total - total_time)) hosts = [] for history_for_host in status_intervals: total = 0 for interval in history_for_host.keys(): total += interval[1] - interval[0] if total > span * 3600: hosts.append( history_for_host.values()[0]['metadata']['hostname']) error += ' hosts: %s' % ','.join(hosts) print error return error mur = host_history_utils.get_machine_utilization_rate(stats_all) mar = host_history_utils.get_machine_availability_rate(stats_all) mir = mar - mur for status, interval in stats_all.iteritems(): print '%-18s %-16s %-10.2f%%' % (status, interval, 100 * interval / total_time) print 'Machine utilization rate = %-4.2f%%' % (100 * mur) print 'Machine availability rate = %-4.2f%%' % (100 * mar) autotest_stats.Gauge('machine_utilization_rate').send( '%s_hours.%s.%s' % (span, board, pool), mur) autotest_stats.Gauge('machine_availability_rate').send( '%s_hours.%s.%s' % (span, board, pool), mar) autotest_stats.Gauge('machine_idle_rate').send( '%s_hours.%s.%s' % (span, board, pool), mir)
def _decrement_running_processes(cls): cls._num_running_processes -= 1 autotest_stats.Gauge('scheduler').send( '%s.num_running_processes' % cls.__name__, cls._num_running_processes)