Exemple #1
0
def _stage_build_artifacts(build):
    """
    Ensure components of |build| necessary for installing images are staged.

    @param build image we want to stage.

    @raises StageControlFileFailure: if the dev server throws 500 while staging
        suite control files.

    @return: dev_server.ImageServer instance to use with this build.
    @return: timings dictionary containing staging start/end times.
    """
    timings = {}
    # Ensure components of |build| necessary for installing images are staged
    # on the dev server. However set synchronous to False to allow other
    # components to be downloaded in the background.
    ds = dev_server.ImageServer.resolve(build)
    timings[constants.DOWNLOAD_STARTED_TIME] = formatted_now()
    timer = autotest_stats.Timer(
        'control_files.stage.%s' %
        (ds.get_server_name(ds.url()).replace('.', '_')))
    try:
        with timer:
            ds.stage_artifacts(build, ['test_suites'])
    except dev_server.DevServerException as e:
        raise error.StageControlFileFailure("Failed to stage %s: %s" %
                                            (build, e))
    timings[constants.PAYLOAD_FINISHED_TIME] = formatted_now()
    return (ds, timings)
Exemple #2
0
    def collect_client_job_results(self):
        """ A method that collects all the current results of a running
        client job into the results dir. By default does nothing as no
        client job is running, but when running a client job you can override
        this with something that will actually do something. """

        # make an effort to wait for the machine to come up
        try:
            self.host.wait_up(timeout=30)
        except error.AutoservError:
            # don't worry about any errors, we'll try and
            # get the results anyway
            pass

        # Copy all dirs in default to results_dir
        timer = autotest_stats.Timer('collect_client_job_results')
        timer.start()
        try:
            self.host.get_file(self.client_results_dir + '/',
                               self.server_results_dir,
                               preserve_symlinks=True)

            # Only report time used for successful get_file calls.
            timer.stop()
        except Exception:
            # well, don't stop running just because we couldn't get logs
            e_msg = "Unexpected error copying test result logs, continuing ..."
            logging.error(e_msg)
            traceback.print_exc(file=sys.stdout)
def persist_records_sent_from_shard(shard, jobs, hqes):
    """
    Sanity checking then saving serialized records sent to master from shard.

    During heartbeats shards upload jobs and hostqueuentries. This performs
    some sanity checks on these and then updates the existing records for those
    entries with the updated ones from the heartbeat.

    The sanity checks include:
    - Checking if the objects sent already exist on the master.
    - Checking if the objects sent were assigned to this shard.
    - hostqueueentries must be sent together with their jobs.

    @param shard: The shard the records were sent from.
    @param jobs: The jobs the shard sent.
    @param hqes: The hostqueuentries the shart sent.

    @raises error.UnallowedRecordsSentToMaster if any of the sanity checks fail.
    """
    timer = autotest_stats.Timer('shard_heartbeat')
    with timer.get_client('persist_jobs'):
        job_ids_sent = _persist_records_with_type_sent_from_shard(
                shard, jobs, models.Job)

    with timer.get_client('persist_hqes'):
        _persist_records_with_type_sent_from_shard(
                shard, hqes, models.HostQueueEntry, job_ids_sent=job_ids_sent)
Exemple #4
0
    def dispatchRequest(self, request):
        """
        Invoke a json RPC call from a decoded json request.
        @param request: a decoded json_request
        @returns a dictionary with keys id, result, err and err_traceback
        """
        results = self.blank_result_dict()

        try:
            results['id'] = self._getRequestId(request)
            methName = request['method']
            args = request['params']
        except KeyError:
            raise BadServiceRequest(request)

        autotest_stats.Counter('rpc').increment(methName)

        metadata = request.copy()
        metadata['_type'] = 'rpc'
        timer = autotest_stats.Timer('rpc', metadata=metadata)

        try:
            timer.start()
            meth = self.findServiceEndpoint(methName)
            results['result'] = self.invokeServiceEndpoint(meth, args)
        except Exception, err:
            results['err_traceback'] = traceback.format_exc()
            results['err'] = err
Exemple #5
0
def _run():
    """Report metadata in the queue until being aborted.
    """
    # Time when the first time upload failed. None if the last upload succeeded.
    first_failed_upload = None
    # True if email alert was sent when upload has been failing continuously
    # for _MAX_UPLOAD_FAIL_DURATION seconds.
    email_alert = False
    upload_size = _MIN_RETRY_ENTRIES
    try:
        while True:
            start_time = time.time()
            data_list = []
            if (first_failed_upload and time.time() - first_failed_upload >
                    _MAX_UPLOAD_FAIL_DURATION):
                upload_size = _MIN_RETRY_ENTRIES
                if not email_alert:
                    _email_alert()
                    email_alert = True
            else:
                upload_size = min(upload_size * 2, _MAX_UPLOAD_SIZE)
            while (not metadata_queue.empty()
                   and len(data_list) < upload_size):
                data_list.append(metadata_queue.get_nowait())
            if data_list:
                if autotest_es.bulk_post(data_list=data_list):
                    time_used = time.time() - start_time
                    logging.info(
                        '%d entries of metadata uploaded in %s '
                        'seconds.', len(data_list), time_used)
                    autotest_stats.Timer('metadata_reporter').send(
                        'time_used', time_used)
                    autotest_stats.Gauge('metadata_reporter').send(
                        'entries_uploaded', len(data_list))
                    first_failed_upload = None
                    email_alert = False
                else:
                    logging.warn(
                        'Failed to upload %d entries of metadata, '
                        'they will be retried later.', len(data_list))
                    autotest_stats.Gauge('metadata_reporter').send(
                        'entries_failed', len(data_list))
                    for data in data_list:
                        queue(data)
                    if not first_failed_upload:
                        first_failed_upload = time.time()
            sleep_time = _REPORT_INTERVAL_SECONDS - time.time() + start_time
            if sleep_time < 0:
                sleep_time = 0.5
            _abort.wait(timeout=sleep_time)
    except Exception as e:
        logging.error('Metadata reporter thread failed with error: %s', e)
        raise
    finally:
        logging.info('Metadata reporting thread is exiting.')
        _abort.clear()
        _report_lock.release()
Exemple #6
0
 def run(self):
     """Wrapper around the thread's run method."""
     try:
         with autotest_stats.Timer(self.name):
             super(ExceptionRememberingThread, self).run()
     except Exception as self.err:
         logging.error(
             '%s raised an exception that will be re-raised by '
             'the thread pool manager.', self.getName())
Exemple #7
0
    def wait_for_restart(self,
                         timeout=DEFAULT_REBOOT_TIMEOUT,
                         down_timeout=WAIT_DOWN_REBOOT_TIMEOUT,
                         down_warning=WAIT_DOWN_REBOOT_WARNING,
                         log_failure=True,
                         old_boot_id=None,
                         **dargs):
        """ Wait for the host to come back from a reboot. This is a generic
        implementation based entirely on wait_up and wait_down. """
        key_string = 'Reboot.%s' % dargs.get('board')

        total_reboot_timer = autotest_stats.Timer(
            '%s.total' % key_string,
            metadata=self._construct_host_metadata('reboot_total'))
        wait_down_timer = autotest_stats.Timer(
            '%s.wait_down' % key_string,
            metadata=self._construct_host_metadata('reboot_down'))

        total_reboot_timer.start()
        wait_down_timer.start()
        if not self.wait_down(timeout=down_timeout,
                              warning_timer=down_warning,
                              old_boot_id=old_boot_id):
            if log_failure:
                self.record("ABORT", None, "reboot.verify", "shut down failed")
            raise error.AutoservShutdownError("Host did not shut down")
        wait_down_timer.stop()
        wait_up_timer = autotest_stats.Timer(
            '%s.wait_up' % key_string,
            metadata=self._construct_host_metadata('reboot_up'))
        wait_up_timer.start()
        if self.wait_up(timeout):
            self.record("GOOD", None, "reboot.verify")
            self.reboot_followup(**dargs)
            wait_up_timer.stop()
            total_reboot_timer.stop()
        else:
            self.record("ABORT", None, "reboot.verify",
                        "Host did not return from reboot")
            raise error.AutoservRebootError("Host did not return from reboot")
def retrieve_shard(shard_hostname):
    """
    Retrieves the shard with the given hostname from the database.

    @param shard_hostname: Hostname of the shard to retrieve

    @raises models.Shard.DoesNotExist, if no shard with this hostname was found.

    @returns: Shard object
    """
    timer = autotest_stats.Timer('shard_heartbeat.retrieve_shard')
    with timer:
        return models.Shard.smart_get(shard_hostname)
def find_records_for_shard(shard, known_job_ids, known_host_ids):
    """Find records that should be sent to a shard.

    @param shard: Shard to find records for.
    @param known_job_ids: List of ids of jobs the shard already has.
    @param known_host_ids: List of ids of hosts the shard already has.

    @returns: Tuple of three lists for hosts, jobs, and suite job keyvals:
              (hosts, jobs, suite_job_keyvals).
    """
    timer = autotest_stats.Timer('shard_heartbeat')
    with timer.get_client('find_hosts'):
        hosts = models.Host.assign_to_shard(shard, known_host_ids)
    with timer.get_client('find_jobs'):
        jobs = models.Job.assign_to_shard(shard, known_job_ids)
    with timer.get_client('find_suite_job_keyvals'):
        parent_job_ids = [job.parent_job_id for job in jobs]
        suite_job_keyvals = models.JobKeyval.objects.filter(
                job_id__in=parent_job_ids)
    return hosts, jobs, suite_job_keyvals
Exemple #10
0
def _get_control_file_contents_by_name(build, ds, suite_name):
    """Return control file contents for |suite_name|.

    Query the dev server at |ds| for the control file |suite_name|, included
    in |build| for |board|.

    @param build: unique name by which to refer to the image from now on.
    @param ds: a dev_server.DevServer instance to fetch control file with.
    @param suite_name: canonicalized suite name, e.g. test_suites/control.bvt.
    @raises ControlFileNotFound if a unique suite control file doesn't exist.
    @raises NoControlFileList if we can't list the control files at all.
    @raises ControlFileEmpty if the control file exists on the server, but
                             can't be read.

    @return the contents of the desired control file.
    """
    getter = control_file_getter.DevServerGetter.create(build, ds)
    timer = autotest_stats.Timer('control_files.parse.%s.%s' %
                                 (ds.get_server_name(ds.url()).replace(
                                     '.', '_'), suite_name.rsplit('.')[-1]))
    # Get the control file for the suite.
    try:
        with timer:
            control_file_in = getter.get_control_file_contents_by_name(
                suite_name)
    except error.CrosDynamicSuiteException as e:
        raise type(e)("%s while testing %s." % (e, build))
    if not control_file_in:
        raise error.ControlFileEmpty("Fetching %s returned no data." %
                                     suite_name)
    # Force control files to only contain ascii characters.
    try:
        control_file_in.encode('ascii')
    except UnicodeDecodeError as e:
        raise error.ControlFileMalformed(str(e))

    return control_file_in
class HostScheduler(BaseHostScheduler):
    """A scheduler capable managing host acquisition for new jobs."""

    _timer = autotest_stats.Timer('host_scheduler')

    def __init__(self):
        super(HostScheduler, self).__init__()
        self.job_query_manager = query_managers.AFEJobQueryManager()
        # Keeping track on how many hosts each suite is holding
        # {suite_job_id: num_hosts}
        self._suite_recorder = SuiteRecorder(self.job_query_manager)

    def _record_host_assignment(self, host, queue_entry):
        """Record that |host| is assigned to |queue_entry|.

        Record:
            1. How long it takes to assign a host to a job in metadata db.
            2. Record host assignment of a suite.

        @param host: A Host object.
        @param queue_entry: A HostQueueEntry object.
        """
        secs_in_queued = (datetime.datetime.now() -
                          queue_entry.job.created_on).total_seconds()
        job_overhead.record_state_duration(queue_entry.job_id, host.hostname,
                                           job_overhead.STATUS.QUEUED,
                                           secs_in_queued)
        self._suite_recorder.record_assignment(queue_entry)

    @_timer.decorate
    def _schedule_jobs(self):
        """Schedule new jobs against hosts."""

        key = 'host_scheduler.jobs_per_tick'
        new_jobs_with_hosts = 0
        queue_entries = self.job_query_manager.get_pending_queue_entries(
            only_hostless=False)
        unverified_host_jobs = [
            job for job in queue_entries if not job.is_hostless()
        ]
        if not unverified_host_jobs:
            return
        for acquisition in self.find_hosts_for_jobs(unverified_host_jobs):
            self.schedule_host_job(acquisition.host, acquisition.job)
            self._record_host_assignment(acquisition.host, acquisition.job)
            new_jobs_with_hosts += 1
        autotest_stats.Gauge(key).send('new_jobs_with_hosts',
                                       new_jobs_with_hosts)
        autotest_stats.Gauge(key).send(
            'new_jobs_without_hosts',
            len(unverified_host_jobs) - new_jobs_with_hosts)

    @_timer.decorate
    def _lease_hosts_of_frontend_tasks(self):
        """Lease hosts of tasks scheduled through the frontend."""
        # We really don't need to get all the special tasks here, just the ones
        # without hqes, but reusing the method used by the scheduler ensures
        # we prioritize the same way.
        lease_hostnames = [
            task.host.hostname
            for task in self.job_query_manager.get_prioritized_special_tasks(
                only_tasks_with_leased_hosts=False)
            if task.queue_entry_id is None and not task.host.leased
        ]
        # Leasing a leased hosts here shouldn't be a problem:
        # 1. The only way a host can be leased is if it's been assigned to
        #    an active hqe or another similar frontend task, but doing so will
        #    have already precluded it from the list of tasks returned by the
        #    job_query_manager.
        # 2. The unleasing is done based on global conditions. Eg: Even if a
        #    task has already leased a host and we lease it again, the
        #    host scheduler won't release the host till both tasks are complete.
        if lease_hostnames:
            self.host_query_manager.set_leased(True,
                                               hostname__in=lease_hostnames)

    def acquire_hosts(self, host_jobs):
        """Override acquire_hosts.

        This method overrides the method in parent class.
        It figures out a set of suites that |host_jobs| belong to;
        and get min_duts requirement for each suite.
        It pipes min_duts for each suite to rdb.

        """
        parent_job_ids = set(
            [q.job.parent_job_id for q in host_jobs if q.job.parent_job_id])
        suite_min_duts = self._suite_recorder.get_min_duts(parent_job_ids)
        return rdb_lib.acquire_hosts(host_jobs, suite_min_duts)

    @_timer.decorate
    def tick(self):
        logging.info('Calling new tick.')
        logging.info('Leasing hosts for frontend tasks.')
        self._lease_hosts_of_frontend_tasks()
        logging.info('Finding hosts for new jobs.')
        self._schedule_jobs()
        logging.info('Releasing unused hosts.')
        released_hosts = self._release_hosts()
        logging.info('Updating suite assignment with released hosts')
        self._suite_recorder.record_release(released_hosts)
        logging.info('Calling email_manager.')
        email_manager.manager.send_queued_emails()
import contextlib
import logging
import time
from multiprocessing import pool

import base_event, board_enumerator, build_event
import task, timed_event

import common
from autotest_lib.client.common_lib.cros.graphite import autotest_stats
from autotest_lib.server import utils

POOL_SIZE = 32

_timer = autotest_stats.Timer('suite_scheduler')


class Driver(object):
    """Implements the main loop of the suite_scheduler.

    @var EVENT_CLASSES: list of the event classes Driver supports.
    @var _LOOP_INTERVAL_SECONDS: seconds to wait between loop iterations.

    @var _scheduler: a DedupingScheduler, used to schedule jobs with the AFE.
    @var _enumerator: a BoardEnumerator, used to list plaforms known to
                      the AFE
    @var _events: dict of BaseEvents to be handled each time through main loop.
    """

    EVENT_CLASSES = [
"""

import logging
import socket
import subprocess
import sys

import common
from autotest_lib.client.common_lib.cros.graphite import autotest_stats
from autotest_lib.frontend import database_settings_helper
from autotest_lib.scheduler import email_manager

# Format Appears as: [Date] [Time] - [Msg Level] - [Message]
LOGGING_FORMAT = '%(asctime)s - %(levelname)s - %(message)s'
STATS_KEY = 'db_optimize.%s' % socket.gethostname()
timer = autotest_stats.Timer(STATS_KEY)


@timer.decorate
def main_without_exception_handling():
    database_settings = database_settings_helper.get_default_db_config()
    command = [
        'mysqlcheck',
        '-o',
        database_settings['NAME'],
        '-u',
        database_settings['USER'],
        '-p%s' % database_settings['PASSWORD'],
        # we want to do db optimation on each master/slave
        # in rotation. Do not write otimize table to bin log
        # so that it won't be picked up by slaves automatically
Exemple #14
0
class SiteDispatcher(object):
    """
    SiteDispatcher subclasses BaseDispatcher in monitor_db.
    """
    DEFAULT_REQUESTED_BY_USER_ID = 1

    _timer = autotest_stats.Timer('scheduler')
    _gauge = autotest_stats.Gauge('scheduler_rel')
    _tick_start = None

    @_timer.decorate
    def tick(self):
        self._tick_start = time.time()
        super(SiteDispatcher, self).tick()
        self._gauge.send('tick', time.time() - self._tick_start)

    @_timer.decorate
    def _garbage_collection(self):
        super(SiteDispatcher, self)._garbage_collection()
        if self._tick_start:
            self._gauge.send('_garbage_collection',
                             time.time() - self._tick_start)

    @_timer.decorate
    def _run_cleanup(self):
        super(SiteDispatcher, self)._run_cleanup()
        if self._tick_start:
            self._gauge.send('_run_cleanup', time.time() - self._tick_start)

    @_timer.decorate
    def _find_aborting(self):
        super(SiteDispatcher, self)._find_aborting()
        if self._tick_start:
            self._gauge.send('_find_aborting', time.time() - self._tick_start)

    @_timer.decorate
    def _process_recurring_runs(self):
        super(SiteDispatcher, self)._process_recurring_runs()
        if self._tick_start:
            self._gauge.send('_process_recurring_runs',
                             time.time() - self._tick_start)

    @_timer.decorate
    def _schedule_delay_tasks(self):
        super(SiteDispatcher, self)._schedule_delay_tasks()
        if self._tick_start:
            self._gauge.send('_schedule_delay_tasks',
                             time.time() - self._tick_start)

    @_timer.decorate
    def _schedule_running_host_queue_entries(self):
        super(SiteDispatcher, self)._schedule_running_host_queue_entries()
        if self._tick_start:
            self._gauge.send('_schedule_running_host_queue_entries',
                             time.time() - self._tick_start)

    @_timer.decorate
    def _schedule_special_tasks(self):
        super(SiteDispatcher, self)._schedule_special_tasks()
        if self._tick_start:
            self._gauge.send('_schedule_special_tasks',
                             time.time() - self._tick_start)

    @_timer.decorate
    def _schedule_new_jobs(self):
        super(SiteDispatcher, self)._schedule_new_jobs()
        if self._tick_start:
            self._gauge.send('_schedule_new_jobs',
                             time.time() - self._tick_start)

    @_timer.decorate
    def _handle_agents(self):
        super(SiteDispatcher, self)._handle_agents()
        if self._tick_start:
            self._gauge.send('_handle_agents', time.time() - self._tick_start)

    def _reverify_hosts_where(self,
                              where,
                              print_message='Reverifying host %s'):
        """
        This is an altered version of _reverify_hosts_where the class to
        models.SpecialTask.objects.create passes in an argument for
        requested_by, in order to allow the Reset task to be created
        properly.
        """
        full_where = 'locked = 0 AND invalid = 0 AND ' + where
        for host in scheduler_models.Host.fetch(where=full_where):
            if self.host_has_agent(host):
                # host has already been recovered in some way
                continue
            if self._host_has_scheduled_special_task(host):
                # host will have a special task scheduled on the next cycle
                continue
            if print_message:
                logging.error(print_message, host.hostname)
            try:
                user = models.User.objects.get(login='******')
            except models.User.DoesNotExist:
                user = models.User.objects.get(
                    id=SiteDispatcher.DEFAULT_REQUESTED_BY_USER_ID)
            models.SpecialTask.objects.create(
                task=models.SpecialTask.Task.RESET,
                host=models.Host.objects.get(id=host.id),
                requested_by=user)

    def _check_for_unrecovered_verifying_entries(self):
        # Verify is replaced by Reset.
        queue_entries = scheduler_models.HostQueueEntry.fetch(
            where='status = "%s"' % models.HostQueueEntry.Status.RESETTING)
        for queue_entry in queue_entries:
            special_tasks = models.SpecialTask.objects.filter(
                task__in=(models.SpecialTask.Task.CLEANUP,
                          models.SpecialTask.Task.VERIFY,
                          models.SpecialTask.Task.RESET),
                queue_entry__id=queue_entry.id,
                is_complete=False)
            if special_tasks.count() == 0:
                logging.error(
                    'Unrecovered Resetting host queue entry: %s. '
                    'Setting status to Queued.', str(queue_entry))
                # Essentially this host queue entry was set to be Verifying
                # however no special task exists for entry. This occurs if the
                # scheduler dies between changing the status and creating the
                # special task. By setting it to queued, the job can restart
                # from the beginning and proceed correctly. This is much more
                # preferable than having monitor_db not launching.
                queue_entry.set_status('Queued')
Exemple #15
0
class SiteDroneManager(object):

    _timer = autotest_stats.Timer('drone_manager')

    def copy_to_results_repository(self,
                                   process,
                                   source_path,
                                   destination_path=None):
        """
        Copy results from the given process at source_path to destination_path
        in the results repository.

        This site subclassed version will only copy the results back for Special
        Agent Tasks (Cleanup, Verify, Repair) that reside in the hosts/
        subdirectory of results if the copy_task_results_back flag has been set
        to True inside global_config.ini

        It will also only copy .parse.log files back to the scheduler if the
        copy_parse_log_back flag in global_config.ini has been set to True.
        """
        if not ENABLE_ARCHIVING:
            return
        copy_task_results_back = global_config.global_config.get_config_value(
            scheduler_config.CONFIG_SECTION,
            'copy_task_results_back',
            type=bool)
        copy_parse_log_back = global_config.global_config.get_config_value(
            scheduler_config.CONFIG_SECTION, 'copy_parse_log_back', type=bool)
        special_task = source_path.startswith(HOSTS_JOB_SUBDIR)
        parse_log = source_path.endswith(PARSE_LOG)
        if (copy_task_results_back or
                not special_task) and (copy_parse_log_back or not parse_log):
            super(SiteDroneManager,
                  self).copy_to_results_repository(process, source_path,
                                                   destination_path)

    def kill_process(self, process):
        """
        Kill the given process.
        """
        logging.info('killing %s', process)
        drone = self._get_drone_for_process(process)
        drone.queue_kill_process(process)

    def _add_drone(self, hostname):
        """
        Forked from drone_manager.py

        Catches AutoservRunError if the drone fails initialization and does not
        add it to the list of usable drones.

        @param hostname: Hostname of the drone we are trying to add.
        """
        logging.info('Adding drone %s' % hostname)
        drone = drones.get_drone(hostname)
        if drone:
            try:
                drone.call('initialize', self.absolute_path(''))
            except error.AutoservRunError as e:
                logging.error('Failed to initialize drone %s with error: %s',
                              hostname, e)
                return
            self._drones[drone.hostname] = drone

    @_timer.decorate
    def refresh(self):
        super(SiteDroneManager, self).refresh()

    @_timer.decorate
    def execute_actions(self):
        super(SiteDroneManager, self).execute_actions()
import argparse
import getpass
import logging
import os
import sys
from datetime import datetime

import common
from autotest_lib.client.common_lib.cros.graphite import autotest_stats
from autotest_lib.server import frontend
from autotest_lib.server import utils

LOG_NAME_TEMPLATE = 'abort_suite-%s.log'
SUITE_JOB_NAME_TEMPLATE = '%s-test_suites/control.%s'
_timer = autotest_stats.Timer('abort_suites')


def parse_args():
    """
    Parse the arguments to this script.

    @return The arguments to this script.

    """
    parser = argparse.ArgumentParser()
    parser.add_argument('-s', '--suite_name', dest='name')
    parser.add_argument('-i', '--build', dest='build')
    return parser.parse_args()

Exemple #17
0
from autotest_lib.client.common_lib import control_data
from autotest_lib.client.common_lib import global_config
from autotest_lib.client.common_lib import utils
from autotest_lib.client.common_lib.cros.graphite import autotest_stats
from autotest_lib.tko import db

try:
    from autotest_lib.server.site_common import site_utils as server_utils
except:
    from autotest_lib.server import utils as server_utils
form_ntuples_from_machines = server_utils.form_ntuples_from_machines

GLOBAL_CONFIG = global_config.global_config
DEFAULT_SERVER = 'autotest'

_tko_timer = autotest_stats.Timer('tko')


def dump_object(header, obj):
    """
    Standard way to print out the frontend objects (eg job, host, acl, label)
    in a human-readable fashion for debugging
    """
    result = header + '\n'
    for key in obj.hash:
        if key == 'afe' or key == 'hash':
            continue
        result += '%20s: %s\n' % (key, obj.hash[key])
    return result

class BaseHostScheduler(object):
    """Base class containing host acquisition logic.

    This class contains all the core host acquisition logic needed by the
    scheduler to run jobs on hosts. It is only capable of releasing hosts
    back to the rdb through its tick, any other action must be instigated by
    the job scheduler.
    """

    _timer = autotest_stats.Timer('base_host_scheduler')
    host_assignment = collections.namedtuple('host_assignment',
                                             ['host', 'job'])

    def __init__(self):
        self.host_query_manager = query_managers.AFEHostQueryManager()

    @_timer.decorate
    def _release_hosts(self):
        """Release hosts to the RDB.

        Release all hosts that are ready and are currently not being used by an
        active hqe, and don't have a new special task scheduled against them.

        @return a list of hosts that are released.
        """
        release_hosts = self.host_query_manager.find_unused_healty_hosts()
        release_hostnames = [host.hostname for host in release_hosts]
        if release_hostnames:
            self.host_query_manager.set_leased(False,
                                               hostname__in=release_hostnames)
        return release_hosts

    @classmethod
    def schedule_host_job(cls, host, queue_entry):
        """Schedule a job on a host.

        Scheduling a job involves:
            1. Setting the active bit on the queue_entry.
            2. Scheduling a special task on behalf of the queue_entry.
        Performing these actions will lead the job scheduler through a chain of
        events, culminating in running the test and collecting results from
        the host.

        @param host: The host against which to schedule the job.
        @param queue_entry: The queue_entry to schedule.
        """
        if queue_entry.host_id is None:
            queue_entry.set_host(host)
        elif host.id != queue_entry.host_id:
            raise rdb_utils.RDBException(
                'The rdb returned host: %s '
                'but the job:%s was already assigned a host: %s. ' %
                (host.hostname, queue_entry.job_id, queue_entry.host.hostname))
        queue_entry.update_field('active', True)

        # TODO: crbug.com/373936. The host scheduler should only be assigning
        # jobs to hosts, but the criterion we use to release hosts depends
        # on it not being used by an active hqe. Since we're activating the
        # hqe here, we also need to schedule its first prejob task. OTOH,
        # we could converge to having the host scheduler manager all special
        # tasks, since their only use today is to verify/cleanup/reset a host.
        logging.info('Scheduling pre job tasks for entry: %s', queue_entry)
        queue_entry.schedule_pre_job_tasks()

    def acquire_hosts(self, host_jobs):
        """Accquire hosts for given jobs.

        This method sends jobs that need hosts to rdb.
        Child class can override this method to pipe more args
        to rdb.

        @param host_jobs: A list of queue entries that either require hosts,
            or require host assignment validation through the rdb.

        @param return: A generator that yields an rdb_hosts.RDBClientHostWrapper
                       for each host acquired on behalf of a queue_entry,
                       or None if a host wasn't found.
        """
        return rdb_lib.acquire_hosts(host_jobs)

    def find_hosts_for_jobs(self, host_jobs):
        """Find and verify hosts for a list of jobs.

        @param host_jobs: A list of queue entries that either require hosts,
            or require host assignment validation through the rdb.
        @return: A list of tuples of the form (host, queue_entry) for each
            valid host-queue_entry assignment.
        """
        jobs_with_hosts = []
        hosts = self.acquire_hosts(host_jobs)
        for host, job in zip(hosts, host_jobs):
            if host:
                jobs_with_hosts.append(self.host_assignment(host, job))
        return jobs_with_hosts

    @_timer.decorate
    def tick(self):
        """Schedule core host management activities."""
        self._release_hosts()
# TODO(crbug.com/464834): Snapshot clone is disabled until Moblab can
# support overlayfs or aufs, which requires a newer kernel.
SUPPORT_SNAPSHOT_CLONE = not IS_MOBLAB

# Number of seconds to wait for network to be up in a container.
NETWORK_INIT_TIMEOUT = 300
# Network bring up is slower in Moblab.
NETWORK_INIT_CHECK_INTERVAL = 2 if IS_MOBLAB else 0.1

# Type string for container related metadata.
CONTAINER_CREATE_METADB_TYPE = 'container_create'
CONTAINER_CREATE_RETRY_METADB_TYPE = 'container_create_retry'
CONTAINER_RUN_TEST_METADB_TYPE = 'container_run_test'

STATS_KEY = 'lxc.%s' % socket.gethostname().replace('.', '_')
timer = autotest_stats.Timer(STATS_KEY)
# Timer used inside container should not include the hostname, as that will
# create individual timer for each container.
container_timer = autotest_stats.Timer('lxc')


def _get_container_info_moblab(container_path, **filters):
    """Get a collection of container information in the given container path
    in a Moblab.

    TODO(crbug.com/457496): remove this method once python 3 can be installed
    in Moblab and lxc-ls command can use python 3 code.

    When running in Moblab, lxc-ls behaves differently from a server with python
    3 installed:
    1. lxc-ls returns a list of containers installed under /etc/lxc, the default
class UserCleanup(PeriodicCleanup):
    """User cleanup that is controlled by the global config variable
       clean_interval_minutes in the SCHEDULER section.
    """
    timer = autotest_stats.Timer('monitor_db_cleanup.user_cleanup')

    def __init__(self, db, clean_interval_minutes):
        super(UserCleanup, self).__init__(db, clean_interval_minutes)
        self._last_reverify_time = time.time()

    @timer.decorate
    def _cleanup(self):
        logging.info('Running periodic cleanup')
        self._abort_timed_out_jobs()
        self._abort_jobs_past_max_runtime()
        self._clear_inactive_blocks()
        self._check_for_db_inconsistencies()
        self._reverify_dead_hosts()
        self._django_session_cleanup()

    @timer.decorate
    def _abort_timed_out_jobs(self):
        msg = 'Aborting all jobs that have timed out and are not complete'
        logging.info(msg)
        query = models.Job.objects.filter(
            hostqueueentry__complete=False).extra(
                where=['created_on + INTERVAL timeout_mins MINUTE < NOW()'])
        for job in query.distinct():
            logging.warning('Aborting job %d due to job timeout', job.id)
            job.abort()

    @timer.decorate
    def _abort_jobs_past_max_runtime(self):
        """
        Abort executions that have started and are past the job's max runtime.
        """
        logging.info('Aborting all jobs that have passed maximum runtime')
        rows = self._db.execute("""
            SELECT hqe.id
            FROM afe_host_queue_entries AS hqe
            INNER JOIN afe_jobs ON (hqe.job_id = afe_jobs.id)
            WHERE NOT hqe.complete AND NOT hqe.aborted AND
            hqe.started_on + INTERVAL afe_jobs.max_runtime_mins MINUTE <
            NOW()""")
        query = models.HostQueueEntry.objects.filter(
            id__in=[row[0] for row in rows])
        for queue_entry in query.distinct():
            logging.warning('Aborting entry %s due to max runtime',
                            queue_entry)
            queue_entry.abort()

    @timer.decorate
    def _check_for_db_inconsistencies(self):
        logging.info('Cleaning db inconsistencies')
        self._check_all_invalid_related_objects()

    def _check_invalid_related_objects_one_way(self, first_model,
                                               relation_field, second_model):
        if 'invalid' not in first_model.get_field_dict():
            return []
        invalid_objects = list(first_model.objects.filter(invalid=True))
        first_model.objects.populate_relationships(invalid_objects,
                                                   second_model,
                                                   'related_objects')
        error_lines = []
        for invalid_object in invalid_objects:
            if invalid_object.related_objects:
                related_list = ', '.join(
                    str(related_object)
                    for related_object in invalid_object.related_objects)
                error_lines.append('Invalid %s %s is related to %ss: %s' %
                                   (first_model.__name__, invalid_object,
                                    second_model.__name__, related_list))
                related_manager = getattr(invalid_object, relation_field)
                related_manager.clear()
        return error_lines

    def _check_invalid_related_objects(self, first_model, first_field,
                                       second_model, second_field):
        errors = self._check_invalid_related_objects_one_way(
            first_model, first_field, second_model)
        errors.extend(
            self._check_invalid_related_objects_one_way(
                second_model, second_field, first_model))
        return errors

    def _check_all_invalid_related_objects(self):
        model_pairs = ((models.Host, 'labels', models.Label, 'host_set'),
                       (models.AclGroup, 'hosts', models.Host, 'aclgroup_set'),
                       (models.AclGroup, 'users', models.User,
                        'aclgroup_set'), (models.Test, 'dependency_labels',
                                          models.Label, 'test_set'))
        errors = []
        for first_model, first_field, second_model, second_field in model_pairs:
            errors.extend(
                self._check_invalid_related_objects(first_model, first_field,
                                                    second_model,
                                                    second_field))

        if errors:
            subject = ('%s relationships to invalid models, cleaned all' %
                       len(errors))
            message = '\n'.join(errors)
            logging.warning(subject)
            logging.warning(message)
            email_manager.manager.enqueue_notify_email(subject, message)

    @timer.decorate
    def _clear_inactive_blocks(self):
        msg = 'Clear out blocks for all completed jobs.'
        logging.info(msg)
        # this would be simpler using NOT IN (subquery), but MySQL
        # treats all IN subqueries as dependent, so this optimizes much
        # better
        self._db.execute("""
            DELETE ihq FROM afe_ineligible_host_queues ihq
            LEFT JOIN (SELECT DISTINCT job_id FROM afe_host_queue_entries
                       WHERE NOT complete) hqe
            USING (job_id) WHERE hqe.job_id IS NULL""")

    def _should_reverify_hosts_now(self):
        reverify_period_sec = (
            scheduler_config.config.reverify_period_minutes * 60)
        if reverify_period_sec == 0:
            return False
        return (self._last_reverify_time + reverify_period_sec) <= time.time()

    def _choose_subset_of_hosts_to_reverify(self, hosts):
        """Given hosts needing verification, return a subset to reverify."""
        max_at_once = scheduler_config.config.reverify_max_hosts_at_once
        if (max_at_once > 0 and len(hosts) > max_at_once):
            return random.sample(hosts, max_at_once)
        return sorted(hosts)

    @timer.decorate
    def _reverify_dead_hosts(self):
        if not self._should_reverify_hosts_now():
            return

        self._last_reverify_time = time.time()
        logging.info('Checking for dead hosts to reverify')
        hosts = models.Host.objects.filter(
            status=models.Host.Status.REPAIR_FAILED,
            locked=False,
            invalid=False)
        hosts = hosts.exclude(
            protection=host_protections.Protection.DO_NOT_VERIFY)
        if not hosts:
            return

        hosts = list(hosts)
        total_hosts = len(hosts)
        hosts = self._choose_subset_of_hosts_to_reverify(hosts)
        logging.info('Reverifying dead hosts (%d of %d) %s', len(hosts),
                     total_hosts, ', '.join(host.hostname for host in hosts))
        for host in hosts:
            models.SpecialTask.schedule_special_task(
                host=host, task=models.SpecialTask.Task.VERIFY)

    @timer.decorate
    def _django_session_cleanup(self):
        """Clean up django_session since django doesn't for us.
           http://www.djangoproject.com/documentation/0.96/sessions/
        """
        logging.info('Deleting old sessions from django_session')
        sql = 'TRUNCATE TABLE django_session'
        self._db.execute(sql)
class TwentyFourHourUpkeep(PeriodicCleanup):
    """Cleanup that runs at the startup of monitor_db and every subsequent
       twenty four hours.
    """
    timer = autotest_stats.Timer('monitor_db_cleanup.twentyfourhour_cleanup')

    def __init__(self, db, drone_manager, run_at_initialize=True):
        """Initialize TwentyFourHourUpkeep.

        @param db: Database connection object.
        @param drone_manager: DroneManager to access drones.
        @param run_at_initialize: True to run cleanup when scheduler starts.
                                  Default is set to True.

        """
        self.drone_manager = drone_manager
        clean_interval_minutes = 24 * 60  # 24 hours
        super(TwentyFourHourUpkeep,
              self).__init__(db,
                             clean_interval_minutes,
                             run_at_initialize=run_at_initialize)

    @timer.decorate
    def _cleanup(self):
        logging.info('Running 24 hour clean up')
        self._check_for_uncleanable_db_inconsistencies()
        self._cleanup_orphaned_containers()

    @timer.decorate
    def _check_for_uncleanable_db_inconsistencies(self):
        logging.info('Checking for uncleanable DB inconsistencies')
        self._check_for_active_and_complete_queue_entries()
        self._check_for_multiple_platform_hosts()
        self._check_for_no_platform_hosts()
        self._check_for_multiple_atomic_group_hosts()

    @timer.decorate
    def _check_for_active_and_complete_queue_entries(self):
        query = models.HostQueueEntry.objects.filter(active=True,
                                                     complete=True)
        if query.count() != 0:
            subject = ('%d queue entries found with active=complete=1' %
                       query.count())
            lines = []
            for entry in query:
                lines.append(str(entry.get_object_dict()))
                if entry.status == 'Aborted':
                    logging.error(
                        'Aborted entry: %s is both active and '
                        'complete. Setting active value to False.', str(entry))
                    entry.active = False
                    entry.save()
            self._send_inconsistency_message(subject, lines)

    @timer.decorate
    def _check_for_multiple_platform_hosts(self):
        rows = self._db.execute("""
            SELECT afe_hosts.id, hostname, COUNT(1) AS platform_count,
                   GROUP_CONCAT(afe_labels.name)
            FROM afe_hosts
            INNER JOIN afe_hosts_labels ON
                    afe_hosts.id = afe_hosts_labels.host_id
            INNER JOIN afe_labels ON afe_hosts_labels.label_id = afe_labels.id
            WHERE afe_labels.platform
            GROUP BY afe_hosts.id
            HAVING platform_count > 1
            ORDER BY hostname""")
        if rows:
            subject = '%s hosts with multiple platforms' % self._db.rowcount
            lines = [' '.join(str(item) for item in row) for row in rows]
            self._send_inconsistency_message(subject, lines)

    @timer.decorate
    def _check_for_no_platform_hosts(self):
        rows = self._db.execute("""
            SELECT hostname
            FROM afe_hosts
            LEFT JOIN afe_hosts_labels
              ON afe_hosts.id = afe_hosts_labels.host_id
              AND afe_hosts_labels.label_id IN (SELECT id FROM afe_labels
                                                WHERE platform)
            WHERE NOT afe_hosts.invalid AND afe_hosts_labels.host_id IS NULL"""
                                )
        if rows:
            logging.warning('%s hosts with no platform\n%s', self._db.rowcount,
                            ', '.join(row[0] for row in rows))

    @timer.decorate
    def _check_for_multiple_atomic_group_hosts(self):
        rows = self._db.execute("""
            SELECT afe_hosts.id, hostname,
                   COUNT(DISTINCT afe_atomic_groups.name) AS atomic_group_count,
                   GROUP_CONCAT(afe_labels.name),
                   GROUP_CONCAT(afe_atomic_groups.name)
            FROM afe_hosts
            INNER JOIN afe_hosts_labels ON
                    afe_hosts.id = afe_hosts_labels.host_id
            INNER JOIN afe_labels ON afe_hosts_labels.label_id = afe_labels.id
            INNER JOIN afe_atomic_groups ON
                       afe_labels.atomic_group_id = afe_atomic_groups.id
            WHERE NOT afe_hosts.invalid AND NOT afe_labels.invalid
            GROUP BY afe_hosts.id
            HAVING atomic_group_count > 1
            ORDER BY hostname""")
        if rows:
            subject = '%s hosts with multiple atomic groups' % self._db.rowcount
            lines = [' '.join(str(item) for item in row) for row in rows]
            self._send_inconsistency_message(subject, lines)

    def _send_inconsistency_message(self, subject, lines):
        logging.error(subject)
        message = '\n'.join(lines)
        if len(message) > 5000:
            message = message[:5000] + '\n(truncated)\n'
        email_manager.manager.enqueue_notify_email(subject, message)

    @timer.decorate
    def _cleanup_orphaned_containers(self):
        """Cleanup orphaned containers in each drone.

        The function queues a lxc_cleanup call in each drone without waiting for
        the script to finish, as the cleanup procedure could take minutes and the
        script output is logged.

        """
        ssp_enabled = global_config.global_config.get_config_value(
            'AUTOSERV', 'enable_ssp_container')
        if not ssp_enabled:
            logging.info(
                'Server-side packaging is not enabled, no need to clean'
                ' up orphaned containers.')
            return
        self.drone_manager.cleanup_orphaned_containers()
import logging
import os
import re
from autotest_lib.client.common_lib import utils as client_utils
from autotest_lib.client.common_lib.cros import dev_server
from autotest_lib.client.common_lib.cros import retry
from autotest_lib.client.common_lib.cros.graphite import autotest_stats
from autotest_lib.client.cros import constants
from autotest_lib.server.cros.dynamic_suite.constants import JOB_BUILD_KEY
from autotest_lib.server import utils

CRASH_SERVER_OVERLOAD = 'crash_server_overload'
CRASH_SERVER_FOUND = 'crash_server_found'
SYMBOLICATE_TIMEDOUT = 'symbolicate_timedout'

timer = autotest_stats.Timer('crash_collect')


def generate_minidump_stacktrace(minidump_path):
    """
    Generates a stacktrace for the specified minidump.

    This function expects the debug symbols to reside under:
        /build/<board>/usr/lib/debug

    @param minidump_path: absolute path to minidump to by symbolicated.
    @raise client_utils.error.CmdError if minidump_stackwalk return code != 0.
    """
    symbol_dir = '%s/../../../lib/debug' % utils.get_server_dir()
    logging.info('symbol_dir: %s', symbol_dir)
    client_utils.run('minidump_stackwalk "%s" "%s" > "%s.txt"' %
class BaseDroneManager(object):
    """
    This class acts as an interface from the scheduler to drones, whether it be
    only a single "drone" for localhost or multiple remote drones.

    All paths going into and out of this class are relative to the full results
    directory, except for those returns by absolute_path().
    """


    # Minimum time to wait before next email
    # about a drone hitting process limit is sent.
    NOTIFY_INTERVAL = 60 * 60 * 24 # one day
    _STATS_KEY = 'drone_manager'
    _timer = autotest_stats.Timer(_STATS_KEY)


    def __init__(self):
        # absolute path of base results dir
        self._results_dir = None
        # holds Process objects
        self._process_set = set()
        # holds the list of all processes running on all drones
        self._all_processes = {}
        # maps PidfileId to PidfileContents
        self._pidfiles = {}
        # same as _pidfiles
        self._pidfiles_second_read = {}
        # maps PidfileId to _PidfileInfo
        self._registered_pidfile_info = {}
        # used to generate unique temporary paths
        self._temporary_path_counter = 0
        # maps hostname to Drone object
        self._drones = {}
        self._results_drone = None
        # maps results dir to dict mapping file path to contents
        self._attached_files = {}
        # heapq of _DroneHeapWrappers
        self._drone_queue = []
        # map drone hostname to time stamp of email that
        # has been sent about the drone hitting process limit.
        self._notify_record = {}
        # A threaded task queue used to refresh drones asynchronously.
        if _THREADED_DRONE_MANAGER:
            self._refresh_task_queue = thread_lib.ThreadedTaskQueue(
                    name='%s.refresh_queue' % self._STATS_KEY)
        else:
            self._refresh_task_queue = drone_task_queue.DroneTaskQueue()


    def initialize(self, base_results_dir, drone_hostnames,
                   results_repository_hostname):
        self._results_dir = base_results_dir

        for hostname in drone_hostnames:
            self._add_drone(hostname)

        if not self._drones:
            # all drones failed to initialize
            raise DroneManagerError('No valid drones found')

        self.refresh_drone_configs()

        logging.info('Using results repository on %s',
                     results_repository_hostname)
        self._results_drone = drones.get_drone(results_repository_hostname)
        results_installation_dir = global_config.global_config.get_config_value(
                scheduler_config.CONFIG_SECTION,
                'results_host_installation_directory', default=None)
        if results_installation_dir:
            self._results_drone.set_autotest_install_dir(
                    results_installation_dir)
        # don't initialize() the results drone - we don't want to clear out any
        # directories and we don't need to kill any processes


    def reinitialize_drones(self):
        self._call_all_drones('initialize', self._results_dir)


    def shutdown(self):
        for drone in self.get_drones():
            drone.shutdown()


    def _get_max_pidfile_refreshes(self):
        """
        Normally refresh() is called on every monitor_db.Dispatcher.tick().

        @returns: The number of refresh() calls before we forget a pidfile.
        """
        pidfile_timeout = global_config.global_config.get_config_value(
                scheduler_config.CONFIG_SECTION, 'max_pidfile_refreshes',
                type=int, default=2000)
        return pidfile_timeout


    def _add_drone(self, hostname):
        logging.info('Adding drone %s', hostname)
        drone = drones.get_drone(hostname)
        if drone:
            self._drones[drone.hostname] = drone
            drone.call('initialize', self.absolute_path(''))


    def _remove_drone(self, hostname):
        self._drones.pop(hostname, None)


    def refresh_drone_configs(self):
        """
        Reread global config options for all drones.
        """
        # Import server_manager_utils is delayed rather than at the beginning of
        # this module. The reason is that test_that imports drone_manager when
        # importing autoserv_utils. The import is done before test_that setup
        # django (test_that only setup django in setup_local_afe, since it's
        # not needed when test_that runs the test in a lab duts through :lab:
        # option. Therefore, if server_manager_utils is imported at the
        # beginning of this module, test_that will fail since django is not
        # setup yet.
        from autotest_lib.site_utils import server_manager_utils
        config = global_config.global_config
        section = scheduler_config.CONFIG_SECTION
        config.parse_config_file()
        for hostname, drone in self._drones.iteritems():
            if server_manager_utils.use_server_db():
                server = server_manager_utils.get_servers(hostname=hostname)[0]
                attributes = dict([(a.attribute, a.value)
                                   for a in server.attributes.all()])
                drone.enabled = (
                        int(attributes.get('disabled', 0)) == 0)
                drone.max_processes = int(
                        attributes.get(
                            'max_processes',
                            scheduler_config.config.max_processes_per_drone))
                allowed_users = attributes.get('users', None)
            else:
                disabled = config.get_config_value(
                        section, '%s_disabled' % hostname, default='')
                drone.enabled = not bool(disabled)
                drone.max_processes = config.get_config_value(
                        section, '%s_max_processes' % hostname, type=int,
                        default=scheduler_config.config.max_processes_per_drone)

                allowed_users = config.get_config_value(
                        section, '%s_users' % hostname, default=None)
            if allowed_users:
                drone.allowed_users = set(allowed_users.split())
            else:
                drone.allowed_users = None
            logging.info('Drone %s.max_processes: %s', hostname,
                         drone.max_processes)
            logging.info('Drone %s.enabled: %s', hostname, drone.enabled)
            logging.info('Drone %s.allowed_users: %s', hostname,
                         drone.allowed_users)
            logging.info('Drone %s.support_ssp: %s', hostname,
                         drone.support_ssp)

        self._reorder_drone_queue() # max_processes may have changed
        # Clear notification record about reaching max_processes limit.
        self._notify_record = {}


    def get_drones(self):
        return self._drones.itervalues()


    def cleanup_orphaned_containers(self):
        """Queue cleanup_orphaned_containers call at each drone.
        """
        for drone in self._drones.values():
            logging.info('Queue cleanup_orphaned_containers at %s',
                         drone.hostname)
            drone.queue_call('cleanup_orphaned_containers')


    def _get_drone_for_process(self, process):
        return self._drones[process.hostname]


    def _get_drone_for_pidfile_id(self, pidfile_id):
        pidfile_contents = self.get_pidfile_contents(pidfile_id)
        assert pidfile_contents.process is not None
        return self._get_drone_for_process(pidfile_contents.process)


    def _drop_old_pidfiles(self):
        # use items() since the dict is modified in unregister_pidfile()
        for pidfile_id, info in self._registered_pidfile_info.items():
            if info.age > self._get_max_pidfile_refreshes():
                logging.warning('dropping leaked pidfile %s', pidfile_id)
                self.unregister_pidfile(pidfile_id)
            else:
                info.age += 1


    def _reset(self):
        self._process_set = set()
        self._all_processes = {}
        self._pidfiles = {}
        self._pidfiles_second_read = {}
        self._drone_queue = []


    def _call_all_drones(self, method, *args, **kwargs):
        all_results = {}
        for drone in self.get_drones():
            with self._timer.get_client(
                    '%s.%s' % (drone.hostname.replace('.', '_'), method)):
                all_results[drone] = drone.call(method, *args, **kwargs)
        return all_results


    def _parse_pidfile(self, drone, raw_contents):
        """Parse raw pidfile contents.

        @param drone: The drone on which this pidfile was found.
        @param raw_contents: The raw contents of a pidfile, eg:
            "pid\nexit_staus\nnum_tests_failed\n".
        """
        contents = PidfileContents()
        if not raw_contents:
            return contents
        lines = raw_contents.splitlines()
        if len(lines) > 3:
            return InvalidPidfile('Corrupt pid file (%d lines):\n%s' %
                                  (len(lines), lines))
        try:
            pid = int(lines[0])
            contents.process = Process(drone.hostname, pid)
            # if len(lines) == 2, assume we caught Autoserv between writing
            # exit_status and num_failed_tests, so just ignore it and wait for
            # the next cycle
            if len(lines) == 3:
                contents.exit_status = int(lines[1])
                contents.num_tests_failed = int(lines[2])
        except ValueError, exc:
            return InvalidPidfile('Corrupt pid file: ' + str(exc.args))

        return contents
class SuiteRecorder(object):
    """Recording the host assignment for suites.

    The recorder holds two things:
        * suite_host_num, records how many duts a suite is holding,
          which is a map <suite_job_id -> num_of_hosts>
        * hosts_to_suites, records which host is assigned to which
          suite, it is a map <host_id -> suite_job_id>
    The two datastructure got updated when a host is assigned to or released
    by a job.

    The reason to maintain hosts_to_suites is that, when a host is released,
    we need to know which suite it was leased to. Querying the db for the
    latest completed job that has run on a host is slow.  Therefore, we go with
    an alternative: keeping a <host id, suite job id> map
    in memory (for 10K hosts, the map should take less than 1M memory on
    64-bit machine with python 2.7)

    """

    _timer = autotest_stats.Timer('suite_recorder')

    def __init__(self, job_query_manager):
        """Initialize.

        @param job_queue_manager: A JobQueueryManager object.
        """
        self.job_query_manager = job_query_manager
        self.suite_host_num, self.hosts_to_suites = (
            self.job_query_manager.get_suite_host_assignment())

    def record_assignment(self, queue_entry):
        """Record that the hqe has got a host.

        @param queue_entry: A scheduler_models.HostQueueEntry object which has
                            got a host.
        """
        parent_id = queue_entry.job.parent_job_id
        if not parent_id:
            return
        if self.hosts_to_suites.get(queue_entry.host_id, None) == parent_id:
            logging.error(
                'HQE (id: %d, parent_job_id: %d, host: %s) '
                'seems already recorded', queue_entry.id, parent_id,
                queue_entry.host.hostname)
            return
        num_hosts = self.suite_host_num.get(parent_id, 0)
        self.suite_host_num[parent_id] = num_hosts + 1
        self.hosts_to_suites[queue_entry.host_id] = parent_id
        logging.debug('Suite %d got host %s, currently holding %d hosts',
                      parent_id, queue_entry.host.hostname,
                      self.suite_host_num[parent_id])

    def record_release(self, hosts):
        """Update the record with host releasing event.

        @param hosts: A list of scheduler_models.Host objects.
        """
        for host in hosts:
            if host.id in self.hosts_to_suites:
                parent_job_id = self.hosts_to_suites.pop(host.id)
                count = self.suite_host_num[parent_job_id] - 1
                if count == 0:
                    del self.suite_host_num[parent_job_id]
                else:
                    self.suite_host_num[parent_job_id] = count
                logging.debug(
                    'Suite %d releases host %s, currently holding %d hosts',
                    parent_job_id, host.hostname, count)

    def get_min_duts(self, suite_job_ids):
        """Figure out min duts to request.

        Given a set ids of suite jobs, figure out minimum duts to request for
        each suite. It is determined by two factors: min_duts specified
        for each suite in its job keyvals, and how many duts a suite is
        currently holding.

        @param suite_job_ids: A set of suite job ids.

        @returns: A dictionary, the key is suite_job_id, the value
                  is the minimum number of duts to request.
        """
        suite_min_duts = self.job_query_manager.get_min_duts_of_suites(
            suite_job_ids)
        for parent_id in suite_job_ids:
            min_duts = suite_min_duts.get(parent_id, 0)
            cur_duts = self.suite_host_num.get(parent_id, 0)
            suite_min_duts[parent_id] = max(0, min_duts - cur_duts)
        logging.debug(
            'Minimum duts to get for suites (suite_id: min_duts): %s',
            suite_min_duts)
        return suite_min_duts
Exemple #25
0
def shard_heartbeat(shard_hostname,
                    jobs=(),
                    hqes=(),
                    known_job_ids=(),
                    known_host_ids=(),
                    known_host_statuses=()):
    """Receive updates for job statuses from shards and assign hosts and jobs.

    @param shard_hostname: Hostname of the calling shard
    @param jobs: Jobs in serialized form that should be updated with newer
                 status from a shard.
    @param hqes: Hostqueueentries in serialized form that should be updated with
                 newer status from a shard. Note that for every hostqueueentry
                 the corresponding job must be in jobs.
    @param known_job_ids: List of ids of jobs the shard already has.
    @param known_host_ids: List of ids of hosts the shard already has.
    @param known_host_statuses: List of statuses of hosts the shard already has.

    @returns: Serialized representations of hosts, jobs, suite job keyvals
              and their dependencies to be inserted into a shard's database.
    """
    # The following alternatives to sending host and job ids in every heartbeat
    # have been considered:
    # 1. Sending the highest known job and host ids. This would work for jobs:
    #    Newer jobs always have larger ids. Also, if a job is not assigned to a
    #    particular shard during a heartbeat, it never will be assigned to this
    #    shard later.
    #    This is not true for hosts though: A host that is leased won't be sent
    #    to the shard now, but might be sent in a future heartbeat. This means
    #    sometimes hosts should be transfered that have a lower id than the
    #    maximum host id the shard knows.
    # 2. Send the number of jobs/hosts the shard knows to the master in each
    #    heartbeat. Compare these to the number of records that already have
    #    the shard_id set to this shard. In the normal case, they should match.
    #    In case they don't, resend all entities of that type.
    #    This would work well for hosts, because there aren't that many.
    #    Resending all jobs is quite a big overhead though.
    #    Also, this approach might run into edge cases when entities are
    #    ever deleted.
    # 3. Mixtures of the above: Use 1 for jobs and 2 for hosts.
    #    Using two different approaches isn't consistent and might cause
    #    confusion. Also the issues with the case of deletions might still
    #    occur.
    #
    # The overhead of sending all job and host ids in every heartbeat is low:
    # At peaks one board has about 1200 created but unfinished jobs.
    # See the numbers here: http://goo.gl/gQCGWH
    # Assuming that job id's have 6 digits and that json serialization takes a
    # comma and a space as overhead, the traffic per id sent is about 8 bytes.
    # If 5000 ids need to be sent, this means 40 kilobytes of traffic.
    # A NOT IN query with 5000 ids took about 30ms in tests made.
    # These numbers seem low enough to outweigh the disadvantages of the
    # solutions described above.
    timer = autotest_stats.Timer('shard_heartbeat')
    with timer:
        shard_obj = rpc_utils.retrieve_shard(shard_hostname=shard_hostname)
        rpc_utils.persist_records_sent_from_shard(shard_obj, jobs, hqes)
        assert len(known_host_ids) == len(known_host_statuses)
        for i in range(len(known_host_ids)):
            host_model = models.Host.objects.get(pk=known_host_ids[i])
            if host_model.status != known_host_statuses[i]:
                host_model.status = known_host_statuses[i]
                host_model.save()

        hosts, jobs, suite_keyvals = rpc_utils.find_records_for_shard(
            shard_obj,
            known_job_ids=known_job_ids,
            known_host_ids=known_host_ids)
        return {
            'hosts': [host.serialize() for host in hosts],
            'jobs': [job.serialize() for job in jobs],
            'suite_keyvals': [kv.serialize() for kv in suite_keyvals],
        }
"""Scheduler library classes.
"""

import collections
import logging

import common

from autotest_lib.client.common_lib.cros.graphite import autotest_stats
from autotest_lib.frontend import setup_django_environment
from autotest_lib.frontend.afe import models
from autotest_lib.server.cros.dynamic_suite import constants
from autotest_lib.scheduler import scheduler_models
from autotest_lib.scheduler import scheduler_lib

_job_timer = autotest_stats.Timer('scheduler.job_query_manager')


class AFEJobQueryManager(object):
    """Query manager for AFE Jobs."""

    # A subquery to only get inactive hostless jobs.
    hostless_query = 'host_id IS NULL AND meta_host IS NULL'

    @_job_timer.decorate
    def get_pending_queue_entries(self, only_hostless=False):
        """
        Fetch a list of new host queue entries.

        The ordering of this list is important, as every new agent
        we schedule can potentially contribute to the process count
Exemple #27
0
def analyze_suites(start_time, end_time):
    """
    Calculates timing stats (i.e., suite runtime, scheduling overhead)
    for the suites that finished within the timestamps given by parameters.

    @param start_time: Beginning timestamp.
    @param end_time: Ending timestamp.
    """
    print('Analyzing suites from %s to %s...' %
          (time_utils.epoch_time_to_date_string(start_time),
           time_utils.epoch_time_to_date_string(end_time)))

    if _options.bvtonly:
        batch_constraints = [('suite_name',
                              ['bvt-inline', 'bvt-cq', 'bvt-perbuild'])]
    else:
        batch_constraints = []

    start_time_epoch = time_utils.to_epoch_time(start_time)
    end_time_epoch = time_utils.to_epoch_time(end_time)
    results = autotest_es.query(fields_returned=[
        'suite_name', 'suite_job_id', 'board', 'build', 'num_child_jobs',
        'duration'
    ],
                                equality_constraints=[
                                    ('_type', job_overhead.SUITE_RUNTIME_KEY),
                                ],
                                range_constraints=[
                                    ('time_recorded', start_time_epoch,
                                     end_time_epoch)
                                ],
                                sort_specs=[{
                                    'time_recorded': 'asc'
                                }],
                                batch_constraints=batch_constraints)
    print('Found %d suites' % (results.total))

    for hit in results.hits:
        suite_job_id = hit['suite_job_id']

        try:
            suite_name = hit['suite_name']
            num_child_jobs = int(hit['num_child_jobs'])
            suite_runtime = float(hit['duration'])

            print('Suite: %s (%s), Board: %s, Build: %s, Num child jobs: %d' %
                  (suite_name, suite_job_id, hit['board'], hit['build'],
                   num_child_jobs))

            suite_stats = get_scheduling_overhead(suite_job_id, num_child_jobs)
            print('Suite: %s (%s) runtime: %f,' %
                  (suite_name, suite_job_id, suite_runtime)),
            print_suite_stats(suite_stats)

            if _options.cron_mode:
                key = utils.get_data_key('suite_time_stats', suite_name,
                                         hit['build'], hit['board'])
                autotest_stats.Timer(key).send('suite_runtime', suite_runtime)
                for stat, val in suite_stats.iteritems():
                    autotest_stats.Timer(key).send(stat, val)
        except Exception as e:
            print('ERROR: Exception is raised while processing suite %s' %
                  (suite_job_id))
            print e
Exemple #28
0
 def start(self):
     """Create and start a new timer."""
     self.timer = autotest_stats.Timer(self.name)
     self.timer.start()