Beispiel #1
0
def QueryAndEmit(baselines, cursor):
    """Queries MySQL for important stats and emits Monarch metrics

    @param baselines: A dict containing the initial values for the cumulative
                      metrics.
    @param cursor: The mysql command line.
    """

    for status in EMITTED_STATUSES_COUNTERS:
        delta = GetStatus(cursor, status) - baselines[status]
        metric_name = 'chromeos/autotest/afe_db/%s' % status.lower()
        metrics.Counter(metric_name).set(delta)

    for status in EMITTED_STATUS_GAUGES:
        metric_name = 'chromeos/autotest/afe_db/%s' % status.lower()
        metrics.Gauge(metric_name).set(GetStatus(cursor, status))

    pages_free = GetStatus(cursor, 'Innodb_buffer_pool_pages_free')
    pages_total = GetStatus(cursor, 'Innodb_buffer_pool_pages_total')

    metrics.Gauge('chromeos/autotest/afe_db/buffer_pool_pages').set(
        pages_free, fields={'used': False})

    metrics.Gauge('chromeos/autotest/afe_db/buffer_pool_pages').set(
        pages_total - pages_free, fields={'used': True})
def _modify_table(cursor, mysql_cmds, table):
    """Helper method to commit a list of sql_cmds.

  @param cursor: mysql cursor instance.
  @param mysql_cmds: the list of sql cmd to be executed.
  @param table: the name of the table modified.
  """
    try:
        succeed = False
        for cmd in mysql_cmds:
            logging.info('running command: %s', cmd)
            cursor.execute(cmd)
        succeed = True
    except Exception as e:
        msg = ('Fail to run the following sql command:\n%s\nError:\n%s\n'
               'All changes made to server db will be rollback.' % (cmd, e))
        logging.error(msg)
        raise UpdateDatabaseException(msg)
    finally:
        num_deletes = len([cmd.startswith('DELETE') for cmd in mysql_cmds])
        num_inserts = len([cmd.startswith('INSERT') for cmd in mysql_cmds])
        metrics.Gauge(_METRICS_PREFIX + '/inconsistency_fixed').set(
            num_deletes,
            fields={
                'table': table,
                'action': 'delete',
                'succeed': succeed
            })
        metrics.Gauge(_METRICS_PREFIX + '/inconsistency_fixed').set(
            num_inserts,
            fields={
                'table': table,
                'action': 'insert',
                'succeed': succeed
            })
    def do_heartbeat(self):
        """Perform a heartbeat: Retreive new jobs.

        This function executes a `shard_heartbeat` RPC. It retrieves the
        response of this call and processes the response by storing the returned
        objects in the local database.
        """
        heartbeat_metrics_prefix = 'chromeos/autotest/shard_client/heartbeat/'

        logging.info("Performing heartbeat.")
        packet = self._heartbeat_packet()
        metrics.Gauge(heartbeat_metrics_prefix + 'request_size').set(
            len(str(packet)))

        try:
            response = self.afe.run(HEARTBEAT_AFE_ENDPOINT, **packet)
        except urllib2.HTTPError as e:
            self._heartbeat_failure("HTTPError %d: %s" % (e.code, e.reason))
            return
        except urllib2.URLError as e:
            self._heartbeat_failure("URLError: %s" % e.reason)
            return
        except httplib.HTTPException as e:
            self._heartbeat_failure("HTTPException: %s" % e)
            return
        except timeout_util.TimeoutError as e:
            self._heartbeat_failure("TimeoutError: %s" % e)
            return

        metrics.Gauge(heartbeat_metrics_prefix + 'response_size').set(
            len(str(response)))
        self._mark_jobs_as_uploaded([job['id'] for job in packet['jobs']])
        self.process_heartbeat_response(response)
        logging.info("Heartbeat completed.")
Beispiel #4
0
 def _report_packet_metrics(self, packet):
     """Report stats about outgoing packet to monarch."""
     metrics.Gauge(_METRICS_PREFIX + 'known_job_ids_count').set(
         len(packet['known_job_ids']))
     metrics.Gauge(_METRICS_PREFIX + 'jobs_upload_count').set(
         len(packet['jobs']))
     metrics.Gauge(_METRICS_PREFIX + 'known_host_ids_count').set(
         len(packet['known_host_ids']))
Beispiel #5
0
    def process_heartbeat_response(self, heartbeat_response):
        """Save objects returned by a heartbeat to the local database.

        This deseralizes hosts and jobs including their dependencies and saves
        them to the local database.

        @param heartbeat_response: A dictionary with keys 'hosts' and 'jobs',
                                   as returned by the `shard_heartbeat` rpc
                                   call.
        """
        hosts_serialized = heartbeat_response['hosts']
        jobs_serialized = heartbeat_response['jobs']
        suite_keyvals_serialized = heartbeat_response['suite_keyvals']
        incorrect_host_ids = heartbeat_response.get('incorrect_host_ids', [])

        metrics.Gauge('chromeos/autotest/shard_client/hosts_received').set(
            len(hosts_serialized))
        metrics.Gauge('chromeos/autotest/shard_client/jobs_received').set(
            len(jobs_serialized))
        metrics.Gauge(
            'chromeos/autotest/shard_client/suite_keyvals_received').set(
                len(suite_keyvals_serialized))

        self._deserialize_many(hosts_serialized, models.Host, 'host')
        self._deserialize_many(jobs_serialized, models.Job, 'job')
        self._deserialize_many(suite_keyvals_serialized, models.JobKeyval,
                               'jobkeyval')

        host_ids = [h['id'] for h in hosts_serialized]
        logging.info('Heartbeat response contains hosts %s', host_ids)
        job_ids = [j['id'] for j in jobs_serialized]
        logging.info('Heartbeat response contains jobs %s', job_ids)
        parent_jobs_with_keyval = set(
            [kv['job_id'] for kv in suite_keyvals_serialized])
        logging.info('Heartbeat response contains suite_keyvals_for jobs %s',
                     list(parent_jobs_with_keyval))
        if incorrect_host_ids:
            logging.info(
                'Heartbeat response contains incorrect_host_ids %s '
                'which will be deleted.', incorrect_host_ids)
            self._remove_incorrect_hosts(incorrect_host_ids)

        # If the master has just sent any jobs that we think have completed,
        # re-sync them with the master. This is especially useful when a
        # heartbeat or job is silently dropped, as the next heartbeat will
        # have a disagreement. Updating the shard_id to NULL will mark these
        # jobs for upload on the next heartbeat.
        job_models = models.Job.objects.filter(id__in=job_ids,
                                               hostqueueentry__complete=True)
        if job_models:
            job_models.update(shard=None)
            job_ids_repr = ', '.join([str(job.id) for job in job_models])
            logging.warn(
                'Following completed jobs are reset shard_id to NULL '
                'to be uploaded to master again: %s', job_ids_repr)
def clean_labels(options):
    """Cleans unused labels from AFE database"""
    msg = 'Label cleaner starts. Will delete '
    if options.prefix:
        msg += 'all labels whose prefix is "%s".'
    else:
        msg += 'a label "%s".'
    logging.info(msg, options.label)
    logging.info('Target database: %s.', options.db_server)
    if options.check_status and not is_primary_server():
        raise Exception('Cannot run in a non-primary server')

    conn = MySQLdb.connect(
        host=options.db_server,
        user=options.db_user,
        passwd=options.db_password,
        db=DATABASE,
    )

    all_labels = fetch_labels(conn)
    logging.info('Found total %d labels', len(all_labels))
    metrics.Gauge(_METRICS_PREFIX + '/total_labels_count').set(
        len(all_labels),
        fields={
            'target_db': options.db_server,
            'label_prefix': '',
        },
    )

    labels = fetch_labels(conn, options.label, options.prefix)
    logging.info('Found total %d labels matching %s', len(labels),
                 options.label)
    if options.prefix and options.label in _LABEL_PREFIX_METRICS_WHITELIST:
        metrics.Gauge(_METRICS_PREFIX + '/total_labels_count').set(
            len(labels),
            fields={
                'target_db': options.db_server,
                'label_prefix': options.label,
            },
        )

    used_labels = get_used_labels(conn)
    logging.info('Found %d labels are used', len(used_labels))
    metrics.Gauge(_METRICS_PREFIX + '/used_labels_count').set(
        len(used_labels), fields={'target_db': options.db_server})

    to_delete = list(labels - used_labels)
    logging.info('Deleting %d unused labels', len(to_delete))
    delete_labels(conn, to_delete, options.max_delete, options.dry_run)
    metrics.Counter(_METRICS_PREFIX + '/labels_deleted').increment_by(
        len(to_delete), fields={'target_db': options.db_server})
Beispiel #7
0
    def _report_failed_jobs_count(self, failed_jobs):
        """Report the number of outstanding failed offload jobs to monarch.

        @param: List of failed jobs.
        """
        metrics.Gauge('chromeos/autotest/gs_offloader/failed_jobs_count').set(
            len(failed_jobs))
def _report_detected_errors(metric_name, gauge, fields={}):
    """Reports a gauge metric for errors detected

    @param metric_name: Name of the metric to report about.
    @param gauge: Outstanding number of unrecoverable errors of this type.
    @param fields: Optional fields to include with the metric.
    """
    m = '%s/errors_detected/%s' % (_METRICS_PREFIX, metric_name)
    metrics.Gauge(m).set(gauge, fields=fields)
Beispiel #9
0
    def do_heartbeat(self):
        """Perform a heartbeat: Retreive new jobs.

        This function executes a `shard_heartbeat` RPC. It retrieves the
        response of this call and processes the response by storing the returned
        objects in the local database.

        Returns: True if the heartbeat ran successfully, False otherwise.
        """

        logging.info("Performing heartbeat.")
        packet = self._heartbeat_packet()
        self._report_packet_metrics(packet)
        metrics.Gauge(_METRICS_PREFIX + 'request_size').set(len(str(packet)))

        try:
            response = self.afe.run(HEARTBEAT_AFE_ENDPOINT, **packet)
            logging.info('Finished heartbeat upload.')
        except urllib2.HTTPError as e:
            self._heartbeat_failure('HTTPError %d: %s' % (e.code, e.reason),
                                    'HTTPError')
            return False
        except urllib2.URLError as e:
            self._heartbeat_failure('URLError: %s' % e.reason, 'URLError')
            return False
        except httplib.HTTPException as e:
            self._heartbeat_failure('HTTPException: %s' % e, 'HTTPException')
            return False
        except timeout_util.TimeoutError as e:
            self._heartbeat_failure('TimeoutError: %s' % e, 'TimeoutError')
            return False
        except proxy.JSONRPCException as e:
            self._heartbeat_failure('JSONRPCException: %s' % e,
                                    'JSONRPCException')
            return False

        metrics.Gauge(_METRICS_PREFIX + 'response_size').set(len(
            str(response)))
        logging.info('Marking jobs as uploaded.')
        self._mark_jobs_as_uploaded([job['id'] for job in packet['jobs']])
        logging.info('Processing heartbeat response.')
        self.process_heartbeat_response(response)
        logging.info("Heartbeat completed.")
        return True
Beispiel #10
0
def _run():
    """Report metadata in the queue until being aborted.
    """
    # Time when the first time upload failed. None if the last upload succeeded.
    first_failed_upload = None
    upload_size = _MIN_RETRY_ENTRIES

    try:
        while True:
            start_time = time.time()
            data_list = []
            if (first_failed_upload and
                time.time() - first_failed_upload > _MAX_UPLOAD_FAIL_DURATION):
                upload_size = _MIN_RETRY_ENTRIES
            else:
                upload_size = min(upload_size*2, _MAX_UPLOAD_SIZE)
            while (not metadata_queue.empty() and len(data_list) < upload_size):
                data_list.append(metadata_queue.get_nowait())
            if data_list:
                success = False
                fields = _get_metrics_fields().copy()
                fields['success'] = success
                metrics.Gauge(
                        _METADATA_METRICS_PREFIX + 'upload/batch_sizes').set(
                                len(data_list), fields=fields)
                metrics.Counter(
                        _METADATA_METRICS_PREFIX + 'upload/attempts').increment(
                                fields=fields);

            metrics.Gauge(_METADATA_METRICS_PREFIX + 'queue_size').set(
                    metadata_queue.qsize(), fields=_get_metrics_fields())
            sleep_time = _REPORT_INTERVAL_SECONDS - time.time() + start_time
            if sleep_time < 0:
                sleep_time = 0.5
            _abort.wait(timeout=sleep_time)
    except Exception as e:
        logging.exception('Metadata reporter thread failed with error: %s', e)
        raise
    finally:
        logging.info('Metadata reporting thread is exiting.')
        _abort.clear()
        _report_lock.release()
    def _compute_active_processes(self, drone):
        drone.active_processes = 0
        for pidfile_id, contents in self._pidfiles.iteritems():
            is_running = contents.exit_status is None
            on_this_drone = (contents.process
                             and contents.process.hostname == drone.hostname)
            if is_running and on_this_drone:
                info = self._registered_pidfile_info[pidfile_id]
                if info.num_processes is not None:
                    drone.active_processes += info.num_processes

        metrics.Gauge('chromeos/autotest/drone/active_processes').set(
            drone.active_processes, fields={'drone_hostname': drone.hostname})
Beispiel #12
0
def main():
    """Counts the number of AFE jobs in the last day, then pushes the count to statsd"""
    parser = argparse.ArgumentParser(description=(
        'A script which records the number of afe jobs run in a time interval.'
    ))
    parser.parse_args(sys.argv[1:])
    count = number_of_jobs_since(timedelta(days=1))

    with site_utils.SetupTsMonGlobalState('count_jobs', short_lived=True):
        # TODO: Reporting a stat for each job created from the afe directly could be better.
        # More discussions are needed to decide whether to remove this file.
        metrics.Gauge(
            'chromeos/autotest/experimental/jobs_rate/afe_daily_count').set(
                count)
def main(argv):
    """Entry point for dut_mon."""
    logging.getLogger().setLevel(logging.INFO)

    with ts_mon_config.SetupTsMonGlobalState('dut_mon', indirect=True):
        afe = frontend.AFE()
        counters = collections.defaultdict(lambda: 0)

        field_spec = [ts_mon.StringField('board'),
                      ts_mon.StringField('model'),
                      ts_mon.StringField('pool'),
                      ts_mon.BooleanField('is_locked'),
                      ts_mon.StringField('status'),
                      ]
        dut_count = metrics.Gauge('chromeos/autotest/dut_mon/dut_count',
                                  description='The number of duts in a given '
                                              'state and bucket.',
                                  field_spec=field_spec)
        tick_count = metrics.Counter('chromeos/autotest/dut_mon/tick',
                                     description='Tick counter of dut_mon.')

        while True:
            # Note: We reset all counters to zero in each loop rather than
            # creating a new defaultdict, because we want to ensure that any
            # gauges that were previously set to a nonzero value by this process
            # get set back to zero if necessary.
            for k in counters:
                counters[k] = 0

            logging.info('Fetching all hosts.')
            hosts = afe.get_hosts()
            logging.info('Fetched %s hosts.', len(hosts))
            for host in hosts:
                fields = _get_bucket_for_host(host)
                counters[fields] += 1

            for field, value in counters.iteritems():
                logging.info('%s %s', field, value)
                dut_count.set(value, fields=field.__dict__)

            tick_count.increment()
            logging.info('Sleeping for 2 minutes.')
            time.sleep(120)
def check_proc(prog, max_elapsed_sec):
    """Check the number of long-running processes for a given program.

    Finds out the number of processes for a given program that have run
    more than a given elapsed time.
    Sends out the number to stats dashboard.

    @param prog: Program name.
    @param max_elapsed_sec: Max elapsed time in seconds. Processes that
                            have run more than this value will be caught.
    """
    cmd = ('ps -eo etimes,args | grep "%s" | awk \'{if($1 > %d) print $0}\' | '
           'wc -l' % (prog, max_elapsed_sec))
    count = int(subprocess.check_output(cmd, shell=True))

    if prog not in PROGRAM_TO_CHECK_SET:
        prog = 'unknown'

    metrics.Gauge('chromeos/autotest/hung_processes').set(
        count, fields={'program': prog})
Beispiel #15
0
    def _schedule_jobs(self):
        """Schedule new jobs against hosts."""

        new_jobs_with_hosts = 0
        queue_entries = self.job_query_manager.get_pending_queue_entries(
            only_hostless=False)
        unverified_host_jobs = [
            job for job in queue_entries if not job.is_hostless()
        ]
        if unverified_host_jobs:
            for acquisition in self.find_hosts_for_jobs(unverified_host_jobs):
                self.schedule_host_job(acquisition.host, acquisition.job)
                self._record_host_assignment(acquisition.host, acquisition.job)
                new_jobs_with_hosts += 1
            metrics.Counter('%s/new_jobs_with_hosts' %
                            _METRICS_PREFIX).increment_by(new_jobs_with_hosts)

        num_jobs_without_hosts = (len(unverified_host_jobs) -
                                  new_jobs_with_hosts)
        metrics.Gauge('%s/current_jobs_without_hosts' %
                      _METRICS_PREFIX).set(num_jobs_without_hosts)

        metrics.Counter('%s/tick' % _METRICS_PREFIX).increment()
Beispiel #16
0
    def batch_acquire_hosts(self, host_requests):
        """Acquire hosts for a list of requests.

        The act of acquisition involves finding and leasing a set of hosts
        that match the parameters of a request. Each acquired host is added
        to the response_map dictionary as an RDBServerHostWrapper.

        @param host_requests: A list of requests to acquire hosts.
        """
        distinct_requests = 0

        logging.debug('Processing %s host acquisition requests',
                      len(host_requests))
        metrics.Gauge('chromeos/autotest/scheduler/pending_host_acq_requests'
                      ).set(len(host_requests))

        self.request_accountant = rdb_utils.RequestAccountant(host_requests)
        # First pass tries to satisfy min_duts for each suite.
        for request in self.request_accountant.requests:
            to_acquire = self.request_accountant.get_min_duts(request)
            if to_acquire > 0:
                self._acquire_hosts(request, to_acquire,
                                    is_acquire_min_duts=True)
            distinct_requests += 1

        # Second pass tries to allocate duts to the rest unsatisfied requests.
        for request in self.request_accountant.requests:
            to_acquire = self.request_accountant.get_duts(request)
            if to_acquire > 0:
                self._acquire_hosts(request, to_acquire,
                                    is_acquire_min_duts=False)

        self.cache.record_stats()
        logging.debug('Host acquisition stats: distinct requests: %s, leased '
                      'hosts: %s, unsatisfied requests: %s', distinct_requests,
                      self.leased_hosts_count, self.unsatisfied_requests)
Beispiel #17
0
 def _report_current_jobs_count(self):
     """Report the number of outstanding jobs to monarch."""
     metrics.Gauge('chromeos/autotest/gs_offloader/current_jobs_count').set(
         len(self._open_jobs))
class BaseDroneManager(object):
    """
    This class acts as an interface from the scheduler to drones, whether it be
    only a single "drone" for localhost or multiple remote drones.

    All paths going into and out of this class are relative to the full results
    directory, except for those returns by absolute_path().
    """

    # Minimum time to wait before next email
    # about a drone hitting process limit is sent.
    NOTIFY_INTERVAL = 60 * 60 * 24  # one day
    _STATS_KEY = 'drone_manager'

    _ACTIVE_PROCESS_GAUGE = metrics.Gauge(
        'chromeos/autotest/drone/active_processes')

    def __init__(self):
        # absolute path of base results dir
        self._results_dir = None
        # holds Process objects
        self._process_set = set()
        # holds the list of all processes running on all drones
        self._all_processes = {}
        # maps PidfileId to PidfileContents
        self._pidfiles = {}
        # same as _pidfiles
        self._pidfiles_second_read = {}
        # maps PidfileId to _PidfileInfo
        self._registered_pidfile_info = {}
        # used to generate unique temporary paths
        self._temporary_path_counter = 0
        # maps hostname to Drone object
        self._drones = {}
        self._results_drone = None
        # maps results dir to dict mapping file path to contents
        self._attached_files = {}
        # heapq of _DroneHeapWrappers
        self._drone_queue = []
        # A threaded task queue used to refresh drones asynchronously.
        if _THREADED_DRONE_MANAGER:
            self._refresh_task_queue = thread_lib.ThreadedTaskQueue(
                name='%s.refresh_queue' % self._STATS_KEY)
        else:
            self._refresh_task_queue = drone_task_queue.DroneTaskQueue()

    def initialize(self, base_results_dir, drone_hostnames,
                   results_repository_hostname):
        self._results_dir = base_results_dir

        for hostname in drone_hostnames:
            self._add_drone(hostname)

        if not self._drones:
            # all drones failed to initialize
            raise DroneManagerError('No valid drones found')

        self.refresh_drone_configs()

        logging.info('Using results repository on %s',
                     results_repository_hostname)
        self._results_drone = drones.get_drone(results_repository_hostname)
        results_installation_dir = global_config.global_config.get_config_value(
            scheduler_config.CONFIG_SECTION,
            'results_host_installation_directory',
            default=None)
        if results_installation_dir:
            self._results_drone.set_autotest_install_dir(
                results_installation_dir)
        # don't initialize() the results drone - we don't want to clear out any
        # directories and we don't need to kill any processes

    def reinitialize_drones(self):
        for drone in self.get_drones():
            with metrics.SecondsTimer(
                    'chromeos/autotest/drone_manager/'
                    'reinitialize_drones_duration',
                    fields={'drone': drone.hostname}):
                drone.call('initialize', self._results_dir)

    def shutdown(self):
        for drone in self.get_drones():
            drone.shutdown()

    def _get_max_pidfile_refreshes(self):
        """
        Normally refresh() is called on every monitor_db.Dispatcher.tick().

        @returns: The number of refresh() calls before we forget a pidfile.
        """
        pidfile_timeout = global_config.global_config.get_config_value(
            scheduler_config.CONFIG_SECTION,
            'max_pidfile_refreshes',
            type=int,
            default=2000)
        return pidfile_timeout

    def _add_drone(self, hostname):
        logging.info('Adding drone %s', hostname)
        drone = drones.get_drone(hostname)
        if drone:
            self._drones[drone.hostname] = drone
            drone.call('initialize', self.absolute_path(''))

    def _remove_drone(self, hostname):
        self._drones.pop(hostname, None)

    def refresh_drone_configs(self):
        """
        Reread global config options for all drones.
        """
        # Import server_manager_utils is delayed rather than at the beginning of
        # this module. The reason is that test_that imports drone_manager when
        # importing autoserv_utils. The import is done before test_that setup
        # django (test_that only setup django in setup_local_afe, since it's
        # not needed when test_that runs the test in a lab duts through :lab:
        # option. Therefore, if server_manager_utils is imported at the
        # beginning of this module, test_that will fail since django is not
        # setup yet.
        from autotest_lib.site_utils import server_manager_utils
        config = global_config.global_config
        section = scheduler_config.CONFIG_SECTION
        config.parse_config_file()
        for hostname, drone in self._drones.iteritems():
            if server_manager_utils.use_server_db():
                server = server_manager_utils.get_servers(hostname=hostname)[0]
                attributes = dict([(a.attribute, a.value)
                                   for a in server.attributes.all()])
                drone.enabled = (int(attributes.get('disabled', 0)) == 0)
                drone.max_processes = int(
                    attributes.get(
                        'max_processes',
                        scheduler_config.config.max_processes_per_drone))
                allowed_users = attributes.get('users', None)
            else:
                disabled = config.get_config_value(section,
                                                   '%s_disabled' % hostname,
                                                   default='')
                drone.enabled = not bool(disabled)
                drone.max_processes = config.get_config_value(
                    section,
                    '%s_max_processes' % hostname,
                    type=int,
                    default=scheduler_config.config.max_processes_per_drone)

                allowed_users = config.get_config_value(section,
                                                        '%s_users' % hostname,
                                                        default=None)
            if allowed_users:
                drone.allowed_users = set(allowed_users.split())
            else:
                drone.allowed_users = None
            logging.info('Drone %s.max_processes: %s', hostname,
                         drone.max_processes)
            logging.info('Drone %s.enabled: %s', hostname, drone.enabled)
            logging.info('Drone %s.allowed_users: %s', hostname,
                         drone.allowed_users)
            logging.info('Drone %s.support_ssp: %s', hostname,
                         drone.support_ssp)

        self._reorder_drone_queue()  # max_processes may have changed
        # Clear notification record about reaching max_processes limit.
        self._notify_record = {}

    def get_drones(self):
        return self._drones.itervalues()

    def cleanup_orphaned_containers(self):
        """Queue cleanup_orphaned_containers call at each drone.
        """
        for drone in self._drones.values():
            logging.info('Queue cleanup_orphaned_containers at %s',
                         drone.hostname)
            drone.queue_call('cleanup_orphaned_containers')

    def _get_drone_for_process(self, process):
        return self._drones[process.hostname]

    def _get_drone_for_pidfile_id(self, pidfile_id):
        pidfile_contents = self.get_pidfile_contents(pidfile_id)
        assert pidfile_contents.process is not None
        return self._get_drone_for_process(pidfile_contents.process)

    def _drop_old_pidfiles(self):
        # use items() since the dict is modified in unregister_pidfile()
        for pidfile_id, info in self._registered_pidfile_info.items():
            if info.age > self._get_max_pidfile_refreshes():
                logging.warning('dropping leaked pidfile %s', pidfile_id)
                self.unregister_pidfile(pidfile_id)
            else:
                info.age += 1

    def _reset(self):
        self._process_set = set()
        self._all_processes = {}
        self._pidfiles = {}
        self._pidfiles_second_read = {}
        self._drone_queue = []

    def _parse_pidfile(self, drone, raw_contents):
        """Parse raw pidfile contents.

        @param drone: The drone on which this pidfile was found.
        @param raw_contents: The raw contents of a pidfile, eg:
            "pid\nexit_staus\nnum_tests_failed\n".
        """
        contents = PidfileContents()
        if not raw_contents:
            return contents
        lines = raw_contents.splitlines()
        if len(lines) > 3:
            return InvalidPidfile('Corrupt pid file (%d lines):\n%s' %
                                  (len(lines), lines))
        try:
            pid = int(lines[0])
            contents.process = Process(drone.hostname, pid)
            # if len(lines) == 2, assume we caught Autoserv between writing
            # exit_status and num_failed_tests, so just ignore it and wait for
            # the next cycle
            if len(lines) == 3:
                contents.exit_status = int(lines[1])
                contents.num_tests_failed = int(lines[2])
        except ValueError, exc:
            return InvalidPidfile('Corrupt pid file: ' + str(exc.args))

        return contents
Beispiel #19
0
    def PerformStage(self):
        """Perform the actual work for this stage.

    This includes final metadata archival, and update CIDB with our final status
    as well as producting a logged build result summary.
    """
        build_identifier, _ = self._run.GetCIDBHandle()
        build_id = build_identifier.cidb_id
        buildbucket_id = build_identifier.buildbucket_id
        if results_lib.Results.BuildSucceededSoFar(self.buildstore,
                                                   buildbucket_id, self.name):
            final_status = constants.BUILDER_STATUS_PASSED
        else:
            final_status = constants.BUILDER_STATUS_FAILED

        if not hasattr(self._run.attrs, 'release_tag'):
            # If, for some reason, sync stage was not completed and
            # release_tag was not set. Set it to None here because
            # ArchiveResults() depends the existence of this attr.
            self._run.attrs.release_tag = None

        # Set up our report metadata.
        self._run.attrs.metadata.UpdateWithDict(
            self.GetReportMetadata(
                final_status=final_status,
                completion_instance=self._completion_instance))

        src_root = self._build_root
        # Workspace builders use a different buildroot for overlays.
        if self._run.config.workspace_branch and self._run.options.workspace:
            src_root = self._run.options.workspace

        # Add tags for the arches and statuses of the build.
        # arches requires crossdev which isn't available at the early part of the
        # build.
        arches = []
        for board in self._run.config['boards']:
            toolchains = toolchain.GetToolchainsForBoard(board,
                                                         buildroot=src_root)
            default = list(
                toolchain.FilterToolchains(toolchains, 'default', True))
            if default:
                try:
                    arches.append(toolchain.GetArchForTarget(default[0]))
                except cros_build_lib.RunCommandError as e:
                    logging.warning(
                        'Unable to retrieve arch for board %s default toolchain %s: %s',
                        board, default, e)
        tags = {
            'arches': arches,
            'status': final_status,
        }
        results = self._run.attrs.metadata.GetValue('results')
        for stage in results:
            tags['stage_status:%s' % stage['name']] = stage['status']
            tags['stage_summary:%s' % stage['name']] = stage['summary']
        self._run.attrs.metadata.UpdateKeyDictWithDict(constants.METADATA_TAGS,
                                                       tags)

        # Some operations can only be performed if a valid version is available.
        try:
            self._run.GetVersionInfo()
            self.ArchiveResults(final_status)
            metadata_url = os.path.join(self.upload_url,
                                        constants.METADATA_JSON)
        except cbuildbot_run.VersionNotSetError:
            logging.error('A valid version was never set for this run. '
                          'Can not archive results.')
            metadata_url = ''

        results_lib.Results.Report(sys.stdout,
                                   current_version=(self._run.attrs.release_tag
                                                    or ''))

        # Upload goma log if used for BuildPackage and TestSimpleChrome.
        _UploadAndLinkGomaLogIfNecessary(
            'BuildPackages', self._run.config.name, self._run.options.goma_dir,
            self._run.options.goma_client_json,
            self._run.attrs.metadata.GetValueWithDefault('goma_tmp_dir'))
        _UploadAndLinkGomaLogIfNecessary(
            'TestSimpleChromeWorkflow', self._run.config.name,
            self._run.options.goma_dir, self._run.options.goma_client_json,
            self._run.attrs.metadata.GetValueWithDefault(
                'goma_tmp_dir_for_simple_chrome'))

        if self.buildstore.AreClientsReady():
            status_for_db = final_status

            # TODO(pprabhu): After BuildData and CBuildbotMetdata are merged, remove
            # this extra temporary object creation.
            # XXX:HACK We're creating a BuildData with an empty URL. Don't try to
            # MarkGathered this object.
            build_data = metadata_lib.BuildData(
                '', self._run.attrs.metadata.GetDict())
            # TODO(akeshet): Find a clearer way to get the "primary upload url" for
            # the metadata.json file. One alternative is _GetUploadUrls(...)[0].
            # Today it seems that element 0 of its return list is the primary upload
            # url, but there is no guarantee or unit test coverage of that.
            self.buildstore.FinishBuild(build_id,
                                        status=status_for_db,
                                        summary=build_data.failure_message,
                                        metadata_url=metadata_url)

            duration = self._GetBuildDuration()

            mon_fields = {
                'status': status_for_db,
                'build_config': self._run.config.name,
                'important': self._run.config.important
            }
            metrics.Counter(
                constants.MON_BUILD_COMP_COUNT).increment(fields=mon_fields)
            metrics.CumulativeSecondsDistribution(
                constants.MON_BUILD_DURATION).add(duration, fields=mon_fields)

            if self._run.options.sanity_check_build:
                metrics.Counter(
                    constants.MON_BUILD_SANITY_COMP_COUNT).increment(
                        fields=mon_fields)
                metrics.Gauge(
                    constants.MON_BUILD_SANITY_ID,
                    description=
                    'The build number of the latest sanity build. Used '
                    'for recovering the link to the latest failing build '
                    'in the alert when a sanity build fails.',
                    field_spec=[
                        ts_mon.StringField('status'),
                        ts_mon.StringField('build_config'),
                        ts_mon.StringField('builder_name'),
                        ts_mon.BooleanField('important')
                    ]).set(self._run.buildnumber,
                           fields=dict(
                               mon_fields,
                               builder_name=self._run.GetBuilderName()))

            if config_lib.IsMasterCQ(self._run.config):
                self_destructed = self._run.attrs.metadata.GetValueWithDefault(
                    constants.SELF_DESTRUCTED_BUILD, False)
                mon_fields = {
                    'status': status_for_db,
                    'self_destructed': self_destructed
                }
                metrics.CumulativeSecondsDistribution(
                    constants.MON_CQ_BUILD_DURATION).add(duration,
                                                         fields=mon_fields)
                annotator_link = uri_lib.ConstructAnnotatorUri(build_id)
                logging.PrintBuildbotLink('Build annotator', annotator_link)

            # From this point forward, treat all exceptions as warnings.
            self._post_completion = True

            # Dump report about things we retry.
            retry_stats.ReportStats(sys.stdout)
def create_mysql_updates(api_output, db_output, table, server_id_map,
                         warn_only):
    """Sync up servers table in server db with the inventory service.

  First step, entries in server_db but not in inventory services will be deleted
  from db. Then, entries in inventory service but not in server_db will be
  inserted into server_db.

  @param api_output: a dict mapping table name to list of corresponding
                     namedtuples parsed from inventory. This is the only
                     source of truth.
  @param db_output: a dict mapping table name to list of corresponding
                    namedtuples parsed from server db.
  @param table: name of the targeted server_db table.
  @param server_id_map: server hostname to id mapping dict.
  @param warn_only: whether it is warn_only. If yes, there will be no server id
                    for server_attributes and server_roles.

  @returns a list of mysql update commands, e.g.
  ['DELETE FROM a WHERE xx', 'INSERT ...']
  """
    logging.info('Checking table %s with inventory service...', table)

    mysql_cmds = []
    delete_entries = set(db_output[table]) - set(api_output[table])
    insert_entries = set(api_output[table]) - set(db_output[table])

    if delete_entries:
        logging.info(
            '\nTable %s is not synced up! Below is a list of entries '
            'that exist only in server db. These invalid entries will be '
            'deleted from server db:\n%s', table, delete_entries)

        for entry in delete_entries:
            if table == 'servers':
                cmd = 'DELETE FROM servers WHERE hostname=%r' % entry.hostname
            elif table == 'server_attributes':
                cmd = (
                    'DELETE FROM server_attrs WHERE server_id=%d and attribute=%r'
                    % (server_id_map[entry.hostname], entry.attribute))
            else:
                cmd = (
                    'DELETE FROM server_roles WHERE server_id=%d and role=%r' %
                    (server_id_map[entry.hostname], entry.role))
            mysql_cmds.append(cmd)

    if insert_entries:
        logging.info(
            '\nTable %s is not synced up! Below is a list of entries '
            'that exist only in inventory service. These new entries will'
            ' be inserted in to server db:\n%s', table, insert_entries)

        for entry in insert_entries:
            # If this is warn_only, it is very likely that the server id for new
            # entry does not exsit since the server has not been inserted into servers
            # table yet. For this case, fake it as 0.
            if warn_only and not server_id_map.get(entry.hostname):
                server_id = 0
            else:
                server_id = server_id_map[entry.hostname]

            if table == 'servers':
                cname = entry.cname.__repr__() if entry.cname else 'NULL'
                cmd = ('INSERT INTO servers (hostname, cname, status, note) '
                       'VALUES(%r, %s, %r, %r)' %
                       (entry.hostname, cname, entry.status, entry.note))
            elif table == 'server_attributes':
                cmd = (
                    'INSERT INTO server_attributes (server_id, attribute, value) '
                    'VALUES(%d, %r, %r)' %
                    (server_id, entry.attribute, entry.value))
            else:
                cmd = (
                    'INSERT INTO server_roles (server_id, role) VALUES(%d, %r)'
                    % (server_id, entry.role))
            mysql_cmds.append(cmd)

    metrics.Gauge(_METRICS_PREFIX + '/inconsistency_found').set(
        len(delete_entries), fields={
            'table': table,
            'action': 'to_delete'
        })
    metrics.Gauge(_METRICS_PREFIX + '/inconsistency_found').set(
        len(insert_entries), fields={
            'table': table,
            'action': 'to_add'
        })

    return mysql_cmds
Beispiel #21
0
 def _gauge_metrics(cls):
     """Report to monarch the number of running processes."""
     m = metrics.Gauge('chromeos/autotest/scheduler/postjob_tasks')
     m.set(cls._num_running_processes, fields={'task_name': cls.__name__})