Ejemplo n.º 1
0
        def counters_inc(counter_name, failure_name):
            """Helper function to increment metrics counters.
            @param counter_name: string indicating which counter to use
            @param failure_name: string indentifying an error, or 'success'
            """
            if counter_name == 'call':
                # ssh_counter records the outcome of each ssh invocation
                # inside _run(), including exceptions.
                ssh_counter = metrics.Counter('chromeos/autotest/ssh/calls')
                fields = {
                    'error': failure_name or 'success',
                    'attempt': ssh_call_count
                }
                ssh_counter.increment(fields=fields)

            if counter_name == 'run':
                # run_counter records each call to _run() with its result
                # and how many tries were made.  Calls are recorded when
                # _run() exits (including exiting with an exception)
                run_counter = metrics.Counter('chromeos/autotest/ssh/runs')
                fields = {
                    'error': failure_name or 'success',
                    'attempt': ssh_call_count
                }
                run_counter.increment(fields=fields)
def main(argv):
    """Entry point."""
    logging.basicConfig(level=logging.INFO,
                        format="%(asctime)s - %(name)s - " +
                        "%(levelname)s - %(message)s")
    parser, options, args = parse_options()
    if not verify_options_and_args(options, args):
        parser.print_help()
        sys.exit(1)

    with ts_mon_config.SetupTsMonGlobalState(service_name='sync_server_db',
                                             indirect=True):
        try:
            metrics.Counter(_METRICS_PREFIX + '/start').increment()
            logging.info("Setting signal handler")
            signal.signal(signal.SIGINT, handle_signal)
            signal.signal(signal.SIGTERM, handle_signal)

            while not _shutdown:
                _main(options)
                metrics.Counter(_METRICS_PREFIX +
                                '/tick').increment(fields={'success': True})
                time.sleep(options.sleep)
        except:
            metrics.Counter(_METRICS_PREFIX +
                            '/tick').increment(fields={'success': False})
            raise
def main():
    """Main entry."""
    # Clear all loggers to make sure the following basicConfig take effect.
    logging.shutdown()
    reload(logging)
    logging.basicConfig(format='%(asctime)s %(message)s',
                        datefmt='%m/%d/%Y %H:%M:%S', level=logging.DEBUG)

    with ts_mon_config.SetupTsMonGlobalState(service_name='kill_slow_queries',
                                             indirect=True):
        count = 0
        parser, options, args = parse_options()
        if not verify_options_and_args(options, args):
            parser.print_help()
            return 1
        try:
            while True:
                result_log_strs, count = kill_slow_queries(
                    options.user, options.password, options.timeout)
                if result_log_strs:
                    gmail_lib.send_email(
                        options.mail,
                        'Successfully killed slow autotest db queries',
                        'Below are killed queries:\n%s' % result_log_strs)
                    m = 'chromeos/autotest/afe_db/killed_slow_queries'
                    metrics.Counter(m).increment_by(count)
                time.sleep(options.timeout)
        except Exception as e:
            m = 'chromeos/autotest/afe_db/failed_to_kill_query'
            metrics.Counter(m).increment()
            logging.error('Failed to kill slow db queries.\n%s', e)
            raise
Ejemplo n.º 4
0
 def __init__(self, tag, dependencies, triggers, host_class):
     super(RepairAction, self).__init__(tag, 'repair', dependencies)
     self._trigger_list = triggers
     self._failure_modes_counter = metrics.Counter(
         'chromeos/autotest/repair/failure_modes')
     self._failure_detail_counter = metrics.Counter(
         'chromeos/autotest/repair/failure_detail')
     self.host_class = host_class
Ejemplo n.º 5
0
 def _verify_store_infos(self, primary_info, shadow_info):
     if primary_info == shadow_info:
         metrics.Counter(_REFRESH_METRIC_NAME).increment(
                 fields={'validation_result': 'success'})
     else:
         self._mismatch_callback(primary_info, shadow_info)
         metrics.Counter(_REFRESH_METRIC_NAME).increment(
                 fields={'validation_result': 'fail_mismatch'})
         self._shadow_store.commit(primary_info)
Ejemplo n.º 6
0
 def _commit_to_shadow_store(self, info):
     try:
         self._shadow_store.commit(info)
     except host_info.StoreError:
         logger.exception('shadow commit failed. '
                          'Expect primary / shadow desync in the future.')
         metrics.Counter(_COMMIT_METRIC_NAME).increment(
             fields={'file_commit_result': 'fail'})
     else:
         metrics.Counter(_COMMIT_METRIC_NAME).increment(
             fields={'file_commit_result': 'success'})
Ejemplo n.º 7
0
    def __init__(self, verifier_data, repair_data, host_class):
        """
        Construct a `RepairStrategy` from simplified DAG data.

        The input `verifier_data` object describes how to construct
        verify nodes and the dependencies that relate them, as detailed
        above.

        The input `repair_data` object describes how to construct repair
        actions and their dependencies and triggers, as detailed above.

        @param verifier_data  Iterable value with constructors for the
                              elements of the verification DAG and their
                              dependencies.
        @param repair_data    Iterable value with constructors for the
                              elements of the repair action list, and
                              their dependencies and triggers.
        @property host_class  A string identifier that identify what
                              class of host this repair strategy target
                              on, will be used as a field to send repair
                              metrics.
        """
        # Metrics - we report on 'actions' for every repair action
        # we execute; we report on 'strategy' for every complete
        # repair operation.
        self._strategy_counter = metrics.Counter(
            'chromeos/autotest/repair/repair_strategy_v2')
        self._actions_counter = metrics.Counter(
            'chromeos/autotest/repair/repair_actions')
        self.host_class = host_class
        # We use the `all_verifiers` list to guarantee that our root
        # verifier will execute its dependencies in the order provided
        # to us by our caller.
        verifier_map = {}
        all_tags = []
        dependencies = set()
        for constructor, tag, deps in verifier_data:
            self._add_verifier(verifier_map, constructor, tag, deps)
            dependencies.update(deps)
            all_tags.append(tag)
        # Capture all the verifiers that have nothing depending on them.
        root_tags = [t for t in all_tags if t not in dependencies]
        self._add_verifier(verifier_map, _RootVerifier,
                           self.ROOT_TAG, root_tags)
        self._verify_root = verifier_map[self.ROOT_TAG]
        self._repair_actions = []
        for constructor, tag, deps, triggers in repair_data:
            r = constructor(tag,
                            [verifier_map[d] for d in deps],
                            [verifier_map[t] for t in triggers],
                            self.host_class)
            self._repair_actions.append(r)
Ejemplo n.º 8
0
def main():
    if _monitor_db_host_acquisition:
        logging.info('Please set inline_host_acquisition=False in the shadow '
                     'config before starting the host scheduler.')
        sys.exit(0)
    try:
        options = parse_arguments(sys.argv[1:])
        scheduler_lib.check_production_settings(options)

        # If server database is enabled, check if the server has role
        # `host_scheduler`. If the server does not have host_scheduler role,
        # exception will be raised and host scheduler will not continue to run.
        if server_manager_utils.use_server_db():
            server_manager_utils.confirm_server_has_role(hostname='localhost',
                                                         role='host_scheduler')

        initialize(options.testing)

        with ts_mon_config.SetupTsMonGlobalState(
                'autotest_host_scheduler',
                indirect=True,
                debug_file=options.metrics_file,
        ):
            metrics.Counter('%s/start' % _METRICS_PREFIX).increment()
            process_start_time = time.time()
            host_scheduler = HostScheduler()
            minimum_tick_sec = global_config.global_config.get_config_value(
                'SCHEDULER', 'host_scheduler_minimum_tick_sec', type=float)
            while not _shutdown:
                if _lifetime_expired(options.lifetime_hours,
                                     process_start_time):
                    break
                start = time.time()
                host_scheduler.tick()
                curr_tick_sec = time.time() - start
                if (minimum_tick_sec > curr_tick_sec):
                    time.sleep(minimum_tick_sec - curr_tick_sec)
                else:
                    time.sleep(0.0001)
            logging.info('Shutdown request recieved. Bye! Bye!')
    except server_manager_utils.ServerActionError:
        # This error is expected when the server is not in primary status
        # for host-scheduler role. Thus do not send email for it.
        raise
    except Exception:
        metrics.Counter('%s/uncaught_exception' % _METRICS_PREFIX).increment()
        raise
    finally:
        email_manager.manager.send_queued_emails()
        if _db_manager:
            _db_manager.disconnect()
Ejemplo n.º 9
0
def _emit_offload_metrics(dirpath):
    """Emit gs offload metrics.

    @param dirpath: Offloaded directory path.
    """
    dir_size = file_utils.get_directory_size_kibibytes(dirpath)
    metrics_fields = _get_metrics_fields(dirpath)

    m_offload_count = ('chromeos/autotest/gs_offloader/jobs_offloaded')
    metrics.Counter(m_offload_count).increment(fields=metrics_fields)
    m_offload_size = ('chromeos/autotest/gs_offloader/'
                      'kilobytes_transferred')
    metrics.Counter(m_offload_size).increment_by(dir_size,
                                                 fields=metrics_fields)
Ejemplo n.º 10
0
def main():
    ts_mon_config.SetupTsMonGlobalState('shard_client')

    try:
        metrics.Counter('chromeos/autotest/shard_client/start').increment()
        main_without_exception_handling()
    except Exception as e:
        metrics.Counter(
            'chromeos/autotest/shard_client/uncaught_exception').increment()
        message = 'Uncaught exception. Terminating shard_client.'
        email_manager.manager.log_stacktrace(message)
        logging.exception(message)
        raise
    finally:
        email_manager.manager.send_queued_emails()
Ejemplo n.º 11
0
    def start(self, pool_size=constants.DEFAULT_CONTAINER_POOL_SIZE):
        """Starts the service.

        @param pool_size: The desired size of the container pool.  This
                          parameter has no effect if a pre-created pool was DI'd
                          into the Service constructor.
        """
        self._running = True

        # Start the container pool.
        if self._pool is None:
            factory = container_factory.ContainerFactory(
                base_container=base_image.BaseImage().get(),
                container_class=zygote.Zygote)
            self._pool = pool.Pool(factory=factory, size=pool_size)

        # Start listening asynchronously for incoming connections.
        self._connection_listener.start()

        # Poll for incoming connections, and spawn threads to handle them.
        logging.debug('Start event loop.')
        while self._stop_event is None:
            self._handle_incoming_connections()
            self._cleanup_closed_connections()
            # TODO(kenobi): Poll for and log errors from pool.
            metrics.Counter(METRICS_PREFIX + '/tick').increment()
            time.sleep(_MIN_POLLING_PERIOD)

        logging.debug('Exit event loop.')

        # Stopped - stop all the client threads, stop listening, then signal
        # that shutdown is complete.
        for thread in self._client_threads:
            thread.stop()
        try:
            self._connection_listener.close()
        except Exception as e:
            logging.error('Error stopping pool service: %r', e)
            raise
        finally:
            # Clean up the container pool.
            self._pool.cleanup()
            # Make sure state is consistent.
            self._stop_event.set()
            self._stop_event = None
            self._running = False
            metrics.Counter(METRICS_PREFIX + '/service_stopped').increment()
            logging.debug('Container pool stopped.')
Ejemplo n.º 12
0
    def _RepoSelfupdate(self):
        """Execute repo selfupdate command.

    'repo selfupdate' would clean up the .repo/repo dir on certain exceptions
    and warnings, it must be followed by the 'repo init' command, which would
    recover .repo/repo in this circumstance.
    """
        cmd = [self.repo_cmd, 'selfupdate']
        failed_to_selfupdate = False
        try:
            cmd_result = cros_build_lib.run(cmd,
                                            cwd=self.directory,
                                            capture_output=True,
                                            log_output=True,
                                            encoding='utf-8')

            if (cmd_result.error is not None
                    and SELFUPDATE_WARNING_RE.search(cmd_result.error)):
                logging.warning('Unable to selfupdate because of warning "%s"',
                                SELFUPDATE_WARNING)
                failed_to_selfupdate = True
        except cros_build_lib.RunCommandError as e:
            logging.warning('repo selfupdate failed with exception: %s', e)
            failed_to_selfupdate = True

        if failed_to_selfupdate:
            metrics.Counter(
                constants.MON_REPO_SELFUPDATE_FAILURE_COUNT).increment()
            logging.warning(
                'Failed to selfupdate repo, cleaning .repo/repo in %s',
                self.directory)
            osutils.RmDir(os.path.join(self.directory, '.repo', 'repo'),
                          ignore_missing=True)
Ejemplo n.º 13
0
def poll_rpc_servers(servers,
                     servers_lock,
                     shards=None,
                     period=60,
                     stop_event=None):
    """Blocking function that polls all servers and shards

    @param servers: list of servers to poll
    @param servers_lock: lock to be used when accessing servers or shards
    @param shards: list of shards to poll
    @param period: time between polls
    @param stop_event: Event that can be set to stop polling
    """
    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count() * 4)

    while (not stop_event or not stop_event.is_set()):
        start_time = time.time()
        with servers_lock:
            all_servers = set(servers).union(shards)

        logging.debug('Starting Server Polling: %s', ', '.join(all_servers))
        pool.map(afe_rpc_call, all_servers)

        logging.debug('Finished Server Polling')

        metrics.Counter(METRIC_TICK).increment()

        wait_time = (start_time + period) - time.time()
        if wait_time > 0:
            time.sleep(wait_time)
Ejemplo n.º 14
0
def is_server_in_prod(server_name, afe):
    """Validate server's role and status.

    @param server_name: the server name to be validated.
    @param afe: the afe server to get role & status info in server_db.

    @return: A boolean value, True when the server_name is in prod, False
             otherwise, or if RPC fails.
    """
    logging.info('Validating server: %s', server_name)
    afe = frontend_wrappers.RetryingAFE(timeout_min=5,
                                        delay_sec=10,
                                        server=afe)
    is_prod_proxy_server = False
    try:
        if afe.run('get_servers',
                   hostname=server_name,
                   status='primary',
                   role='golo_proxy'):
            is_prod_proxy_server = True

    except urllib2.URLError as e:
        logging.warning('RPC get_servers failed on afe %s: %s', afe, str(e))
    finally:
        metrics.Counter(metrics_template % 'server_in_prod_check').increment(
            fields={'success': is_prod_proxy_server})
        return is_prod_proxy_server
Ejemplo n.º 15
0
    def _deserialize_many(self, serialized_list, djmodel, message):
        """Deserialize data in JSON format to database.

        Deserialize a list of JSON-formatted data to database using Django.

        @param serialized_list: A list of JSON-formatted data or python dict
                                literals.
        @param djmodel: Django model type.
        @param message: A string to be used in a logging message.
        """
        logging.info('Deserializing %s %ss', len(serialized_list), message)
        i = 0
        for serialized in serialized_list:
            i += 1
            if i % 100 == 1:
                logging.info('Progress: at entry %s', i)
            with transaction.commit_on_success():
                try:
                    djmodel.deserialize(serialized)
                except Exception as e:
                    logging.error('Deserializing a %s fails: %s, Error: %s',
                                  message, serialized, e)
                    metrics.Counter(
                        'chromeos/autotest/shard_client/deserialization_failed'
                    ).increment()
        logging.info('Done deserializing %ss', message)
    def __init__(self, container_path=constants.DEFAULT_CONTAINER_PATH,
                 container_factory=None):
        """Initialize a ContainerBucket.

        @param container_path: Path to the directory used to store containers.
                               Default is set to AUTOSERV/container_path in
                               global config.
        @param container_factory: A factory for creating Containers.
        """
        self.container_path = os.path.realpath(container_path)
        if container_factory is not None:
            self._factory = container_factory
        else:
            # Pass in the container path so that the bucket is hermetic (i.e. so
            # that if the container path is customized, the base image doesn't
            # fall back to using the default container path).
            try:
                base_image_ok = True
                container = BaseImage(self.container_path).get()
            except error.ContainerError as e:
                base_image_ok = False
                raise e
            finally:
                metrics.Counter(METRICS_PREFIX + '/base_image',
                                field_spec=[ts_mon.BooleanField('corrupted')]
                                ).increment(
                                    fields={'corrupted': not base_image_ok})
            self._factory = ContainerFactory(
                base_container=container,
                lxc_path=self.container_path)
        self.container_cache = {}
Ejemplo n.º 17
0
def _emit_gs_returncode_metric(returncode):
    """Increment the gs_returncode counter based on |returncode|."""
    m_gs_returncode = 'chromeos/autotest/gs_offloader/gs_returncode'
    rcode = int(returncode)
    if rcode < 0 or rcode > 255:
        rcode = -1
    metrics.Counter(m_gs_returncode).increment(fields={'return_code': rcode})
Ejemplo n.º 18
0
def main():
    """Main entry."""
    logging.basicConfig(format='%(asctime)s %(message)s',
                        datefmt='%m/%d/%Y %H:%M:%S',
                        level=logging.DEBUG)

    with ts_mon_config.SetupTsMonGlobalState(service_name='kill_slow_queries',
                                             indirect=True):
        count = 0
        parser, options, args = parse_options()
        if not verify_options_and_args(options, args):
            parser.print_help()
            return 1
        try:
            while True:
                result_log_strs, count = kill_slow_queries(
                    options.user, options.password, options.timeout)
                if result_log_strs:
                    gmail_lib.send_email(
                        options.mail,
                        'Successfully killed slow autotest db queries',
                        'Below are killed queries:\n%s' % result_log_strs)
                    m = 'chromeos/autotest/afe_db/killed_slow_queries'
                    metrics.Counter(m).increment_by(count)
                time.sleep(options.timeout)
        except Exception as e:
            logging.error('Failed to kill slow db queries.\n%s', e)
            gmail_lib.send_email(
                options.mail, 'Failed to kill slow autotest db queries.',
                ('Error occurred during killing slow db queries:\n%s\n'
                 'Detailed logs can be found in /var/log/slow_queries.log on db'
                 ' backup server.\nTo avoid db crash, please check ASAP.') % e)
            raise
Ejemplo n.º 19
0
def _MaybeCleanDistfiles(repo, distfiles_ts, metrics_fields):
    """Cleans the distfiles directory if too old.

  Args:
    repo: repository.RepoRepository instance.
    distfiles_ts: A timestamp str for the last time distfiles was cleaned. May
    be None.
    metrics_fields: Dictionary of fields to include in metrics.

  Returns:
    The new distfiles_ts to persist in state.
  """

    if distfiles_ts is None:
        return None

    distfiles_age = (time.time() - distfiles_ts) / 3600.0
    if distfiles_age < _DISTFILES_CACHE_EXPIRY_HOURS:
        return distfiles_ts

    logging.info('Remove old distfiles cache (cache expiry %d hours)',
                 _DISTFILES_CACHE_EXPIRY_HOURS)
    osutils.RmDir(os.path.join(repo.directory, '.cache', 'distfiles'),
                  ignore_missing=True,
                  sudo=True)
    metrics.Counter(METRIC_DISTFILES_CLEANUP).increment(
        field(metrics_fields, reason='cache_expired'))
    # Cleaned cache, so reset distfiles_ts
    return None
Ejemplo n.º 20
0
def main():
    """Runs the program."""
    options = parse_options()
    logging_manager.configure_logging(
        test_importer.TestImporterLoggingConfig(), verbose=options.verbose)
    backup_succeeded = False

    with ts_mon_config.SetupTsMonGlobalState(service_name='mysql_db_backup',
                                             indirect=True):
        with metrics.SecondsTimer('chromeos/autotest/afe_db/backup/durations',
                                  fields={'type': options.type}):
            try:
                logging.debug('Start db backup: %s', options.type)
                archiver = MySqlArchiver(options.type, options.keep,
                                         options.gs_bucket)
                dump_file = archiver.dump()
                logging.debug('Uploading backup: %s', options.type)
                archiver.upload_to_google_storage(dump_file)
                archiver.cleanup()
                logging.debug('Db backup completed: %s', options.type)
                backup_succeeded = True
            finally:
                metrics.Counter('chromeos/autotest/db/db_backup/completed'
                                ).increment(fields={
                                    'success': backup_succeeded,
                                    'type': options.type
                                })
Ejemplo n.º 21
0
    def cleanup(self, timeout=0):
        """Cleans up the container pool.

        Stops all worker threads, and destroys all Containers still in the Pool.

        @param timeout: For testing.  If this is non-zero, it specifies the
                        number of seconds to wait for each worker to shut down.
                        An error is raised if shutdown has not occurred by then.
                        If zero (the default), don't wait for worker threads to
                        shut down, just return immediately.
        """
        logging.info('Pool.cleanup called.')
        # Stop the monitor thread, then drain the pool.
        self._monitor.stop(timeout)

        try:
            dcount = 0
            logging.debug('Emptying container pool')
            while True:
                container = self._pool.get(block=False)
                dcount += 1
                container.destroy()
        except Queue.Empty:
            pass
        finally:
            metrics.Counter(METRICS_PREFIX +
                            '/containers_cleaned_up').increment_by(dcount)
            logging.debug('Done.  Destroyed %d containers', dcount)
Ejemplo n.º 22
0
 def __init__(self, primary_store, shadow_store,
              mismatch_callback=None):
     """
     @param primary_store: A CachingHostInfoStore to be used as the primary
             store.
     @param shadow_store: A CachingHostInfoStore to be used to shadow the
             primary store.
     @param mismatch_callback: A callback used to notify whenever we notice a
             mismatch between primary_store and shadow_store. The signature
             of the callback must match:
                 callback(primary_info, shadow_info)
             where primary_info and shadow_info are HostInfo objects obtained
             from the two stores respectively.
             Mostly used by unittests. Actual users don't know / nor care
             that they're using a ShadowingStore.
     """
     super(ShadowingStore, self).__init__()
     self._primary_store = primary_store
     self._shadow_store = shadow_store
     self._mismatch_callback = (
             mismatch_callback if mismatch_callback is not None
             else _log_info_mismatch)
     try:
         self._shadow_store.commit(self._primary_store.get())
     except host_info.StoreError as e:
         metrics.Counter(
                 _METRICS_PREFIX + 'initialization_fail_count').increment()
         logger.exception(
                 'Failed to initialize shadow store. '
                 'Expect primary / shadow desync in the future.')
Ejemplo n.º 23
0
    def _FinishBuildStageInCIDBAndMonarch(self,
                                          status,
                                          elapsed_time_seconds=0):
        """Mark the stage as finished in cidb.

    Args:
      status: The finish status of the build. Enum type
          constants.BUILDER_COMPLETED_STATUSES
      elapsed_time_seconds: (optional) Elapsed time in stage, in seconds.
    """
        _, db = self._run.GetCIDBHandle()
        if self._build_stage_id is not None and db is not None:
            db.FinishBuildStage(self._build_stage_id, status)

        fields = {
            'status': status,
            'name': self.name,
            'build_config': self._run.config.name,
            'important': self._run.config.important
        }

        metrics.SecondsDistribution(constants.MON_STAGE_DURATION).add(
            elapsed_time_seconds, fields=fields)
        metrics.Counter(
            constants.MON_STAGE_COMP_COUNT).increment(fields=fields)
Ejemplo n.º 24
0
def QueryAndEmit(baselines, cursor):
    """Queries MySQL for important stats and emits Monarch metrics

    @param baselines: A dict containing the initial values for the cumulative
                      metrics.
    @param cursor: The mysql command line.
    """

    for status in EMITTED_STATUSES_COUNTERS:
        delta = GetStatus(cursor, status) - baselines[status]
        metric_name = 'chromeos/autotest/afe_db/%s' % status.lower()
        metrics.Counter(metric_name).set(delta)

    for status in EMITTED_STATUS_GAUGES:
        metric_name = 'chromeos/autotest/afe_db/%s' % status.lower()
        metrics.Gauge(metric_name).set(GetStatus(cursor, status))

    pages_free = GetStatus(cursor, 'Innodb_buffer_pool_pages_free')
    pages_total = GetStatus(cursor, 'Innodb_buffer_pool_pages_total')

    metrics.Gauge('chromeos/autotest/afe_db/buffer_pool_pages').set(
        pages_free, fields={'used': False})

    metrics.Gauge('chromeos/autotest/afe_db/buffer_pool_pages').set(
        pages_total - pages_free, fields={'used': True})
Ejemplo n.º 25
0
    def _get(self, id, timeout):
        """Gets a container from the pool.

        @param id: A ContainerId to assign to the new container.
        @param timeout: A timeout (in seconds) to wait for the pool.  If a
                        container is not available from the pool within the
                        given period, None will be returned.

        @return: A container from the pool.
        """
        logging.debug('Received get request (id=%s)', id)
        container = self._pool.get(timeout)
        # Assign an ID to the container as soon as it is removed from the pool.
        # This associates the container with the process to which it will be
        # handed off.
        if container is not None:
            logging.debug('Assigning container (name=%s, id=%s)',
                          container.name, id)
            container.id = id
        else:
            logging.debug('No container (id=%s)', id)
        metrics.Counter(METRICS_PREFIX + '/container_requests',
                        field_spec=[ts_mon.BooleanField('success')]).increment(
                            fields={'success': (container is not None)})
        return container
Ejemplo n.º 26
0
 def _commit_to_primary_store(self, info):
     try:
         self._primary_store.commit(info)
     except host_info.StoreError:
         metrics.Counter(_COMMIT_METRIC_NAME).increment(
                 fields={'file_commit_result': 'skipped'})
         raise
def CancelBuilds(buildbucket_ids, buildbucket_client,
                 debug=True, config=None):
  """Cancel Buildbucket builds in a set.

  Args:
    buildbucket_ids: A list of build_ids (strings).
    buildbucket_client: Instance of buildbucket_lib.buildbucket_client.
    debug: Boolean indicating whether it's a dry run. Default to True.
    config: Instance of config_lib.BuildConfig. Config dict for the master
      build initiating the cancel. Optional.
  """
  if buildbucket_ids:
    logging.info('Canceling buildbucket_ids: %s', buildbucket_ids)
    if (not debug) and config:
      fields = {'build_type': config.build_type,
                'build_name': config.name}
      metrics.Counter(constants.MON_BB_CANCEL_BATCH_BUILDS_COUNT).increment(
          fields=fields)
    cancel_results = buildbucket_client.CancelBatchBuildsRequest(
        buildbucket_ids,
        dryrun=debug)
    result_map = buildbucket_lib.GetResultMap(cancel_results)
    for buildbucket_id, result in result_map.items():
      # Check for error messages
      if buildbucket_lib.GetNestedAttr(result, ['error']):
        # TODO(nxia): Get build url and log url in the warnings.
        logging.warning('Error cancelling build %s with reason: %s. '
                        'Please check the status of the build.',
                        buildbucket_id,
                        buildbucket_lib.GetErrorReason(result))
Ejemplo n.º 28
0
 def _refresh_from_shadow_store(self):
     try:
         return self._shadow_store.get(force_refresh=True)
     except host_info.StoreError:
         metrics.Counter(_REFRESH_METRIC_NAME).increment(fields={
                 'validation_result': 'fail_shadow_store_refresh'})
         raise
Ejemplo n.º 29
0
def send_email(bug, bug_template):
    """Send email to the owner and cc's to notify the TestBug.

    @param bug: TestBug instance.
    @param bug_template: A template dictionary specifying the default bug
                         filing options for failures in this suite.
    """
    to_set = set(bug.cc) if bug.cc else set()
    if bug.owner:
        to_set.add(bug.owner)
    if bug_template.get('cc'):
        to_set = to_set.union(bug_template.get('cc'))
    if bug_template.get('owner'):
        to_set.add(bug_template.get('owner'))
    recipients = ', '.join(to_set)
    if not recipients:
        logging.warning('No owner/cc found. Will skip sending a mail.')
        return
    success = False
    try:
        gmail_lib.send_email(
            recipients,
            bug.title(),
            bug.summary(),
            retry=False,
            creds_path=site_utils.get_creds_abspath(EMAIL_CREDS_FILE))
        success = True
    finally:
        (metrics.Counter('chromeos/autotest/errors/send_bug_email').increment(
            fields={'success': success}))
Ejemplo n.º 30
0
def _MaybeCleanDistfiles(cache_dir, distfiles_ts):
    """Cleans the distfiles directory if too old.

  Args:
    cache_dir: Directory of the cache, as a string.
    distfiles_ts: A timestamp str for the last time distfiles was cleaned. May
    be None.

  Returns:
    The new distfiles_ts to persist in state.
  """
    # distfiles_ts can be None for a fresh environment, which means clean.
    if distfiles_ts is None:
        return time.time()

    distfiles_age = (time.time() - distfiles_ts) / 3600.0
    if distfiles_age < _DISTFILES_CACHE_EXPIRY_HOURS:
        return distfiles_ts

    logging.info('Remove old distfiles cache (cache expiry %d hours)',
                 _DISTFILES_CACHE_EXPIRY_HOURS)
    osutils.RmDir(os.path.join(cache_dir, 'distfiles'),
                  ignore_missing=True,
                  sudo=True)
    metrics.Counter(METRIC_DISTFILES_CLEANUP).increment(
        fields=field({}, reason='cache_expired'))

    # Cleaned cache, so reset distfiles_ts
    return time.time()