예제 #1
0
    def SendInfraAlertIfNeeded(self, failing, inflight, no_stat):
        """Send infra alerts if needed.

    Args:
      failing: The names of the failing builders.
      inflight: The names of the builders that are still running.
      no_stat: The names of the builders that had status None.
    """
        msgs = [str(x) for x in self._GetInfraFailMessages(failing)]
        # Failed to report a non-None messages is an infra failure.
        slaves = self._GetBuildersWithNoneMessages(failing)
        msgs += ['%s failed with unknown reason.' % x for x in slaves]
        msgs += ['%s timed out' % x for x in inflight]
        msgs += ['%s did not start' % x for x in no_stat]
        if msgs:
            builder_name = self._run.config.name
            title = '%s has encountered infra failures:' % (builder_name, )
            msgs.insert(0, title)
            msgs.append('See %s' % self.ConstructDashboardURL())
            msg = '\n\n'.join(msgs)
            subject = '%s infra failures' % (builder_name, )
            extra_fields = {'X-cbuildbot-alert': 'cq-infra-alert'}
            tree_status.SendHealthAlert(self._run,
                                        subject,
                                        msg,
                                        extra_fields=extra_fields)
예제 #2
0
    def _UpdateRunStreak(self, builder_run, final_status):
        """Update the streak counter for this builder, if applicable, and notify.

    Update the pass/fail streak counter for the builder.  If the new
    streak should trigger a notification email then send it now.

    Args:
      builder_run: BuilderRun for this run.
      final_status: Final status string for this run.
    """
        if builder_run.InEmailReportingEnvironment():
            streak_value = self._UpdateStreakCounter(
                final_status=final_status,
                counter_name=builder_run.config.name,
                dry_run=self._run.debug)
            verb = 'passed' if streak_value > 0 else 'failed'
            logging.info('Builder %s has %s %s time(s) in a row.',
                         builder_run.config.name, verb, abs(streak_value))
            # See if updated streak should trigger a notification email.
            if (builder_run.config.health_alert_recipients
                    and builder_run.config.health_threshold > 0
                    and streak_value <= -builder_run.config.health_threshold):
                logging.info(
                    'Builder failed %i consecutive times, sending health '
                    'alert email to %s.', -streak_value,
                    builder_run.config.health_alert_recipients)

                subject = '%s health alert' % builder_run.config.name
                body = self._HealthAlertMessage(-streak_value)
                extra_fields = {'X-cbuildbot-alert': 'cq-health'}
                tree_status.SendHealthAlert(builder_run,
                                            subject,
                                            body,
                                            extra_fields=extra_fields)
예제 #3
0
    def SendCanaryFailureAlert(self, failing, inflight, no_stat):
        """Send an alert email to summarize canary failures.

    Args:
      failing: The names of the failing builders.
      inflight: The names of the builders that are still running.
      no_stat: The names of the builders that had status None.
    """
        builder_name = 'Canary Master'
        title = '%s has detected build failures:' % builder_name
        msgs = [str(x) for x in self._GetFailedMessages(failing)]
        slaves = self._GetBuildersWithNoneMessages(failing)
        msgs += ['%s failed with unknown reason.' % x for x in slaves]
        msgs += ['%s timed out' % x for x in inflight]
        msgs += ['%s did not start' % x for x in no_stat]
        msgs.insert(0, title)
        msgs.append('You can also view the summary of the slave failures from '
                    'the %s stage of %s. Click on the failure message to go '
                    'to an individual slave\'s build status page: %s' %
                    (self.name, builder_name, self.ConstructDashboardURL()))
        msg = '\n\n'.join(msgs)
        logging.warning(msg)
        extra_fields = {'X-cbuildbot-alert': 'canary-fail-alert'}
        tree_status.SendHealthAlert(self._run,
                                    'Canary builder failures',
                                    msg,
                                    extra_fields=extra_fields)
예제 #4
0
 def _SendPreCQInfraAlertMessageIfNeeded(self):
   """Send alerts on Pre-CQ infra failures."""
   msg = completion_stages.CreateBuildFailureMessage(
       self._run.config.overlays,
       self._run.config.name,
       self._run.ConstructDashboardURL())
   pre_cq = self._run.config.pre_cq
   if pre_cq and msg.HasFailureType(failures_lib.InfrastructureFailure):
     name = self._run.config.name
     title = 'pre-cq infra failures'
     body = ['%s failed on %s' % (name, cros_build_lib.GetHostName()),
             '%s' % msg]
     extra_fields = {'X-cbuildbot-alert': 'pre-cq-infra-alert'}
     tree_status.SendHealthAlert(self._run, title, '\n\n'.join(body),
                                 extra_fields=extra_fields)
예제 #5
0
    def CQMasterHandleFailure(self, failing, inflight, no_stat):
        """Handle changes in the validation pool upon build failure or timeout.

    This function determines whether to reject CLs and what CLs to
    reject based on the category of the failures and whether the
    sanity check builder(s) passed.

    Args:
      failing: Names of the builders that failed.
      inflight: Names of the builders that timed out.
      no_stat: Set of builder names of slave builders that had status None.
    """
        messages = self._GetFailedMessages(failing)
        self.SendInfraAlertIfNeeded(failing, inflight, no_stat)

        changes = self.sync_stage.pool.applied

        do_partial_submission = self._ShouldSubmitPartialPool()

        if do_partial_submission:
            changes_by_config = self.GetRelevantChangesForSlaves(
                changes, no_stat)
            subsys_by_config = self.GetSubsysResultForSlaves()

            # Even if there was a failure, we can submit the changes that indicate
            # that they don't care about this failure.
            changes = self.sync_stage.pool.SubmitPartialPool(
                changes, messages, changes_by_config, subsys_by_config,
                failing, inflight, no_stat)
        else:
            logging.warning(
                'Not doing any partial submission, due to critical stage '
                'failure(s).')
            title = 'CQ encountered a critical failure.'
            msg = ('CQ encountered a critical failure, and hence skipped '
                   'board-aware submission. See %s' %
                   self.ConstructDashboardURL())
            tree_status.SendHealthAlert(self._run, title, msg)

        sanity_check_slaves = set(self._run.config.sanity_check_slaves)
        tot_sanity = self._ToTSanity(sanity_check_slaves, self._slave_statuses)

        if not tot_sanity:
            # Sanity check slave failure may have been caused by bug(s)
            # in ToT or broken infrastructure. In any of those cases, we
            # should not reject any changes.
            logging.warning('Detected that a sanity-check builder failed. '
                            'Will not reject any changes.')

        # If the tree was not open when we acquired a pool, do not assume that
        # tot was sane.
        if not self.sync_stage.pool.tree_was_open:
            logging.info(
                'The tree was not open when changes were acquired so we are '
                'attributing failures to the broken tree rather than the '
                'changes.')
            tot_sanity = False

        if inflight:
            # Some slave(s) timed out due to unknown causes, so only reject infra
            # changes (probably just chromite changes).
            self.sync_stage.pool.HandleValidationTimeout(sanity=tot_sanity,
                                                         changes=changes)
            return

        # Some builder failed, or some builder did not report stats, or
        # the intersection of both. Let HandleValidationFailure decide
        # what changes to reject.
        self.sync_stage.pool.HandleValidationFailure(messages,
                                                     sanity=tot_sanity,
                                                     changes=changes,
                                                     no_stat=no_stat)