Example #1
0
def main():
    alerts = getAlerts()
    for alert in alerts:
        # new alert
        if alert['stage'] == 0:
            if checkMerge(alert['revision'], alert['buildername']) or 'pgo' in alert['buildername']:
                LOG.info("We are ignoring this alert since it is either a merge or a pgo job.")
                alert['stage'] = -1 # We need to have manual inspection in this case.
                alert['user'] = '******'
                updateAlert(alert['id'], alert['revision'], alert['buildername'], alert['test'],
                            alert['stage'], alert['loop'], alert['user'])
            else:
                alert['stage'] = 1

        # trigger jobs for backfill
        if alert['stage'] == 1:
            LOG.info("We are in stage 1, and going to backfill jobs.")
            revisionList = getRevisions(alert['revision'], alert['buildername'], start=-2, end=2)
            trigger_range(alert['buildername'], revisionList, times=6, dry_run=DRY_RUN)
            alert['stage'] = 2
            # We want some time interval between stage 1 and 2, so we exit.
            updateAlert(alert['id'], alert['revision'], alert['buildername'], alert['test'],
                            alert['stage'], alert['loop'], alert['user'])
            continue


        # verify jobs for backfill
        if alert['stage'] == 2:
            LOG.info("We are in stage 2, and going to verify if jobs are backfilled.")
            revisionList = getRevisions(alert['revision'], alert['buildername'], start=-2, end=2)
            for revision in revisionList:
                dataPoints = getSuccessfulJobs(revision, alert['buildername'])

                # If dataPoints are less than 6, it means that builds/jobs are still running.
                if dataPoints < 6:
                    # We wait for 6 hours for all triggered tests to complete,
                    # And if they don't then we mark them for manual intervention/
                    alert['loop'] += 1
                    if alert['loop'] > (TIME_TO_BUILD + TIME_TO_TEST + PENDING_TIME) / CYCLE_TIME:
                        alert['stage'] = -1
                        alert['user'] = '******'
                    else:
                        alert['stage'] = 1

                    break

            if alert['stage'] != 2:
                updateAlert(alert['id'], alert['revision'], alert['buildername'], alert['test'],
                            alert['stage'], alert['loop'], alert['user'])
                continue

            badRevisions = []
            # Reset the loop for upcoming stages
            alert['loop'] = 0
            for i in range(1, len(revisionList)):
                results = compare(alert['test'], alert['buildername'], revisionList[i], revisionList[i-1])
                if results < -2.0:
                    badRevisions.append(revisionList[i])

            if len(badRevisions) != 1:
                alert['stage'] = -1 # too noisy, something bad happened
                alert['user'] = '******'
                updateAlert(alert['id'], alert['revision'], alert['buildername'], alert['test'],
                            alert['stage'], alert['loop'], alert['user'])
                continue

            if checkMerge(badRevisions[0], alert['buildername']):
                alert['stage'] = -1 # A merge revision is a bad revision, manually inspect
                alert['user'] = '******'

            if alert['revision'] != badRevisions[0]:
                alert['revision'] = badRevisions[0] # we misreported initially, change the actual regression revision

            alert['stage'] = 3

        # Trigger all talos stage
        if alert['stage'] == 3:
            LOG.info("We are in stage 3, and going to trigger all_talos jobs.")
            repo_name = query_repo_name_from_buildername(alert['buildername'])
            trigger_all_talos_jobs(repo_name, alert['revision'], times=6, dry_run=DRY_RUN)
            previousRevision = getRevisions(alert['revision'], alert['buildername'], start=-1, end=-1)[0]
            trigger_all_talos_jobs(repo_name, previousRevision, times=6, dry_run=DRY_RUN)
            alert['stage'] = 4
            updateAlert(alert['id'], alert['revision'], alert['buildername'], alert['test'],
                            alert['stage'], alert['loop'], alert['user'])
            continue

        # Verify All talos stage is completed
        if alert['stage'] == 4:
            LOG.info("We are in stage 4, and going to verify if all_talos ran successfully.")
            previousRevision = getRevisions(alert['revision'], alert['buildername'], start=-1, end=-1)[0]
            repo_name = query_repo_name_from_buildername(alert['buildername'])
            all_buildernames = build_talos_buildernames_for_repo(repo_name)
            for revision in [alert['revision'], previousRevision]:
                for buildername in all_buildernames:
                    dataPoints = getSuccessfulJobs(revision, buildername)
                    if dataPoints < 6:
                        # We wait for 8 hours for all talos tests to complete,
                        # And if they don't then we mark them for manual intervention
                        alert['loop'] += 1
                        if alert['loop'] > (TIME_TO_BUILD + TIME_TO_TEST + PENDING_TIME + TIME_TO_WAIT) / CYCLE_TIME:
                            alert['stage'] = -1
                            alert['user'] = '******'
                        else:
                            alert['stage'] = 3
                        break

                if alert['stage'] != 4:
                    break

            if alert['stage'] != 4:
                updateAlert(alert['id'], alert['revision'], alert['buildername'], alert['test'],
                            alert['stage'], alert['loop'], alert['user'])
                continue

            alert['stage'] = 5  # final stage, sheriff will check for this.

        updateAlert(alert['id'], alert['revision'], alert['buildername'], alert['test'], alert['stage'], alert['loop'], alert['user'])
Example #2
0
def main():
    alerts = getAlerts()
    for alert in alerts:
        # new alert
        LOG.info("Running alert for: [%s, %s, %s]" % (alert['test'], alert['buildername'], alert['revision']))
        if alert['stage'] == 0:
            LOG.info("We are in stage 0.")
            if checkMerge(alert['revision'], alert['buildername']) or 'pgo' in alert['buildername']:
                LOG.info("We are ignoring alert: %s since it is either a merge or a pgo job." % alert['test'])
                alert['stage'] = -1 # We need to have manual inspection in this case.
                alert['user'] = '******'
                updateAlert(alert['id'], alert['revision'], alert['buildername'], alert['test'],
                            alert['stage'], alert['loop'], alert['user'])
            else:
                alert['stage'] = 1

        # trigger jobs for backfill
        if alert['stage'] == 1:
            LOG.info("We are in stage 1, and going to backfill jobs.")
            revisionList = getRevisions(alert['revision'], alert['buildername'], start=-2, end=2)
            # Setting Treeherder as the source for querying.
            set_query_source("treeherder")
            trigger_range(alert['buildername'], revisionList, times=6, dry_run=DRY_RUN)
            alert['stage'] = 2
            # We want some time interval between stage 1 and 2, so we exit.
            updateAlert(alert['id'], alert['revision'], alert['buildername'], alert['test'],
                            alert['stage'], alert['loop'], alert['user'])
            continue


        # verify jobs for backfill
        if alert['stage'] == 2:
            LOG.info("We are in stage 2, and going to verify if jobs are backfilled.")
            revisionList = getRevisions(alert['revision'], alert['buildername'], start=-2, end=2)
            for revision in revisionList:
                dataPoints = getSuccessfulJobs(revision, alert['buildername'])

                # If dataPoints are less than 6, it means that builds/jobs are still running.
                if dataPoints < 6:
                    print "data points <6 for revision: %s" % revision
                    # We wait for 6 hours for all triggered tests to complete,
                    # And if they don't then we mark them for manual intervention/
                    alert['loop'] += 1
                    if alert['loop'] > (TIME_TO_BUILD + TIME_TO_TEST + PENDING_TIME) / CYCLE_TIME:
                        LOG.info("The jobs did not complete backfilling in time, assigning for human inspection.")
                        alert['stage'] = -1
                        alert['user'] = '******'
                    else:
                        LOG.info("The jobs have not completed backfilling. Looping back to stage 1.")
                        alert['stage'] = 1

                    break

            if alert['stage'] != 2:
                print "updating alert and then continue, not stage 2"
                updateAlert(alert['id'], alert['revision'], alert['buildername'], alert['test'],
                            alert['stage'], alert['loop'], alert['user'])
                continue

            badRevisions = []
            # Reset the loop for upcoming stages
            alert['loop'] = 0
            for i in range(1, len(revisionList)):
                print "getting results for revision number: %s" % i
                results = compare(alert['test'], alert['buildername'], revisionList[i], revisionList[i-1])
                print "compare returned: %s" % results
                if results < -2.0:
                    print "appending bad revision to list: %s"% revisionList[i]
                    badRevisions.append(revisionList[i])

            if len(badRevisions) != 1:
                LOG.info("There are too many bad revisions: %s for alert %s on buildername %s, "
                         "assigning for human inspection." % (badRevisions, alert['test'], alert['buildername']))
                alert['stage'] = -1 # too noisy, something bad happened
                alert['user'] = '******'
                print "too many bad revisions, update alert to human"
                updateAlert(alert['id'], alert['revision'], alert['buildername'], alert['test'],
                            alert['stage'], alert['loop'], alert['user'])
                continue

            if checkMerge(badRevisions[0], alert['buildername']):
                LOG.info("The bad revision %s identified for alert %s on buildername %s is a merge, "
                         "assigning for human inspection" % (badRevisions[0], alert['test'], alert['buildername']))
                alert['stage'] = -1 # A merge revision is a bad revision, manually inspect
                alert['user'] = '******'

            if alert['revision'] != badRevisions[0]:
                LOG.info("Alert_Manager misreported the bad revision. The actual bad revision is %s "
                         "for alert %s on %s buildername." % (badRevisions[0], alert['test'], alert['buildername']))
                alert['revision'] = badRevisions[0] # we misreported initially, change the actual regression revision

            print "setting stage = 3!"
            alert['stage'] = 3

        # Trigger all talos stage
        if alert['stage'] == 3:
            LOG.info("We are in stage 3, and going to trigger all_talos jobs.")
            repo_name = query_repo_name_from_buildername(alert['buildername'])
            # Setting Treeherder as the source for querying.
            set_query_source("treeherder")
            trigger_all_talos_jobs(repo_name, alert['revision'], times=6, dry_run=DRY_RUN)
            previousRevision = getRevisions(alert['revision'], alert['buildername'], start=-1, end=-1)[0]
            trigger_all_talos_jobs(repo_name, previousRevision, times=6, dry_run=DRY_RUN)
            alert['stage'] = 4
            updateAlert(alert['id'], alert['revision'], alert['buildername'], alert['test'],
                            alert['stage'], alert['loop'], alert['user'])
            continue

        # Verify All talos stage is completed
        if alert['stage'] == 4:
            LOG.info("We are in stage 4, and going to verify if all_talos ran successfully.")
            previousRevision = getRevisions(alert['revision'], alert['buildername'], start=-1, end=-1)[0]
            repo_name = query_repo_name_from_buildername(alert['buildername'])
            all_buildernames = build_talos_buildernames_for_repo(repo_name)
            for revision in [alert['revision'], previousRevision]:
                for buildername in all_buildernames:
                    dataPoints = getSuccessfulJobs(revision, buildername)
                    if dataPoints < 6:
                        # We wait for 8 hours for all talos tests to complete,
                        # And if they don't then we mark them for manual intervention
                        alert['loop'] += 1
                        if alert['loop'] > (TIME_TO_BUILD + TIME_TO_TEST + PENDING_TIME + TIME_TO_WAIT) / CYCLE_TIME:
                            LOG.info("The all talos jobs for alert %s on %s revision did not complete in time, "
                                     " assigning for human inspection." % (alert['test'], alert['revision']))
                            alert['stage'] = -1
                            alert['user'] = '******'
                        else:
                            alert['stage'] = 3
                        break

                if alert['stage'] != 4:
                    break

            if alert['stage'] != 4:
                updateAlert(alert['id'], alert['revision'], alert['buildername'], alert['test'],
                            alert['stage'], alert['loop'], alert['user'])
                continue

            alert['stage'] = 5  # final stage, sheriff will check for this.
            alert['user'] = '******'
            LOG.info("All automated parts are complete.")

        updateAlert(alert['id'], alert['revision'], alert['buildername'], alert['test'], alert['stage'], alert['loop'], alert['user'])
Example #3
0
def main():
    alerts = getAlerts()
    for alert in alerts:
        # new alert
        if alert['stage'] == 0:
            if checkMerge(
                    alert['revision'],
                    alert['buildername']) or 'pgo' in alert['buildername']:
                LOG.info(
                    "We are ignoring this alert since it is either a merge or a pgo job."
                )
                alert[
                    'stage'] = -1  # We need to have manual inspection in this case.
                alert['user'] = '******'
                updateAlert(alert['id'], alert['revision'],
                            alert['buildername'], alert['test'],
                            alert['stage'], alert['loop'], alert['user'])
            else:
                alert['stage'] = 1

        # trigger jobs for backfill
        if alert['stage'] == 1:
            LOG.info("We are in stage 1, and going to backfill jobs.")
            revisionList = getRevisions(alert['revision'],
                                        alert['buildername'],
                                        start=-2,
                                        end=2)
            trigger_range(alert['buildername'],
                          revisionList,
                          times=6,
                          dry_run=DRY_RUN)
            alert['stage'] = 2
            # We want some time interval between stage 1 and 2, so we exit.
            updateAlert(alert['id'], alert['revision'], alert['buildername'],
                        alert['test'], alert['stage'], alert['loop'],
                        alert['user'])
            continue

        # verify jobs for backfill
        if alert['stage'] == 2:
            LOG.info(
                "We are in stage 2, and going to verify if jobs are backfilled."
            )
            revisionList = getRevisions(alert['revision'],
                                        alert['buildername'],
                                        start=-2,
                                        end=2)
            for revision in revisionList:
                dataPoints = getSuccessfulJobs(revision, alert['buildername'])

                # If dataPoints are less than 6, it means that builds/jobs are still running.
                if dataPoints < 6:
                    # We wait for 6 hours for all triggered tests to complete,
                    # And if they don't then we mark them for manual intervention/
                    alert['loop'] += 1
                    if alert['loop'] > (TIME_TO_BUILD + TIME_TO_TEST +
                                        PENDING_TIME) / CYCLE_TIME:
                        alert['stage'] = -1
                        alert['user'] = '******'
                    else:
                        alert['stage'] = 1

                    break

            if alert['stage'] != 2:
                updateAlert(alert['id'], alert['revision'],
                            alert['buildername'], alert['test'],
                            alert['stage'], alert['loop'], alert['user'])
                continue

            badRevisions = []
            # Reset the loop for upcoming stages
            alert['loop'] = 0
            for i in range(1, len(revisionList)):
                results = compare(alert['test'], alert['buildername'],
                                  revisionList[i], revisionList[i - 1])
                if results < -2.0:
                    badRevisions.append(revisionList[i])

            if len(badRevisions) != 1:
                alert['stage'] = -1  # too noisy, something bad happened
                alert['user'] = '******'
                updateAlert(alert['id'], alert['revision'],
                            alert['buildername'], alert['test'],
                            alert['stage'], alert['loop'], alert['user'])
                continue

            if checkMerge(badRevisions[0], alert['buildername']):
                alert[
                    'stage'] = -1  # A merge revision is a bad revision, manually inspect
                alert['user'] = '******'

            if alert['revision'] != badRevisions[0]:
                alert['revision'] = badRevisions[
                    0]  # we misreported initially, change the actual regression revision

            alert['stage'] = 3

        # Trigger all talos stage
        if alert['stage'] == 3:
            LOG.info("We are in stage 3, and going to trigger all_talos jobs.")
            repo_name = query_repo_name_from_buildername(alert['buildername'])
            trigger_all_talos_jobs(repo_name,
                                   alert['revision'],
                                   times=6,
                                   dry_run=DRY_RUN)
            previousRevision = getRevisions(alert['revision'],
                                            alert['buildername'],
                                            start=-1,
                                            end=-1)[0]
            trigger_all_talos_jobs(repo_name,
                                   previousRevision,
                                   times=6,
                                   dry_run=DRY_RUN)
            alert['stage'] = 4
            updateAlert(alert['id'], alert['revision'], alert['buildername'],
                        alert['test'], alert['stage'], alert['loop'],
                        alert['user'])
            continue

        # Verify All talos stage is completed
        if alert['stage'] == 4:
            LOG.info(
                "We are in stage 4, and going to verify if all_talos ran successfully."
            )
            previousRevision = getRevisions(alert['revision'],
                                            alert['buildername'],
                                            start=-1,
                                            end=-1)[0]
            repo_name = query_repo_name_from_buildername(alert['buildername'])
            all_buildernames = build_talos_buildernames_for_repo(repo_name)
            for revision in [alert['revision'], previousRevision]:
                for buildername in all_buildernames:
                    dataPoints = getSuccessfulJobs(revision, buildername)
                    if dataPoints < 6:
                        # We wait for 8 hours for all talos tests to complete,
                        # And if they don't then we mark them for manual intervention
                        alert['loop'] += 1
                        if alert['loop'] > (TIME_TO_BUILD + TIME_TO_TEST +
                                            PENDING_TIME +
                                            TIME_TO_WAIT) / CYCLE_TIME:
                            alert['stage'] = -1
                            alert['user'] = '******'
                        else:
                            alert['stage'] = 3
                        break

                if alert['stage'] != 4:
                    break

            if alert['stage'] != 4:
                updateAlert(alert['id'], alert['revision'],
                            alert['buildername'], alert['test'],
                            alert['stage'], alert['loop'], alert['user'])
                continue

            alert['stage'] = 5  # final stage, sheriff will check for this.

        updateAlert(alert['id'], alert['revision'], alert['buildername'],
                    alert['test'], alert['stage'], alert['loop'],
                    alert['user'])