def main(): alerts = getAlerts() for alert in alerts: # new alert if alert['stage'] == 0: if checkMerge(alert['revision'], alert['buildername']) or 'pgo' in alert['buildername']: LOG.info("We are ignoring this alert since it is either a merge or a pgo job.") alert['stage'] = -1 # We need to have manual inspection in this case. alert['user'] = '******' updateAlert(alert['id'], alert['revision'], alert['buildername'], alert['test'], alert['stage'], alert['loop'], alert['user']) else: alert['stage'] = 1 # trigger jobs for backfill if alert['stage'] == 1: LOG.info("We are in stage 1, and going to backfill jobs.") revisionList = getRevisions(alert['revision'], alert['buildername'], start=-2, end=2) trigger_range(alert['buildername'], revisionList, times=6, dry_run=DRY_RUN) alert['stage'] = 2 # We want some time interval between stage 1 and 2, so we exit. updateAlert(alert['id'], alert['revision'], alert['buildername'], alert['test'], alert['stage'], alert['loop'], alert['user']) continue # verify jobs for backfill if alert['stage'] == 2: LOG.info("We are in stage 2, and going to verify if jobs are backfilled.") revisionList = getRevisions(alert['revision'], alert['buildername'], start=-2, end=2) for revision in revisionList: dataPoints = getSuccessfulJobs(revision, alert['buildername']) # If dataPoints are less than 6, it means that builds/jobs are still running. if dataPoints < 6: # We wait for 6 hours for all triggered tests to complete, # And if they don't then we mark them for manual intervention/ alert['loop'] += 1 if alert['loop'] > (TIME_TO_BUILD + TIME_TO_TEST + PENDING_TIME) / CYCLE_TIME: alert['stage'] = -1 alert['user'] = '******' else: alert['stage'] = 1 break if alert['stage'] != 2: updateAlert(alert['id'], alert['revision'], alert['buildername'], alert['test'], alert['stage'], alert['loop'], alert['user']) continue badRevisions = [] # Reset the loop for upcoming stages alert['loop'] = 0 for i in range(1, len(revisionList)): results = compare(alert['test'], alert['buildername'], revisionList[i], revisionList[i-1]) if results < -2.0: badRevisions.append(revisionList[i]) if len(badRevisions) != 1: alert['stage'] = -1 # too noisy, something bad happened alert['user'] = '******' updateAlert(alert['id'], alert['revision'], alert['buildername'], alert['test'], alert['stage'], alert['loop'], alert['user']) continue if checkMerge(badRevisions[0], alert['buildername']): alert['stage'] = -1 # A merge revision is a bad revision, manually inspect alert['user'] = '******' if alert['revision'] != badRevisions[0]: alert['revision'] = badRevisions[0] # we misreported initially, change the actual regression revision alert['stage'] = 3 # Trigger all talos stage if alert['stage'] == 3: LOG.info("We are in stage 3, and going to trigger all_talos jobs.") repo_name = query_repo_name_from_buildername(alert['buildername']) trigger_all_talos_jobs(repo_name, alert['revision'], times=6, dry_run=DRY_RUN) previousRevision = getRevisions(alert['revision'], alert['buildername'], start=-1, end=-1)[0] trigger_all_talos_jobs(repo_name, previousRevision, times=6, dry_run=DRY_RUN) alert['stage'] = 4 updateAlert(alert['id'], alert['revision'], alert['buildername'], alert['test'], alert['stage'], alert['loop'], alert['user']) continue # Verify All talos stage is completed if alert['stage'] == 4: LOG.info("We are in stage 4, and going to verify if all_talos ran successfully.") previousRevision = getRevisions(alert['revision'], alert['buildername'], start=-1, end=-1)[0] repo_name = query_repo_name_from_buildername(alert['buildername']) all_buildernames = build_talos_buildernames_for_repo(repo_name) for revision in [alert['revision'], previousRevision]: for buildername in all_buildernames: dataPoints = getSuccessfulJobs(revision, buildername) if dataPoints < 6: # We wait for 8 hours for all talos tests to complete, # And if they don't then we mark them for manual intervention alert['loop'] += 1 if alert['loop'] > (TIME_TO_BUILD + TIME_TO_TEST + PENDING_TIME + TIME_TO_WAIT) / CYCLE_TIME: alert['stage'] = -1 alert['user'] = '******' else: alert['stage'] = 3 break if alert['stage'] != 4: break if alert['stage'] != 4: updateAlert(alert['id'], alert['revision'], alert['buildername'], alert['test'], alert['stage'], alert['loop'], alert['user']) continue alert['stage'] = 5 # final stage, sheriff will check for this. updateAlert(alert['id'], alert['revision'], alert['buildername'], alert['test'], alert['stage'], alert['loop'], alert['user'])
def main(): alerts = getAlerts() for alert in alerts: # new alert LOG.info("Running alert for: [%s, %s, %s]" % (alert['test'], alert['buildername'], alert['revision'])) if alert['stage'] == 0: LOG.info("We are in stage 0.") if checkMerge(alert['revision'], alert['buildername']) or 'pgo' in alert['buildername']: LOG.info("We are ignoring alert: %s since it is either a merge or a pgo job." % alert['test']) alert['stage'] = -1 # We need to have manual inspection in this case. alert['user'] = '******' updateAlert(alert['id'], alert['revision'], alert['buildername'], alert['test'], alert['stage'], alert['loop'], alert['user']) else: alert['stage'] = 1 # trigger jobs for backfill if alert['stage'] == 1: LOG.info("We are in stage 1, and going to backfill jobs.") revisionList = getRevisions(alert['revision'], alert['buildername'], start=-2, end=2) # Setting Treeherder as the source for querying. set_query_source("treeherder") trigger_range(alert['buildername'], revisionList, times=6, dry_run=DRY_RUN) alert['stage'] = 2 # We want some time interval between stage 1 and 2, so we exit. updateAlert(alert['id'], alert['revision'], alert['buildername'], alert['test'], alert['stage'], alert['loop'], alert['user']) continue # verify jobs for backfill if alert['stage'] == 2: LOG.info("We are in stage 2, and going to verify if jobs are backfilled.") revisionList = getRevisions(alert['revision'], alert['buildername'], start=-2, end=2) for revision in revisionList: dataPoints = getSuccessfulJobs(revision, alert['buildername']) # If dataPoints are less than 6, it means that builds/jobs are still running. if dataPoints < 6: print "data points <6 for revision: %s" % revision # We wait for 6 hours for all triggered tests to complete, # And if they don't then we mark them for manual intervention/ alert['loop'] += 1 if alert['loop'] > (TIME_TO_BUILD + TIME_TO_TEST + PENDING_TIME) / CYCLE_TIME: LOG.info("The jobs did not complete backfilling in time, assigning for human inspection.") alert['stage'] = -1 alert['user'] = '******' else: LOG.info("The jobs have not completed backfilling. Looping back to stage 1.") alert['stage'] = 1 break if alert['stage'] != 2: print "updating alert and then continue, not stage 2" updateAlert(alert['id'], alert['revision'], alert['buildername'], alert['test'], alert['stage'], alert['loop'], alert['user']) continue badRevisions = [] # Reset the loop for upcoming stages alert['loop'] = 0 for i in range(1, len(revisionList)): print "getting results for revision number: %s" % i results = compare(alert['test'], alert['buildername'], revisionList[i], revisionList[i-1]) print "compare returned: %s" % results if results < -2.0: print "appending bad revision to list: %s"% revisionList[i] badRevisions.append(revisionList[i]) if len(badRevisions) != 1: LOG.info("There are too many bad revisions: %s for alert %s on buildername %s, " "assigning for human inspection." % (badRevisions, alert['test'], alert['buildername'])) alert['stage'] = -1 # too noisy, something bad happened alert['user'] = '******' print "too many bad revisions, update alert to human" updateAlert(alert['id'], alert['revision'], alert['buildername'], alert['test'], alert['stage'], alert['loop'], alert['user']) continue if checkMerge(badRevisions[0], alert['buildername']): LOG.info("The bad revision %s identified for alert %s on buildername %s is a merge, " "assigning for human inspection" % (badRevisions[0], alert['test'], alert['buildername'])) alert['stage'] = -1 # A merge revision is a bad revision, manually inspect alert['user'] = '******' if alert['revision'] != badRevisions[0]: LOG.info("Alert_Manager misreported the bad revision. The actual bad revision is %s " "for alert %s on %s buildername." % (badRevisions[0], alert['test'], alert['buildername'])) alert['revision'] = badRevisions[0] # we misreported initially, change the actual regression revision print "setting stage = 3!" alert['stage'] = 3 # Trigger all talos stage if alert['stage'] == 3: LOG.info("We are in stage 3, and going to trigger all_talos jobs.") repo_name = query_repo_name_from_buildername(alert['buildername']) # Setting Treeherder as the source for querying. set_query_source("treeherder") trigger_all_talos_jobs(repo_name, alert['revision'], times=6, dry_run=DRY_RUN) previousRevision = getRevisions(alert['revision'], alert['buildername'], start=-1, end=-1)[0] trigger_all_talos_jobs(repo_name, previousRevision, times=6, dry_run=DRY_RUN) alert['stage'] = 4 updateAlert(alert['id'], alert['revision'], alert['buildername'], alert['test'], alert['stage'], alert['loop'], alert['user']) continue # Verify All talos stage is completed if alert['stage'] == 4: LOG.info("We are in stage 4, and going to verify if all_talos ran successfully.") previousRevision = getRevisions(alert['revision'], alert['buildername'], start=-1, end=-1)[0] repo_name = query_repo_name_from_buildername(alert['buildername']) all_buildernames = build_talos_buildernames_for_repo(repo_name) for revision in [alert['revision'], previousRevision]: for buildername in all_buildernames: dataPoints = getSuccessfulJobs(revision, buildername) if dataPoints < 6: # We wait for 8 hours for all talos tests to complete, # And if they don't then we mark them for manual intervention alert['loop'] += 1 if alert['loop'] > (TIME_TO_BUILD + TIME_TO_TEST + PENDING_TIME + TIME_TO_WAIT) / CYCLE_TIME: LOG.info("The all talos jobs for alert %s on %s revision did not complete in time, " " assigning for human inspection." % (alert['test'], alert['revision'])) alert['stage'] = -1 alert['user'] = '******' else: alert['stage'] = 3 break if alert['stage'] != 4: break if alert['stage'] != 4: updateAlert(alert['id'], alert['revision'], alert['buildername'], alert['test'], alert['stage'], alert['loop'], alert['user']) continue alert['stage'] = 5 # final stage, sheriff will check for this. alert['user'] = '******' LOG.info("All automated parts are complete.") updateAlert(alert['id'], alert['revision'], alert['buildername'], alert['test'], alert['stage'], alert['loop'], alert['user'])
def main(): alerts = getAlerts() for alert in alerts: # new alert if alert['stage'] == 0: if checkMerge( alert['revision'], alert['buildername']) or 'pgo' in alert['buildername']: LOG.info( "We are ignoring this alert since it is either a merge or a pgo job." ) alert[ 'stage'] = -1 # We need to have manual inspection in this case. alert['user'] = '******' updateAlert(alert['id'], alert['revision'], alert['buildername'], alert['test'], alert['stage'], alert['loop'], alert['user']) else: alert['stage'] = 1 # trigger jobs for backfill if alert['stage'] == 1: LOG.info("We are in stage 1, and going to backfill jobs.") revisionList = getRevisions(alert['revision'], alert['buildername'], start=-2, end=2) trigger_range(alert['buildername'], revisionList, times=6, dry_run=DRY_RUN) alert['stage'] = 2 # We want some time interval between stage 1 and 2, so we exit. updateAlert(alert['id'], alert['revision'], alert['buildername'], alert['test'], alert['stage'], alert['loop'], alert['user']) continue # verify jobs for backfill if alert['stage'] == 2: LOG.info( "We are in stage 2, and going to verify if jobs are backfilled." ) revisionList = getRevisions(alert['revision'], alert['buildername'], start=-2, end=2) for revision in revisionList: dataPoints = getSuccessfulJobs(revision, alert['buildername']) # If dataPoints are less than 6, it means that builds/jobs are still running. if dataPoints < 6: # We wait for 6 hours for all triggered tests to complete, # And if they don't then we mark them for manual intervention/ alert['loop'] += 1 if alert['loop'] > (TIME_TO_BUILD + TIME_TO_TEST + PENDING_TIME) / CYCLE_TIME: alert['stage'] = -1 alert['user'] = '******' else: alert['stage'] = 1 break if alert['stage'] != 2: updateAlert(alert['id'], alert['revision'], alert['buildername'], alert['test'], alert['stage'], alert['loop'], alert['user']) continue badRevisions = [] # Reset the loop for upcoming stages alert['loop'] = 0 for i in range(1, len(revisionList)): results = compare(alert['test'], alert['buildername'], revisionList[i], revisionList[i - 1]) if results < -2.0: badRevisions.append(revisionList[i]) if len(badRevisions) != 1: alert['stage'] = -1 # too noisy, something bad happened alert['user'] = '******' updateAlert(alert['id'], alert['revision'], alert['buildername'], alert['test'], alert['stage'], alert['loop'], alert['user']) continue if checkMerge(badRevisions[0], alert['buildername']): alert[ 'stage'] = -1 # A merge revision is a bad revision, manually inspect alert['user'] = '******' if alert['revision'] != badRevisions[0]: alert['revision'] = badRevisions[ 0] # we misreported initially, change the actual regression revision alert['stage'] = 3 # Trigger all talos stage if alert['stage'] == 3: LOG.info("We are in stage 3, and going to trigger all_talos jobs.") repo_name = query_repo_name_from_buildername(alert['buildername']) trigger_all_talos_jobs(repo_name, alert['revision'], times=6, dry_run=DRY_RUN) previousRevision = getRevisions(alert['revision'], alert['buildername'], start=-1, end=-1)[0] trigger_all_talos_jobs(repo_name, previousRevision, times=6, dry_run=DRY_RUN) alert['stage'] = 4 updateAlert(alert['id'], alert['revision'], alert['buildername'], alert['test'], alert['stage'], alert['loop'], alert['user']) continue # Verify All talos stage is completed if alert['stage'] == 4: LOG.info( "We are in stage 4, and going to verify if all_talos ran successfully." ) previousRevision = getRevisions(alert['revision'], alert['buildername'], start=-1, end=-1)[0] repo_name = query_repo_name_from_buildername(alert['buildername']) all_buildernames = build_talos_buildernames_for_repo(repo_name) for revision in [alert['revision'], previousRevision]: for buildername in all_buildernames: dataPoints = getSuccessfulJobs(revision, buildername) if dataPoints < 6: # We wait for 8 hours for all talos tests to complete, # And if they don't then we mark them for manual intervention alert['loop'] += 1 if alert['loop'] > (TIME_TO_BUILD + TIME_TO_TEST + PENDING_TIME + TIME_TO_WAIT) / CYCLE_TIME: alert['stage'] = -1 alert['user'] = '******' else: alert['stage'] = 3 break if alert['stage'] != 4: break if alert['stage'] != 4: updateAlert(alert['id'], alert['revision'], alert['buildername'], alert['test'], alert['stage'], alert['loop'], alert['user']) continue alert['stage'] = 5 # final stage, sheriff will check for this. updateAlert(alert['id'], alert['revision'], alert['buildername'], alert['test'], alert['stage'], alert['loop'], alert['user'])