def dump_summary(info, label): print "-- %s by operation by cell by platform --" % (label,) p = prettytable.PrettyTable(["Operation", "Cell", "Platform", "Count", "Min", "Max", "Avg"]) for c in ["Count", "Min", "Max", "Avg"]: p.align[c] = 'r' total = 0 op_totals = {} cell_totals = {} platform_totals = {} for key, count in info.iteritems(): operation, platform, cell = key readable = image_type.readable(platform) text = "n/a" if readable: text = ", ".join(readable) _min, _max, _count, _total = durations[key] _avg = float(_total) / float(_count) _fmin = dt.sec_to_str(_min) _fmax = dt.sec_to_str(_max) _favg = dt.sec_to_str(_avg * 100.0) op_totals[operation] = op_totals.get(operation, 0) + count cell_totals[cell] = cell_totals.get(cell, 0) + count platform_totals[text] = platform_totals.get(text, 0) + count p.add_row([operation, cell, text, count, _fmin, _fmax, _favg]) total += count p.sortby = 'Count' print p dump_breakdown(op_totals, "Total %s by Operation" % label) dump_breakdown(cell_totals, "Total %s by Cell" % label) dump_breakdown(platform_totals, "Total %s by Platform" % label) print return total
def make_report(yesterday=None, start_hour=0, hours=24, percentile=97, store=False, region=None, too_long=1800): if not yesterday: yesterday = datetime.datetime.utcnow().date() - \ datetime.timedelta(days=1) rstart = datetime.datetime(year=yesterday.year, month=yesterday.month, day=yesterday.day, hour=start_hour) rend = rstart + datetime.timedelta(hours=hours - 1, minutes=59, seconds=59) dstart = dt.dt_to_decimal(rstart) dend = dt.dt_to_decimal(rend) codes = {} too_long_col = '> %d' % (too_long / 60) cells = [] regions = [] if region: region = region.upper() deployments = models.Deployment.objects.all() for deployment in deployments: name = deployment.name.upper() if not region or region in name: regions.append(deployment.id) cells.append(deployment.name) if not len(regions): print "No regions found for '%s'" % region sys.exit(1) # Get all the instances that have changed in the last N hours ... updates = models.RawData.objects.filter(event='compute.instance.update', when__gt=dstart, when__lte=dend, deployment__in=regions)\ .values('instance').distinct() expiry = 60 * 60 # 1 hour cmds = ['create', 'rebuild', 'rescue', 'resize', 'snapshot'] failures = {} # { key : {failure_type: count} } durations = {} attempts = {} for uuid_dict in updates: uuid = uuid_dict['instance'] # All the unique Request ID's for this instance during that timespan. reqs = models.RawData.objects.filter(instance=uuid, when__gt=dstart, when__lte=dend) \ .values('request_id').distinct() for req_dict in reqs: req = req_dict['request_id'] raws = models.RawData.objects.filter(request_id=req)\ .exclude(event='compute.instance.exists')\ .order_by('when') start = None err = None failure_type = None operation = "aux" image_type_num = 0 for raw in raws: if not start: start = raw.when if 'error' in raw.routing_key: err = raw failure_type = 'http' if raw.old_state != 'error' and raw.state == 'error': failure_type = 'state' if raw.old_state == 'error' and \ (not raw.state in ['deleted', 'error']): failure_type = None for cmd in cmds: if cmd in raw.event: operation = cmd break if raw.image_type: image_type_num |= raw.image_type image = "?" if image_type.isset(image_type_num, image_type.BASE_IMAGE): image = "base" if image_type.isset(image_type_num, image_type.SNAPSHOT_IMAGE): image = "snap" if not start: continue end = raw.when diff = end - start if diff > too_long and failure_type == None: failure_type = too_long_col key = (operation, image) # Track durations for all attempts, good and bad ... _durations = durations.get(key, []) _durations.append(diff) durations[key] = _durations attempts[key] = attempts.get(key, 0) + 1 if failure_type: if err: queue, body = json.loads(err.json) payload = body['payload'] exc = payload.get('exception') if exc: code = int(exc.get('kwargs', {}).get('code', 0)) if code >= 400 and code < 500: failure_type = "4xx" if code >= 500 and code < 600: failure_type = "5xx" breakdown = failures.get(key, {}) breakdown[failure_type] = breakdown.get(failure_type, 0) + 1 failures[key] = breakdown # Summarize the results ... report = [] pct = (float(100 - percentile) / 2.0) / 100.0 details = { 'percentile': percentile, 'pct': pct, 'hours': hours, 'start': float(dstart), 'end': float(dend), 'region': region, 'cells': cells } report.append(details) failure_types = ["4xx", "5xx", too_long_col, "state"] cols = [ "Operation", "Image", "Min", "Max", "Med", "%d%%" % percentile, "Requests" ] for failure_type in failure_types: cols.append("%s" % failure_type) cols.append("%% %s" % failure_type) report.append(cols) total = 0 failure_totals = {} for key, count in attempts.iteritems(): total += count operation, image = key breakdown = failures.get(key, {}) this_failure_pair = [] for failure_type in failure_types: # Failure counts for this attempt. # Sum for grand totals. failure_count = breakdown.get(failure_type, 0) failure_totals[failure_type] = \ failure_totals.get(failure_type, 0) + failure_count # Failure percentage for this attempt. percentage = float(failure_count) / float(count) this_failure_pair.append((failure_count, percentage)) # N-th % of durations ... _values = durations[key] _values.sort() _min = 99999999 _max = 0 _total = 0.0 for value in _values: _min = min(_min, value) _max = max(_max, value) _total += float(value) _num = len(_values) _avg = float(_total) / float(_num) half = _num / 2 _median = _values[half] _percentile_index = int((float(percentile) / 100.0) * float(_num)) _percentile = _values[_percentile_index] _fmin = dt.sec_to_str(_min) _fmax = dt.sec_to_str(_max) _favg = dt.sec_to_str(_avg) _fmedian = dt.sec_to_str(_median) _fpercentile = dt.sec_to_str(_percentile) row = [operation, image, _fmin, _fmax, _fmedian, _fpercentile, count] for failure_count, failure_percentage in this_failure_pair: row.append(failure_count) row.append(failure_percentage) report.append(row) details['total'] = total failure_grand_total = 0 for failure_type in failure_types: failure_total = failure_totals.get(failure_type, 0) failure_grand_total += failure_total details["%s failure count" % failure_type] = failure_total failure_percentage = (float(failure_total) / float(total)) * 100.0 details["%s failure percentage" % failure_type] = failure_percentage details['failure_grand_total'] = failure_grand_total details['failure_grand_rate'] = (float(failure_grand_total) / float(total)) * 100.0 return (rstart, rend, report)
def make_report(yesterday=None, start_hour=0, hours=24, percentile=97, store=False, region=None, too_long=1800): if not yesterday: yesterday = datetime.datetime.utcnow().date() - \ datetime.timedelta(days=1) rstart = datetime.datetime(year=yesterday.year, month=yesterday.month, day=yesterday.day, hour=start_hour) rend = rstart + datetime.timedelta(hours=hours-1, minutes=59, seconds=59) dstart = dt.dt_to_decimal(rstart) dend = dt.dt_to_decimal(rend) codes = {} too_long_col = '> %d' % (too_long / 60) cells = [] regions = [] if region: region = region.upper() deployments = models.Deployment.objects.all() for deployment in deployments: name = deployment.name.upper() if not region or region in name: regions.append(deployment.id) cells.append(deployment.name) if not len(regions): print "No regions found for '%s'" % region sys.exit(1) # Get all the instances that have changed in the last N hours ... updates = models.RawData.objects.filter(event='compute.instance.update', when__gt=dstart, when__lte=dend, deployment__in=regions)\ .values('instance').distinct() expiry = 60 * 60 # 1 hour cmds = ['create', 'rebuild', 'rescue', 'resize', 'snapshot'] failures = {} # { key : {failure_type: count} } durations = {} attempts = {} for uuid_dict in updates: uuid = uuid_dict['instance'] # All the unique Request ID's for this instance during that timespan. reqs = models.RawData.objects.filter(instance=uuid, when__gt=dstart, when__lte=dend) \ .values('request_id').distinct() for req_dict in reqs: req = req_dict['request_id'] raws = models.RawData.objects.filter(request_id=req)\ .exclude(event='compute.instance.exists')\ .order_by('when') start = None err = None failure_type = None operation = "aux" image_type_num = 0 for raw in raws: if not start: start = raw.when if 'error' in raw.routing_key: err = raw failure_type = 'http' if raw.old_state != 'error' and raw.state == 'error': failure_type = 'state' if raw.old_state == 'error' and \ (not raw.state in ['deleted', 'error']): failure_type = None for cmd in cmds: if cmd in raw.event: operation = cmd break if raw.image_type: image_type_num |= raw.image_type image = "?" if image_type.isset(image_type_num, image_type.BASE_IMAGE): image = "base" if image_type.isset(image_type_num, image_type.SNAPSHOT_IMAGE): image = "snap" if not start: continue end = raw.when diff = end - start if diff > too_long and failure_type == None: failure_type = too_long_col key = (operation, image) # Track durations for all attempts, good and bad ... _durations = durations.get(key, []) _durations.append(diff) durations[key] = _durations attempts[key] = attempts.get(key, 0) + 1 if failure_type: if err: queue, body = json.loads(err.json) payload = body['payload'] exc = payload.get('exception') if exc: code = int(exc.get('kwargs', {}).get('code', 0)) if code >= 400 and code < 500: failure_type = "4xx" if code >= 500 and code < 600: failure_type = "5xx" breakdown = failures.get(key, {}) breakdown[failure_type] = breakdown.get(failure_type, 0) + 1 failures[key] = breakdown # Summarize the results ... report = [] pct = (float(100 - percentile) / 2.0) / 100.0 details = {'percentile': percentile, 'pct': pct, 'hours': hours, 'start': float(dstart), 'end': float(dend), 'region': region, 'cells': cells} report.append(details) failure_types = ["4xx", "5xx", too_long_col, "state"] cols = ["Operation", "Image", "Min", "Max", "Med", "%d%%" % percentile, "Requests"] for failure_type in failure_types: cols.append("%s" % failure_type) cols.append("%% %s" % failure_type) report.append(cols) total = 0 failure_totals = {} for key, count in attempts.iteritems(): total += count operation, image = key breakdown = failures.get(key, {}) this_failure_pair = [] for failure_type in failure_types: # Failure counts for this attempt. # Sum for grand totals. failure_count = breakdown.get(failure_type, 0) failure_totals[failure_type] = \ failure_totals.get(failure_type, 0) + failure_count # Failure percentage for this attempt. percentage = float(failure_count) / float(count) this_failure_pair.append((failure_count, percentage)) # N-th % of durations ... _values = durations[key] _values.sort() _min = 99999999 _max = 0 _total = 0.0 for value in _values: _min = min(_min, value) _max = max(_max, value) _total += float(value) _num = len(_values) _avg = float(_total) / float(_num) half = _num / 2 _median = _values[half] _percentile_index = int((float(percentile) / 100.0) * float(_num)) _percentile = _values[_percentile_index] _fmin = dt.sec_to_str(_min) _fmax = dt.sec_to_str(_max) _favg = dt.sec_to_str(_avg) _fmedian = dt.sec_to_str(_median) _fpercentile = dt.sec_to_str(_percentile) row = [operation, image, _fmin, _fmax, _fmedian, _fpercentile, count] for failure_count, failure_percentage in this_failure_pair: row.append(failure_count) row.append(failure_percentage) report.append(row) details['total'] = total failure_grand_total = 0 for failure_type in failure_types: failure_total = failure_totals.get(failure_type, 0) failure_grand_total += failure_total details["%s failure count" % failure_type] = failure_total failure_percentage = (float(failure_total)/float(total)) * 100.0 details["%s failure percentage" % failure_type] = failure_percentage details['failure_grand_total'] = failure_grand_total details['failure_grand_rate'] = (float(failure_grand_total)/float(total)) * 100.0 return (rstart, rend, report)