def email_instance_hours(date, dry_run=False): """Email instance hours report for the given datetime.date object.""" yyyymmdd = date.strftime("%Y%m%d") cost_fn = '\n'.join("WHEN module_id == '%s' THEN latency * %s" % kv for kv in _MODULE_CPU_COUNT.iteritems()) query = """\ SELECT COUNT(*) as count_, elog_url_route as url_route, SUM(CASE %s ELSE 0 END) / 3600 as instance_hours FROM [logs.requestlogs_%s] WHERE url_map_entry != "" # omit static files GROUP BY url_route ORDER BY instance_hours DESC """ % (cost_fn, yyyymmdd) data = bq_util.query_bigquery(query) bq_util.save_daily_data(data, "instance_hours", yyyymmdd) historical_data = bq_util.process_past_data( "instance_hours", date, 14, lambda row: row['url_route']) # Munge the table by adding a few columns. total_instance_hours = 0.0 for row in data: total_instance_hours += row['instance_hours'] for row in data: row['%% of total'] = row['instance_hours'] / total_instance_hours * 100 row['per 1k requests'] = row['instance_hours'] / row['count_'] * 1000 sparkline_data = [] for old_data in historical_data: old_row = old_data.get(row['url_route']) if old_row: sparkline_data.append( old_row['instance_hours'] / old_row['count_']) else: sparkline_data.append(None) row['last 2 weeks (per request)'] = sparkline_data _ORDER = ('%% of total', 'instance_hours', 'count_', 'per 1k requests', 'last 2 weeks (per request)', 'url_route') data = _convert_table_rows_to_lists(data, _ORDER) subject = 'Instance Hours by Route' heading = 'Instance hours by route for %s' % _pretty_date(yyyymmdd) # Let's just send the top most expensive routes, not all of them. _send_email({heading: data[:50]}, None, to=['*****@*****.**'], subject=subject, dry_run=dry_run) # We'll also send the most-most expensive ones to stackdriver. _send_table_to_stackdriver(data[:20], 'webapp.routes.instance_hours.week_over_week', 'url_route', metric_label_col='url_route', data_col='last 2 weeks (per request)', dry_run=dry_run)
def email_client_api_usage(date, dry_run=False): """Emails a report of API usage, segmented by client and build version.""" yyyymmdd = date.strftime("%Y%m%d") ios_user_agent_regex = '^Khan%20Academy\.(.*)/(.*) CFNetwork/([.0-9]*)' \ ' Darwin/([.0-9]*)$' # We group all non-ios user agents into a single bucket to keep this # report down to a reasonable size. query = """\ SELECT IF(REGEXP_MATCH(user_agent, r'%(ios_user_agent_regex)s'), REGEXP_REPLACE(user_agent, r'%(ios_user_agent_regex)s', r'iOS \1'), 'Web Browsers/other') as client, IF(REGEXP_MATCH(user_agent, r'%(ios_user_agent_regex)s'), REGEXP_REPLACE(user_agent, r'%(ios_user_agent_regex)s', r'\2'), '') as build, elog_url_route as route, count(elog_url_route) as request_count FROM logs.requestlogs_%(date_format)s WHERE REGEXP_MATCH(elog_url_route, '^api.main:/api/internal') AND user_agent IS NOT NULL AND elog_url_route IS NOT NULL AND LEFT(version_id, 3) != 'znd' # ignore znds GROUP BY client, build, route ORDER BY client DESC, build DESC, request_count DESC; """ % { 'ios_user_agent_regex': ios_user_agent_regex, 'date_format': yyyymmdd } data = bq_util.query_bigquery(query) bq_util.save_daily_data(data, "client_api_usage", yyyymmdd) _ORDER = ('client', 'build', 'route', 'request_count') all_data = _convert_table_rows_to_lists(data, _ORDER) subject = 'API usage by client - ' heading = 'API usage by client for %s' % _pretty_date(yyyymmdd) _send_email({heading: all_data}, None, to=[initiatives.email('infrastructure')], subject=subject + 'All', dry_run=dry_run) # Per-initiative reports for initiative_id, initiative_data in _by_initiative(data, key='route'): table = _convert_table_rows_to_lists(initiative_data, _ORDER) _send_email({heading: table}, None, to=[initiatives.email(initiative_id)], subject=subject + initiatives.title(initiative_id), dry_run=dry_run)
def email_instance_hours(date): """Email instance hours report for the given datetime.date object.""" yyyymmdd = date.strftime("%Y%m%d") cost_fn = '\n'.join("WHEN module_id == '%s' THEN latency * %s" % kv for kv in _MODULE_CPU_COUNT.iteritems()) query = """\ SELECT COUNT(*) as count_, elog_url_route as url_route, SUM(CASE %s ELSE 0 END) / 3600 as instance_hours FROM [logs.requestlogs_%s] WHERE url_map_entry != "" # omit static files GROUP BY url_route ORDER BY instance_hours DESC """ % (cost_fn, yyyymmdd) data = bq_util.query_bigquery(query) bq_util.save_daily_data(data, "instance_hours", yyyymmdd) historical_data = bq_util.process_past_data("instance_hours", date, 14, lambda row: row['url_route']) # Munge the table by adding a few columns. total_instance_hours = 0.0 for row in data: total_instance_hours += row['instance_hours'] for row in data: row['%% of total'] = row['instance_hours'] / total_instance_hours * 100 row['per 1k requests'] = row['instance_hours'] / row['count_'] * 1000 sparkline_data = [] for old_data in historical_data: old_row = old_data.get(row['url_route']) if old_row: sparkline_data.append(old_row['instance_hours'] / old_row['count_']) else: sparkline_data.append(None) row['last 2 weeks (per request)'] = sparkline_data _ORDER = ('%% of total', 'instance_hours', 'count_', 'per 1k requests', 'last 2 weeks (per request)', 'url_route') data = _convert_table_rows_to_lists(data, _ORDER) subject = 'Instance Hours by Route' heading = 'Instance hours by route for %s' % _pretty_date(yyyymmdd) # Let's just send the top most expensive routes, not all of them. _send_email({heading: data[:50]}, None, to=['*****@*****.**'], subject=subject)
def email_instance_hours(date): """Email instance hours report for the given datetime.date object.""" yyyymmdd = date.strftime("%Y%m%d") cost_fn = "\n".join("WHEN module_id == '%s' THEN latency * %s" % kv for kv in _MODULE_CPU_COUNT.iteritems()) query = """\ SELECT COUNT(*) as count_, elog_url_route as url_route, SUM(CASE %s ELSE 0 END) / 3600 as instance_hours FROM [logs.requestlogs_%s] WHERE url_map_entry != "" # omit static files GROUP BY url_route ORDER BY instance_hours DESC """ % ( cost_fn, yyyymmdd, ) data = bq_util.query_bigquery(query) bq_util.save_daily_data(data, "instance_hours", yyyymmdd) historical_data = bq_util.process_past_data("instance_hours", date, 14, lambda row: row["url_route"]) # Munge the table by adding a few columns. total_instance_hours = 0.0 for row in data: total_instance_hours += row["instance_hours"] for row in data: row["%% of total"] = row["instance_hours"] / total_instance_hours * 100 row["per 1k requests"] = row["instance_hours"] / row["count_"] * 1000 sparkline_data = [] for old_data in historical_data: old_row = old_data.get(row["url_route"]) if old_row: sparkline_data.append(old_row["instance_hours"] / old_row["count_"]) else: sparkline_data.append(None) row["last 2 weeks (per request)"] = sparkline_data _ORDER = ("%% of total", "instance_hours", "count_", "per 1k requests", "last 2 weeks (per request)", "url_route") data = _convert_table_rows_to_lists(data, _ORDER) subject = "Instance Hours by Route" heading = "Instance hours by route for %s" % _pretty_date(yyyymmdd) # Let's just send the top most expensive routes, not all of them. _send_email({heading: data[:50]}, None, to=["*****@*****.**"], subject=subject)
def email_client_api_usage(date, dry_run=False): """Emails a report of API usage, segmented by client and build version.""" yyyymmdd = date.strftime("%Y%m%d") ios_user_agent_regex = '^Khan%20Academy\.(.*)/(.*) CFNetwork/([.0-9]*)' \ ' Darwin/([.0-9]*)$' # We group all non-ios user agents into a single bucket to keep this # report down to a reasonable size. query = """\ SELECT IF(REGEXP_MATCH(user_agent, r'%(ios_user_agent_regex)s'), REGEXP_REPLACE(user_agent, r'%(ios_user_agent_regex)s', r'iOS \1'), 'Web Browsers/other') as client, IF(REGEXP_MATCH(user_agent, r'%(ios_user_agent_regex)s'), REGEXP_REPLACE(user_agent, r'%(ios_user_agent_regex)s', r'\2'), '') as build, elog_url_route as route, count(elog_url_route) as request_count FROM logs.requestlogs_%(date_format)s WHERE REGEXP_MATCH(elog_url_route, '^api.main:/api/internal') AND user_agent IS NOT NULL AND elog_url_route IS NOT NULL GROUP BY client, build, route ORDER BY client DESC, build DESC, request_count DESC; """ % {'ios_user_agent_regex': ios_user_agent_regex, 'date_format': yyyymmdd} data = bq_util.query_bigquery(query) bq_util.save_daily_data(data, "client_api_usage", yyyymmdd) _ORDER = ('client', 'build', 'route', 'request_count') data = _convert_table_rows_to_lists(data, _ORDER) subject = 'API usage by client' heading = 'API usage by client for %s' % _pretty_date(yyyymmdd) _send_email({heading: data}, None, to=['*****@*****.**'], subject=subject, dry_run=dry_run)
def email_rpcs(date): """Email RPCs-per-route report for the given datetime.date object.""" yyyymmdd = date.strftime("%Y%m%d") rpc_fields = ('Get', 'Put', 'Next', 'RunQuery', 'Delete') inits = [ "IFNULL(INTEGER(t%s.rpc_%s), 0) AS rpc_%s" % (name, name, name) for name in rpc_fields ] inits.append("IFNULL(tcost.rpc_cost, 0) AS rpc_cost") joins = [ "LEFT OUTER JOIN ( " "SELECT elog_url_route AS url_route, COUNT(*) AS rpc_%s " "FROM FLATTEN([logs.requestlogs_%s], elog_stats_rpc) " "WHERE elog_stats_rpc.key = 'stats.rpc.datastore_v3.%s' " "GROUP BY url_route) AS t%s ON t1.url_route = t%s.url_route" % (name, yyyymmdd, name, name, name) for name in rpc_fields ] joins.append("""\ LEFT OUTER JOIN ( SELECT elog_url_route AS url_route, SUM(elog_stats_rpc_ops.value) AS rpc_cost FROM [logs.requestlogs_%s] WHERE elog_stats_rpc_ops.key = 'stats.rpc_ops.cost' GROUP BY url_route) AS tcost ON t1.url_route = tcost.url_route """ % yyyymmdd) query = """\ SELECT t1.url_route AS url_route, t1.url_requests AS requests, %s FROM ( SELECT elog_url_route AS url_route, COUNT(*) AS url_requests FROM [logs.requestlogs_%s] GROUP BY url_route) AS t1 %s ORDER BY tcost.rpc_cost DESC; """ % (',\n'.join(inits), yyyymmdd, '\n'.join(joins)) data = bq_util.query_bigquery(query) bq_util.save_daily_data(data, "rpcs", yyyymmdd) historical_data = bq_util.process_past_data("rpcs", date, 14, lambda row: row['url_route']) # Munge the table by getting per-request counts for every RPC stat. micropennies = 'μ¢' for row in data: for stat in rpc_fields: row['%s/req' % stat] = row['rpc_%s' % stat] * 1.0 / row['requests'] row[micropennies + '/req'] = row['rpc_cost'] * 1.0 / row['requests'] row['$'] = row['rpc_cost'] * 1.0e-8 sparkline_data = [] for old_data in historical_data: old_row = old_data.get(row['url_route']) if old_row and 'rpc_cost' in old_row: sparkline_data.append(old_row['rpc_cost'] * 1.0 / old_row['requests']) else: sparkline_data.append(None) row['last 2 weeks (%s/req)' % micropennies] = sparkline_data del row['rpc_cost'] # Convert each row from a dict to a list, in a specific order. _ORDER = ([ 'url_route', 'requests', '$', micropennies + '/req', 'last 2 weeks (%s/req)' % micropennies ] + ['rpc_%s' % f for f in rpc_fields] + ['%s/req' % f for f in rpc_fields]) data = _convert_table_rows_to_lists(data, _ORDER) subject = 'RPC calls by route' heading = 'RPC calls by route for %s' % _pretty_date(yyyymmdd) # Let's just send the top most expensive routes, not all of them. _send_email({heading: data[:75]}, None, to=['*****@*****.**'], subject=subject)
def email_memory_increases(date, window_length=20, min_increase_in_mb=1): """Emails the increases in memory caused by particular routes. It attempts to compute the amount of memory ignoring memory which is reclaimed in the next few requests. (The number of requests which are checked is specified by the window_length parameter. Routes with a total increase less than the min_increase_in_mb parameter are ignored.) """ yyyymmdd = date.strftime("%Y%m%d") lead_lengths = range(1, window_length + 1) lead_selects = '\n'.join( "LEAD(total, %s) OVER (PARTITION BY instance_key ORDER BY start_time) " "AS lead_total_%s," % (i, i) for i in lead_lengths) fields = ['total'] + ['lead_total_%s' % i for i in lead_lengths] # We want to compute the minimal value of added + field - total, where # field is one of "total" or one of the "lead_total_i". BigQuery # unfortunately doesn't give us a nice way to do this (at least that I know # of, without doing a CROSS JOIN of the table to itself). One way to do # this would be a gigantic nested IF, but this could be exponentially # large, and queries have a fixed maximum size. Instead we use a CASE # expression which could be O(n^2); since we don't pay for BigQuery # execution time, and n shouldn't be too huge, this seems like a better # approach. In theory it might be better to do O(n) nested queries (each # of which does a single pairwise min), but this seems like it could in # practice be even slower, depending on the implementation. # TODO(benkraft): If I get a useful answer to # http://stackoverflow.com/questions/24923101/computing-a-moving-maximum-in-bigquery # we should use that instead. Or, once we have user-defined functions in # BigQuery, we can probably do something actually reasonable. case_expr = '\n'.join( "WHEN %s THEN added + %s - total" % ( ' AND '.join("%s <= %s" % (field1, field2) for field2 in fields if field2 != field1), field1) for field1 in fields) # This is a kind of large query, so here's what it's doing, from inside to # out: # First, extract the memory data from the logs. # Second, compute the appropriate LEAD() columns, which tell us what the # total will be a few requests later on the same instance # Third, compute "real_added", which is the amount we think the request # actually added to the heap, not counting memory which was soon # reclaimed. This might come out negative, so # Fourth, make sure the memory added is at least zero, since if memory # usage went down, this request probably shouldn't get the credit. # Fifth, group by route and do whatever aggregation we want. Ignore the # first 25 requests to each module, since those are probably all # loading things that each module has to load, and the request that # does the loading shouldn't be blamed. query = """\ SELECT COUNT(*) AS count_, elog_url_route AS url_route, module_id AS module, AVG(real_added) AS added_avg, NTH(99, QUANTILES(real_added, 101)) AS added_98th, SUM(real_added) AS added_total, FROM ( SELECT IF(real_added > 0, real_added, 0) AS real_added, elog_url_route, module_id, num, FROM ( SELECT (CASE %s ELSE added END) AS real_added, elog_url_route, module_id, num, FROM ( SELECT %s RANK() OVER (PARTITION BY instance_key ORDER BY start_time) AS num, added, total, elog_url_route, module_id, FROM ( SELECT FLOAT(REGEXP_EXTRACT( app_logs.message, "This request added (.*) MB to the heap.")) AS added, FLOAT(REGEXP_EXTRACT( app_logs.message, "Total memory now used: (.*) MB")) AS total, instance_key, start_time, elog_url_route, module_id, FROM [logs.requestlogs_%s] WHERE app_logs.message CONTAINS 'This request added' ) ) ) ) WHERE num > 25 GROUP BY url_route, module ORDER BY added_total DESC """ % (case_expr, lead_selects, yyyymmdd) data = bq_util.query_bigquery(query) bq_util.save_daily_data(data, "memory_increases", yyyymmdd) historical_data = bq_util.process_past_data( "memory_increases", date, 14, lambda row: (row['module'], row['url_route'])) by_module = collections.defaultdict(list) for row in data: if row['added_total'] > min_increase_in_mb: heading = "Memory increases by route for %s module on %s" % ( row['module'], _pretty_date(yyyymmdd)) by_module[heading].append(row) _ORDER = ['count_', 'added_avg', 'last 2 weeks (avg)', 'added_98th', 'added_total', 'added %%', 'url_route'] for heading in by_module: total = sum(row['added_total'] for row in by_module[heading]) for row in by_module[heading]: row['added %%'] = row['added_total'] / total * 100 sparkline_data = [] for old_data in historical_data: old_row = old_data.get((row['module'], row['url_route'])) if old_row: sparkline_data.append(old_row['added_avg']) else: sparkline_data.append(None) row['last 2 weeks (avg)'] = sparkline_data del row['module'] by_module[heading] = _convert_table_rows_to_lists( by_module[heading][:50], _ORDER) subject = "Memory Increases by Route" _send_email(by_module, None, to=['*****@*****.**'], subject=subject)
def email_out_of_memory_errors(date): # This sends two emails, for two different ways of seeing the data. # But we'll have them share the same subject so they thread together. yyyymmdd = date.strftime("%Y%m%d") subject = 'OOM errors' # Out-of-memory errors look like: # Exceeded soft private memory limit with 260.109 MB after servicing 2406 requests total #@Nolint # with SDK 1.9.7, they changed. Note the double-space after "after": # Exceeded soft private memory limit of 512 MB with 515 MB after servicing 9964 requests total #@Nolint numreqs = r"REGEXP_EXTRACT(app_logs.message, r'servicing (\d+) requests')" query = """\ SELECT COUNT(module_id) AS count_, module_id, NTH(10, QUANTILES(INTEGER(%s), 101)) as numserved_10th, NTH(50, QUANTILES(INTEGER(%s), 101)) as numserved_50th, NTH(90, QUANTILES(INTEGER(%s), 101)) as numserved_90th FROM [logs.requestlogs_%s] WHERE app_logs.message CONTAINS 'Exceeded soft private memory limit' AND module_id IS NOT NULL GROUP BY module_id ORDER BY count_ DESC """ % (numreqs, numreqs, numreqs, yyyymmdd) data = bq_util.query_bigquery(query) bq_util.save_daily_data(data, "out_of_memory_errors_by_module", yyyymmdd) historical_data = bq_util.process_past_data( "out_of_memory_errors_by_module", date, 14, lambda row: row['module_id']) for row in data: sparkline_data = [] for old_data in historical_data: old_row = old_data.get(row['module_id']) if old_row: sparkline_data.append(old_row['count_']) elif old_data: # If we have data, just not on this module, then it just didn't # OOM. sparkline_data.append(0) else: # On the other hand, if we don't have data at all, we should # show a gap. sparkline_data.append(None) row['last 2 weeks'] = sparkline_data _ORDER = ['count_', 'last 2 weeks', 'module_id', 'numserved_10th', 'numserved_50th', 'numserved_90th'] data = _convert_table_rows_to_lists(data, _ORDER) heading = 'OOM errors by module for %s' % _pretty_date(yyyymmdd) email_content = {heading: data} query = """\ SELECT COUNT(*) as count_, module_id, elog_url_route as url_route FROM [logs.requestlogs_%s] WHERE app_logs.message CONTAINS 'Exceeded soft private memory limit' GROUP BY module_id, url_route ORDER BY count_ DESC """ % yyyymmdd data = bq_util.query_bigquery(query) bq_util.save_daily_data(data, "out_of_memory_errors_by_route", yyyymmdd) historical_data = bq_util.process_past_data( "out_of_memory_errors_by_route", date, 14, lambda row: (row['module_id'], row['url_route'])) for row in data: sparkline_data = [] for old_data in historical_data: old_row = old_data.get((row['module_id'], row['url_route'])) if old_row: sparkline_data.append(old_row['count_']) elif old_data: # If we have data, just not on this route/module, then it just # didn't OOM. sparkline_data.append(0) else: # On the other hand, if we don't have data at all, we should # show a gap. sparkline_data.append(None) row['last 2 weeks'] = sparkline_data _ORDER = ['count_', 'last 2 weeks', 'module_id', 'url_route'] data = _convert_table_rows_to_lists(data, _ORDER) heading = 'OOM errors by route for %s' % _pretty_date(yyyymmdd) email_content[heading] = data _send_email(email_content, None, to=['*****@*****.**'], subject=subject)
def email_rpcs(date): """Email RPCs-per-route report for the given datetime.date object.""" yyyymmdd = date.strftime("%Y%m%d") rpc_fields = ('Get', 'Put', 'Next', 'RunQuery', 'Delete', 'Commit') inits = ["IFNULL(INTEGER(t%s.rpc_%s), 0) AS rpc_%s" % (name, name, name) for name in rpc_fields] inits.append("IFNULL(tcost.rpc_cost, 0) AS rpc_cost") joins = ["LEFT OUTER JOIN ( " "SELECT elog_url_route AS url_route, " " SUM(elog_stats_rpc_ops.value) as rpc_%s " "FROM [logs.requestlogs_%s] " "WHERE elog_stats_rpc_ops.key = 'stats.rpc_ops.%s.count' " "GROUP BY url_route) AS t%s " "ON t1.url_route = t%s.url_route" % (name, yyyymmdd, name, name, name) for name in rpc_fields] joins.append("LEFT OUTER JOIN ( " "SELECT elog_url_route AS url_route, " " SUM(elog_stats_rpc_ops.value) AS rpc_cost " "FROM [logs.requestlogs_%s] " "WHERE elog_stats_rpc_ops.key = 'stats.rpc_ops.cost' " "GROUP BY url_route) AS tcost " "ON t1.url_route = tcost.url_route" % yyyymmdd) query = """\ SELECT t1.url_route AS url_route, t1.url_requests AS requests, %s FROM ( SELECT elog_url_route AS url_route, COUNT(*) AS url_requests FROM [logs.requestlogs_%s] GROUP BY url_route) AS t1 %s ORDER BY tcost.rpc_cost DESC; """ % (',\n'.join(inits), yyyymmdd, '\n'.join(joins)) data = bq_util.query_bigquery(query) bq_util.save_daily_data(data, "rpcs", yyyymmdd) historical_data = bq_util.process_past_data( "rpcs", date, 14, lambda row: row['url_route']) # Munge the table by getting per-request counts for every RPC stat. micropennies = 'μ¢' for row in data: for stat in rpc_fields: row['%s/req' % stat] = row['rpc_%s' % stat] * 1.0 / row['requests'] row[micropennies + '/req'] = row['rpc_cost'] * 1.0 / row['requests'] row['$'] = row['rpc_cost'] * 1.0e-8 sparkline_data = [] for old_data in historical_data: old_row = old_data.get(row['url_route']) if old_row and 'rpc_cost' in old_row: sparkline_data.append( old_row['rpc_cost'] * 1.0 / old_row['requests']) else: sparkline_data.append(None) row['last 2 weeks (%s/req)' % micropennies] = sparkline_data del row['rpc_cost'] # Convert each row from a dict to a list, in a specific order. _ORDER = (['url_route', 'requests', '$', micropennies + '/req', 'last 2 weeks (%s/req)' % micropennies] + ['rpc_%s' % f for f in rpc_fields] + ['%s/req' % f for f in rpc_fields]) data = _convert_table_rows_to_lists(data, _ORDER) subject = 'RPC calls by route' heading = 'RPC calls by route for %s' % _pretty_date(yyyymmdd) # Let's just send the top most expensive routes, not all of them. _send_email({heading: data[:75]}, None, to=['*****@*****.**'], subject=subject)
def email_rpcs(date, dry_run=False): """Email RPCs-per-route report for the given datetime.date object. Also email a more urgent message if one of the RPCs is too expensive. This indicates a bug that is costing us money. """ yyyymmdd = date.strftime("%Y%m%d") rpc_fields = ('Get', 'Put', 'Next', 'RunQuery', 'Delete', 'Commit') inits = ["IFNULL(INTEGER(t%s.rpc_%s), 0) AS rpc_%s" % (name, name, name) for name in rpc_fields] inits.append("IFNULL(tcost.rpc_cost, 0) AS rpc_cost") joins = ["LEFT OUTER JOIN ( " "SELECT elog_url_route AS url_route, " " SUM(elog_stats_rpc_ops.value) as rpc_%s " "FROM [logs.requestlogs_%s] " "WHERE elog_stats_rpc_ops.key = 'stats.rpc_ops.%s.count' " "GROUP BY url_route) AS t%s " "ON t1.url_route = t%s.url_route" % (name, yyyymmdd, name, name, name) for name in rpc_fields] joins.append("LEFT OUTER JOIN ( " "SELECT elog_url_route AS url_route, " " SUM(elog_stats_rpc_ops.value) AS rpc_cost " "FROM [logs.requestlogs_%s] " "WHERE elog_stats_rpc_ops.key = 'stats.rpc_ops.cost' " "GROUP BY url_route) AS tcost " "ON t1.url_route = tcost.url_route" % yyyymmdd) query = """\ SELECT t1.url_route AS url_route, t1.url_requests AS requests, %s FROM ( SELECT elog_url_route AS url_route, COUNT(*) AS url_requests FROM [logs.requestlogs_%s] GROUP BY url_route) AS t1 %s ORDER BY tcost.rpc_cost DESC; """ % (',\n'.join(inits), yyyymmdd, '\n'.join(joins)) data = bq_util.query_bigquery(query) bq_util.save_daily_data(data, "rpcs", yyyymmdd) historical_data = bq_util.process_past_data( "rpcs", date, 14, lambda row: row['url_route']) # Munge the table by getting per-request counts for every RPC stat. micropennies = 'μ¢' for row in data: for stat in rpc_fields: row['%s/req' % stat] = row['rpc_%s' % stat] * 1.0 / row['requests'] row[micropennies + '/req'] = row['rpc_cost'] * 1.0 / row['requests'] row['$'] = row['rpc_cost'] * 1.0e-8 sparkline_data = [] for old_data in historical_data: old_row = old_data.get(row['url_route']) if old_row and 'rpc_cost' in old_row: sparkline_data.append( old_row['rpc_cost'] * 1.0 / old_row['requests']) else: sparkline_data.append(None) row['last 2 weeks (%s/req)' % micropennies] = sparkline_data del row['rpc_cost'] # Convert each row from a dict to a list, in a specific order. _ORDER = (['url_route', 'requests', '$', micropennies + '/req', 'last 2 weeks (%s/req)' % micropennies] + ['rpc_%s' % f for f in rpc_fields] + ['%s/req' % f for f in rpc_fields]) data = _convert_table_rows_to_lists(data, _ORDER) subject = 'RPC calls by route' heading = 'RPC calls by route for %s' % _pretty_date(yyyymmdd) # Let's just send the top most expensive routes, not all of them. _send_email({heading: data[:75]}, None, to=['*****@*****.**'], subject=subject, dry_run=dry_run) # We'll also send the most-most expensive ones to stackdriver. _send_table_to_stackdriver(data[:20], 'webapp.routes.rpc_cost.week_over_week', 'url_route', metric_label_col='url_route', data_col='last 2 weeks (%s/req)' % micropennies, dry_run=dry_run) # As of 1 Feb 2016, the most expensive RPC route is about $300 a # day. More than $750 a day and we should be very suspcious. # TODO(csilvers): do this check more frequently. # TODO(csilvers): send to slack and/or 911 as well as emailing if any(row[2] > 750 for row in data): _send_email({heading: data[:75]}, None, to=['*****@*****.**'], subject=('WARNING: some very expensive RPC calls on %s!' % _pretty_date(yyyymmdd)), dry_run=dry_run)
def email_rrs_stats(date, dry_run=False): """Emails stats about rrs requests that are too slow or have errors. rrs == React Render Server. Requests that take longer than one second are timed out and thus simply add a second to the reponse, rather than speeding it up. """ yyyymmdd = date.strftime("%Y%m%d") latency_q = """ SELECT REPLACE(REGEXP_EXTRACT( httpRequest.requestUrl, r'/render\?path=\.(.*)'), '%2F', '/') AS url, AVG(FLOAT(jsonPayload.latencyseconds)) AS average_latency, COUNT(1) AS count, SUM(CASE WHEN FLOAT(jsonPayload.latencyseconds) >= 1.0 THEN 1 ELSE 0 END) AS timeouts, SUM(CASE WHEN FLOAT(jsonPayload.latencyseconds) >= 1.0 THEN 100 ELSE 0 END) / count(1) as timeout_percent FROM [khan-academy:react_render_logs.appengine_googleapis_com_nginx_request_{}] GROUP BY url ORDER BY timeout_percent DESC """.format(yyyymmdd) error_q = """ SELECT REPLACE(REGEXP_EXTRACT( httpRequest.requestUrl, r'/render\?path=\.(.*)'), '%2F', '/') AS url, SUM(httpRequest.status == 500) AS error_count, (SUM(httpRequest.status == 500) / COUNT(1)) * 100 AS error_percent FROM [khan-academy:react_render_logs.appengine_googleapis_com_nginx_request_{}] GROUP BY url ORDER BY error_percent DESC """.format(yyyymmdd) latency_data = bq_util.get_daily_data('rrs_latency', yyyymmdd) if latency_data is None: table_name = ( "khan-academy:react_render_logs" + ".appengine_googleapis_com_nginx_request_{}").format(yyyymmdd) table_exists = bq_util.does_table_exist(table_name) if not table_exists: print "The RRS logs were not generated. No email will be sent." print "Returning..." return latency_data = bq_util.query_bigquery(latency_q) bq_util.save_daily_data(latency_data, 'rrs_latency', yyyymmdd) error_data = bq_util.get_daily_data('rrs_errors', yyyymmdd) if error_data is None: error_data = bq_util.query_bigquery(error_q) bq_util.save_daily_data(error_data, 'rrs_errors', yyyymmdd) subject = 'React render server errors and timeouts - ' error_heading = 'React component errors' latency_heading = 'React compontent render latency (> 1 sec = timeout)' error_order = ('url', 'error_count', 'error_percent') latency_order = ('url', 'average_latency', 'count', 'timeouts', 'timeout_percent') tables = collections.defaultdict(dict) # Put data in tables by initiative for initiative_id, initiative_data in _by_initiative(error_data, key='url', by_package=True): tables[initiative_id][error_heading] = _convert_table_rows_to_lists( initiative_data, error_order) for initiative_id, initiative_data in _by_initiative(latency_data, key='url', by_package=True): tables[initiative_id][latency_heading] = _convert_table_rows_to_lists( initiative_data, latency_order) # Send email to initiatives for initiative_id, initiative_tables in tables.items(): _send_email(initiative_tables, graph=None, to=[initiatives.email(initiative_id)], subject=subject + initiatives.title(initiative_id), dry_run=dry_run) # Send all data to infra tables = {} tables[error_heading] = _convert_table_rows_to_lists( error_data, error_order) tables[latency_heading] = _convert_table_rows_to_lists( latency_data, latency_order) _send_email(tables, graph=None, to=[initiatives.email('infrastructure')], subject=subject + 'All', dry_run=dry_run)
def email_out_of_memory_errors(date, dry_run=False): # This sends two emails, for two different ways of seeing the data. # But we'll have them share the same subject so they thread together. yyyymmdd = date.strftime("%Y%m%d") subject = 'OOM errors - ' # Out-of-memory errors for python look like: # Exceeded soft memory limit of 2048 MB with 2078 MB after servicing 1497 requests total. Consider setting a larger instance class in app.yaml. #@Nolint # Out-of-memory errors for kotlin look like: # java.lang.OutOfMemoryError: <reason> # (where <reason> is some text that may take on a number of values # depending on whether the problem is lack of heap space, the garbage # collector taking too long to stay ahead of garbage accumulation, etc.) # Note that older messages (before the gVisor sandboxs) started with # "Exceeded soft private memory limit" instead. numreqs = r"REGEXP_EXTRACT(app_logs.message, r'servicing (\d+) requests')" query = """\ SELECT COUNT(1) AS count_, IFNULL(module_id, 'default') AS module_id, NTH(10, QUANTILES(INTEGER(%s), 101)) as numserved_10th, NTH(50, QUANTILES(INTEGER(%s), 101)) as numserved_50th, NTH(90, QUANTILES(INTEGER(%s), 101)) as numserved_90th FROM [logs.requestlogs_%s] WHERE (app_logs.message CONTAINS 'Exceeded soft memory limit' OR app_logs.message CONTAINS 'OutOfMemoryError') AND LEFT(version_id, 3) != 'znd' # ignore znds GROUP BY module_id ORDER BY count_ DESC """ % (numreqs, numreqs, numreqs, yyyymmdd) data = bq_util.query_bigquery(query) bq_util.save_daily_data(data, "out_of_memory_errors_by_module", yyyymmdd) historical_data = bq_util.process_past_data( "out_of_memory_errors_by_module", date, 14, lambda row: row['module_id']) for row in data: sparkline_data = [] for old_data in historical_data: old_row = old_data.get(row['module_id']) if old_row: sparkline_data.append(old_row['count_']) elif old_data: # If we have data, just not on this module, then it just didn't # OOM. sparkline_data.append(0) else: # On the other hand, if we don't have data at all, we should # show a gap. sparkline_data.append(None) row['last 2 weeks'] = sparkline_data _ORDER = [ 'count_', 'last 2 weeks', 'module_id', 'numserved_10th', 'numserved_50th', 'numserved_90th' ] data = _convert_table_rows_to_lists(data, _ORDER) heading = 'OOM errors by module for %s' % _pretty_date(yyyymmdd) email_content = {heading: data} query = """\ SELECT COUNT(1) AS count_, module_id, elog_url_route AS url_route FROM ( SELECT IFNULL(FIRST(module_id), 'default') AS module_id, FIRST(elog_url_route) AS elog_url_route, SUM(IF( app_logs.message CONTAINS 'Exceeded soft memory limit' OR app_logs.message CONTAINS 'OutOfMemoryError', 1, 0)) AS oom_message_count FROM [logs.requestlogs_%s] WHERE LEFT(version_id, 3) != 'znd' # ignore znds GROUP BY request_id HAVING oom_message_count > 0 ) GROUP BY module_id, url_route ORDER BY count_ DESC """ % yyyymmdd data = bq_util.query_bigquery(query) bq_util.save_daily_data(data, "out_of_memory_errors_by_route", yyyymmdd) historical_data = bq_util.process_past_data( "out_of_memory_errors_by_route", date, 14, lambda row: (row['module_id'], row['url_route'])) for row in data: sparkline_data = [] for old_data in historical_data: old_row = old_data.get((row['module_id'], row['url_route'])) if old_row: sparkline_data.append(old_row['count_']) elif old_data: # If we have data, just not on this route/module, then it just # didn't OOM. sparkline_data.append(0) else: # On the other hand, if we don't have data at all, we should # show a gap. sparkline_data.append(None) row['last 2 weeks'] = sparkline_data _ORDER = ['count_', 'last 2 weeks', 'module_id', 'url_route'] heading = 'OOM errors by route for %s' % _pretty_date(yyyymmdd) email_content[heading] = _convert_table_rows_to_lists(data, _ORDER) _send_email(email_content, None, to=[initiatives.email('infrastructure')], subject=subject + 'All', dry_run=dry_run) # Per-initiative reports for initiative_id, initiative_data in _by_initiative(data): table = _convert_table_rows_to_lists(initiative_data, _ORDER) email_content = {heading: table} _send_email(email_content, None, to=[initiatives.email(initiative_id)], subject=subject + initiatives.title(initiative_id), dry_run=dry_run)
def email_rpcs(date, dry_run=False): """Email RPCs-per-route report for the given datetime.date object. Also email a more urgent message if one of the RPCs is too expensive. This indicates a bug that is costing us money. """ yyyymmdd = date.strftime("%Y%m%d") rpc_fields = ('Get', 'Put', 'Next', 'RunQuery', 'Delete', 'Commit') inits = [ "IFNULL(INTEGER(t%s.rpc_%s), 0) AS rpc_%s" % (name, name, name) for name in rpc_fields ] inits.append("IFNULL(tcost.rpc_cost, 0) AS rpc_cost") joins = [ "LEFT OUTER JOIN ( " "SELECT elog_url_route AS url_route, " " SUM(elog_stats_rpc_ops.value) as rpc_%s " "FROM [logs.requestlogs_%s] " "WHERE elog_stats_rpc_ops.key = 'stats.rpc_ops.%s.count' " "GROUP BY url_route) AS t%s " "ON t1.url_route = t%s.url_route" % (name, yyyymmdd, name, name, name) for name in rpc_fields ] joins.append("LEFT OUTER JOIN ( " "SELECT elog_url_route AS url_route, " " SUM(elog_stats_rpc_ops.value) AS rpc_cost " "FROM [logs.requestlogs_%s] " "WHERE elog_stats_rpc_ops.key = 'stats.rpc_ops.cost' " "GROUP BY url_route) AS tcost " "ON t1.url_route = tcost.url_route" % yyyymmdd) query = """\ SELECT t1.url_route AS url_route, t1.url_requests AS requests, %s FROM ( SELECT elog_url_route AS url_route, COUNT(1) AS url_requests FROM ( SELECT FIRST(elog_url_route) AS elog_url_route FROM [logs.requestlogs_%s] WHERE LEFT(version_id, 3) != 'znd' # ignore znds GROUP BY request_id ) GROUP BY url_route ) AS t1 %s ORDER BY tcost.rpc_cost DESC; """ % (',\n'.join(inits), yyyymmdd, '\n'.join(joins)) data = bq_util.query_bigquery(query) bq_util.save_daily_data(data, "rpcs", yyyymmdd) historical_data = bq_util.process_past_data("rpcs", date, 14, lambda row: row['url_route']) # Munge the table by getting per-request counts for every RPC stat. micropennies = 'μ¢' for row in data: for stat in rpc_fields: row['%s/req' % stat] = row['rpc_%s' % stat] * 1.0 / row['requests'] row[micropennies + '/req'] = row['rpc_cost'] * 1.0 / row['requests'] row['$'] = row['rpc_cost'] * 1.0e-8 sparkline_data = [] for old_data in historical_data: old_row = old_data.get(row['url_route']) if old_row and 'rpc_cost' in old_row: sparkline_data.append(old_row['rpc_cost'] * 1.0 / old_row['requests']) else: sparkline_data.append(None) row['last 2 weeks (%s/req)' % micropennies] = sparkline_data del row['rpc_cost'] # Convert each row from a dict to a list, in a specific order. _ORDER = ([ 'url_route', 'requests', '$', micropennies + '/req', 'last 2 weeks (%s/req)' % micropennies ] + ['rpc_%s' % f for f in rpc_fields] + ['%s/req' % f for f in rpc_fields]) all_data = _convert_table_rows_to_lists(data, _ORDER) subject = 'RPC calls by route - ' heading = 'RPC calls by route for %s' % _pretty_date(yyyymmdd) _send_email({heading: all_data[:75]}, None, to=[initiatives.email('infrastructure')], subject=subject + 'All', dry_run=dry_run) # Per-initiative reports for initiative_id, initiative_data in _by_initiative(data): table = _convert_table_rows_to_lists(initiative_data, _ORDER) # Let's just send the top most expensive routes, not all of them. _send_email({heading: table[:75]}, None, to=[initiatives.email(initiative_id)], subject=subject + initiatives.title(initiative_id), dry_run=dry_run) # We'll also send the most-most expensive ones to stackdriver. _send_table_to_stackdriver(all_data[:20], 'webapp.routes.rpc_cost.week_over_week', 'url_route', metric_label_col='url_route', data_col='last 2 weeks (%s/req)' % micropennies, dry_run=dry_run) # As of 1 Feb 2016, the most expensive RPC route is about $300 a # day. More than $750 a day and we should be very suspcious. # TODO(csilvers): do this check more frequently. # TODO(csilvers): send to slack and/or 911 as well as emailing if any(row[2] > 750 for row in all_data[1:]): # ignore the header line _send_email({heading: all_data[:75]}, None, to=['*****@*****.**'], subject=('WARNING: some very expensive RPC calls on %s!' % _pretty_date(yyyymmdd)), dry_run=dry_run)
def email_instance_hours(date, dry_run=False): """Email instance hours report for the given datetime.date object.""" yyyymmdd = date.strftime("%Y%m%d") cost_fn = '\n'.join("WHEN module_id == '%s' THEN latency * %s" % kv for kv in _MODULE_CPU_COUNT.iteritems()) query = """\ SELECT COUNT(1) as count_, elog_url_route as url_route, SUM(CASE %s ELSE 0 END) / 3600 as instance_hours FROM ( -- When logs get split into multiple entries, each has latency calculated -- from the start of the request to the point where the log line was emitted. -- This means the total latency is the maximum value that appears, not the -- sum. SELECT FIRST(elog_url_route) AS elog_url_route, IFNULL(FIRST(module_id), 'default') AS module_id, MAX(latency) AS latency, FIRST(url_map_entry) AS url_map_entry FROM [logs.requestlogs_%s] WHERE LEFT(version_id, 3) != 'znd' # ignore znds GROUP BY request_id ) WHERE url_map_entry != "" # omit static files GROUP BY url_route ORDER BY instance_hours DESC """ % (cost_fn, yyyymmdd) data = bq_util.query_bigquery(query) bq_util.save_daily_data(data, "instance_hours", yyyymmdd) historical_data = bq_util.process_past_data("instance_hours", date, 14, lambda row: row['url_route']) # Munge the table by adding a few columns. total_instance_hours = 0.0 for row in data: total_instance_hours += row['instance_hours'] for row in data: row['%% of total'] = row['instance_hours'] / total_instance_hours * 100 row['per 1k requests'] = row['instance_hours'] / row['count_'] * 1000 sparkline_data = [] for old_data in historical_data: old_row = old_data.get(row['url_route']) if old_row: sparkline_data.append(old_row['instance_hours'] / old_row['count_']) else: sparkline_data.append(None) row['last 2 weeks (per request)'] = sparkline_data _ORDER = ('%% of total', 'instance_hours', 'count_', 'per 1k requests', 'last 2 weeks (per request)', 'url_route') subject = 'Instance Hours by Route - ' heading = 'Cost-normalized instance hours by route for %s' % ( _pretty_date(yyyymmdd)) all_data = _convert_table_rows_to_lists(data, _ORDER) # Let's just send the top most expensive routes, not all of them. _send_email({heading: all_data[:50]}, None, to=[initiatives.email('infrastructure')], subject=subject + 'All', dry_run=dry_run) # Per-initiative reports for initiative_id, initiative_data in _by_initiative(data): table = _convert_table_rows_to_lists(initiative_data, _ORDER) _send_email({heading: table[:50]}, None, to=[initiatives.email(initiative_id)], subject=subject + initiatives.title(initiative_id), dry_run=dry_run) # We'll also send the most-most expensive ones to stackdriver. _send_table_to_stackdriver(all_data[:20], 'webapp.routes.instance_hours.week_over_week', 'url_route', metric_label_col='url_route', data_col='last 2 weeks (per request)', dry_run=dry_run)
def email_out_of_memory_errors(date): # This sends two emails, for two different ways of seeing the data. # But we'll have them share the same subject so they thread together. yyyymmdd = date.strftime("%Y%m%d") subject = 'OOM errors' # Out-of-memory errors look like: # Exceeded soft private memory limit with 260.109 MB after servicing 2406 requests total #@Nolint # with SDK 1.9.7, they changed. Note the double-space after "after": # Exceeded soft private memory limit of 512 MB with 515 MB after servicing 9964 requests total #@Nolint numreqs = r"REGEXP_EXTRACT(app_logs.message, r'servicing (\d+) requests')" query = """\ SELECT COUNT(module_id) AS count_, module_id, NTH(10, QUANTILES(INTEGER(%s), 101)) as numserved_10th, NTH(50, QUANTILES(INTEGER(%s), 101)) as numserved_50th, NTH(90, QUANTILES(INTEGER(%s), 101)) as numserved_90th FROM [logs.requestlogs_%s] WHERE app_logs.message CONTAINS 'Exceeded soft private memory limit' AND module_id IS NOT NULL GROUP BY module_id ORDER BY count_ DESC """ % (numreqs, numreqs, numreqs, yyyymmdd) data = bq_util.query_bigquery(query) bq_util.save_daily_data(data, "out_of_memory_errors_by_module", yyyymmdd) historical_data = bq_util.process_past_data( "out_of_memory_errors_by_module", date, 14, lambda row: row['module_id']) for row in data: sparkline_data = [] for old_data in historical_data: old_row = old_data.get(row['module_id']) if old_row: sparkline_data.append(old_row['count_']) elif old_data: # If we have data, just not on this module, then it just didn't # OOM. sparkline_data.append(0) else: # On the other hand, if we don't have data at all, we should # show a gap. sparkline_data.append(None) row['last 2 weeks'] = sparkline_data _ORDER = [ 'count_', 'last 2 weeks', 'module_id', 'numserved_10th', 'numserved_50th', 'numserved_90th' ] data = _convert_table_rows_to_lists(data, _ORDER) heading = 'OOM errors by module for %s' % _pretty_date(yyyymmdd) email_content = {heading: data} query = """\ SELECT COUNT(*) as count_, module_id, elog_url_route as url_route FROM [logs.requestlogs_%s] WHERE app_logs.message CONTAINS 'Exceeded soft private memory limit' GROUP BY module_id, url_route ORDER BY count_ DESC """ % yyyymmdd data = bq_util.query_bigquery(query) bq_util.save_daily_data(data, "out_of_memory_errors_by_route", yyyymmdd) historical_data = bq_util.process_past_data( "out_of_memory_errors_by_route", date, 14, lambda row: (row['module_id'], row['url_route'])) for row in data: sparkline_data = [] for old_data in historical_data: old_row = old_data.get((row['module_id'], row['url_route'])) if old_row: sparkline_data.append(old_row['count_']) elif old_data: # If we have data, just not on this route/module, then it just # didn't OOM. sparkline_data.append(0) else: # On the other hand, if we don't have data at all, we should # show a gap. sparkline_data.append(None) row['last 2 weeks'] = sparkline_data _ORDER = ['count_', 'last 2 weeks', 'module_id', 'url_route'] data = _convert_table_rows_to_lists(data, _ORDER) heading = 'OOM errors by route for %s' % _pretty_date(yyyymmdd) email_content[heading] = data _send_email(email_content, None, to=['*****@*****.**'], subject=subject)
def email_memory_increases(date, window_length=20, min_increase_in_mb=1): """Emails the increases in memory caused by particular routes. It attempts to compute the amount of memory ignoring memory which is reclaimed in the next few requests. (The number of requests which are checked is specified by the window_length parameter. Routes with a total increase less than the min_increase_in_mb parameter are ignored.) """ yyyymmdd = date.strftime("%Y%m%d") lead_lengths = range(1, window_length + 1) lead_selects = '\n'.join( "LEAD(total, %s) OVER (PARTITION BY instance_key ORDER BY start_time) " "AS lead_total_%s," % (i, i) for i in lead_lengths) fields = ['total'] + ['lead_total_%s' % i for i in lead_lengths] # We want to compute the minimal value of added + field - total, where # field is one of "total" or one of the "lead_total_i". BigQuery # unfortunately doesn't give us a nice way to do this (at least that I know # of, without doing a CROSS JOIN of the table to itself). One way to do # this would be a gigantic nested IF, but this could be exponentially # large, and queries have a fixed maximum size. Instead we use a CASE # expression which could be O(n^2); since we don't pay for BigQuery # execution time, and n shouldn't be too huge, this seems like a better # approach. In theory it might be better to do O(n) nested queries (each # of which does a single pairwise min), but this seems like it could in # practice be even slower, depending on the implementation. # TODO(benkraft): If I get a useful answer to # http://stackoverflow.com/questions/24923101/computing-a-moving-maximum-in-bigquery # we should use that instead. Or, once we have user-defined functions in # BigQuery, we can probably do something actually reasonable. case_expr = '\n'.join( "WHEN %s THEN added + %s - total" % (' AND '.join("%s <= %s" % (field1, field2) for field2 in fields if field2 != field1), field1) for field1 in fields) # This is a kind of large query, so here's what it's doing, from inside to # out: # First, extract the memory data from the logs. # Second, compute the appropriate LEAD() columns, which tell us what the # total will be a few requests later on the same instance # Third, compute "real_added", which is the amount we think the request # actually added to the heap, not counting memory which was soon # reclaimed. This might come out negative, so # Fourth, make sure the memory added is at least zero, since if memory # usage went down, this request probably shouldn't get the credit. # Fifth, group by route and do whatever aggregation we want. Ignore the # first 25 requests to each module, since those are probably all # loading things that each module has to load, and the request that # does the loading shouldn't be blamed. query = """\ SELECT COUNT(*) AS count_, elog_url_route AS url_route, module_id AS module, AVG(real_added) AS added_avg, NTH(99, QUANTILES(real_added, 101)) AS added_98th, SUM(real_added) AS added_total, FROM ( SELECT IF(real_added > 0, real_added, 0) AS real_added, elog_url_route, module_id, num, FROM ( SELECT (CASE %s ELSE added END) AS real_added, elog_url_route, module_id, num, FROM ( SELECT %s RANK() OVER (PARTITION BY instance_key ORDER BY start_time) AS num, added, total, elog_url_route, module_id, FROM ( SELECT FLOAT(REGEXP_EXTRACT( app_logs.message, "This request added (.*) MB to the heap.")) AS added, FLOAT(REGEXP_EXTRACT( app_logs.message, "Total memory now used: (.*) MB")) AS total, instance_key, start_time, elog_url_route, module_id, FROM [logs.requestlogs_%s] WHERE app_logs.message CONTAINS 'This request added' ) ) ) ) WHERE num > 25 GROUP BY url_route, module ORDER BY added_total DESC """ % (case_expr, lead_selects, yyyymmdd) data = bq_util.query_bigquery(query) bq_util.save_daily_data(data, "memory_increases", yyyymmdd) historical_data = bq_util.process_past_data( "memory_increases", date, 14, lambda row: (row['module'], row['url_route'])) by_module = collections.defaultdict(list) for row in data: if row['added_total'] > min_increase_in_mb: heading = "Memory increases by route for %s module on %s" % ( row['module'], _pretty_date(yyyymmdd)) by_module[heading].append(row) _ORDER = [ 'count_', 'added_avg', 'last 2 weeks (avg)', 'added_98th', 'added_total', 'added %%', 'url_route' ] for heading in by_module: total = sum(row['added_total'] for row in by_module[heading]) for row in by_module[heading]: row['added %%'] = row['added_total'] / total * 100 sparkline_data = [] for old_data in historical_data: old_row = old_data.get((row['module'], row['url_route'])) if old_row: sparkline_data.append(old_row['added_avg']) else: sparkline_data.append(None) row['last 2 weeks (avg)'] = sparkline_data del row['module'] by_module[heading] = _convert_table_rows_to_lists( by_module[heading][:50], _ORDER) subject = "Memory Increases by Route" _send_email(by_module, None, to=['*****@*****.**'], subject=subject)
def email_applog_sizes(date, dry_run=False): """Email app-log report for the given datetime.date object. This report says how much we are logging (via logging.info() and friends), grouped by the first word of the log message. (Which usually, but not always, is a good proxy for a single log-message in our app.) Since we pay per byte logged, we want to make sure we're not accidentally logging a single log message a ton, which is really easy to do. """ yyyymmdd = date.strftime("%Y%m%d") query = """\ SELECT REGEXP_EXTRACT(app_logs.message, r'^([a-zA-Z0-9_-]*)') AS firstword, FIRST(app_logs.message) as sample_logline, SUM(LENGTH(app_logs.message)) / 1024 / 1024 AS size_mb, -- This cost comes from https://cloud.google.com/stackdriver/pricing_v2: -- "Stackdriver Logging: $0.50/GB". But it seems like app-log messages -- are actually encoded *twice* in the logging data, based on our -- best model of app-log sizes vs num-requests vs costs in the billing -- reports, so we assume our cost is $1/GB. SUM(LENGTH(app_logs.message)) / 1024 / 1024 / 1024 AS cost_usd FROM logs.requestlogs_%s GROUP BY firstword ORDER BY cost_usd DESC """ % (yyyymmdd) data = bq_util.query_bigquery(query) data = [row for row in data if row['firstword'] not in (None, '(None)')] bq_util.save_daily_data(data, "log_bytes", yyyymmdd) historical_data = bq_util.process_past_data("log_bytes", date, 14, lambda row: row['firstword']) # Munge the table by adding a few columns. total_bytes = 0.0 for row in data: total_bytes += row['size_mb'] for row in data: row['%% of total'] = row['size_mb'] / total_bytes * 100 sparkline_data = [] for old_data in historical_data: old_row = old_data.get(row['firstword']) if old_row: sparkline_data.append(old_row['size_mb']) else: sparkline_data.append(None) row['last 2 weeks'] = sparkline_data # While we're here, truncate the sample-logline, since it can get # really long. row['sample_logline'] = row['sample_logline'][:80] _ORDER = ('%% of total', 'size_mb', 'cost_usd', 'last 2 weeks', 'firstword', 'sample_logline') subject = 'Log-bytes by first word of log-message - ' heading = 'Cost-normalized log-bytes by firstword for %s' % ( _pretty_date(yyyymmdd)) all_data = _convert_table_rows_to_lists(data, _ORDER) # Let's just send the top most expensive routes, not all of them. _send_email({heading: all_data[:50]}, None, to=[initiatives.email('infrastructure')], subject=subject + 'All', dry_run=dry_run) # As of 1 Jun 2018, the most expensive firstword costs about # $2/day. More than $20 a day and we should be very suspicious. if any(row[2] > 20 for row in all_data[1:]): # ignore the header line _send_email({heading: all_data[:75]}, None, to=['*****@*****.**'], subject=('WARNING: some very expensive loglines on %s!' % _pretty_date(yyyymmdd)), dry_run=dry_run)