def email_client_api_usage(date, dry_run=False): """Emails a report of API usage, segmented by client and build version.""" yyyymmdd = date.strftime("%Y%m%d") ios_user_agent_regex = '^Khan%20Academy\.(.*)/(.*) CFNetwork/([.0-9]*)' \ ' Darwin/([.0-9]*)$' # We group all non-ios user agents into a single bucket to keep this # report down to a reasonable size. query = """\ SELECT IF(REGEXP_MATCH(user_agent, r'%(ios_user_agent_regex)s'), REGEXP_REPLACE(user_agent, r'%(ios_user_agent_regex)s', r'iOS \1'), 'Web Browsers/other') as client, IF(REGEXP_MATCH(user_agent, r'%(ios_user_agent_regex)s'), REGEXP_REPLACE(user_agent, r'%(ios_user_agent_regex)s', r'\2'), '') as build, elog_url_route as route, count(elog_url_route) as request_count FROM logs.requestlogs_%(date_format)s WHERE REGEXP_MATCH(elog_url_route, '^api.main:/api/internal') AND user_agent IS NOT NULL AND elog_url_route IS NOT NULL AND LEFT(version_id, 3) != 'znd' # ignore znds GROUP BY client, build, route ORDER BY client DESC, build DESC, request_count DESC; """ % { 'ios_user_agent_regex': ios_user_agent_regex, 'date_format': yyyymmdd } data = bq_util.query_bigquery(query) bq_util.save_daily_data(data, "client_api_usage", yyyymmdd) _ORDER = ('client', 'build', 'route', 'request_count') all_data = _convert_table_rows_to_lists(data, _ORDER) subject = 'API usage by client - ' heading = 'API usage by client for %s' % _pretty_date(yyyymmdd) _send_email({heading: all_data}, None, to=[initiatives.email('infrastructure')], subject=subject + 'All', dry_run=dry_run) # Per-initiative reports for initiative_id, initiative_data in _by_initiative(data, key='route'): table = _convert_table_rows_to_lists(initiative_data, _ORDER) _send_email({heading: table}, None, to=[initiatives.email(initiative_id)], subject=subject + initiatives.title(initiative_id), dry_run=dry_run)
def email_applog_sizes(date, dry_run=False): """Email app-log report for the given datetime.date object. This report says how much we are logging (via logging.info() and friends), grouped by the first word of the log message. (Which usually, but not always, is a good proxy for a single log-message in our app.) Since we pay per byte logged, we want to make sure we're not accidentally logging a single log message a ton, which is really easy to do. """ yyyymmdd = date.strftime("%Y%m%d") query = """\ SELECT REGEXP_EXTRACT(app_logs.message, r'^([a-zA-Z0-9_-]*)') AS firstword, FIRST(app_logs.message) as sample_logline, SUM(LENGTH(app_logs.message)) / 1024 / 1024 AS size_mb, -- This cost comes from https://cloud.google.com/stackdriver/pricing_v2: -- "Stackdriver Logging: $0.50/GB". But it seems like app-log messages -- are actually encoded *twice* in the logging data, based on our -- best model of app-log sizes vs num-requests vs costs in the billing -- reports, so we assume our cost is $1/GB. SUM(LENGTH(app_logs.message)) / 1024 / 1024 / 1024 AS cost_usd FROM logs.requestlogs_%s GROUP BY firstword ORDER BY cost_usd DESC """ % (yyyymmdd) data = bq_util.query_bigquery(query) data = [row for row in data if row['firstword'] not in (None, '(None)')] bq_util.save_daily_data(data, "log_bytes", yyyymmdd) historical_data = bq_util.process_past_data("log_bytes", date, 14, lambda row: row['firstword']) # Munge the table by adding a few columns. total_bytes = 0.0 for row in data: total_bytes += row['size_mb'] for row in data: row['%% of total'] = row['size_mb'] / total_bytes * 100 sparkline_data = [] for old_data in historical_data: old_row = old_data.get(row['firstword']) if old_row: sparkline_data.append(old_row['size_mb']) else: sparkline_data.append(None) row['last 2 weeks'] = sparkline_data # While we're here, truncate the sample-logline, since it can get # really long. row['sample_logline'] = row['sample_logline'][:80] _ORDER = ('%% of total', 'size_mb', 'cost_usd', 'last 2 weeks', 'firstword', 'sample_logline') subject = 'Log-bytes by first word of log-message - ' heading = 'Cost-normalized log-bytes by firstword for %s' % ( _pretty_date(yyyymmdd)) all_data = _convert_table_rows_to_lists(data, _ORDER) # Let's just send the top most expensive routes, not all of them. _send_email({heading: all_data[:50]}, None, to=[initiatives.email('infrastructure')], subject=subject + 'All', dry_run=dry_run) # As of 1 Jun 2018, the most expensive firstword costs about # $2/day. More than $20 a day and we should be very suspicious. if any(row[2] > 20 for row in all_data[1:]): # ignore the header line _send_email({heading: all_data[:75]}, None, to=['*****@*****.**'], subject=('WARNING: some very expensive loglines on %s!' % _pretty_date(yyyymmdd)), dry_run=dry_run)
def email_rrs_stats(date, dry_run=False): """Emails stats about rrs requests that are too slow or have errors. rrs == React Render Server. Requests that take longer than one second are timed out and thus simply add a second to the reponse, rather than speeding it up. """ yyyymmdd = date.strftime("%Y%m%d") latency_q = """ SELECT REPLACE(REGEXP_EXTRACT( httpRequest.requestUrl, r'/render\?path=\.(.*)'), '%2F', '/') AS url, AVG(FLOAT(jsonPayload.latencyseconds)) AS average_latency, COUNT(1) AS count, SUM(CASE WHEN FLOAT(jsonPayload.latencyseconds) >= 1.0 THEN 1 ELSE 0 END) AS timeouts, SUM(CASE WHEN FLOAT(jsonPayload.latencyseconds) >= 1.0 THEN 100 ELSE 0 END) / count(1) as timeout_percent FROM [khan-academy:react_render_logs.appengine_googleapis_com_nginx_request_{}] GROUP BY url ORDER BY timeout_percent DESC """.format(yyyymmdd) error_q = """ SELECT REPLACE(REGEXP_EXTRACT( httpRequest.requestUrl, r'/render\?path=\.(.*)'), '%2F', '/') AS url, SUM(httpRequest.status == 500) AS error_count, (SUM(httpRequest.status == 500) / COUNT(1)) * 100 AS error_percent FROM [khan-academy:react_render_logs.appengine_googleapis_com_nginx_request_{}] GROUP BY url ORDER BY error_percent DESC """.format(yyyymmdd) latency_data = bq_util.get_daily_data('rrs_latency', yyyymmdd) if latency_data is None: table_name = ( "khan-academy:react_render_logs" + ".appengine_googleapis_com_nginx_request_{}").format(yyyymmdd) table_exists = bq_util.does_table_exist(table_name) if not table_exists: print "The RRS logs were not generated. No email will be sent." print "Returning..." return latency_data = bq_util.query_bigquery(latency_q) bq_util.save_daily_data(latency_data, 'rrs_latency', yyyymmdd) error_data = bq_util.get_daily_data('rrs_errors', yyyymmdd) if error_data is None: error_data = bq_util.query_bigquery(error_q) bq_util.save_daily_data(error_data, 'rrs_errors', yyyymmdd) subject = 'React render server errors and timeouts - ' error_heading = 'React component errors' latency_heading = 'React compontent render latency (> 1 sec = timeout)' error_order = ('url', 'error_count', 'error_percent') latency_order = ('url', 'average_latency', 'count', 'timeouts', 'timeout_percent') tables = collections.defaultdict(dict) # Put data in tables by initiative for initiative_id, initiative_data in _by_initiative(error_data, key='url', by_package=True): tables[initiative_id][error_heading] = _convert_table_rows_to_lists( initiative_data, error_order) for initiative_id, initiative_data in _by_initiative(latency_data, key='url', by_package=True): tables[initiative_id][latency_heading] = _convert_table_rows_to_lists( initiative_data, latency_order) # Send email to initiatives for initiative_id, initiative_tables in tables.items(): _send_email(initiative_tables, graph=None, to=[initiatives.email(initiative_id)], subject=subject + initiatives.title(initiative_id), dry_run=dry_run) # Send all data to infra tables = {} tables[error_heading] = _convert_table_rows_to_lists( error_data, error_order) tables[latency_heading] = _convert_table_rows_to_lists( latency_data, latency_order) _send_email(tables, graph=None, to=[initiatives.email('infrastructure')], subject=subject + 'All', dry_run=dry_run)
def email_out_of_memory_errors(date, dry_run=False): # This sends two emails, for two different ways of seeing the data. # But we'll have them share the same subject so they thread together. yyyymmdd = date.strftime("%Y%m%d") subject = 'OOM errors - ' # Out-of-memory errors for python look like: # Exceeded soft memory limit of 2048 MB with 2078 MB after servicing 1497 requests total. Consider setting a larger instance class in app.yaml. #@Nolint # Out-of-memory errors for kotlin look like: # java.lang.OutOfMemoryError: <reason> # (where <reason> is some text that may take on a number of values # depending on whether the problem is lack of heap space, the garbage # collector taking too long to stay ahead of garbage accumulation, etc.) # Note that older messages (before the gVisor sandboxs) started with # "Exceeded soft private memory limit" instead. numreqs = r"REGEXP_EXTRACT(app_logs.message, r'servicing (\d+) requests')" query = """\ SELECT COUNT(1) AS count_, IFNULL(module_id, 'default') AS module_id, NTH(10, QUANTILES(INTEGER(%s), 101)) as numserved_10th, NTH(50, QUANTILES(INTEGER(%s), 101)) as numserved_50th, NTH(90, QUANTILES(INTEGER(%s), 101)) as numserved_90th FROM [logs.requestlogs_%s] WHERE (app_logs.message CONTAINS 'Exceeded soft memory limit' OR app_logs.message CONTAINS 'OutOfMemoryError') AND LEFT(version_id, 3) != 'znd' # ignore znds GROUP BY module_id ORDER BY count_ DESC """ % (numreqs, numreqs, numreqs, yyyymmdd) data = bq_util.query_bigquery(query) bq_util.save_daily_data(data, "out_of_memory_errors_by_module", yyyymmdd) historical_data = bq_util.process_past_data( "out_of_memory_errors_by_module", date, 14, lambda row: row['module_id']) for row in data: sparkline_data = [] for old_data in historical_data: old_row = old_data.get(row['module_id']) if old_row: sparkline_data.append(old_row['count_']) elif old_data: # If we have data, just not on this module, then it just didn't # OOM. sparkline_data.append(0) else: # On the other hand, if we don't have data at all, we should # show a gap. sparkline_data.append(None) row['last 2 weeks'] = sparkline_data _ORDER = [ 'count_', 'last 2 weeks', 'module_id', 'numserved_10th', 'numserved_50th', 'numserved_90th' ] data = _convert_table_rows_to_lists(data, _ORDER) heading = 'OOM errors by module for %s' % _pretty_date(yyyymmdd) email_content = {heading: data} query = """\ SELECT COUNT(1) AS count_, module_id, elog_url_route AS url_route FROM ( SELECT IFNULL(FIRST(module_id), 'default') AS module_id, FIRST(elog_url_route) AS elog_url_route, SUM(IF( app_logs.message CONTAINS 'Exceeded soft memory limit' OR app_logs.message CONTAINS 'OutOfMemoryError', 1, 0)) AS oom_message_count FROM [logs.requestlogs_%s] WHERE LEFT(version_id, 3) != 'znd' # ignore znds GROUP BY request_id HAVING oom_message_count > 0 ) GROUP BY module_id, url_route ORDER BY count_ DESC """ % yyyymmdd data = bq_util.query_bigquery(query) bq_util.save_daily_data(data, "out_of_memory_errors_by_route", yyyymmdd) historical_data = bq_util.process_past_data( "out_of_memory_errors_by_route", date, 14, lambda row: (row['module_id'], row['url_route'])) for row in data: sparkline_data = [] for old_data in historical_data: old_row = old_data.get((row['module_id'], row['url_route'])) if old_row: sparkline_data.append(old_row['count_']) elif old_data: # If we have data, just not on this route/module, then it just # didn't OOM. sparkline_data.append(0) else: # On the other hand, if we don't have data at all, we should # show a gap. sparkline_data.append(None) row['last 2 weeks'] = sparkline_data _ORDER = ['count_', 'last 2 weeks', 'module_id', 'url_route'] heading = 'OOM errors by route for %s' % _pretty_date(yyyymmdd) email_content[heading] = _convert_table_rows_to_lists(data, _ORDER) _send_email(email_content, None, to=[initiatives.email('infrastructure')], subject=subject + 'All', dry_run=dry_run) # Per-initiative reports for initiative_id, initiative_data in _by_initiative(data): table = _convert_table_rows_to_lists(initiative_data, _ORDER) email_content = {heading: table} _send_email(email_content, None, to=[initiatives.email(initiative_id)], subject=subject + initiatives.title(initiative_id), dry_run=dry_run)
def email_rpcs(date, dry_run=False): """Email RPCs-per-route report for the given datetime.date object. Also email a more urgent message if one of the RPCs is too expensive. This indicates a bug that is costing us money. """ yyyymmdd = date.strftime("%Y%m%d") rpc_fields = ('Get', 'Put', 'Next', 'RunQuery', 'Delete', 'Commit') inits = [ "IFNULL(INTEGER(t%s.rpc_%s), 0) AS rpc_%s" % (name, name, name) for name in rpc_fields ] inits.append("IFNULL(tcost.rpc_cost, 0) AS rpc_cost") joins = [ "LEFT OUTER JOIN ( " "SELECT elog_url_route AS url_route, " " SUM(elog_stats_rpc_ops.value) as rpc_%s " "FROM [logs.requestlogs_%s] " "WHERE elog_stats_rpc_ops.key = 'stats.rpc_ops.%s.count' " "GROUP BY url_route) AS t%s " "ON t1.url_route = t%s.url_route" % (name, yyyymmdd, name, name, name) for name in rpc_fields ] joins.append("LEFT OUTER JOIN ( " "SELECT elog_url_route AS url_route, " " SUM(elog_stats_rpc_ops.value) AS rpc_cost " "FROM [logs.requestlogs_%s] " "WHERE elog_stats_rpc_ops.key = 'stats.rpc_ops.cost' " "GROUP BY url_route) AS tcost " "ON t1.url_route = tcost.url_route" % yyyymmdd) query = """\ SELECT t1.url_route AS url_route, t1.url_requests AS requests, %s FROM ( SELECT elog_url_route AS url_route, COUNT(1) AS url_requests FROM ( SELECT FIRST(elog_url_route) AS elog_url_route FROM [logs.requestlogs_%s] WHERE LEFT(version_id, 3) != 'znd' # ignore znds GROUP BY request_id ) GROUP BY url_route ) AS t1 %s ORDER BY tcost.rpc_cost DESC; """ % (',\n'.join(inits), yyyymmdd, '\n'.join(joins)) data = bq_util.query_bigquery(query) bq_util.save_daily_data(data, "rpcs", yyyymmdd) historical_data = bq_util.process_past_data("rpcs", date, 14, lambda row: row['url_route']) # Munge the table by getting per-request counts for every RPC stat. micropennies = 'μ¢' for row in data: for stat in rpc_fields: row['%s/req' % stat] = row['rpc_%s' % stat] * 1.0 / row['requests'] row[micropennies + '/req'] = row['rpc_cost'] * 1.0 / row['requests'] row['$'] = row['rpc_cost'] * 1.0e-8 sparkline_data = [] for old_data in historical_data: old_row = old_data.get(row['url_route']) if old_row and 'rpc_cost' in old_row: sparkline_data.append(old_row['rpc_cost'] * 1.0 / old_row['requests']) else: sparkline_data.append(None) row['last 2 weeks (%s/req)' % micropennies] = sparkline_data del row['rpc_cost'] # Convert each row from a dict to a list, in a specific order. _ORDER = ([ 'url_route', 'requests', '$', micropennies + '/req', 'last 2 weeks (%s/req)' % micropennies ] + ['rpc_%s' % f for f in rpc_fields] + ['%s/req' % f for f in rpc_fields]) all_data = _convert_table_rows_to_lists(data, _ORDER) subject = 'RPC calls by route - ' heading = 'RPC calls by route for %s' % _pretty_date(yyyymmdd) _send_email({heading: all_data[:75]}, None, to=[initiatives.email('infrastructure')], subject=subject + 'All', dry_run=dry_run) # Per-initiative reports for initiative_id, initiative_data in _by_initiative(data): table = _convert_table_rows_to_lists(initiative_data, _ORDER) # Let's just send the top most expensive routes, not all of them. _send_email({heading: table[:75]}, None, to=[initiatives.email(initiative_id)], subject=subject + initiatives.title(initiative_id), dry_run=dry_run) # We'll also send the most-most expensive ones to stackdriver. _send_table_to_stackdriver(all_data[:20], 'webapp.routes.rpc_cost.week_over_week', 'url_route', metric_label_col='url_route', data_col='last 2 weeks (%s/req)' % micropennies, dry_run=dry_run) # As of 1 Feb 2016, the most expensive RPC route is about $300 a # day. More than $750 a day and we should be very suspcious. # TODO(csilvers): do this check more frequently. # TODO(csilvers): send to slack and/or 911 as well as emailing if any(row[2] > 750 for row in all_data[1:]): # ignore the header line _send_email({heading: all_data[:75]}, None, to=['*****@*****.**'], subject=('WARNING: some very expensive RPC calls on %s!' % _pretty_date(yyyymmdd)), dry_run=dry_run)
def email_instance_hours(date, dry_run=False): """Email instance hours report for the given datetime.date object.""" yyyymmdd = date.strftime("%Y%m%d") cost_fn = '\n'.join("WHEN module_id == '%s' THEN latency * %s" % kv for kv in _MODULE_CPU_COUNT.iteritems()) query = """\ SELECT COUNT(1) as count_, elog_url_route as url_route, SUM(CASE %s ELSE 0 END) / 3600 as instance_hours FROM ( -- When logs get split into multiple entries, each has latency calculated -- from the start of the request to the point where the log line was emitted. -- This means the total latency is the maximum value that appears, not the -- sum. SELECT FIRST(elog_url_route) AS elog_url_route, IFNULL(FIRST(module_id), 'default') AS module_id, MAX(latency) AS latency, FIRST(url_map_entry) AS url_map_entry FROM [logs.requestlogs_%s] WHERE LEFT(version_id, 3) != 'znd' # ignore znds GROUP BY request_id ) WHERE url_map_entry != "" # omit static files GROUP BY url_route ORDER BY instance_hours DESC """ % (cost_fn, yyyymmdd) data = bq_util.query_bigquery(query) bq_util.save_daily_data(data, "instance_hours", yyyymmdd) historical_data = bq_util.process_past_data("instance_hours", date, 14, lambda row: row['url_route']) # Munge the table by adding a few columns. total_instance_hours = 0.0 for row in data: total_instance_hours += row['instance_hours'] for row in data: row['%% of total'] = row['instance_hours'] / total_instance_hours * 100 row['per 1k requests'] = row['instance_hours'] / row['count_'] * 1000 sparkline_data = [] for old_data in historical_data: old_row = old_data.get(row['url_route']) if old_row: sparkline_data.append(old_row['instance_hours'] / old_row['count_']) else: sparkline_data.append(None) row['last 2 weeks (per request)'] = sparkline_data _ORDER = ('%% of total', 'instance_hours', 'count_', 'per 1k requests', 'last 2 weeks (per request)', 'url_route') subject = 'Instance Hours by Route - ' heading = 'Cost-normalized instance hours by route for %s' % ( _pretty_date(yyyymmdd)) all_data = _convert_table_rows_to_lists(data, _ORDER) # Let's just send the top most expensive routes, not all of them. _send_email({heading: all_data[:50]}, None, to=[initiatives.email('infrastructure')], subject=subject + 'All', dry_run=dry_run) # Per-initiative reports for initiative_id, initiative_data in _by_initiative(data): table = _convert_table_rows_to_lists(initiative_data, _ORDER) _send_email({heading: table[:50]}, None, to=[initiatives.email(initiative_id)], subject=subject + initiatives.title(initiative_id), dry_run=dry_run) # We'll also send the most-most expensive ones to stackdriver. _send_table_to_stackdriver(all_data[:20], 'webapp.routes.instance_hours.week_over_week', 'url_route', metric_label_col='url_route', data_col='last 2 weeks (per request)', dry_run=dry_run)