コード例 #1
0
def email_client_api_usage(date, dry_run=False):
    """Emails a report of API usage, segmented by client and build version."""
    yyyymmdd = date.strftime("%Y%m%d")

    ios_user_agent_regex = '^Khan%20Academy\.(.*)/(.*) CFNetwork/([.0-9]*)' \
                           ' Darwin/([.0-9]*)$'

    # We group all non-ios user agents into a single bucket to keep this
    # report down to a reasonable size.
    query = """\
SELECT IF(REGEXP_MATCH(user_agent, r'%(ios_user_agent_regex)s'),
      REGEXP_REPLACE(user_agent, r'%(ios_user_agent_regex)s', r'iOS \1'),
      'Web Browsers/other') as client,
      IF(REGEXP_MATCH(user_agent, r'%(ios_user_agent_regex)s'),
      REGEXP_REPLACE(user_agent, r'%(ios_user_agent_regex)s', r'\2'),
      '') as build,
    elog_url_route as route,
    count(elog_url_route) as request_count
FROM logs.requestlogs_%(date_format)s
WHERE REGEXP_MATCH(elog_url_route, '^api.main:/api/internal')
    AND user_agent IS NOT NULL
    AND elog_url_route IS NOT NULL
    AND LEFT(version_id, 3) != 'znd' # ignore znds
GROUP BY client, build, route
ORDER BY client DESC, build DESC, request_count DESC;
""" % {
        'ios_user_agent_regex': ios_user_agent_regex,
        'date_format': yyyymmdd
    }
    data = bq_util.query_bigquery(query)
    bq_util.save_daily_data(data, "client_api_usage", yyyymmdd)

    _ORDER = ('client', 'build', 'route', 'request_count')
    all_data = _convert_table_rows_to_lists(data, _ORDER)

    subject = 'API usage by client - '
    heading = 'API usage by client for %s' % _pretty_date(yyyymmdd)
    _send_email({heading: all_data},
                None,
                to=[initiatives.email('infrastructure')],
                subject=subject + 'All',
                dry_run=dry_run)

    # Per-initiative reports
    for initiative_id, initiative_data in _by_initiative(data, key='route'):
        table = _convert_table_rows_to_lists(initiative_data, _ORDER)
        _send_email({heading: table},
                    None,
                    to=[initiatives.email(initiative_id)],
                    subject=subject + initiatives.title(initiative_id),
                    dry_run=dry_run)
コード例 #2
0
def email_applog_sizes(date, dry_run=False):
    """Email app-log report for the given datetime.date object.

    This report says how much we are logging (via logging.info()
    and friends), grouped by the first word of the log message.
    (Which usually, but not always, is a good proxy for a single
    log-message in our app.)  Since we pay per byte logged, we
    want to make sure we're not accidentally logging a single
    log message a ton, which is really easy to do.
    """
    yyyymmdd = date.strftime("%Y%m%d")
    query = """\
SELECT
  REGEXP_EXTRACT(app_logs.message, r'^([a-zA-Z0-9_-]*)') AS firstword,
  FIRST(app_logs.message) as sample_logline,
  SUM(LENGTH(app_logs.message)) / 1024 / 1024 AS size_mb,
  -- This cost comes from https://cloud.google.com/stackdriver/pricing_v2:
  -- "Stackdriver Logging: $0.50/GB".  But it seems like app-log messages
  -- are actually encoded *twice* in the logging data, based on our
  -- best model of app-log sizes vs num-requests vs costs in the billing
  -- reports, so we assume our cost is $1/GB.
  SUM(LENGTH(app_logs.message)) / 1024 / 1024 / 1024 AS cost_usd
FROM
  logs.requestlogs_%s
GROUP BY
  firstword
ORDER BY
  cost_usd DESC
""" % (yyyymmdd)
    data = bq_util.query_bigquery(query)
    data = [row for row in data if row['firstword'] not in (None, '(None)')]
    bq_util.save_daily_data(data, "log_bytes", yyyymmdd)
    historical_data = bq_util.process_past_data("log_bytes", date, 14,
                                                lambda row: row['firstword'])

    # Munge the table by adding a few columns.
    total_bytes = 0.0
    for row in data:
        total_bytes += row['size_mb']

    for row in data:
        row['%% of total'] = row['size_mb'] / total_bytes * 100
        sparkline_data = []
        for old_data in historical_data:
            old_row = old_data.get(row['firstword'])
            if old_row:
                sparkline_data.append(old_row['size_mb'])
            else:
                sparkline_data.append(None)
        row['last 2 weeks'] = sparkline_data
        # While we're here, truncate the sample-logline, since it can get
        # really long.
        row['sample_logline'] = row['sample_logline'][:80]

    _ORDER = ('%% of total', 'size_mb', 'cost_usd', 'last 2 weeks',
              'firstword', 'sample_logline')

    subject = 'Log-bytes by first word of log-message - '
    heading = 'Cost-normalized log-bytes by firstword for %s' % (
        _pretty_date(yyyymmdd))
    all_data = _convert_table_rows_to_lists(data, _ORDER)
    # Let's just send the top most expensive routes, not all of them.
    _send_email({heading: all_data[:50]},
                None,
                to=[initiatives.email('infrastructure')],
                subject=subject + 'All',
                dry_run=dry_run)

    # As of 1 Jun 2018, the most expensive firstword costs about
    # $2/day.  More than $20 a day and we should be very suspicious.
    if any(row[2] > 20 for row in all_data[1:]):  # ignore the header line
        _send_email({heading: all_data[:75]},
                    None,
                    to=['*****@*****.**'],
                    subject=('WARNING: some very expensive loglines on %s!' %
                             _pretty_date(yyyymmdd)),
                    dry_run=dry_run)
コード例 #3
0
def email_rrs_stats(date, dry_run=False):
    """Emails stats about rrs requests that are too slow or have errors.

    rrs == React Render Server.

    Requests that take longer than one second are timed out and thus simply add
    a second to the reponse, rather than speeding it up.
    """
    yyyymmdd = date.strftime("%Y%m%d")

    latency_q = """
SELECT
  REPLACE(REGEXP_EXTRACT(
    httpRequest.requestUrl, r'/render\?path=\.(.*)'), '%2F', '/') AS url,
  AVG(FLOAT(jsonPayload.latencyseconds)) AS average_latency,
  COUNT(1) AS count,
  SUM(CASE
      WHEN FLOAT(jsonPayload.latencyseconds) >= 1.0 THEN 1
      ELSE 0 END) AS timeouts,
  SUM(CASE
      WHEN FLOAT(jsonPayload.latencyseconds) >= 1.0 THEN 100
      ELSE 0 END) / count(1) as timeout_percent
FROM
  [khan-academy:react_render_logs.appengine_googleapis_com_nginx_request_{}]
GROUP BY
  url
ORDER BY
  timeout_percent DESC
""".format(yyyymmdd)

    error_q = """
SELECT
  REPLACE(REGEXP_EXTRACT(
    httpRequest.requestUrl, r'/render\?path=\.(.*)'), '%2F', '/') AS url,
  SUM(httpRequest.status == 500) AS error_count,
  (SUM(httpRequest.status == 500) / COUNT(1)) * 100 AS error_percent
FROM
  [khan-academy:react_render_logs.appengine_googleapis_com_nginx_request_{}]
GROUP BY
  url
ORDER BY
  error_percent DESC
""".format(yyyymmdd)

    latency_data = bq_util.get_daily_data('rrs_latency', yyyymmdd)
    if latency_data is None:
        table_name = (
            "khan-academy:react_render_logs" +
            ".appengine_googleapis_com_nginx_request_{}").format(yyyymmdd)
        table_exists = bq_util.does_table_exist(table_name)
        if not table_exists:
            print "The RRS logs were not generated. No email will be sent."
            print "Returning..."
            return
        latency_data = bq_util.query_bigquery(latency_q)
        bq_util.save_daily_data(latency_data, 'rrs_latency', yyyymmdd)

    error_data = bq_util.get_daily_data('rrs_errors', yyyymmdd)
    if error_data is None:
        error_data = bq_util.query_bigquery(error_q)
        bq_util.save_daily_data(error_data, 'rrs_errors', yyyymmdd)

    subject = 'React render server errors and timeouts - '
    error_heading = 'React component errors'
    latency_heading = 'React compontent render latency (> 1 sec = timeout)'
    error_order = ('url', 'error_count', 'error_percent')
    latency_order = ('url', 'average_latency', 'count', 'timeouts',
                     'timeout_percent')
    tables = collections.defaultdict(dict)

    # Put data in tables by initiative
    for initiative_id, initiative_data in _by_initiative(error_data,
                                                         key='url',
                                                         by_package=True):
        tables[initiative_id][error_heading] = _convert_table_rows_to_lists(
            initiative_data, error_order)
    for initiative_id, initiative_data in _by_initiative(latency_data,
                                                         key='url',
                                                         by_package=True):
        tables[initiative_id][latency_heading] = _convert_table_rows_to_lists(
            initiative_data, latency_order)

    # Send email to initiatives
    for initiative_id, initiative_tables in tables.items():
        _send_email(initiative_tables,
                    graph=None,
                    to=[initiatives.email(initiative_id)],
                    subject=subject + initiatives.title(initiative_id),
                    dry_run=dry_run)

    # Send all data to infra
    tables = {}
    tables[error_heading] = _convert_table_rows_to_lists(
        error_data, error_order)
    tables[latency_heading] = _convert_table_rows_to_lists(
        latency_data, latency_order)
    _send_email(tables,
                graph=None,
                to=[initiatives.email('infrastructure')],
                subject=subject + 'All',
                dry_run=dry_run)
コード例 #4
0
def email_out_of_memory_errors(date, dry_run=False):
    # This sends two emails, for two different ways of seeing the data.
    # But we'll have them share the same subject so they thread together.
    yyyymmdd = date.strftime("%Y%m%d")
    subject = 'OOM errors - '

    # Out-of-memory errors for python look like:
    #   Exceeded soft memory limit of 2048 MB with 2078 MB after servicing 1497 requests total. Consider setting a larger instance class in app.yaml.  #@Nolint
    # Out-of-memory errors for kotlin look like:
    #   java.lang.OutOfMemoryError: <reason>
    #   (where <reason> is some text that may take on a number of values
    #   depending on whether the problem is lack of heap space, the garbage
    #   collector taking too long to stay ahead of garbage accumulation, etc.)
    # Note that older messages (before the gVisor sandboxs) started with
    # "Exceeded soft private memory limit" instead.
    numreqs = r"REGEXP_EXTRACT(app_logs.message, r'servicing (\d+) requests')"
    query = """\
SELECT COUNT(1) AS count_,
       IFNULL(module_id, 'default') AS module_id,
       NTH(10, QUANTILES(INTEGER(%s), 101)) as numserved_10th,
       NTH(50, QUANTILES(INTEGER(%s), 101)) as numserved_50th,
       NTH(90, QUANTILES(INTEGER(%s), 101)) as numserved_90th
FROM [logs.requestlogs_%s]
WHERE (app_logs.message CONTAINS 'Exceeded soft memory limit'
       OR app_logs.message CONTAINS 'OutOfMemoryError')
  AND LEFT(version_id, 3) != 'znd' # ignore znds
GROUP BY module_id
ORDER BY count_ DESC
""" % (numreqs, numreqs, numreqs, yyyymmdd)
    data = bq_util.query_bigquery(query)
    bq_util.save_daily_data(data, "out_of_memory_errors_by_module", yyyymmdd)
    historical_data = bq_util.process_past_data(
        "out_of_memory_errors_by_module", date, 14,
        lambda row: row['module_id'])

    for row in data:
        sparkline_data = []
        for old_data in historical_data:
            old_row = old_data.get(row['module_id'])
            if old_row:
                sparkline_data.append(old_row['count_'])
            elif old_data:
                # If we have data, just not on this module, then it just didn't
                # OOM.
                sparkline_data.append(0)
            else:
                # On the other hand, if we don't have data at all, we should
                # show a gap.
                sparkline_data.append(None)
        row['last 2 weeks'] = sparkline_data

    _ORDER = [
        'count_', 'last 2 weeks', 'module_id', 'numserved_10th',
        'numserved_50th', 'numserved_90th'
    ]
    data = _convert_table_rows_to_lists(data, _ORDER)

    heading = 'OOM errors by module for %s' % _pretty_date(yyyymmdd)
    email_content = {heading: data}

    query = """\
SELECT COUNT(1) AS count_,
       module_id,
       elog_url_route AS url_route
FROM (
    SELECT IFNULL(FIRST(module_id), 'default') AS module_id,
           FIRST(elog_url_route) AS elog_url_route,
           SUM(IF(
               app_logs.message CONTAINS 'Exceeded soft memory limit'
               OR app_logs.message CONTAINS 'OutOfMemoryError',
               1, 0)) AS oom_message_count
    FROM [logs.requestlogs_%s]
    WHERE LEFT(version_id, 3) != 'znd' # ignore znds
    GROUP BY request_id
    HAVING oom_message_count > 0
)
GROUP BY module_id, url_route
ORDER BY count_ DESC
""" % yyyymmdd
    data = bq_util.query_bigquery(query)
    bq_util.save_daily_data(data, "out_of_memory_errors_by_route", yyyymmdd)
    historical_data = bq_util.process_past_data(
        "out_of_memory_errors_by_route", date, 14, lambda row:
        (row['module_id'], row['url_route']))

    for row in data:
        sparkline_data = []
        for old_data in historical_data:
            old_row = old_data.get((row['module_id'], row['url_route']))
            if old_row:
                sparkline_data.append(old_row['count_'])
            elif old_data:
                # If we have data, just not on this route/module, then it just
                # didn't OOM.
                sparkline_data.append(0)
            else:
                # On the other hand, if we don't have data at all, we should
                # show a gap.
                sparkline_data.append(None)
        row['last 2 weeks'] = sparkline_data

    _ORDER = ['count_', 'last 2 weeks', 'module_id', 'url_route']
    heading = 'OOM errors by route for %s' % _pretty_date(yyyymmdd)

    email_content[heading] = _convert_table_rows_to_lists(data, _ORDER)
    _send_email(email_content,
                None,
                to=[initiatives.email('infrastructure')],
                subject=subject + 'All',
                dry_run=dry_run)

    # Per-initiative reports
    for initiative_id, initiative_data in _by_initiative(data):
        table = _convert_table_rows_to_lists(initiative_data, _ORDER)
        email_content = {heading: table}
        _send_email(email_content,
                    None,
                    to=[initiatives.email(initiative_id)],
                    subject=subject + initiatives.title(initiative_id),
                    dry_run=dry_run)
コード例 #5
0
def email_rpcs(date, dry_run=False):
    """Email RPCs-per-route report for the given datetime.date object.

    Also email a more urgent message if one of the RPCs is too expensive.
    This indicates a bug that is costing us money.
    """
    yyyymmdd = date.strftime("%Y%m%d")
    rpc_fields = ('Get', 'Put', 'Next', 'RunQuery', 'Delete', 'Commit')

    inits = [
        "IFNULL(INTEGER(t%s.rpc_%s), 0) AS rpc_%s" % (name, name, name)
        for name in rpc_fields
    ]
    inits.append("IFNULL(tcost.rpc_cost, 0) AS rpc_cost")
    joins = [
        "LEFT OUTER JOIN ( "
        "SELECT elog_url_route AS url_route, "
        "       SUM(elog_stats_rpc_ops.value) as rpc_%s "
        "FROM [logs.requestlogs_%s] "
        "WHERE elog_stats_rpc_ops.key = 'stats.rpc_ops.%s.count' "
        "GROUP BY url_route) AS t%s "
        "ON t1.url_route = t%s.url_route" % (name, yyyymmdd, name, name, name)
        for name in rpc_fields
    ]
    joins.append("LEFT OUTER JOIN ( "
                 "SELECT elog_url_route AS url_route, "
                 "       SUM(elog_stats_rpc_ops.value) AS rpc_cost "
                 "FROM [logs.requestlogs_%s] "
                 "WHERE elog_stats_rpc_ops.key = 'stats.rpc_ops.cost' "
                 "GROUP BY url_route) AS tcost "
                 "ON t1.url_route = tcost.url_route" % yyyymmdd)
    query = """\
SELECT t1.url_route AS url_route,
t1.url_requests AS requests,
%s
FROM (
    SELECT elog_url_route AS url_route, COUNT(1) AS url_requests
    FROM (
        SELECT FIRST(elog_url_route) AS elog_url_route
        FROM [logs.requestlogs_%s]
        WHERE LEFT(version_id, 3) != 'znd' # ignore znds
        GROUP BY request_id
    )
    GROUP BY url_route
) AS t1
%s
ORDER BY tcost.rpc_cost DESC;
""" % (',\n'.join(inits), yyyymmdd, '\n'.join(joins))
    data = bq_util.query_bigquery(query)
    bq_util.save_daily_data(data, "rpcs", yyyymmdd)
    historical_data = bq_util.process_past_data("rpcs", date, 14,
                                                lambda row: row['url_route'])

    # Munge the table by getting per-request counts for every RPC stat.
    micropennies = '&mu;&cent;'
    for row in data:
        for stat in rpc_fields:
            row['%s/req' % stat] = row['rpc_%s' % stat] * 1.0 / row['requests']
        row[micropennies + '/req'] = row['rpc_cost'] * 1.0 / row['requests']
        row['$'] = row['rpc_cost'] * 1.0e-8
        sparkline_data = []
        for old_data in historical_data:
            old_row = old_data.get(row['url_route'])
            if old_row and 'rpc_cost' in old_row:
                sparkline_data.append(old_row['rpc_cost'] * 1.0 /
                                      old_row['requests'])
            else:
                sparkline_data.append(None)
        row['last 2 weeks (%s/req)' % micropennies] = sparkline_data

        del row['rpc_cost']

    # Convert each row from a dict to a list, in a specific order.
    _ORDER = ([
        'url_route', 'requests', '$', micropennies + '/req',
        'last 2 weeks (%s/req)' % micropennies
    ] + ['rpc_%s' % f
         for f in rpc_fields] + ['%s/req' % f for f in rpc_fields])
    all_data = _convert_table_rows_to_lists(data, _ORDER)
    subject = 'RPC calls by route - '
    heading = 'RPC calls by route for %s' % _pretty_date(yyyymmdd)
    _send_email({heading: all_data[:75]},
                None,
                to=[initiatives.email('infrastructure')],
                subject=subject + 'All',
                dry_run=dry_run)

    # Per-initiative reports
    for initiative_id, initiative_data in _by_initiative(data):
        table = _convert_table_rows_to_lists(initiative_data, _ORDER)
        # Let's just send the top most expensive routes, not all of them.
        _send_email({heading: table[:75]},
                    None,
                    to=[initiatives.email(initiative_id)],
                    subject=subject + initiatives.title(initiative_id),
                    dry_run=dry_run)

    # We'll also send the most-most expensive ones to stackdriver.
    _send_table_to_stackdriver(all_data[:20],
                               'webapp.routes.rpc_cost.week_over_week',
                               'url_route',
                               metric_label_col='url_route',
                               data_col='last 2 weeks (%s/req)' % micropennies,
                               dry_run=dry_run)

    # As of 1 Feb 2016, the most expensive RPC route is about $300 a
    # day.  More than $750 a day and we should be very suspcious.
    # TODO(csilvers): do this check more frequently.
    # TODO(csilvers): send to slack and/or 911 as well as emailing
    if any(row[2] > 750 for row in all_data[1:]):  # ignore the header line
        _send_email({heading: all_data[:75]},
                    None,
                    to=['*****@*****.**'],
                    subject=('WARNING: some very expensive RPC calls on %s!' %
                             _pretty_date(yyyymmdd)),
                    dry_run=dry_run)
コード例 #6
0
def email_instance_hours(date, dry_run=False):
    """Email instance hours report for the given datetime.date object."""
    yyyymmdd = date.strftime("%Y%m%d")
    cost_fn = '\n'.join("WHEN module_id == '%s' THEN latency * %s" % kv
                        for kv in _MODULE_CPU_COUNT.iteritems())
    query = """\
SELECT COUNT(1) as count_,
elog_url_route as url_route,
SUM(CASE %s ELSE 0 END) / 3600 as instance_hours
FROM (
  -- When logs get split into multiple entries, each has latency calculated
  -- from the start of the request to the point where the log line was emitted.
  -- This means the total latency is the maximum value that appears, not the
  -- sum.
  SELECT FIRST(elog_url_route) AS elog_url_route,
         IFNULL(FIRST(module_id), 'default') AS module_id,
         MAX(latency) AS latency,
         FIRST(url_map_entry) AS url_map_entry
  FROM [logs.requestlogs_%s]
  WHERE LEFT(version_id, 3) != 'znd' # ignore znds
  GROUP BY request_id
)
WHERE url_map_entry != "" # omit static files
GROUP BY url_route
ORDER BY instance_hours DESC
""" % (cost_fn, yyyymmdd)
    data = bq_util.query_bigquery(query)
    bq_util.save_daily_data(data, "instance_hours", yyyymmdd)
    historical_data = bq_util.process_past_data("instance_hours", date, 14,
                                                lambda row: row['url_route'])

    # Munge the table by adding a few columns.
    total_instance_hours = 0.0
    for row in data:
        total_instance_hours += row['instance_hours']

    for row in data:
        row['%% of total'] = row['instance_hours'] / total_instance_hours * 100
        row['per 1k requests'] = row['instance_hours'] / row['count_'] * 1000
        sparkline_data = []
        for old_data in historical_data:
            old_row = old_data.get(row['url_route'])
            if old_row:
                sparkline_data.append(old_row['instance_hours'] /
                                      old_row['count_'])
            else:
                sparkline_data.append(None)
        row['last 2 weeks (per request)'] = sparkline_data

    _ORDER = ('%% of total', 'instance_hours', 'count_', 'per 1k requests',
              'last 2 weeks (per request)', 'url_route')

    subject = 'Instance Hours by Route - '
    heading = 'Cost-normalized instance hours by route for %s' % (
        _pretty_date(yyyymmdd))
    all_data = _convert_table_rows_to_lists(data, _ORDER)
    # Let's just send the top most expensive routes, not all of them.
    _send_email({heading: all_data[:50]},
                None,
                to=[initiatives.email('infrastructure')],
                subject=subject + 'All',
                dry_run=dry_run)

    # Per-initiative reports
    for initiative_id, initiative_data in _by_initiative(data):
        table = _convert_table_rows_to_lists(initiative_data, _ORDER)
        _send_email({heading: table[:50]},
                    None,
                    to=[initiatives.email(initiative_id)],
                    subject=subject + initiatives.title(initiative_id),
                    dry_run=dry_run)

    # We'll also send the most-most expensive ones to stackdriver.
    _send_table_to_stackdriver(all_data[:20],
                               'webapp.routes.instance_hours.week_over_week',
                               'url_route',
                               metric_label_col='url_route',
                               data_col='last 2 weeks (per request)',
                               dry_run=dry_run)