예제 #1
0
def _run_bigquery(config, start_time_t, time_interval_seconds,
        time_table_expiration):
    """config is as described in logs_bridge.config.json."""
    # First, create a temporary table that's just the rows from
    # start_time_t to start_time_t + time_interval_seconds.
    # We'll give it a random name so we can run multiple copies of
    # this script at the same time.
    temp_table_name = (
        'khan-academy:logs_streaming_tmp_analysis.logs_bridge_%d_%04d'
        % (start_time_t, random.randint(0, 9999)))
    # We assume that this script will not run for longer than
    # time_interval_seconds; if it did, it would continually be
    # falling behind!
    temp_table_query = _query_for_rows_in_time_range(config, start_time_t,
                                                     time_interval_seconds)
    # Apparently the commandline doesn't like newlines in the script.
    # Reformat for the commandline.
    temp_table_query = temp_table_query.replace('\n', ' ')

    logging.debug("Creating the temporary table for querying over by running "
                  + temp_table_query)
    bq_util.call_bq(['mk', '--expiration', str(time_table_expiration),
                     temp_table_name],
                    project='khan-academy',
                    return_output=False,
                    stdout=open(os.devnull, 'w'))
    bq_util.call_bq(['query', '--destination_table', temp_table_name,
                     '--allow_large_results', temp_table_query],
                    project='khanacademy.org:deductive-jet-827',
                    return_output=False)

    logging.info("Step 1: Created temp table with all loglines. "
            "View it on bigquery:")
    logging.info("  https://bigquery.cloud.google.com/table/%s?tab=preview"
            % (temp_table_name))

    subqueries = [_create_subquery(entry, start_time_t, time_interval_seconds,
                                   temp_table_name)
                  for entry in config]

    # num_requests is the total number of requests in the specified
    # time period (either `now` or some other time). In order to get
    # this value, we sum the total number of requests for each field
    # (e.g. we sum the total number of requests for each browser)
    # partioned by the time, `when`.
    query = ('SELECT *, SUM(num_requests_by_field) OVER(PARTITION BY when)'
             ' as num_requests FROM %s' % ',\n'.join(subqueries))
    logging.debug('BIGQUERY QUERY: %s' % query)

    job_name = 'logs_bridge_query_%s' % random.randint(0, sys.maxint)
    r = bq_util.query_bigquery(query, job_name=job_name)
    logging.info('Step 2: Counting # of logs that match metric:')
    logging.info('  https://bigquery.cloud.google.com/results/khanacademy.org:deductive-jet-827:%s' % job_name)
    logging.debug('BIGQUERY RESULTS: %s' % r)

    return r
예제 #2
0
def email_instance_hours(date, dry_run=False):
    """Email instance hours report for the given datetime.date object."""
    yyyymmdd = date.strftime("%Y%m%d")
    cost_fn = '\n'.join("WHEN module_id == '%s' THEN latency * %s" % kv
                        for kv in _MODULE_CPU_COUNT.iteritems())
    query = """\
SELECT COUNT(*) as count_,
elog_url_route as url_route,
SUM(CASE %s ELSE 0 END) / 3600 as instance_hours
FROM [logs.requestlogs_%s]
WHERE url_map_entry != "" # omit static files
GROUP BY url_route
ORDER BY instance_hours DESC
""" % (cost_fn, yyyymmdd)
    data = bq_util.query_bigquery(query)
    bq_util.save_daily_data(data, "instance_hours", yyyymmdd)
    historical_data = bq_util.process_past_data(
        "instance_hours", date, 14, lambda row: row['url_route'])

    # Munge the table by adding a few columns.
    total_instance_hours = 0.0
    for row in data:
        total_instance_hours += row['instance_hours']

    for row in data:
        row['%% of total'] = row['instance_hours'] / total_instance_hours * 100
        row['per 1k requests'] = row['instance_hours'] / row['count_'] * 1000
        sparkline_data = []
        for old_data in historical_data:
            old_row = old_data.get(row['url_route'])
            if old_row:
                sparkline_data.append(
                    old_row['instance_hours'] / old_row['count_'])
            else:
                sparkline_data.append(None)
        row['last 2 weeks (per request)'] = sparkline_data

    _ORDER = ('%% of total', 'instance_hours', 'count_', 'per 1k requests',
              'last 2 weeks (per request)', 'url_route')
    data = _convert_table_rows_to_lists(data, _ORDER)

    subject = 'Instance Hours by Route'
    heading = 'Instance hours by route for %s' % _pretty_date(yyyymmdd)
    # Let's just send the top most expensive routes, not all of them.
    _send_email({heading: data[:50]}, None,
                to=['*****@*****.**'],
                subject=subject,
                dry_run=dry_run)

    # We'll also send the most-most expensive ones to stackdriver.
    _send_table_to_stackdriver(data[:20],
                               'webapp.routes.instance_hours.week_over_week',
                               'url_route', metric_label_col='url_route',
                               data_col='last 2 weeks (per request)',
                               dry_run=dry_run)
예제 #3
0
def email_client_api_usage(date, dry_run=False):
    """Emails a report of API usage, segmented by client and build version."""
    yyyymmdd = date.strftime("%Y%m%d")

    ios_user_agent_regex = '^Khan%20Academy\.(.*)/(.*) CFNetwork/([.0-9]*)' \
                           ' Darwin/([.0-9]*)$'

    # We group all non-ios user agents into a single bucket to keep this
    # report down to a reasonable size.
    query = """\
SELECT IF(REGEXP_MATCH(user_agent, r'%(ios_user_agent_regex)s'),
      REGEXP_REPLACE(user_agent, r'%(ios_user_agent_regex)s', r'iOS \1'),
      'Web Browsers/other') as client,
      IF(REGEXP_MATCH(user_agent, r'%(ios_user_agent_regex)s'),
      REGEXP_REPLACE(user_agent, r'%(ios_user_agent_regex)s', r'\2'),
      '') as build,
    elog_url_route as route,
    count(elog_url_route) as request_count
FROM logs.requestlogs_%(date_format)s
WHERE REGEXP_MATCH(elog_url_route, '^api.main:/api/internal')
    AND user_agent IS NOT NULL
    AND elog_url_route IS NOT NULL
    AND LEFT(version_id, 3) != 'znd' # ignore znds
GROUP BY client, build, route
ORDER BY client DESC, build DESC, request_count DESC;
""" % {
        'ios_user_agent_regex': ios_user_agent_regex,
        'date_format': yyyymmdd
    }
    data = bq_util.query_bigquery(query)
    bq_util.save_daily_data(data, "client_api_usage", yyyymmdd)

    _ORDER = ('client', 'build', 'route', 'request_count')
    all_data = _convert_table_rows_to_lists(data, _ORDER)

    subject = 'API usage by client - '
    heading = 'API usage by client for %s' % _pretty_date(yyyymmdd)
    _send_email({heading: all_data},
                None,
                to=[initiatives.email('infrastructure')],
                subject=subject + 'All',
                dry_run=dry_run)

    # Per-initiative reports
    for initiative_id, initiative_data in _by_initiative(data, key='route'):
        table = _convert_table_rows_to_lists(initiative_data, _ORDER)
        _send_email({heading: table},
                    None,
                    to=[initiatives.email(initiative_id)],
                    subject=subject + initiatives.title(initiative_id),
                    dry_run=dry_run)
예제 #4
0
def email_instance_hours(date):
    """Email instance hours report for the given datetime.date object."""
    yyyymmdd = date.strftime("%Y%m%d")
    cost_fn = '\n'.join("WHEN module_id == '%s' THEN latency * %s" % kv
                        for kv in _MODULE_CPU_COUNT.iteritems())
    query = """\
SELECT COUNT(*) as count_,
elog_url_route as url_route,
SUM(CASE %s ELSE 0 END) / 3600 as instance_hours
FROM [logs.requestlogs_%s]
WHERE url_map_entry != "" # omit static files
GROUP BY url_route
ORDER BY instance_hours DESC
""" % (cost_fn, yyyymmdd)
    data = bq_util.query_bigquery(query)
    bq_util.save_daily_data(data, "instance_hours", yyyymmdd)
    historical_data = bq_util.process_past_data("instance_hours", date, 14,
                                                lambda row: row['url_route'])

    # Munge the table by adding a few columns.
    total_instance_hours = 0.0
    for row in data:
        total_instance_hours += row['instance_hours']

    for row in data:
        row['%% of total'] = row['instance_hours'] / total_instance_hours * 100
        row['per 1k requests'] = row['instance_hours'] / row['count_'] * 1000
        sparkline_data = []
        for old_data in historical_data:
            old_row = old_data.get(row['url_route'])
            if old_row:
                sparkline_data.append(old_row['instance_hours'] /
                                      old_row['count_'])
            else:
                sparkline_data.append(None)
        row['last 2 weeks (per request)'] = sparkline_data

    _ORDER = ('%% of total', 'instance_hours', 'count_', 'per 1k requests',
              'last 2 weeks (per request)', 'url_route')
    data = _convert_table_rows_to_lists(data, _ORDER)

    subject = 'Instance Hours by Route'
    heading = 'Instance hours by route for %s' % _pretty_date(yyyymmdd)
    # Let's just send the top most expensive routes, not all of them.
    _send_email({heading: data[:50]},
                None,
                to=['*****@*****.**'],
                subject=subject)
예제 #5
0
def report_out_of_memory_errors(yyyymmdd_hh, graphite_host):
    # This bigquery query is modified from email_bq_data.py
    query = """\
SELECT module_id, integer(app_logs.time) as time_t
FROM [logs_hourly.requestlogs_%s]
WHERE app_logs.message CONTAINS 'Exceeded soft private memory limit'
      AND module_id IS NOT NULL
""" % (yyyymmdd_hh)
    data = bq_util.query_bigquery(query)

    records = [('webapp.%(module_id)s_module.stats.oom' % row,
                (row['time_t'], 1)) for row in data]

    if not graphite_host:
        print 'Would send to graphite: %s' % records
    else:
        graphite_util.send_to_graphite(graphite_host, records)
예제 #6
0
def report_out_of_memory_errors(yyyymmdd_hh, graphite_host):
    # This bigquery query is modified from email_bq_data.py
    query = """\
SELECT module_id, integer(app_logs.time) as time_t
FROM [logs_hourly.requestlogs_%s]
WHERE app_logs.message CONTAINS 'Exceeded soft private memory limit'
      AND module_id IS NOT NULL
""" % (yyyymmdd_hh)
    data = bq_util.query_bigquery(query)

    records = [('webapp.%(module_id)s_module.stats.oom' % row,
                (row['time_t'], 1))
               for row in data]

    if not graphite_host:
        print 'Would send to graphite: %s' % records
    else:
        graphite_util.send_to_graphite(graphite_host, records)
def email_instance_hours(date):
    """Email instance hours report for the given datetime.date object."""
    yyyymmdd = date.strftime("%Y%m%d")
    cost_fn = "\n".join("WHEN module_id == '%s' THEN latency * %s" % kv for kv in _MODULE_CPU_COUNT.iteritems())
    query = """\
SELECT COUNT(*) as count_,
elog_url_route as url_route,
SUM(CASE %s ELSE 0 END) / 3600 as instance_hours
FROM [logs.requestlogs_%s]
WHERE url_map_entry != "" # omit static files
GROUP BY url_route
ORDER BY instance_hours DESC
""" % (
        cost_fn,
        yyyymmdd,
    )
    data = bq_util.query_bigquery(query)
    bq_util.save_daily_data(data, "instance_hours", yyyymmdd)
    historical_data = bq_util.process_past_data("instance_hours", date, 14, lambda row: row["url_route"])

    # Munge the table by adding a few columns.
    total_instance_hours = 0.0
    for row in data:
        total_instance_hours += row["instance_hours"]

    for row in data:
        row["%% of total"] = row["instance_hours"] / total_instance_hours * 100
        row["per 1k requests"] = row["instance_hours"] / row["count_"] * 1000
        sparkline_data = []
        for old_data in historical_data:
            old_row = old_data.get(row["url_route"])
            if old_row:
                sparkline_data.append(old_row["instance_hours"] / old_row["count_"])
            else:
                sparkline_data.append(None)
        row["last 2 weeks (per request)"] = sparkline_data

    _ORDER = ("%% of total", "instance_hours", "count_", "per 1k requests", "last 2 weeks (per request)", "url_route")
    data = _convert_table_rows_to_lists(data, _ORDER)

    subject = "Instance Hours by Route"
    heading = "Instance hours by route for %s" % _pretty_date(yyyymmdd)
    # Let's just send the top most expensive routes, not all of them.
    _send_email({heading: data[:50]}, None, to=["*****@*****.**"], subject=subject)
def check(date, dry_run=False):
    yyyymmdd = date.strftime("%Y%m%d")
    q = QUERY.format(yyyymmdd)
    data = bq_util.query_bigquery(q)
    route_data = [row for row in data
                  if not row['route'] in ROUTES_EXPECTED_TO_FAIL]

    for row in route_data:
        route = row['route']
        row['owners'] = initiatives.route_owners(route)
    if dry_run:
        if not route_data:
            print 'No routes with no 2xx requests for {}'.format(
                date.strftime('%x'))
        else:
            print 'Routes with no 2xx requests for {}:\n{}'.format(
                date.strftime('%x'), '\n'.join(_errors(route_data)))
        return

    if route_data:
        notify(route_data, date)
예제 #9
0
def email_client_api_usage(date, dry_run=False):
    """Emails a report of API usage, segmented by client and build version."""
    yyyymmdd = date.strftime("%Y%m%d")

    ios_user_agent_regex = '^Khan%20Academy\.(.*)/(.*) CFNetwork/([.0-9]*)' \
                           ' Darwin/([.0-9]*)$'

    # We group all non-ios user agents into a single bucket to keep this
    # report down to a reasonable size.
    query = """\
SELECT IF(REGEXP_MATCH(user_agent, r'%(ios_user_agent_regex)s'),
      REGEXP_REPLACE(user_agent, r'%(ios_user_agent_regex)s', r'iOS \1'),
      'Web Browsers/other') as client,
      IF(REGEXP_MATCH(user_agent, r'%(ios_user_agent_regex)s'),
      REGEXP_REPLACE(user_agent, r'%(ios_user_agent_regex)s', r'\2'),
      '') as build,
    elog_url_route as route,
    count(elog_url_route) as request_count
FROM logs.requestlogs_%(date_format)s
WHERE REGEXP_MATCH(elog_url_route, '^api.main:/api/internal')
    AND user_agent IS NOT NULL
    AND elog_url_route IS NOT NULL
GROUP BY client, build, route
ORDER BY client DESC, build DESC, request_count DESC;
""" % {'ios_user_agent_regex': ios_user_agent_regex, 'date_format': yyyymmdd}
    data = bq_util.query_bigquery(query)
    bq_util.save_daily_data(data, "client_api_usage", yyyymmdd)

    _ORDER = ('client', 'build', 'route', 'request_count')

    data = _convert_table_rows_to_lists(data, _ORDER)

    subject = 'API usage by client'
    heading = 'API usage by client for %s' % _pretty_date(yyyymmdd)
    _send_email({heading: data}, None,
                to=['*****@*****.**'],
                subject=subject,
                dry_run=dry_run)
예제 #10
0
def get_google_services_costs(start_time_t, end_time_t):
    """Return a list of dicts of GCP costs from start_time_t to end_time_t.

    start_time_t and end_time_t should be unix times represented as ints.

    Each dict in the list contains a daily_cost_so_far_usd, project_name,
    product, and resource_type, e.g.:
    {u'daily_cost_so_far_usd': 7.396287000000001,
     u'product': u'Cloud Pub/Sub',
     u'project_name': u'khan-academy',
     u'resource_type': u'Message Operations'}
    """

    query = """\
SELECT SUM(cost) AS daily_cost_so_far_usd, project.name, product, resource_type
FROM [1_gae_billing.gcp_billing_export_00A183_1C5C74_24422A]
WHERE cost > 0
AND start_time >= timestamp('%s')
AND end_time <= timestamp('%s')
GROUP BY project.name, product, resource_type
""" % (start_time_t, end_time_t)

    return bq_util.query_bigquery(query)
예제 #11
0
def email_rpcs(date, dry_run=False):
    """Email RPCs-per-route report for the given datetime.date object.

    Also email a more urgent message if one of the RPCs is too expensive.
    This indicates a bug that is costing us money.
    """
    yyyymmdd = date.strftime("%Y%m%d")
    rpc_fields = ('Get', 'Put', 'Next', 'RunQuery', 'Delete', 'Commit')

    inits = [
        "IFNULL(INTEGER(t%s.rpc_%s), 0) AS rpc_%s" % (name, name, name)
        for name in rpc_fields
    ]
    inits.append("IFNULL(tcost.rpc_cost, 0) AS rpc_cost")
    joins = [
        "LEFT OUTER JOIN ( "
        "SELECT elog_url_route AS url_route, "
        "       SUM(elog_stats_rpc_ops.value) as rpc_%s "
        "FROM [logs.requestlogs_%s] "
        "WHERE elog_stats_rpc_ops.key = 'stats.rpc_ops.%s.count' "
        "GROUP BY url_route) AS t%s "
        "ON t1.url_route = t%s.url_route" % (name, yyyymmdd, name, name, name)
        for name in rpc_fields
    ]
    joins.append("LEFT OUTER JOIN ( "
                 "SELECT elog_url_route AS url_route, "
                 "       SUM(elog_stats_rpc_ops.value) AS rpc_cost "
                 "FROM [logs.requestlogs_%s] "
                 "WHERE elog_stats_rpc_ops.key = 'stats.rpc_ops.cost' "
                 "GROUP BY url_route) AS tcost "
                 "ON t1.url_route = tcost.url_route" % yyyymmdd)
    query = """\
SELECT t1.url_route AS url_route,
t1.url_requests AS requests,
%s
FROM (
    SELECT elog_url_route AS url_route, COUNT(1) AS url_requests
    FROM (
        SELECT FIRST(elog_url_route) AS elog_url_route
        FROM [logs.requestlogs_%s]
        WHERE LEFT(version_id, 3) != 'znd' # ignore znds
        GROUP BY request_id
    )
    GROUP BY url_route
) AS t1
%s
ORDER BY tcost.rpc_cost DESC;
""" % (',\n'.join(inits), yyyymmdd, '\n'.join(joins))
    data = bq_util.query_bigquery(query)
    bq_util.save_daily_data(data, "rpcs", yyyymmdd)
    historical_data = bq_util.process_past_data("rpcs", date, 14,
                                                lambda row: row['url_route'])

    # Munge the table by getting per-request counts for every RPC stat.
    micropennies = '&mu;&cent;'
    for row in data:
        for stat in rpc_fields:
            row['%s/req' % stat] = row['rpc_%s' % stat] * 1.0 / row['requests']
        row[micropennies + '/req'] = row['rpc_cost'] * 1.0 / row['requests']
        row['$'] = row['rpc_cost'] * 1.0e-8
        sparkline_data = []
        for old_data in historical_data:
            old_row = old_data.get(row['url_route'])
            if old_row and 'rpc_cost' in old_row:
                sparkline_data.append(old_row['rpc_cost'] * 1.0 /
                                      old_row['requests'])
            else:
                sparkline_data.append(None)
        row['last 2 weeks (%s/req)' % micropennies] = sparkline_data

        del row['rpc_cost']

    # Convert each row from a dict to a list, in a specific order.
    _ORDER = ([
        'url_route', 'requests', '$', micropennies + '/req',
        'last 2 weeks (%s/req)' % micropennies
    ] + ['rpc_%s' % f
         for f in rpc_fields] + ['%s/req' % f for f in rpc_fields])
    all_data = _convert_table_rows_to_lists(data, _ORDER)
    subject = 'RPC calls by route - '
    heading = 'RPC calls by route for %s' % _pretty_date(yyyymmdd)
    _send_email({heading: all_data[:75]},
                None,
                to=[initiatives.email('infrastructure')],
                subject=subject + 'All',
                dry_run=dry_run)

    # Per-initiative reports
    for initiative_id, initiative_data in _by_initiative(data):
        table = _convert_table_rows_to_lists(initiative_data, _ORDER)
        # Let's just send the top most expensive routes, not all of them.
        _send_email({heading: table[:75]},
                    None,
                    to=[initiatives.email(initiative_id)],
                    subject=subject + initiatives.title(initiative_id),
                    dry_run=dry_run)

    # We'll also send the most-most expensive ones to stackdriver.
    _send_table_to_stackdriver(all_data[:20],
                               'webapp.routes.rpc_cost.week_over_week',
                               'url_route',
                               metric_label_col='url_route',
                               data_col='last 2 weeks (%s/req)' % micropennies,
                               dry_run=dry_run)

    # As of 1 Feb 2016, the most expensive RPC route is about $300 a
    # day.  More than $750 a day and we should be very suspcious.
    # TODO(csilvers): do this check more frequently.
    # TODO(csilvers): send to slack and/or 911 as well as emailing
    if any(row[2] > 750 for row in all_data[1:]):  # ignore the header line
        _send_email({heading: all_data[:75]},
                    None,
                    to=['*****@*****.**'],
                    subject=('WARNING: some very expensive RPC calls on %s!' %
                             _pretty_date(yyyymmdd)),
                    dry_run=dry_run)
예제 #12
0
def email_memory_increases(date, window_length=20, min_increase_in_mb=1):
    """Emails the increases in memory caused by particular routes.

    It attempts to compute the amount of memory ignoring memory which is
    reclaimed in the next few requests.  (The number of requests which are
    checked is specified by the window_length parameter.  Routes with a total
    increase less than the min_increase_in_mb parameter are ignored.)
    """
    yyyymmdd = date.strftime("%Y%m%d")
    lead_lengths = range(1, window_length + 1)
    lead_selects = '\n'.join(
        "LEAD(total, %s) OVER (PARTITION BY instance_key ORDER BY start_time) "
        "AS lead_total_%s," % (i, i) for i in lead_lengths)

    fields = ['total'] + ['lead_total_%s' % i for i in lead_lengths]
    # We want to compute the minimal value of added + field - total, where
    # field is one of "total" or one of the "lead_total_i".  BigQuery
    # unfortunately doesn't give us a nice way to do this (at least that I know
    # of, without doing a CROSS JOIN of the table to itself).  One way to do
    # this would be a gigantic nested IF, but this could be exponentially
    # large, and queries have a fixed maximum size.  Instead we use a CASE
    # expression which could be O(n^2); since we don't pay for BigQuery
    # execution time, and n shouldn't be too huge, this seems like a better
    # approach.  In theory it might be better to do O(n) nested queries (each
    # of which does a single pairwise min), but this seems like it could in
    # practice be even slower, depending on the implementation.
    # TODO(benkraft): If I get a useful answer to
    # http://stackoverflow.com/questions/24923101/computing-a-moving-maximum-in-bigquery
    # we should use that instead.  Or, once we have user-defined functions in
    # BigQuery, we can probably do something actually reasonable.
    case_expr = '\n'.join(
        "WHEN %s THEN added + %s - total" % (
            ' AND '.join("%s <= %s" % (field1, field2)
                         for field2 in fields if field2 != field1),
            field1)
        for field1 in fields)

    # This is a kind of large query, so here's what it's doing, from inside to
    # out:
    #   First, extract the memory data from the logs.
    #   Second, compute the appropriate LEAD() columns, which tell us what the
    #       total will be a few requests later on the same instance
    #   Third, compute "real_added", which is the amount we think the request
    #       actually added to the heap, not counting memory which was soon
    #       reclaimed.  This might come out negative, so
    #   Fourth, make sure the memory added is at least zero, since if memory
    #       usage went down, this request probably shouldn't get the credit.
    #   Fifth, group by route and do whatever aggregation we want.  Ignore the
    #       first 25 requests to each module, since those are probably all
    #       loading things that each module has to load, and the request that
    #       does the loading shouldn't be blamed.
    query = """\
SELECT
    COUNT(*) AS count_,
    elog_url_route AS url_route,
    module_id AS module,
    AVG(real_added) AS added_avg,
    NTH(99, QUANTILES(real_added, 101)) AS added_98th,
    SUM(real_added) AS added_total,
FROM (
    SELECT
        IF(real_added > 0, real_added, 0) AS real_added,
        elog_url_route, module_id, num,
    FROM (
        SELECT
            (CASE %s ELSE added END) AS real_added,
            elog_url_route, module_id, num,
        FROM (
            SELECT
                %s
                RANK() OVER (PARTITION BY instance_key
                             ORDER BY start_time) AS num,
                added, total, elog_url_route, module_id,
            FROM (
                SELECT
                    FLOAT(REGEXP_EXTRACT(
                        app_logs.message,
                        "This request added (.*) MB to the heap.")) AS added,
                    FLOAT(REGEXP_EXTRACT(
                        app_logs.message,
                        "Total memory now used: (.*) MB")) AS total,
                    instance_key, start_time, elog_url_route, module_id,
                FROM [logs.requestlogs_%s]
                WHERE app_logs.message CONTAINS 'This request added'
            )
        )
    )
)
WHERE num > 25
GROUP BY url_route, module
ORDER BY added_total DESC
""" % (case_expr, lead_selects, yyyymmdd)
    data = bq_util.query_bigquery(query)
    bq_util.save_daily_data(data, "memory_increases", yyyymmdd)
    historical_data = bq_util.process_past_data(
        "memory_increases", date, 14,
        lambda row: (row['module'], row['url_route']))

    by_module = collections.defaultdict(list)
    for row in data:
        if row['added_total'] > min_increase_in_mb:
            heading = "Memory increases by route for %s module on %s" % (
                row['module'], _pretty_date(yyyymmdd))
            by_module[heading].append(row)

    _ORDER = ['count_', 'added_avg', 'last 2 weeks (avg)', 'added_98th',
              'added_total', 'added %%', 'url_route']
    for heading in by_module:
        total = sum(row['added_total'] for row in by_module[heading])
        for row in by_module[heading]:
            row['added %%'] = row['added_total'] / total * 100
            sparkline_data = []
            for old_data in historical_data:
                old_row = old_data.get((row['module'], row['url_route']))
                if old_row:
                    sparkline_data.append(old_row['added_avg'])
                else:
                    sparkline_data.append(None)
            row['last 2 weeks (avg)'] = sparkline_data
            del row['module']
        by_module[heading] = _convert_table_rows_to_lists(
            by_module[heading][:50], _ORDER)
    subject = "Memory Increases by Route"
    _send_email(by_module, None,
                to=['*****@*****.**'],
                subject=subject)
예제 #13
0
def email_out_of_memory_errors(date):
    # This sends two emails, for two different ways of seeing the data.
    # But we'll have them share the same subject so they thread together.
    yyyymmdd = date.strftime("%Y%m%d")
    subject = 'OOM errors'

    # Out-of-memory errors look like:
    #   Exceeded soft private memory limit with 260.109 MB after servicing 2406 requests total  #@Nolint
    # with SDK 1.9.7, they changed. Note the double-space after "after":
    #   Exceeded soft private memory limit of 512 MB with 515 MB after  servicing 9964 requests total  #@Nolint
    numreqs = r"REGEXP_EXTRACT(app_logs.message, r'servicing (\d+) requests')"
    query = """\
SELECT COUNT(module_id) AS count_,
       module_id,
       NTH(10, QUANTILES(INTEGER(%s), 101)) as numserved_10th,
       NTH(50, QUANTILES(INTEGER(%s), 101)) as numserved_50th,
       NTH(90, QUANTILES(INTEGER(%s), 101)) as numserved_90th
FROM [logs.requestlogs_%s]
WHERE app_logs.message CONTAINS 'Exceeded soft private memory limit'
      AND module_id IS NOT NULL
GROUP BY module_id
ORDER BY count_ DESC
""" % (numreqs, numreqs, numreqs, yyyymmdd)
    data = bq_util.query_bigquery(query)
    bq_util.save_daily_data(data, "out_of_memory_errors_by_module", yyyymmdd)
    historical_data = bq_util.process_past_data(
        "out_of_memory_errors_by_module", date, 14,
        lambda row: row['module_id'])

    for row in data:
        sparkline_data = []
        for old_data in historical_data:
            old_row = old_data.get(row['module_id'])
            if old_row:
                sparkline_data.append(old_row['count_'])
            elif old_data:
                # If we have data, just not on this module, then it just didn't
                # OOM.
                sparkline_data.append(0)
            else:
                # On the other hand, if we don't have data at all, we should
                # show a gap.
                sparkline_data.append(None)
        row['last 2 weeks'] = sparkline_data

    _ORDER = ['count_', 'last 2 weeks', 'module_id',
              'numserved_10th', 'numserved_50th', 'numserved_90th']
    data = _convert_table_rows_to_lists(data, _ORDER)

    heading = 'OOM errors by module for %s' % _pretty_date(yyyymmdd)
    email_content = {heading: data}

    query = """\
SELECT COUNT(*) as count_,
       module_id,
       elog_url_route as url_route
FROM [logs.requestlogs_%s]
WHERE app_logs.message CONTAINS 'Exceeded soft private memory limit'
GROUP BY module_id, url_route
ORDER BY count_ DESC
""" % yyyymmdd
    data = bq_util.query_bigquery(query)
    bq_util.save_daily_data(data, "out_of_memory_errors_by_route", yyyymmdd)
    historical_data = bq_util.process_past_data(
        "out_of_memory_errors_by_route", date, 14,
        lambda row: (row['module_id'], row['url_route']))

    for row in data:
        sparkline_data = []
        for old_data in historical_data:
            old_row = old_data.get((row['module_id'], row['url_route']))
            if old_row:
                sparkline_data.append(old_row['count_'])
            elif old_data:
                # If we have data, just not on this route/module, then it just
                # didn't OOM.
                sparkline_data.append(0)
            else:
                # On the other hand, if we don't have data at all, we should
                # show a gap.
                sparkline_data.append(None)
        row['last 2 weeks'] = sparkline_data

    _ORDER = ['count_', 'last 2 weeks', 'module_id', 'url_route']
    data = _convert_table_rows_to_lists(data, _ORDER)

    heading = 'OOM errors by route for %s' % _pretty_date(yyyymmdd)
    email_content[heading] = data

    _send_email(email_content, None,
                to=['*****@*****.**'],
                subject=subject)
예제 #14
0
def email_rpcs(date):
    """Email RPCs-per-route report for the given datetime.date object."""
    yyyymmdd = date.strftime("%Y%m%d")
    rpc_fields = ('Get', 'Put', 'Next', 'RunQuery', 'Delete', 'Commit')

    inits = ["IFNULL(INTEGER(t%s.rpc_%s), 0) AS rpc_%s" % (name, name, name)
             for name in rpc_fields]
    inits.append("IFNULL(tcost.rpc_cost, 0) AS rpc_cost")
    joins = ["LEFT OUTER JOIN ( "
             "SELECT elog_url_route AS url_route, "
             "       SUM(elog_stats_rpc_ops.value) as rpc_%s "
             "FROM [logs.requestlogs_%s] "
             "WHERE elog_stats_rpc_ops.key = 'stats.rpc_ops.%s.count' "
             "GROUP BY url_route) AS t%s "
             "ON t1.url_route = t%s.url_route"
             % (name, yyyymmdd, name, name, name)
             for name in rpc_fields]
    joins.append("LEFT OUTER JOIN ( "
                 "SELECT elog_url_route AS url_route, "
                 "       SUM(elog_stats_rpc_ops.value) AS rpc_cost "
                 "FROM [logs.requestlogs_%s] "
                 "WHERE elog_stats_rpc_ops.key = 'stats.rpc_ops.cost' "
                 "GROUP BY url_route) AS tcost "
                 "ON t1.url_route = tcost.url_route"
                 % yyyymmdd)
    query = """\
SELECT t1.url_route AS url_route,
t1.url_requests AS requests,
%s
FROM (
SELECT elog_url_route AS url_route, COUNT(*) AS url_requests
FROM [logs.requestlogs_%s]
GROUP BY url_route) AS t1
%s
ORDER BY tcost.rpc_cost DESC;
""" % (',\n'.join(inits), yyyymmdd, '\n'.join(joins))
    data = bq_util.query_bigquery(query)
    bq_util.save_daily_data(data, "rpcs", yyyymmdd)
    historical_data = bq_util.process_past_data(
        "rpcs", date, 14, lambda row: row['url_route'])

    # Munge the table by getting per-request counts for every RPC stat.
    micropennies = '&mu;&cent;'
    for row in data:
        for stat in rpc_fields:
            row['%s/req' % stat] = row['rpc_%s' % stat] * 1.0 / row['requests']
        row[micropennies + '/req'] = row['rpc_cost'] * 1.0 / row['requests']
        row['$'] = row['rpc_cost'] * 1.0e-8
        sparkline_data = []
        for old_data in historical_data:
            old_row = old_data.get(row['url_route'])
            if old_row and 'rpc_cost' in old_row:
                sparkline_data.append(
                    old_row['rpc_cost'] * 1.0 / old_row['requests'])
            else:
                sparkline_data.append(None)
        row['last 2 weeks (%s/req)' % micropennies] = sparkline_data

        del row['rpc_cost']

    # Convert each row from a dict to a list, in a specific order.
    _ORDER = (['url_route', 'requests', '$', micropennies + '/req',
               'last 2 weeks (%s/req)' % micropennies] +
              ['rpc_%s' % f for f in rpc_fields] +
              ['%s/req' % f for f in rpc_fields])
    data = _convert_table_rows_to_lists(data, _ORDER)

    subject = 'RPC calls by route'
    heading = 'RPC calls by route for %s' % _pretty_date(yyyymmdd)
    # Let's just send the top most expensive routes, not all of them.
    _send_email({heading: data[:75]}, None,
                to=['*****@*****.**'],
                subject=subject)
예제 #15
0
def email_applog_sizes(date, dry_run=False):
    """Email app-log report for the given datetime.date object.

    This report says how much we are logging (via logging.info()
    and friends), grouped by the first word of the log message.
    (Which usually, but not always, is a good proxy for a single
    log-message in our app.)  Since we pay per byte logged, we
    want to make sure we're not accidentally logging a single
    log message a ton, which is really easy to do.
    """
    yyyymmdd = date.strftime("%Y%m%d")
    query = """\
SELECT
  REGEXP_EXTRACT(app_logs.message, r'^([a-zA-Z0-9_-]*)') AS firstword,
  FIRST(app_logs.message) as sample_logline,
  SUM(LENGTH(app_logs.message)) / 1024 / 1024 AS size_mb,
  -- This cost comes from https://cloud.google.com/stackdriver/pricing_v2:
  -- "Stackdriver Logging: $0.50/GB".  But it seems like app-log messages
  -- are actually encoded *twice* in the logging data, based on our
  -- best model of app-log sizes vs num-requests vs costs in the billing
  -- reports, so we assume our cost is $1/GB.
  SUM(LENGTH(app_logs.message)) / 1024 / 1024 / 1024 AS cost_usd
FROM
  logs.requestlogs_%s
GROUP BY
  firstword
ORDER BY
  cost_usd DESC
""" % (yyyymmdd)
    data = bq_util.query_bigquery(query)
    data = [row for row in data if row['firstword'] not in (None, '(None)')]
    bq_util.save_daily_data(data, "log_bytes", yyyymmdd)
    historical_data = bq_util.process_past_data("log_bytes", date, 14,
                                                lambda row: row['firstword'])

    # Munge the table by adding a few columns.
    total_bytes = 0.0
    for row in data:
        total_bytes += row['size_mb']

    for row in data:
        row['%% of total'] = row['size_mb'] / total_bytes * 100
        sparkline_data = []
        for old_data in historical_data:
            old_row = old_data.get(row['firstword'])
            if old_row:
                sparkline_data.append(old_row['size_mb'])
            else:
                sparkline_data.append(None)
        row['last 2 weeks'] = sparkline_data
        # While we're here, truncate the sample-logline, since it can get
        # really long.
        row['sample_logline'] = row['sample_logline'][:80]

    _ORDER = ('%% of total', 'size_mb', 'cost_usd', 'last 2 weeks',
              'firstword', 'sample_logline')

    subject = 'Log-bytes by first word of log-message - '
    heading = 'Cost-normalized log-bytes by firstword for %s' % (
        _pretty_date(yyyymmdd))
    all_data = _convert_table_rows_to_lists(data, _ORDER)
    # Let's just send the top most expensive routes, not all of them.
    _send_email({heading: all_data[:50]},
                None,
                to=[initiatives.email('infrastructure')],
                subject=subject + 'All',
                dry_run=dry_run)

    # As of 1 Jun 2018, the most expensive firstword costs about
    # $2/day.  More than $20 a day and we should be very suspicious.
    if any(row[2] > 20 for row in all_data[1:]):  # ignore the header line
        _send_email({heading: all_data[:75]},
                    None,
                    to=['*****@*****.**'],
                    subject=('WARNING: some very expensive loglines on %s!' %
                             _pretty_date(yyyymmdd)),
                    dry_run=dry_run)
예제 #16
0
def email_rrs_stats(date, dry_run=False):
    """Emails stats about rrs requests that are too slow or have errors.

    rrs == React Render Server.

    Requests that take longer than one second are timed out and thus simply add
    a second to the reponse, rather than speeding it up.
    """
    yyyymmdd = date.strftime("%Y%m%d")

    latency_q = """
SELECT
  REPLACE(REGEXP_EXTRACT(
    httpRequest.requestUrl, r'/render\?path=\.(.*)'), '%2F', '/') AS url,
  AVG(FLOAT(jsonPayload.latencyseconds)) AS average_latency,
  COUNT(1) AS count,
  SUM(CASE
      WHEN FLOAT(jsonPayload.latencyseconds) >= 1.0 THEN 1
      ELSE 0 END) AS timeouts,
  SUM(CASE
      WHEN FLOAT(jsonPayload.latencyseconds) >= 1.0 THEN 100
      ELSE 0 END) / count(1) as timeout_percent
FROM
  [khan-academy:react_render_logs.appengine_googleapis_com_nginx_request_{}]
GROUP BY
  url
ORDER BY
  timeout_percent DESC
""".format(yyyymmdd)

    error_q = """
SELECT
  REPLACE(REGEXP_EXTRACT(
    httpRequest.requestUrl, r'/render\?path=\.(.*)'), '%2F', '/') AS url,
  SUM(httpRequest.status == 500) AS error_count,
  (SUM(httpRequest.status == 500) / COUNT(1)) * 100 AS error_percent
FROM
  [khan-academy:react_render_logs.appengine_googleapis_com_nginx_request_{}]
GROUP BY
  url
ORDER BY
  error_percent DESC
""".format(yyyymmdd)

    latency_data = bq_util.get_daily_data('rrs_latency', yyyymmdd)
    if latency_data is None:
        table_name = (
            "khan-academy:react_render_logs" +
            ".appengine_googleapis_com_nginx_request_{}").format(yyyymmdd)
        table_exists = bq_util.does_table_exist(table_name)
        if not table_exists:
            print "The RRS logs were not generated. No email will be sent."
            print "Returning..."
            return
        latency_data = bq_util.query_bigquery(latency_q)
        bq_util.save_daily_data(latency_data, 'rrs_latency', yyyymmdd)

    error_data = bq_util.get_daily_data('rrs_errors', yyyymmdd)
    if error_data is None:
        error_data = bq_util.query_bigquery(error_q)
        bq_util.save_daily_data(error_data, 'rrs_errors', yyyymmdd)

    subject = 'React render server errors and timeouts - '
    error_heading = 'React component errors'
    latency_heading = 'React compontent render latency (> 1 sec = timeout)'
    error_order = ('url', 'error_count', 'error_percent')
    latency_order = ('url', 'average_latency', 'count', 'timeouts',
                     'timeout_percent')
    tables = collections.defaultdict(dict)

    # Put data in tables by initiative
    for initiative_id, initiative_data in _by_initiative(error_data,
                                                         key='url',
                                                         by_package=True):
        tables[initiative_id][error_heading] = _convert_table_rows_to_lists(
            initiative_data, error_order)
    for initiative_id, initiative_data in _by_initiative(latency_data,
                                                         key='url',
                                                         by_package=True):
        tables[initiative_id][latency_heading] = _convert_table_rows_to_lists(
            initiative_data, latency_order)

    # Send email to initiatives
    for initiative_id, initiative_tables in tables.items():
        _send_email(initiative_tables,
                    graph=None,
                    to=[initiatives.email(initiative_id)],
                    subject=subject + initiatives.title(initiative_id),
                    dry_run=dry_run)

    # Send all data to infra
    tables = {}
    tables[error_heading] = _convert_table_rows_to_lists(
        error_data, error_order)
    tables[latency_heading] = _convert_table_rows_to_lists(
        latency_data, latency_order)
    _send_email(tables,
                graph=None,
                to=[initiatives.email('infrastructure')],
                subject=subject + 'All',
                dry_run=dry_run)
예제 #17
0
def email_rpcs(date, dry_run=False):
    """Email RPCs-per-route report for the given datetime.date object.

    Also email a more urgent message if one of the RPCs is too expensive.
    This indicates a bug that is costing us money.
    """
    yyyymmdd = date.strftime("%Y%m%d")
    rpc_fields = ('Get', 'Put', 'Next', 'RunQuery', 'Delete', 'Commit')

    inits = ["IFNULL(INTEGER(t%s.rpc_%s), 0) AS rpc_%s" % (name, name, name)
             for name in rpc_fields]
    inits.append("IFNULL(tcost.rpc_cost, 0) AS rpc_cost")
    joins = ["LEFT OUTER JOIN ( "
             "SELECT elog_url_route AS url_route, "
             "       SUM(elog_stats_rpc_ops.value) as rpc_%s "
             "FROM [logs.requestlogs_%s] "
             "WHERE elog_stats_rpc_ops.key = 'stats.rpc_ops.%s.count' "
             "GROUP BY url_route) AS t%s "
             "ON t1.url_route = t%s.url_route"
             % (name, yyyymmdd, name, name, name)
             for name in rpc_fields]
    joins.append("LEFT OUTER JOIN ( "
                 "SELECT elog_url_route AS url_route, "
                 "       SUM(elog_stats_rpc_ops.value) AS rpc_cost "
                 "FROM [logs.requestlogs_%s] "
                 "WHERE elog_stats_rpc_ops.key = 'stats.rpc_ops.cost' "
                 "GROUP BY url_route) AS tcost "
                 "ON t1.url_route = tcost.url_route"
                 % yyyymmdd)
    query = """\
SELECT t1.url_route AS url_route,
t1.url_requests AS requests,
%s
FROM (
SELECT elog_url_route AS url_route, COUNT(*) AS url_requests
FROM [logs.requestlogs_%s]
GROUP BY url_route) AS t1
%s
ORDER BY tcost.rpc_cost DESC;
""" % (',\n'.join(inits), yyyymmdd, '\n'.join(joins))
    data = bq_util.query_bigquery(query)
    bq_util.save_daily_data(data, "rpcs", yyyymmdd)
    historical_data = bq_util.process_past_data(
        "rpcs", date, 14, lambda row: row['url_route'])

    # Munge the table by getting per-request counts for every RPC stat.
    micropennies = '&mu;&cent;'
    for row in data:
        for stat in rpc_fields:
            row['%s/req' % stat] = row['rpc_%s' % stat] * 1.0 / row['requests']
        row[micropennies + '/req'] = row['rpc_cost'] * 1.0 / row['requests']
        row['$'] = row['rpc_cost'] * 1.0e-8
        sparkline_data = []
        for old_data in historical_data:
            old_row = old_data.get(row['url_route'])
            if old_row and 'rpc_cost' in old_row:
                sparkline_data.append(
                    old_row['rpc_cost'] * 1.0 / old_row['requests'])
            else:
                sparkline_data.append(None)
        row['last 2 weeks (%s/req)' % micropennies] = sparkline_data

        del row['rpc_cost']

    # Convert each row from a dict to a list, in a specific order.
    _ORDER = (['url_route', 'requests', '$', micropennies + '/req',
               'last 2 weeks (%s/req)' % micropennies] +
              ['rpc_%s' % f for f in rpc_fields] +
              ['%s/req' % f for f in rpc_fields])
    data = _convert_table_rows_to_lists(data, _ORDER)

    subject = 'RPC calls by route'
    heading = 'RPC calls by route for %s' % _pretty_date(yyyymmdd)
    # Let's just send the top most expensive routes, not all of them.
    _send_email({heading: data[:75]}, None,
                to=['*****@*****.**'],
                subject=subject,
                dry_run=dry_run)

    # We'll also send the most-most expensive ones to stackdriver.
    _send_table_to_stackdriver(data[:20],
                               'webapp.routes.rpc_cost.week_over_week',
                               'url_route', metric_label_col='url_route',
                               data_col='last 2 weeks (%s/req)' % micropennies,
                               dry_run=dry_run)

    # As of 1 Feb 2016, the most expensive RPC route is about $300 a
    # day.  More than $750 a day and we should be very suspcious.
    # TODO(csilvers): do this check more frequently.
    # TODO(csilvers): send to slack and/or 911 as well as emailing
    if any(row[2] > 750 for row in data):
        _send_email({heading: data[:75]}, None,
                    to=['*****@*****.**'],
                    subject=('WARNING: some very expensive RPC calls on %s!'
                             % _pretty_date(yyyymmdd)),
                    dry_run=dry_run)
예제 #18
0
def email_instance_hours(date, dry_run=False):
    """Email instance hours report for the given datetime.date object."""
    yyyymmdd = date.strftime("%Y%m%d")
    cost_fn = '\n'.join("WHEN module_id == '%s' THEN latency * %s" % kv
                        for kv in _MODULE_CPU_COUNT.iteritems())
    query = """\
SELECT COUNT(1) as count_,
elog_url_route as url_route,
SUM(CASE %s ELSE 0 END) / 3600 as instance_hours
FROM (
  -- When logs get split into multiple entries, each has latency calculated
  -- from the start of the request to the point where the log line was emitted.
  -- This means the total latency is the maximum value that appears, not the
  -- sum.
  SELECT FIRST(elog_url_route) AS elog_url_route,
         IFNULL(FIRST(module_id), 'default') AS module_id,
         MAX(latency) AS latency,
         FIRST(url_map_entry) AS url_map_entry
  FROM [logs.requestlogs_%s]
  WHERE LEFT(version_id, 3) != 'znd' # ignore znds
  GROUP BY request_id
)
WHERE url_map_entry != "" # omit static files
GROUP BY url_route
ORDER BY instance_hours DESC
""" % (cost_fn, yyyymmdd)
    data = bq_util.query_bigquery(query)
    bq_util.save_daily_data(data, "instance_hours", yyyymmdd)
    historical_data = bq_util.process_past_data("instance_hours", date, 14,
                                                lambda row: row['url_route'])

    # Munge the table by adding a few columns.
    total_instance_hours = 0.0
    for row in data:
        total_instance_hours += row['instance_hours']

    for row in data:
        row['%% of total'] = row['instance_hours'] / total_instance_hours * 100
        row['per 1k requests'] = row['instance_hours'] / row['count_'] * 1000
        sparkline_data = []
        for old_data in historical_data:
            old_row = old_data.get(row['url_route'])
            if old_row:
                sparkline_data.append(old_row['instance_hours'] /
                                      old_row['count_'])
            else:
                sparkline_data.append(None)
        row['last 2 weeks (per request)'] = sparkline_data

    _ORDER = ('%% of total', 'instance_hours', 'count_', 'per 1k requests',
              'last 2 weeks (per request)', 'url_route')

    subject = 'Instance Hours by Route - '
    heading = 'Cost-normalized instance hours by route for %s' % (
        _pretty_date(yyyymmdd))
    all_data = _convert_table_rows_to_lists(data, _ORDER)
    # Let's just send the top most expensive routes, not all of them.
    _send_email({heading: all_data[:50]},
                None,
                to=[initiatives.email('infrastructure')],
                subject=subject + 'All',
                dry_run=dry_run)

    # Per-initiative reports
    for initiative_id, initiative_data in _by_initiative(data):
        table = _convert_table_rows_to_lists(initiative_data, _ORDER)
        _send_email({heading: table[:50]},
                    None,
                    to=[initiatives.email(initiative_id)],
                    subject=subject + initiatives.title(initiative_id),
                    dry_run=dry_run)

    # We'll also send the most-most expensive ones to stackdriver.
    _send_table_to_stackdriver(all_data[:20],
                               'webapp.routes.instance_hours.week_over_week',
                               'url_route',
                               metric_label_col='url_route',
                               data_col='last 2 weeks (per request)',
                               dry_run=dry_run)
예제 #19
0
def get_weekly_page_load_data_from_bigquery(page_load_table_name):
    """Given the temp table to query, calculate weekly page load perf data.

    We calculate the 90th percentile of page load performance by country,
    by page, and for US server navigation load time.

    This is similar to what is displayed in this stackdriver dashboard:
    https://app.google.stackdriver.com/monitoring/1037059/
    page-load-performance-90th-percentile?project=khan-academy
    """

    by_country_query = """\
SELECT
    FLOAT(NTH(91, QUANTILES(page_load_time, 101))) AS page_load_time,
    elog_country
FROM
    [%s]
WHERE
    elog_country IN ('US', 'ID', 'CN', 'IN', 'BR', 'MX')
GROUP BY
    elog_country
""" % page_load_table_name

    by_page_query = """\
SELECT
    FLOAT(NTH(91, QUANTILES(page_load_time, 101))) AS page_load_time,
    page_load_page
FROM
    [%s]
GROUP BY
    page_load_page
""" % page_load_table_name

    server_usa_query = """\
SELECT
    FLOAT(NTH(91, QUANTILES(page_load_time, 101))) AS page_load_time,
    page_load_page
FROM
    [%s]
WHERE
    page_load_nav_type = 'server'
    AND elog_country = 'US'
GROUP BY
    page_load_page
""" % page_load_table_name

    by_page_results = bq_util.query_bigquery(by_page_query)
    by_country_results = bq_util.query_bigquery(by_country_query)
    server_usa_results = bq_util.query_bigquery(server_usa_query)

    by_page_data = {
        "type":
        "page_load.by_page",
        "data":
        dict([(d["page_load_page"], d["page_load_time"])
              for d in by_page_results]),
    }
    by_country_data = {
        "type":
        "page_load.by_country",
        "data":
        dict([(d["elog_country"], d["page_load_time"])
              for d in by_country_results]),
    }
    server_usa_data = {
        "type":
        "page_load.server_usa",
        "data":
        dict([(d["page_load_page"], d["page_load_time"])
              for d in server_usa_results]),
    }

    return [by_page_data, by_country_data, server_usa_data]
예제 #20
0
def email_out_of_memory_errors(date, dry_run=False):
    # This sends two emails, for two different ways of seeing the data.
    # But we'll have them share the same subject so they thread together.
    yyyymmdd = date.strftime("%Y%m%d")
    subject = 'OOM errors - '

    # Out-of-memory errors for python look like:
    #   Exceeded soft memory limit of 2048 MB with 2078 MB after servicing 1497 requests total. Consider setting a larger instance class in app.yaml.  #@Nolint
    # Out-of-memory errors for kotlin look like:
    #   java.lang.OutOfMemoryError: <reason>
    #   (where <reason> is some text that may take on a number of values
    #   depending on whether the problem is lack of heap space, the garbage
    #   collector taking too long to stay ahead of garbage accumulation, etc.)
    # Note that older messages (before the gVisor sandboxs) started with
    # "Exceeded soft private memory limit" instead.
    numreqs = r"REGEXP_EXTRACT(app_logs.message, r'servicing (\d+) requests')"
    query = """\
SELECT COUNT(1) AS count_,
       IFNULL(module_id, 'default') AS module_id,
       NTH(10, QUANTILES(INTEGER(%s), 101)) as numserved_10th,
       NTH(50, QUANTILES(INTEGER(%s), 101)) as numserved_50th,
       NTH(90, QUANTILES(INTEGER(%s), 101)) as numserved_90th
FROM [logs.requestlogs_%s]
WHERE (app_logs.message CONTAINS 'Exceeded soft memory limit'
       OR app_logs.message CONTAINS 'OutOfMemoryError')
  AND LEFT(version_id, 3) != 'znd' # ignore znds
GROUP BY module_id
ORDER BY count_ DESC
""" % (numreqs, numreqs, numreqs, yyyymmdd)
    data = bq_util.query_bigquery(query)
    bq_util.save_daily_data(data, "out_of_memory_errors_by_module", yyyymmdd)
    historical_data = bq_util.process_past_data(
        "out_of_memory_errors_by_module", date, 14,
        lambda row: row['module_id'])

    for row in data:
        sparkline_data = []
        for old_data in historical_data:
            old_row = old_data.get(row['module_id'])
            if old_row:
                sparkline_data.append(old_row['count_'])
            elif old_data:
                # If we have data, just not on this module, then it just didn't
                # OOM.
                sparkline_data.append(0)
            else:
                # On the other hand, if we don't have data at all, we should
                # show a gap.
                sparkline_data.append(None)
        row['last 2 weeks'] = sparkline_data

    _ORDER = [
        'count_', 'last 2 weeks', 'module_id', 'numserved_10th',
        'numserved_50th', 'numserved_90th'
    ]
    data = _convert_table_rows_to_lists(data, _ORDER)

    heading = 'OOM errors by module for %s' % _pretty_date(yyyymmdd)
    email_content = {heading: data}

    query = """\
SELECT COUNT(1) AS count_,
       module_id,
       elog_url_route AS url_route
FROM (
    SELECT IFNULL(FIRST(module_id), 'default') AS module_id,
           FIRST(elog_url_route) AS elog_url_route,
           SUM(IF(
               app_logs.message CONTAINS 'Exceeded soft memory limit'
               OR app_logs.message CONTAINS 'OutOfMemoryError',
               1, 0)) AS oom_message_count
    FROM [logs.requestlogs_%s]
    WHERE LEFT(version_id, 3) != 'znd' # ignore znds
    GROUP BY request_id
    HAVING oom_message_count > 0
)
GROUP BY module_id, url_route
ORDER BY count_ DESC
""" % yyyymmdd
    data = bq_util.query_bigquery(query)
    bq_util.save_daily_data(data, "out_of_memory_errors_by_route", yyyymmdd)
    historical_data = bq_util.process_past_data(
        "out_of_memory_errors_by_route", date, 14, lambda row:
        (row['module_id'], row['url_route']))

    for row in data:
        sparkline_data = []
        for old_data in historical_data:
            old_row = old_data.get((row['module_id'], row['url_route']))
            if old_row:
                sparkline_data.append(old_row['count_'])
            elif old_data:
                # If we have data, just not on this route/module, then it just
                # didn't OOM.
                sparkline_data.append(0)
            else:
                # On the other hand, if we don't have data at all, we should
                # show a gap.
                sparkline_data.append(None)
        row['last 2 weeks'] = sparkline_data

    _ORDER = ['count_', 'last 2 weeks', 'module_id', 'url_route']
    heading = 'OOM errors by route for %s' % _pretty_date(yyyymmdd)

    email_content[heading] = _convert_table_rows_to_lists(data, _ORDER)
    _send_email(email_content,
                None,
                to=[initiatives.email('infrastructure')],
                subject=subject + 'All',
                dry_run=dry_run)

    # Per-initiative reports
    for initiative_id, initiative_data in _by_initiative(data):
        table = _convert_table_rows_to_lists(initiative_data, _ORDER)
        email_content = {heading: table}
        _send_email(email_content,
                    None,
                    to=[initiatives.email(initiative_id)],
                    subject=subject + initiatives.title(initiative_id),
                    dry_run=dry_run)
def get_bigquery_costs(date,
                       projects=[
                           'khanacademy.org:deductive-jet-827', 'khan-academy'
                       ]):
    """Return the aggregate cost of bigquery queries on the given date.

    date is expected to be given as a YYYYMMDD string.

    Returns a dict which maps the project name to cost for the given date.

    Note that project 'khanacademy.org:deductive-jet-827' is renamed 'khan'
    for consistency with other metrics. Example return value:
    {'khan_academy': 0,
     'khan': 63.15}
    """

    field_name = ("protopayload_auditlog.servicedata_v1_bigquery."
                  "jobCompletedEvent.job.jobStatistics.totalBilledBytes")

    table_name = ("exported_stackdriver_bigquery_logs"
                  ".cloudaudit_googleapis_com_data_access_%s") % date

    data = {}

    # Sometimes when this runs, we haven't yet had a completed job for the day,
    # and when that happens, the field we're trying to query is missing.
    # We record those errors here.  Then at the end, if all (both) of our
    # projects had this error, we raise an exception, under the assumption that
    # something more serious is wrong and we need to alert.
    field_missing_errors = {}

    # TODO(nabil): figure out how to deal with tiered billing.
    # See https://cloud.google.com/bigquery/pricing#high-compute
    # It seems likely that we simply multiply by this field when non-null:
    # `protopayload_google_cloud_audit_auditlog.
    # servicedata_google_cloud_bigquery_logging_v1_auditdata.
    # jobCompletedEvent.job.jobStatistics.billingTier`
    # However, there are no non-null examples, so it would be good to confirm.
    for project in projects:
        query = ("SELECT SUM(%s) / %s as daily_query_cost_so_far_usd "
                 "FROM [%s]") % (field_name, _COST_PER_BYTE_INVERSE,
                                 table_name)

        try:
            raw_data = bq_util.query_bigquery(query, project=project)
        except bq_util.BQException as e:
            # This particular error is sometimes expected when we haven't had a
            # successful query yet today (see comment above on
            # `field_missing_errors`).
            field_missing_message = (
                "Field 'protopayload_auditlog.servicedata_v1_bigquery"
                ".jobCompletedEvent.job.jobStatistics.totalBilledBytes' "
                "not found")
            # The error message from bigquery is hard line-wrapped
            exception_message = e.message.replace('\n', '')
            if field_missing_message in exception_message:
                logging.warn('totalBilledBytes missing in %s' % project)
                field_missing_errors[project] = exception_message
                data[project] = 0
                continue
            raise
        else:
            field_missing_errors[project] = None

        cost = raw_data[0]['daily_query_cost_so_far_usd']

        # if we didn't find anything, send a cost of 0
        # see bq_util.query_bigquery for why this is not just None
        if cost == '(None)':
            cost = 0

        # rename this 'khan' for consistency with other metrics
        # attempting to look up biquery data under 'khan' fails, however
        if project == 'khanacademy.org:deductive-jet-827':
            project = 'khan'

        # replace characters that may interfere with searching for metrics at:
        # https://www.hostedgraphite.com/app/metrics/#
        project = re.sub(r'[^A-Za-z0-9_.]', '_', project)

        data[project] = cost

    if all(field_missing_errors.values()):
        raise RuntimeError(
            'All projects were missing the totalBilledBytes field.  Messages: '
            + '\n'.join(field_missing_errors.values()))

    return data