Example #1
0
def main():
    parser = argparse.ArgumentParser(description=__doc__.split('\n\n', 1)[0])
    parser.add_argument('--graphite_host',
                        default='www.hostedgraphite.com',
                        help=('host of the graphite Render URL API '
                              '[default: %(default)s]'))
    parser.add_argument('--project_id',
                        default='khan-academy',
                        help=('project ID of a Google Cloud Platform project '
                              'with the Cloud Monitoring API enabled '
                              '[default: %(default)s]'))
    # A 24-hour window in graphite produces 5 minute buckets, even
    # when comparing metrics week-over-week, a consistent default.
    parser.add_argument('--window-seconds',
                        default='86400',
                        type=int,
                        help=('window of time to read from graphite. '
                              'The most recent datapoint is sent to Cloud '
                              'Monitoring [default: %(default)s]'))
    parser.add_argument('-v',
                        '--verbose',
                        action='count',
                        default=0,
                        help=('enable verbose logging (-vv for very verbose '
                              'logging)'))
    parser.add_argument('-n',
                        '--dry-run',
                        action='store_true',
                        default=False,
                        help='do not write metrics to Cloud Monitoring')
    parser.add_argument('-t',
                        '--test-write',
                        action='store_true',
                        default=False,
                        help=('write a datapoint to the Cloud Monitoring '
                              'timeseries named "write_test"'))
    args = parser.parse_args()

    # -v for INFO, -vv for DEBUG.
    if args.verbose >= 2:
        logging.basicConfig(level=logging.DEBUG)
    elif args.verbose == 1:
        logging.basicConfig(level=logging.INFO)

    if args.test_write:
        data = [('write_test', {}, math.sin(time.time()), int(time.time()))]
        cloudmonitoring_util.send_timeseries_to_cloudmonitoring(
            args.project_id, data, dry_run=args.dry_run)
    else:
        data = _graphite_to_cloudmonitoring(args.graphite_host,
                                            args.project_id,
                                            _default_metrics(),
                                            dry_run=args.dry_run,
                                            window_seconds=args.window_seconds)
    if args.dry_run:
        print "Would send %d datapoint(s)" % len(data)
    else:
        print "Sent %d datapoint(s)" % len(data)
Example #2
0
def main(project_id, dry_run):
    now = time.time()

    service = cloudmonitoring_util.get_cloud_service('compute', 'v1')

    # Get the number of failed GCE instances for each module of interest
    # via the cloud compute API and put it into Stackdriver
    instance_list_response = _get_instances_list_from_cloud_compute(
        service, project_id)

    # Map module_id as used in Stackdriver to the identifying substring for
    # that module in GCE instance names.
    module_id_to_name_substring = {
        'react-render': 'gae-react--render',
        'vm': 'gae-vm-'
    }
    for module_id, name_substring in module_id_to_name_substring.iteritems():
        instances = _get_instances_matching_name_from_response(
            instance_list_response, name_substring)

        serial_port_output_lines = [
            _get_serial_port_output_lines_from_cloud_compute(
                service, project_id, instance) for instance in instances
        ]

        # Number of consecutive "unhealthy" instance statuses required to
        # consider that instance "failed".
        unhealthy_count_threshold = 5

        num_failed_instances = len([
            l for l in serial_port_output_lines
            if _instance_is_failed(l, unhealthy_count_threshold)
        ])

        if dry_run:
            print('module=%s, num_failed_instances=%s' %
                  (module_id, num_failed_instances))
            continue

        # Send metric to Stackdriver.
        data = ('gce.failed_instance_count', {
            'module_id': module_id
        }, num_failed_instances, now)
        cloudmonitoring_util.send_timeseries_to_cloudmonitoring(
            project_id, [data])
def main(project_id, dry_run):
    now = time.time()

    service = cloudmonitoring_util.get_cloud_service('compute', 'v1')

    # Get the number of failed GCE instances for each module of interest
    # via the cloud compute API and put it into Stackdriver
    instance_list_response = _get_instances_list_from_cloud_compute(
        service, project_id)

    # Map module_id as used in Stackdriver to the identifying substring for
    # that module in GCE instance names.
    module_id_to_name_substring = {'react-render': 'gae-react--render',
                                   'vm': 'gae-vm-'}
    for module_id, name_substring in module_id_to_name_substring.iteritems():
        instances = _get_instances_matching_name_from_response(
            instance_list_response, name_substring)

        serial_port_output_lines = [
            _get_serial_port_output_lines_from_cloud_compute(
                service, project_id, instance)
            for instance in instances
        ]

        # Number of consecutive "unhealthy" instance statuses required to
        # consider that instance "failed".
        unhealthy_count_threshold = 5

        num_failed_instances = len(
            [l for l in serial_port_output_lines
             if _instance_is_failed(l, unhealthy_count_threshold)])

        if dry_run:
            print ('module=%s, num_failed_instances=%s'
                   % (module_id, num_failed_instances))
            continue

        # Send metric to Stackdriver.
        data = ('gce.failed_instance_count',
                {'module_id': module_id},
                num_failed_instances,
                now)
        cloudmonitoring_util.send_timeseries_to_cloudmonitoring(project_id,
                                                                [data])
Example #4
0
def _send_to_stackdriver(google_project_id, bigquery_values,
                         start_time_t, time_interval_seconds, dry_run):
    """bigquery_values is a list of triples via get_values_from_bigquery."""
    # send_to_cloudmonitoring wants data in a particular format, including
    # the timestamp.  We give all these datapoints the timestamp at the
    # *end* of our time-range: start_time_t + time_interval_seconds
    time_t = start_time_t + time_interval_seconds

    # Get the data in the format needed by cloudmonitoring_util.
    data = [(metric_name, metric_labels, value, time_t)
            for (metric_name, metric_labels, value) in bigquery_values]

    return cloudmonitoring_util.send_timeseries_to_cloudmonitoring(
        google_project_id, data, dry_run)
Example #5
0
def _graphite_to_cloudmonitoring(graphite_host,
                                 google_project_id,
                                 metrics,
                                 window_seconds=300,
                                 dry_run=False):
    targets = [m.target for m in metrics]
    from_str = '-%ss' % window_seconds
    response = graphite_util.fetch(graphite_host, targets, from_str=from_str)

    outbound = []
    assert len(response) == len(metrics), (len(response), len(metrics))
    for metric, item in zip(metrics, response):
        datapoints = item['datapoints']

        # Figure out each target's bucket size returned by graphite. This
        # requires 2 or more datapoints, so we ignore entries without
        # enough data, instead of exporting inaccurate timestamps.
        if len(datapoints) < 2:
            logging.info('Ignoring target with too little data: %s %s' %
                         (item['target'], datapoints))
            continue

        bucket_seconds = datapoints[1][1] - datapoints[0][1]
        logging.debug('Detected bucket size of %ss for %s' %
                      (bucket_seconds, metric.name))

        # Extract valid data with two filters:
        #
        # 1) Ignore the first bucket. This current bucket might still
        # be collecting data, so it's incomplete and we don't want to
        # persist it to Cloud Monitoring, which won't let us
        # subsequently update it.
        #
        # 2) Ignore empty buckets. The Render URL API divides the
        # window of data in equally-sized buckets. If there isn't a
        # datapoint in a timespan bucket (e.g., a 5-second period) the
        # value is None (null in JSON).
        datapoints = [p for p in datapoints[:-1] if p[0] is not None]

        # Ignore metrics without any datapoints, only empty buckets.
        if not datapoints:
            logging.info('Ignoring target with no data: %s' % item['target'])
            continue

        # We'll only send the youngest complete data point for each
        # target. We threw out the youngest, possibly-incomplete
        # bucket above, so we know we're good here.
        value, timestamp = datapoints[-1]

        # Graphite buckets line up depending on when the API call is
        # made. We don't choose how to align them when using a relative
        # time like "all data in the last 5 minutes, i.e., -5min". Since
        # we use the bucket timestamp as the datapoint timestamp, we want
        # it to be stable across script executions. We normalize the
        # buckets by rounding to the next-oldest bucket's beginning,
        # assuming that the first-ever bucket began at the UNIX epoch.
        if timestamp % bucket_seconds != 0:
            timestamp = timestamp - timestamp % bucket_seconds

        # Use a friendly name in place of a (possibly complex) graphite target.
        # The '{}' is because we don't use stackdriver metric-labels yet.
        outbound.append((metric.name, {}, value, timestamp))

    # Load data to Cloud Monitoring.
    cloudmonitoring_util.send_timeseries_to_cloudmonitoring(google_project_id,
                                                            outbound,
                                                            dry_run=dry_run)
    return outbound
def _send_table_to_stackdriver(table,
                               metric_name,
                               metric_label_name,
                               metric_label_col,
                               data_col,
                               dry_run=False):
    """Send week-over-week data to stackdriver.

    For tables that have sparklines, we take the ratio of the most
    recent point on the sparkline to the oldest point (which is
    probably two weeks ago), and send that ratio as a datapoint to
    stackdriver (Google Cloud Monitoring), using the given metric-name
    and metric-label.

    Arguments:
       table: A list of lists of the form:
              [[HEADING_A, HEADING_B], [1A, 1B], [2A, 2B], ...].
           If a table cell value is itself a list, it is interpreted
           as a sparkline.
       metric_name: The name of the metric to use in stackdriver,
           e.g. "webapp.routes.daily_cost".  Think of it as the name
           of a graph in stackdriver.
       metric_label_name: The name of the label to be used with this
           metric, e.g. "url_route".  Think of it as a description of
           what the lines in the stackdriver graph represent.
       metric_label_col: the column of the table that holds the
           label value for a particular row, e.g. "url_route".  This
           should be a string that matches one of the HEADING_X fields.
       data_col: the column of the table that holds the sparkline
           historical data for this row, e.g. "last 2 weeks (per request)".
           This should be a string that matches one of the HEADING_X fields.
       dry_run: if True, say what we would send to stackdriver but don't
           actually send it.
    """
    # Stackdriver doesn't let you send a time more than an hour in the
    # past, so we just set the time to be the start of the current
    # hour.  Hopefully, if the script runs at the same time each day,
    # this will end up with us having the datapoints be at the same
    # time each day.
    time_t = int(time.time() / 3600) * 3600  # round down to the hour

    headings = table[0]
    rows = table[1:]

    metric_label_index = headings.index(metric_label_col)
    data_index = headings.index(data_col)

    stackdriver_input = []
    for row in rows:
        metric_label_value = row[metric_label_index]
        data = row[data_index]
        if data[-1] is None or data[0] is None:
            continue  # we don't have historical data, just bail
        num = data[-1] / data[0]

        # send_timeseries_to_cloudmonitoring() wants 4-tuples:
        #    (metric-name, metric-labels, value, time).
        stackdriver_input.append((metric_name, {
            metric_label_name: metric_label_value
        }, num, time_t))

    if dry_run:
        print "WOULD SEND TO STACKDRIVER:"
        print stackdriver_input
        print "------------------------------------------------"
    else:
        cloudmonitoring_util.send_timeseries_to_cloudmonitoring(
            _GOOGLE_PROJECT_ID, stackdriver_input)
Example #7
0
def _send_table_to_stackdriver(table, metric_name, metric_label_name,
                               metric_label_col, data_col, dry_run=False):
    """Send week-over-week data to stackdriver.

    For tables that have sparklines, we take the ratio of the most
    recent point on the sparkline to the oldest point (which is
    probably two weeks ago), and send that ratio as a datapoint to
    stackdriver (Google Cloud Monitoring), using the given metric-name
    and metric-label.

    Arguments:
       table: A list of lists of the form:
              [[HEADING_A, HEADING_B], [1A, 1B], [2A, 2B], ...].
           If a table cell value is itself a list, it is interpreted
           as a sparkline.
       metric_name: The name of the metric to use in stackdriver,
           e.g. "webapp.routes.daily_cost".  Think of it as the name
           of a graph in stackdriver.
       metric_label_name: The name of the label to be used with this
           metric, e.g. "url_route".  Think of it as a description of
           what the lines in the stackdriver graph represent.
       metric_label_col: the column of the table that holds the
           label value for a particular row, e.g. "url_route".  This
           should be a string that matches one of the HEADING_X fields.
       data_col: the column of the table that holds the sparkline
           historical data for this row, e.g. "last 2 weeks (per request)".
           This should be a string that matches one of the HEADING_X fields.
       dry_run: if True, say what we would send to stackdriver but don't
           actually send it.
    """
    # Stackdriver doesn't let you send a time more than an hour in the
    # past, so we just set the time to be the start of the current
    # hour.  Hopefully, if the script runs at the same time each day,
    # this will end up with us having the datapoints be at the same
    # time each day.
    time_t = int(time.time() / 3600) * 3600   # round down to the hour

    headings = table[0]
    rows = table[1:]

    metric_label_index = headings.index(metric_label_col)
    data_index = headings.index(data_col)

    stackdriver_input = []
    for row in rows:
        metric_label_value = row[metric_label_index]
        data = row[data_index]
        if data[-1] is None or data[0] is None:
            continue      # we don't have historical data, just bail
        num = data[-1] / data[0]

        # send_timeseries_to_cloudmonitoring() wants 4-tuples:
        #    (metric-name, metric-labels, value, time).
        stackdriver_input.append((metric_name,
                                  {metric_label_name: metric_label_value},
                                  num,
                                  time_t))

    if dry_run:
        print "WOULD SEND TO STACKDRIVER:"
        print stackdriver_input
        print "------------------------------------------------"
    else:
        cloudmonitoring_util.send_timeseries_to_cloudmonitoring(
            _GOOGLE_PROJECT_ID, stackdriver_input)