def do_request(minutes):
    global req_count
    client = _get_client()

    q = query.Query(
        client,
        _PROJECT_ID,
        metric_type=METRIC_TYPE,
        end_time=None,
        days=0,
        hours=0,
        minutes=minutes). \
        align(enums.Aggregation.Aligner.ALIGN_SUM, minutes=5)

    df = q. \
        align(enums.Aggregation.Aligner.ALIGN_SUM, minutes=5). \
        as_dataframe()
    df_ = df.unstack().reset_index()
    df__ = df_[df_.topic_id == _TOPIC_ID][['level_5', 0]].set_index('level_5')

    if len(df__) == 0:
        req_count = 0
        return
    r = df__.iloc[-1]
    if len(r.values) == 0:
        req_count = 0
        return
    v = r.values[0]
    req_count = int(v)
def dataframe_cpu_df(host_name):
    instance_app_df = instance_app_df1()
    from google.cloud.monitoring_v3 import query
    q = query.Query(client, project,
                    'compute.googleapis.com/instance/cpu/utilization', None,
                    90, 0, 0)
    try:
        instance_id = instance_app_df.loc[instance_app_df['instance_name'] ==
                                          host_name, 'instance_id'].item()
        q = q.align(enums.Aggregation.Aligner.ALIGN_MAX, minutes=1440)
        dataframe = q.as_dataframe()
        dataframe = q.as_dataframe(label='instance_name')
        dataframe = q.as_dataframe(labels=['zone', 'instance_name'])
        query = q.select_metrics(instance_name=host_name)
        dataframe_cpu = q.as_dataframe()
        dataframe_cpu = q.as_dataframe(
            labels=['resource_type', 'instance_name'])

        dataframe_cpu = pd.Series(
            dataframe_cpu['gce_instance'][host_name]).to_frame()
        dataframe_cpu.rename(columns={host_name: 'cpu'}, inplace=True)
        dataframe_cpu['time'] = dataframe_cpu.index
        dataframe_cpu['time'] = pd.Series(dataframe_cpu["time"]).dt.round("H")
        dataframe_cpu = dataframe_cpu.set_index('time')
        dataframe_cpu['cpu'] = dataframe_cpu['cpu'] * 100
        return dataframe_cpu
    except ValueError:
        print("No data available")
Beispiel #3
0
def _AddCpuUtilization(samples, instance_id):
  """Add cpu utilization to the metadata of relevant metric samples.

  Note that the utilization only covers the run stage.

  Args:
    samples: list of sample.Sample. The expected ordering is: (1) table loading
      metrics, (2) table read/write metrics.
    instance_id: the bigtable instance id.

  Returns:
    a list of updated sample.Sample.
  """
  # Check the pre-requisite
  if (len(samples) < 2 or
      samples[0].metadata.get('stage') != 'load' or
      samples[-1].metadata.get('stage') != 'run'):
    return None

  # pylint: disable=g-import-not-at-top
  from google.cloud import monitoring_v3
  from google.cloud.monitoring_v3 import query

  # Query the cpu utilization, which are gauged values at each minute in the
  # time window.
  client = monitoring_v3.MetricServiceClient()
  start_timestamp = samples[0].timestamp
  end_timestamp = samples[-1].timestamp
  cpu_query = query.Query(
      client, project=(FLAGS.project or _GetDefaultProject()),
      metric_type='bigtable.googleapis.com/cluster/cpu_load',
      end_time=datetime.datetime.utcfromtimestamp(end_timestamp),
      minutes=int((end_timestamp - start_timestamp) / 60))
  cpu_query = cpu_query.select_resources(instance=instance_id)
  time_series = list(cpu_query)
  if not time_series:
    return None

  # Build the dict to be added to samples.
  utilization_data = []
  for cluster_number, cluster_time_series in enumerate(time_series):
    utilization = numpy.array(
        [point.value.double_value for point in cluster_time_series.points])

    for percentile in CPU_UTILIZATION_PERCENTILES:
      utilization_data.append(
          {'cluster_number': cluster_number,
           'percentile': percentile,
           'utilization_percentage': (
               '%.2f' % (numpy.percentile(utilization, percentile) * 100))})

  additional_metadata = {'cpu_utilization': json.dumps(utilization_data)}

  # Update the samples.
  for sample in samples:
    if sample.metadata.get('stage') == 'run':
      sample.metadata.update(additional_metadata)

  return samples
Beispiel #4
0
def monitor_vms():
    logging.info("Checking active time of the vms")
    client = monitoring_v3.MetricServiceClient()
    cpu_query = query.Query(
        client,
        project=config.project,
        metric_type='compute.googleapis.com/instance/cpu/utilization',
        minutes=config.inactive_time_minutes)
    return cpu_query
def get_cpu(cluster_payload):
    client = monitoring_v3.MetricServiceClient()
    #THIS NEEDS A FILTER!!!!
    cpu_query = query.Query(
        client,
        project=gcproject,
        metric_type='bigtable.googleapis.com/'
        'cluster/cpu_load',
        minutes=5).select_resources(
            cluster=cluster_payload['bigtable'][0]['cluster'])
    time_series = list(cpu_query)
    recent_time_series = time_series[0]
    return recent_time_series.points[0].value.double_value
def memory_df(host_name):
    instance_id = instance_id_func(host_name)
    from google.cloud.monitoring_v3 import query
    q = query.Query(client, project,
                    'agent.googleapis.com/memory/percent_used', None, 90, 0, 0)
    q = q.align(enums.Aggregation.Aligner.ALIGN_MAX, minutes=1440)
    dataframe = q.as_dataframe(
        labels=['resource_type', 'instance_id', 'state'])
    dataframe_mem = pd.Series(
        dataframe['gce_instance'][instance_id]['used']).to_frame()
    dataframe_mem['time'] = dataframe.index
    dataframe_mem['time'] = pd.Series(dataframe_mem["time"]).dt.round("H")
    dataframe_mem = dataframe_mem.set_index('time')
    dataframe_mem['instance_id'] = instance_id
    return dataframe_mem
def get_metric():
    query_result = query.Query(client,
                               project=project_id,
                               metric_type=metric_type,
                               minutes=period)
    values = []
    for result in query_result:
        if result.resource.labels[resource_label_key] == resource_label_value:
            for time_serie in result.points:
                if value_type == "int64":
                    values.append(time_serie.value.int64_value)
                    return int(sum(values) / len(values))
                elif value_type == "double":
                    values.append(time_serie.value.double_value)
                    return "%0.4f" % (sum(values) / len(values))
Beispiel #8
0
def get_storage_utilization():
    """Returns the most recent Cloud Bigtable storage utilization measurement.

    Returns:
          float: The most recent Cloud Bigtable storage utilization metric
    """
    # [START bigtable_metric_scaler_storage_utilization]
    client = monitoring_v3.MetricServiceClient()
    utilization_query = query.Query(client,
                                    project=PROJECT,
                                    metric_type='bigtable.googleapis.com/'
                                    'cluster/storage_utilization',
                                    minutes=5)
    utilization = next(utilization_query.iter())
    return utilization.points[0].value.double_value
Beispiel #9
0
def get_cpu_load():
    """Returns the most recent Cloud Bigtable CPU load measurement.

    Returns:
          float: The most recent Cloud Bigtable CPU usage metric
    """
    # [START bigtable_cpu]
    client = monitoring_v3.MetricServiceClient()
    cpu_query = query.Query(client,
                            project=PROJECT,
                            metric_type='bigtable.googleapis.com/'
                            'cluster/cpu_load',
                            minutes=5)
    cpu = next(cpu_query.iter())
    return cpu.points[0].value.double_value
Beispiel #10
0
    def qsize(self, sub_list: list = None) -> dict:
        response = {'gcp': {}}
        if not sub_list:
            sub_list = self._sub_list

        query_results = query.Query(
            client=MetricServiceClient(),
            project=self._project,
            metric_type=self.METRIC_TYPE,
            end_time=datetime.now(),
            minutes=2
            # if set 1 minute, we get nothing
            # while creating the latest metrics.
        )

        for result in self.__read_metric(query_results=query_results):
            response['gcp'][result['subscription']] = result['value']

        return response
Beispiel #11
0
def perform_query(client, project, metric_id, minutes, lbnref):
    """Perform a query."""
    if minutes == 0:
        error('No time interval specified. Please specify the number of minutes')

    req = query.Query(client, project, metric_type=metric_id, end_time=None, days=0, hours=0, minutes=minutes)

    filt = req._filter
    filt = str(filt) + ' AND metadata.user_labels.lbnref="' + lbnref + '"'
    req._filter = filt

    delta = datetime.timedelta(days=0, hours=0, minutes=minutes)
    seconds = int(delta.total_seconds())
    req = req.align('ALIGN_MEAN', seconds=seconds)

    try:
        dataframe = req.as_dataframe()
    except Exception:
        return json.dumps({'error': 'problem aligning'})

    return dataframe.unstack(level=0).to_json(orient='table')
 def GetAverageCpuUsage(self, duration_minutes: int) -> float:
     """Gets the average high priority CPU usage through the time duration."""
     client = monitoring_v3.MetricServiceClient()
     # It takes up to 3 minutes for CPU metrics to appear.
     end_timestamp = time.time() - CPU_API_DELAY_SECONDS
     cpu_query = query.Query(
         client,
         project=self.project,
         metric_type=
         'spanner.googleapis.com/instance/cpu/utilization_by_priority',
         end_time=datetime.datetime.utcfromtimestamp(end_timestamp),
         minutes=duration_minutes)
     # Filter by high priority
     cpu_query = cpu_query.select_metrics(database=self.database,
                                          priority='high')
     # Filter by the Spanner instance
     cpu_query = cpu_query.select_resources(instance_id=self.name,
                                            project_id=self.project)
     # Aggregate user and system high priority by the minute
     time_series = list(cpu_query)
     # Expect 2 metrics: user and system high-priority CPU
     if len(time_series) != 2:
         raise errors.Benchmarks.RunError(
             'Expected 2 metrics (user and system) for Spanner high-priority CPU '
             f'utilization query, got {len(time_series)}')
     cpu_aggregated = [
         user.value.double_value + system.value.double_value for user,
         system in zip(time_series[0].points, time_series[1].points)
     ]
     average_cpu = statistics.mean(cpu_aggregated)
     logging.info('CPU aggregated: %s', cpu_aggregated)
     logging.info('Average CPU for the %s minutes ending at %s: %s',
                  duration_minutes,
                  datetime.datetime.fromtimestamp(end_timestamp),
                  average_cpu)
     return average_cpu
Beispiel #13
0
def _GetCpuUtilizationSample(samples, instance_id):
    """Gets a list of cpu utilization samples - one per cluster.

  Note that the utilization only covers the run stage.

  Args:
    samples: list of sample.Sample. Used to find the load and run samples for
             computing the run time.
    instance_id: the bigtable instance id.

  Returns:
    a sample describing the runtime

  Raises:
    Exception:  if the time for running can not be found or if
                querying the cpu sampling fails.
  """
    load_sample = MaxWithDefault(
        (cur_sample for cur_sample in samples
         if cur_sample.metadata.get('stage') == 'load'),
        key=lambda sample: sample.timestamp,
        default=None)

    # get the last sample recorded in the run stage
    last_run_sample = MaxWithDefault(
        (cur_sample for cur_sample in samples
         if cur_sample.metadata.get('stage') == 'run'),
        key=lambda sample: sample.timestamp,
        default=None)

    if not load_sample or not last_run_sample:
        raise Exception('Could not find the load or run sample, '
                        'so cant get the time for cpu utilization')

    # pylint: disable=g-import-not-at-top
    from google.cloud import monitoring_v3
    from google.cloud.monitoring_v3 import query

    # Query the cpu utilization, which are gauged values at each minute in the
    # time window.
    client = monitoring_v3.MetricServiceClient()
    start_timestamp = load_sample.timestamp
    end_timestamp = last_run_sample.timestamp
    samples = []
    for metric in ['cpu_load', 'cpu_load_hottest_node']:
        cpu_query = query.Query(
            client,
            project=(FLAGS.project or _GetDefaultProject()),
            metric_type='bigtable.googleapis.com/cluster/{}'.format(metric),
            end_time=datetime.datetime.utcfromtimestamp(end_timestamp),
            minutes=int((end_timestamp - start_timestamp) / 60))
        cpu_query = cpu_query.select_resources(instance=instance_id)
        time_series = list(cpu_query)
        if not time_series:
            raise Exception(
                'Time series for computing {} could not be found.'.format(
                    metric))

        # Build the dict to be added to samples.
        for cluster_number, cluster_time_series in enumerate(time_series):
            utilization = [
                round(point.value.double_value, 3)
                for point in cluster_time_series.points
            ]

            metadata = {
                'cluster_number': cluster_number,
                'cpu_utilization_per_minute': utilization,
            }

            cpu_utilization_sample = sample.Sample('{}_array'.format(metric),
                                                   -1, metric, metadata)

            samples.append(cpu_utilization_sample)
    return samples
def callback(message):  # pylint: disable=too-many-statements
    log.debug('Message received')

    # TODO: optimize how often we query metrics API -> there is quota we might hit in high load
    # we should cache this and re-query only once in a 2 minutes or so
    result = query.Query(
        client,
        project_id,
        'pubsub.googleapis.com/subscription/num_undelivered_messages',
        minutes=SUBSCRIPTION_TIME_INTERVAL)

    query_data_monitor = result.select_resources(
        resource_type="pubsub_subscription",
        subscription_id=f"{MONITOR_TOPIC}-sub")

    query_data_target = result.select_resources(
        resource_type="pubsub_subscription",
        subscription_id=f"{publisher_topic_name}-sub")

    current_undelivered_count_monitor = extract_metric_data(query_data_monitor)

    current_undelivered_count_target = extract_metric_data(query_data_target)

    log.info(
        f'Current undelivered count monitor {current_undelivered_count_monitor}'
    )
    log.info(
        f'Current undelivered count target {current_undelivered_count_target}')

    if (current_undelivered_count_target and  # noqa: W504 - caused by yapf
            current_undelivered_count_target > SUBSCRIPTION_MESSAGE_LIMIT
        ) or (current_undelivered_count_monitor
              and  # noqa: W504 - caused by yapf
              current_undelivered_count_monitor > SUBSCRIPTION_MESSAGE_LIMIT):
        log.info("Message queue length limit reached - skipping message")
        message.nack()
    else:
        data = json.loads(
            message.data
        )  # TODO: How expensive is to do this? Can we use attributes instead?

        object_id = data["id"]
        object_id_parts = object_id.split("/")

        # staging-image-data-predicted -> staging-image-data
        app = "-".join(object_id_parts[0].split('-')[:-1]) \
            if object_id_parts[0].endswith('-raw') or \
               object_id_parts[0].endswith('-predicted') else object_id_parts[0]

        user_id = object_id_parts[1]
        folder = object_id_parts[2]

        extra_log = {'app': app, 'user_id': user_id, 'folder': folder}

        if not redis_client.is_processing_key(
                app, user_id, folder) and not redis_client.can_process_more():
            log.debug(
                f"Concurrent user processing limit reached, message skipped "
                f"{app} {user_id} "
                f"{folder}",
                extra=extra_log)
            # redis_client.unack_counter()
            message.nack()
        else:
            publish(publisher,
                    publisher_topic_path,
                    data=message.data,
                    meta=message.attributes)
            message.ack()
Beispiel #15
0
def _GetCpuUtilizationSample(samples: List[sample.Sample],
                             instance_id: str) -> List[sample.Sample]:
  """Gets a list of cpu utilization samples - one per cluster per workload.

  Note that the utilization only covers the workload run stage.

  Args:
    samples: list of sample.Sample. Used to find the timestamp information to
             determine the time windows for the cpu metrics.
    instance_id: the bigtable instance id.

  Returns:
    a list of samples for metrics "cpu_load" and "cpu_load_hottest_node",
  """
  runtime_samples = [
      s for s in samples
      if s.metadata.get('stage') == 'run' and s.metric == 'overall RunTime'
  ]

  # pylint: disable=g-import-not-at-top
  from google.cloud import monitoring_v3
  from google.cloud.monitoring_v3 import query
  from google.cloud.monitoring_v3.gapic.transports import metric_service_grpc_transport

  client = monitoring_v3.MetricServiceClient(
      transport=metric_service_grpc_transport.MetricServiceGrpcTransport(
          address=_MONITORING_ADDRESS.value))

  cpu_samples = []
  time_units_in_secs = {'s': 1, 'ms': 0.001, 'us': 0.000001}
  for runtime_sample in runtime_samples:
    if runtime_sample.unit not in time_units_in_secs:
      logging.warning('The unit of overall RunTime is not supported: %s',
                      runtime_sample.unit)
      continue

    duration_sec = runtime_sample.value * time_units_in_secs.get(
        runtime_sample.unit)
    workload_duration_minutes = max(1, int(duration_sec / 60))

    # workload_index helps associate the cpu metrics with the current run stage.
    workload_index = runtime_sample.metadata.get('workload_index')

    # Query the cpu utilization, which are gauged values at each minute in the
    # time window determined by end_timestamp and workload_duration_minutes.
    end_timestamp = runtime_sample.timestamp
    for metric in ['cpu_load', 'cpu_load_hottest_node']:
      cpu_query = query.Query(
          client, project=(FLAGS.project or _GetDefaultProject()),
          metric_type=f'bigtable.googleapis.com/cluster/{metric}',
          end_time=datetime.datetime.utcfromtimestamp(end_timestamp),
          minutes=workload_duration_minutes)
      cpu_query = cpu_query.select_resources(instance=instance_id)
      time_series = list(cpu_query)
      if not time_series:
        logging.debug(
            'Time series for computing %s could not be found.', metric)
        continue

      # Build and add the cpu samples from the query results.
      for cluster_number, cluster_time_series in enumerate(time_series):
        utilization = [
            round(point.value.double_value, 3)
            for point in cluster_time_series.points]

        average_utilization = round(sum(utilization) / len(utilization), 3)
        metadata = {
            'cluster_number': cluster_number,
            'workload_index': workload_index,
            'cpu_utilization_per_minute': utilization,
            'cpu_average_utilization': average_utilization,
        }

        cpu_utilization_sample = sample.Sample(
            f'{metric}_array', -1, '', metadata)

        cpu_samples.append(cpu_utilization_sample)
  return cpu_samples
Beispiel #16
0
def perform_query(client, project_id, metric_id, days, hours, minutes,
                  resource_filter, metric_filter, align, align_period_seconds,
                  reduce, reduce_grouping, iloc00):

    if (days + hours + minutes) == 0:
        error(
            'No time interval specified. Please use --infinite or --days, --hours, --minutes'
        )

    if not metric_id:
        error('Metric ID is required for query, please use --metric')

    query = gcm_v3_query.Query(client=client,
                               project=project_id,
                               metric_type=metric_id,
                               days=days,
                               hours=hours,
                               minutes=minutes)

    if resource_filter:
        query = query.select_resources(**resource_filter)

    if metric_filter:
        query = query.select_metrics(**metric_filter)

    if align:
        if not iloc00:
            print(('ALIGN: {} seconds: {}'.format(align,
                                                  align_period_seconds)))
        query = query.align(align, seconds=align_period_seconds)

    if reduce:
        if not iloc00:
            print(('REDUCE: {} grouping: {}'.format(reduce, reduce_grouping)))
        if reduce_grouping:
            query = query.reduce(reduce, *reduce_grouping)
        else:
            query = query.reduce(reduce)

    if not iloc00:
        print(('QUERY: {}'.format(query.filter)))

    dataframe = query.as_dataframe()

    if iloc00:
        if len(dataframe) == 0:
            # No dataset = zero
            print('0')

        else:
            # print "top left" element of the table only, asusming it's the only one left
            # see http://pandas.pydata.org/pandas-docs/stable/10min.html for details

            # RO 16-01-2020 I disabled these asserts, if you query for cloudsql.googleapis.com/database/state there's no way
            # to set a timeframe that always returns a single value. If you set 2 minutes it will occasionally return 0 or 2 results
            # assert len(dataframe) == 1
            # assert len(dataframe.iloc[0]) == 1
            print((dataframe.iloc[0, 0]))

    else:
        # print the whole dataset
        print((dataframe.to_string()))
def _GetCpuUtilizationSample(samples, instance_id, workload_start_time):
    """Gets a list of cpu utilization samples - one per cluster.

  Note that the utilization only covers the run stage.

  Args:
    samples: list of sample.Sample. Used to find the load and run samples for
             computing the run time.
    instance_id: the bigtable instance id.
    workload_start_time: the timestamp (in seconds) when the workload starts.

  Returns:
    a list of two samples for metrics "cpu_load" and "cpu_load_hottest_node",
  """
    load_sample = MaxWithDefault(
        (cur_sample for cur_sample in samples
         if cur_sample.metadata.get('stage') == 'load'),
        key=lambda sample: sample.timestamp,
        default=None)

    # get the last sample recorded in the run stage
    last_run_sample = MaxWithDefault(
        (cur_sample for cur_sample in samples
         if cur_sample.metadata.get('stage') == 'run'),
        key=lambda sample: sample.timestamp,
        default=None)

    if not last_run_sample:
        logging.debug('Could not find the run samples.')
        return []

    # pylint: disable=g-import-not-at-top
    from google.cloud import monitoring_v3
    from google.cloud.monitoring_v3 import query

    # Query the cpu utilization, which are gauged values at each minute in the
    # time window of [start_timestamp, end_timestamp].
    client = monitoring_v3.MetricServiceClient()
    start_timestamp = workload_start_time
    if load_sample:
        # Adjust the start timestamp to skip the load phase.
        start_timestamp = load_sample.timestamp
    end_timestamp = last_run_sample.timestamp
    samples = []
    for metric in ['cpu_load', 'cpu_load_hottest_node']:
        cpu_query = query.Query(
            client,
            project=(FLAGS.project or _GetDefaultProject()),
            metric_type='bigtable.googleapis.com/cluster/{}'.format(metric),
            end_time=datetime.datetime.utcfromtimestamp(end_timestamp),
            minutes=int((end_timestamp - start_timestamp) / 60))
        cpu_query = cpu_query.select_resources(instance=instance_id)
        time_series = list(cpu_query)
        if not time_series:
            logging.debug('Time series for computing %s could not be found.',
                          metric)
            continue

        # Build the dict to be added to samples.
        for cluster_number, cluster_time_series in enumerate(time_series):
            utilization = [
                round(point.value.double_value, 3)
                for point in cluster_time_series.points
            ]

            metadata = {
                'cluster_number': cluster_number,
                'cpu_utilization_per_minute': utilization,
            }

            cpu_utilization_sample = sample.Sample('{}_array'.format(metric),
                                                   -1, '', metadata)

            samples.append(cpu_utilization_sample)
    return samples