def do_request(minutes): global req_count client = _get_client() q = query.Query( client, _PROJECT_ID, metric_type=METRIC_TYPE, end_time=None, days=0, hours=0, minutes=minutes). \ align(enums.Aggregation.Aligner.ALIGN_SUM, minutes=5) df = q. \ align(enums.Aggregation.Aligner.ALIGN_SUM, minutes=5). \ as_dataframe() df_ = df.unstack().reset_index() df__ = df_[df_.topic_id == _TOPIC_ID][['level_5', 0]].set_index('level_5') if len(df__) == 0: req_count = 0 return r = df__.iloc[-1] if len(r.values) == 0: req_count = 0 return v = r.values[0] req_count = int(v)
def dataframe_cpu_df(host_name): instance_app_df = instance_app_df1() from google.cloud.monitoring_v3 import query q = query.Query(client, project, 'compute.googleapis.com/instance/cpu/utilization', None, 90, 0, 0) try: instance_id = instance_app_df.loc[instance_app_df['instance_name'] == host_name, 'instance_id'].item() q = q.align(enums.Aggregation.Aligner.ALIGN_MAX, minutes=1440) dataframe = q.as_dataframe() dataframe = q.as_dataframe(label='instance_name') dataframe = q.as_dataframe(labels=['zone', 'instance_name']) query = q.select_metrics(instance_name=host_name) dataframe_cpu = q.as_dataframe() dataframe_cpu = q.as_dataframe( labels=['resource_type', 'instance_name']) dataframe_cpu = pd.Series( dataframe_cpu['gce_instance'][host_name]).to_frame() dataframe_cpu.rename(columns={host_name: 'cpu'}, inplace=True) dataframe_cpu['time'] = dataframe_cpu.index dataframe_cpu['time'] = pd.Series(dataframe_cpu["time"]).dt.round("H") dataframe_cpu = dataframe_cpu.set_index('time') dataframe_cpu['cpu'] = dataframe_cpu['cpu'] * 100 return dataframe_cpu except ValueError: print("No data available")
def _AddCpuUtilization(samples, instance_id): """Add cpu utilization to the metadata of relevant metric samples. Note that the utilization only covers the run stage. Args: samples: list of sample.Sample. The expected ordering is: (1) table loading metrics, (2) table read/write metrics. instance_id: the bigtable instance id. Returns: a list of updated sample.Sample. """ # Check the pre-requisite if (len(samples) < 2 or samples[0].metadata.get('stage') != 'load' or samples[-1].metadata.get('stage') != 'run'): return None # pylint: disable=g-import-not-at-top from google.cloud import monitoring_v3 from google.cloud.monitoring_v3 import query # Query the cpu utilization, which are gauged values at each minute in the # time window. client = monitoring_v3.MetricServiceClient() start_timestamp = samples[0].timestamp end_timestamp = samples[-1].timestamp cpu_query = query.Query( client, project=(FLAGS.project or _GetDefaultProject()), metric_type='bigtable.googleapis.com/cluster/cpu_load', end_time=datetime.datetime.utcfromtimestamp(end_timestamp), minutes=int((end_timestamp - start_timestamp) / 60)) cpu_query = cpu_query.select_resources(instance=instance_id) time_series = list(cpu_query) if not time_series: return None # Build the dict to be added to samples. utilization_data = [] for cluster_number, cluster_time_series in enumerate(time_series): utilization = numpy.array( [point.value.double_value for point in cluster_time_series.points]) for percentile in CPU_UTILIZATION_PERCENTILES: utilization_data.append( {'cluster_number': cluster_number, 'percentile': percentile, 'utilization_percentage': ( '%.2f' % (numpy.percentile(utilization, percentile) * 100))}) additional_metadata = {'cpu_utilization': json.dumps(utilization_data)} # Update the samples. for sample in samples: if sample.metadata.get('stage') == 'run': sample.metadata.update(additional_metadata) return samples
def monitor_vms(): logging.info("Checking active time of the vms") client = monitoring_v3.MetricServiceClient() cpu_query = query.Query( client, project=config.project, metric_type='compute.googleapis.com/instance/cpu/utilization', minutes=config.inactive_time_minutes) return cpu_query
def get_cpu(cluster_payload): client = monitoring_v3.MetricServiceClient() #THIS NEEDS A FILTER!!!! cpu_query = query.Query( client, project=gcproject, metric_type='bigtable.googleapis.com/' 'cluster/cpu_load', minutes=5).select_resources( cluster=cluster_payload['bigtable'][0]['cluster']) time_series = list(cpu_query) recent_time_series = time_series[0] return recent_time_series.points[0].value.double_value
def memory_df(host_name): instance_id = instance_id_func(host_name) from google.cloud.monitoring_v3 import query q = query.Query(client, project, 'agent.googleapis.com/memory/percent_used', None, 90, 0, 0) q = q.align(enums.Aggregation.Aligner.ALIGN_MAX, minutes=1440) dataframe = q.as_dataframe( labels=['resource_type', 'instance_id', 'state']) dataframe_mem = pd.Series( dataframe['gce_instance'][instance_id]['used']).to_frame() dataframe_mem['time'] = dataframe.index dataframe_mem['time'] = pd.Series(dataframe_mem["time"]).dt.round("H") dataframe_mem = dataframe_mem.set_index('time') dataframe_mem['instance_id'] = instance_id return dataframe_mem
def get_metric(): query_result = query.Query(client, project=project_id, metric_type=metric_type, minutes=period) values = [] for result in query_result: if result.resource.labels[resource_label_key] == resource_label_value: for time_serie in result.points: if value_type == "int64": values.append(time_serie.value.int64_value) return int(sum(values) / len(values)) elif value_type == "double": values.append(time_serie.value.double_value) return "%0.4f" % (sum(values) / len(values))
def get_storage_utilization(): """Returns the most recent Cloud Bigtable storage utilization measurement. Returns: float: The most recent Cloud Bigtable storage utilization metric """ # [START bigtable_metric_scaler_storage_utilization] client = monitoring_v3.MetricServiceClient() utilization_query = query.Query(client, project=PROJECT, metric_type='bigtable.googleapis.com/' 'cluster/storage_utilization', minutes=5) utilization = next(utilization_query.iter()) return utilization.points[0].value.double_value
def get_cpu_load(): """Returns the most recent Cloud Bigtable CPU load measurement. Returns: float: The most recent Cloud Bigtable CPU usage metric """ # [START bigtable_cpu] client = monitoring_v3.MetricServiceClient() cpu_query = query.Query(client, project=PROJECT, metric_type='bigtable.googleapis.com/' 'cluster/cpu_load', minutes=5) cpu = next(cpu_query.iter()) return cpu.points[0].value.double_value
def qsize(self, sub_list: list = None) -> dict: response = {'gcp': {}} if not sub_list: sub_list = self._sub_list query_results = query.Query( client=MetricServiceClient(), project=self._project, metric_type=self.METRIC_TYPE, end_time=datetime.now(), minutes=2 # if set 1 minute, we get nothing # while creating the latest metrics. ) for result in self.__read_metric(query_results=query_results): response['gcp'][result['subscription']] = result['value'] return response
def perform_query(client, project, metric_id, minutes, lbnref): """Perform a query.""" if minutes == 0: error('No time interval specified. Please specify the number of minutes') req = query.Query(client, project, metric_type=metric_id, end_time=None, days=0, hours=0, minutes=minutes) filt = req._filter filt = str(filt) + ' AND metadata.user_labels.lbnref="' + lbnref + '"' req._filter = filt delta = datetime.timedelta(days=0, hours=0, minutes=minutes) seconds = int(delta.total_seconds()) req = req.align('ALIGN_MEAN', seconds=seconds) try: dataframe = req.as_dataframe() except Exception: return json.dumps({'error': 'problem aligning'}) return dataframe.unstack(level=0).to_json(orient='table')
def GetAverageCpuUsage(self, duration_minutes: int) -> float: """Gets the average high priority CPU usage through the time duration.""" client = monitoring_v3.MetricServiceClient() # It takes up to 3 minutes for CPU metrics to appear. end_timestamp = time.time() - CPU_API_DELAY_SECONDS cpu_query = query.Query( client, project=self.project, metric_type= 'spanner.googleapis.com/instance/cpu/utilization_by_priority', end_time=datetime.datetime.utcfromtimestamp(end_timestamp), minutes=duration_minutes) # Filter by high priority cpu_query = cpu_query.select_metrics(database=self.database, priority='high') # Filter by the Spanner instance cpu_query = cpu_query.select_resources(instance_id=self.name, project_id=self.project) # Aggregate user and system high priority by the minute time_series = list(cpu_query) # Expect 2 metrics: user and system high-priority CPU if len(time_series) != 2: raise errors.Benchmarks.RunError( 'Expected 2 metrics (user and system) for Spanner high-priority CPU ' f'utilization query, got {len(time_series)}') cpu_aggregated = [ user.value.double_value + system.value.double_value for user, system in zip(time_series[0].points, time_series[1].points) ] average_cpu = statistics.mean(cpu_aggregated) logging.info('CPU aggregated: %s', cpu_aggregated) logging.info('Average CPU for the %s minutes ending at %s: %s', duration_minutes, datetime.datetime.fromtimestamp(end_timestamp), average_cpu) return average_cpu
def _GetCpuUtilizationSample(samples, instance_id): """Gets a list of cpu utilization samples - one per cluster. Note that the utilization only covers the run stage. Args: samples: list of sample.Sample. Used to find the load and run samples for computing the run time. instance_id: the bigtable instance id. Returns: a sample describing the runtime Raises: Exception: if the time for running can not be found or if querying the cpu sampling fails. """ load_sample = MaxWithDefault( (cur_sample for cur_sample in samples if cur_sample.metadata.get('stage') == 'load'), key=lambda sample: sample.timestamp, default=None) # get the last sample recorded in the run stage last_run_sample = MaxWithDefault( (cur_sample for cur_sample in samples if cur_sample.metadata.get('stage') == 'run'), key=lambda sample: sample.timestamp, default=None) if not load_sample or not last_run_sample: raise Exception('Could not find the load or run sample, ' 'so cant get the time for cpu utilization') # pylint: disable=g-import-not-at-top from google.cloud import monitoring_v3 from google.cloud.monitoring_v3 import query # Query the cpu utilization, which are gauged values at each minute in the # time window. client = monitoring_v3.MetricServiceClient() start_timestamp = load_sample.timestamp end_timestamp = last_run_sample.timestamp samples = [] for metric in ['cpu_load', 'cpu_load_hottest_node']: cpu_query = query.Query( client, project=(FLAGS.project or _GetDefaultProject()), metric_type='bigtable.googleapis.com/cluster/{}'.format(metric), end_time=datetime.datetime.utcfromtimestamp(end_timestamp), minutes=int((end_timestamp - start_timestamp) / 60)) cpu_query = cpu_query.select_resources(instance=instance_id) time_series = list(cpu_query) if not time_series: raise Exception( 'Time series for computing {} could not be found.'.format( metric)) # Build the dict to be added to samples. for cluster_number, cluster_time_series in enumerate(time_series): utilization = [ round(point.value.double_value, 3) for point in cluster_time_series.points ] metadata = { 'cluster_number': cluster_number, 'cpu_utilization_per_minute': utilization, } cpu_utilization_sample = sample.Sample('{}_array'.format(metric), -1, metric, metadata) samples.append(cpu_utilization_sample) return samples
def callback(message): # pylint: disable=too-many-statements log.debug('Message received') # TODO: optimize how often we query metrics API -> there is quota we might hit in high load # we should cache this and re-query only once in a 2 minutes or so result = query.Query( client, project_id, 'pubsub.googleapis.com/subscription/num_undelivered_messages', minutes=SUBSCRIPTION_TIME_INTERVAL) query_data_monitor = result.select_resources( resource_type="pubsub_subscription", subscription_id=f"{MONITOR_TOPIC}-sub") query_data_target = result.select_resources( resource_type="pubsub_subscription", subscription_id=f"{publisher_topic_name}-sub") current_undelivered_count_monitor = extract_metric_data(query_data_monitor) current_undelivered_count_target = extract_metric_data(query_data_target) log.info( f'Current undelivered count monitor {current_undelivered_count_monitor}' ) log.info( f'Current undelivered count target {current_undelivered_count_target}') if (current_undelivered_count_target and # noqa: W504 - caused by yapf current_undelivered_count_target > SUBSCRIPTION_MESSAGE_LIMIT ) or (current_undelivered_count_monitor and # noqa: W504 - caused by yapf current_undelivered_count_monitor > SUBSCRIPTION_MESSAGE_LIMIT): log.info("Message queue length limit reached - skipping message") message.nack() else: data = json.loads( message.data ) # TODO: How expensive is to do this? Can we use attributes instead? object_id = data["id"] object_id_parts = object_id.split("/") # staging-image-data-predicted -> staging-image-data app = "-".join(object_id_parts[0].split('-')[:-1]) \ if object_id_parts[0].endswith('-raw') or \ object_id_parts[0].endswith('-predicted') else object_id_parts[0] user_id = object_id_parts[1] folder = object_id_parts[2] extra_log = {'app': app, 'user_id': user_id, 'folder': folder} if not redis_client.is_processing_key( app, user_id, folder) and not redis_client.can_process_more(): log.debug( f"Concurrent user processing limit reached, message skipped " f"{app} {user_id} " f"{folder}", extra=extra_log) # redis_client.unack_counter() message.nack() else: publish(publisher, publisher_topic_path, data=message.data, meta=message.attributes) message.ack()
def _GetCpuUtilizationSample(samples: List[sample.Sample], instance_id: str) -> List[sample.Sample]: """Gets a list of cpu utilization samples - one per cluster per workload. Note that the utilization only covers the workload run stage. Args: samples: list of sample.Sample. Used to find the timestamp information to determine the time windows for the cpu metrics. instance_id: the bigtable instance id. Returns: a list of samples for metrics "cpu_load" and "cpu_load_hottest_node", """ runtime_samples = [ s for s in samples if s.metadata.get('stage') == 'run' and s.metric == 'overall RunTime' ] # pylint: disable=g-import-not-at-top from google.cloud import monitoring_v3 from google.cloud.monitoring_v3 import query from google.cloud.monitoring_v3.gapic.transports import metric_service_grpc_transport client = monitoring_v3.MetricServiceClient( transport=metric_service_grpc_transport.MetricServiceGrpcTransport( address=_MONITORING_ADDRESS.value)) cpu_samples = [] time_units_in_secs = {'s': 1, 'ms': 0.001, 'us': 0.000001} for runtime_sample in runtime_samples: if runtime_sample.unit not in time_units_in_secs: logging.warning('The unit of overall RunTime is not supported: %s', runtime_sample.unit) continue duration_sec = runtime_sample.value * time_units_in_secs.get( runtime_sample.unit) workload_duration_minutes = max(1, int(duration_sec / 60)) # workload_index helps associate the cpu metrics with the current run stage. workload_index = runtime_sample.metadata.get('workload_index') # Query the cpu utilization, which are gauged values at each minute in the # time window determined by end_timestamp and workload_duration_minutes. end_timestamp = runtime_sample.timestamp for metric in ['cpu_load', 'cpu_load_hottest_node']: cpu_query = query.Query( client, project=(FLAGS.project or _GetDefaultProject()), metric_type=f'bigtable.googleapis.com/cluster/{metric}', end_time=datetime.datetime.utcfromtimestamp(end_timestamp), minutes=workload_duration_minutes) cpu_query = cpu_query.select_resources(instance=instance_id) time_series = list(cpu_query) if not time_series: logging.debug( 'Time series for computing %s could not be found.', metric) continue # Build and add the cpu samples from the query results. for cluster_number, cluster_time_series in enumerate(time_series): utilization = [ round(point.value.double_value, 3) for point in cluster_time_series.points] average_utilization = round(sum(utilization) / len(utilization), 3) metadata = { 'cluster_number': cluster_number, 'workload_index': workload_index, 'cpu_utilization_per_minute': utilization, 'cpu_average_utilization': average_utilization, } cpu_utilization_sample = sample.Sample( f'{metric}_array', -1, '', metadata) cpu_samples.append(cpu_utilization_sample) return cpu_samples
def perform_query(client, project_id, metric_id, days, hours, minutes, resource_filter, metric_filter, align, align_period_seconds, reduce, reduce_grouping, iloc00): if (days + hours + minutes) == 0: error( 'No time interval specified. Please use --infinite or --days, --hours, --minutes' ) if not metric_id: error('Metric ID is required for query, please use --metric') query = gcm_v3_query.Query(client=client, project=project_id, metric_type=metric_id, days=days, hours=hours, minutes=minutes) if resource_filter: query = query.select_resources(**resource_filter) if metric_filter: query = query.select_metrics(**metric_filter) if align: if not iloc00: print(('ALIGN: {} seconds: {}'.format(align, align_period_seconds))) query = query.align(align, seconds=align_period_seconds) if reduce: if not iloc00: print(('REDUCE: {} grouping: {}'.format(reduce, reduce_grouping))) if reduce_grouping: query = query.reduce(reduce, *reduce_grouping) else: query = query.reduce(reduce) if not iloc00: print(('QUERY: {}'.format(query.filter))) dataframe = query.as_dataframe() if iloc00: if len(dataframe) == 0: # No dataset = zero print('0') else: # print "top left" element of the table only, asusming it's the only one left # see http://pandas.pydata.org/pandas-docs/stable/10min.html for details # RO 16-01-2020 I disabled these asserts, if you query for cloudsql.googleapis.com/database/state there's no way # to set a timeframe that always returns a single value. If you set 2 minutes it will occasionally return 0 or 2 results # assert len(dataframe) == 1 # assert len(dataframe.iloc[0]) == 1 print((dataframe.iloc[0, 0])) else: # print the whole dataset print((dataframe.to_string()))
def _GetCpuUtilizationSample(samples, instance_id, workload_start_time): """Gets a list of cpu utilization samples - one per cluster. Note that the utilization only covers the run stage. Args: samples: list of sample.Sample. Used to find the load and run samples for computing the run time. instance_id: the bigtable instance id. workload_start_time: the timestamp (in seconds) when the workload starts. Returns: a list of two samples for metrics "cpu_load" and "cpu_load_hottest_node", """ load_sample = MaxWithDefault( (cur_sample for cur_sample in samples if cur_sample.metadata.get('stage') == 'load'), key=lambda sample: sample.timestamp, default=None) # get the last sample recorded in the run stage last_run_sample = MaxWithDefault( (cur_sample for cur_sample in samples if cur_sample.metadata.get('stage') == 'run'), key=lambda sample: sample.timestamp, default=None) if not last_run_sample: logging.debug('Could not find the run samples.') return [] # pylint: disable=g-import-not-at-top from google.cloud import monitoring_v3 from google.cloud.monitoring_v3 import query # Query the cpu utilization, which are gauged values at each minute in the # time window of [start_timestamp, end_timestamp]. client = monitoring_v3.MetricServiceClient() start_timestamp = workload_start_time if load_sample: # Adjust the start timestamp to skip the load phase. start_timestamp = load_sample.timestamp end_timestamp = last_run_sample.timestamp samples = [] for metric in ['cpu_load', 'cpu_load_hottest_node']: cpu_query = query.Query( client, project=(FLAGS.project or _GetDefaultProject()), metric_type='bigtable.googleapis.com/cluster/{}'.format(metric), end_time=datetime.datetime.utcfromtimestamp(end_timestamp), minutes=int((end_timestamp - start_timestamp) / 60)) cpu_query = cpu_query.select_resources(instance=instance_id) time_series = list(cpu_query) if not time_series: logging.debug('Time series for computing %s could not be found.', metric) continue # Build the dict to be added to samples. for cluster_number, cluster_time_series in enumerate(time_series): utilization = [ round(point.value.double_value, 3) for point in cluster_time_series.points ] metadata = { 'cluster_number': cluster_number, 'cpu_utilization_per_minute': utilization, } cpu_utilization_sample = sample.Sample('{}_array'.format(metric), -1, '', metadata) samples.append(cpu_utilization_sample) return samples