def get_metrics_for_signal( cluster: str, pool: str, scheduler: str, app: str, metrics_client: ClustermanMetricsBotoClient, required_metrics: List[MetricsConfigDict], end_time: arrow.Arrow, ) -> MetricsValuesDict: """ Get the metrics required for a signal """ metrics: MetricsValuesDict = defaultdict(list) for metric_dict in required_metrics: if metric_dict['type'] not in (SYSTEM_METRICS, APP_METRICS): raise MetricsError( f"Metrics of type {metric_dict['type']} cannot be queried by signals." ) # Need to add the cluster/pool to get the right system metrics # TODO (CLUSTERMAN-126) this should probably be cluster/pool/app eventually # TODO (CLUSTERMAN-446) if a mesos pool and a k8s pool share the same app_name, # APP_METRICS will be used for both if metric_dict['type'] == SYSTEM_METRICS: dims_list = [get_cluster_dimensions(cluster, pool, scheduler)] if scheduler == 'mesos': # handle old (non-scheduler-aware) metrics dims_list.insert(0, get_cluster_dimensions(cluster, pool, None)) else: dims_list = [{}] # We only support regex expressions for APP_METRICS if 'regex' not in metric_dict: metric_dict['regex'] = False start_time = end_time.shift(minutes=-metric_dict['minute_range']) for dims in dims_list: query_results = metrics_client.get_metric_values( metric_dict['name'], metric_dict['type'], start_time.timestamp, end_time.timestamp, is_regex=metric_dict['regex'], extra_dimensions=dims, app_identifier=app, ) for metric_name, timeseries in query_results.items(): metrics[metric_name].extend(timeseries) # safeguard; the metrics _should_ already be sorted since we inserted the old # (non-scheduler-aware) metrics before the new metrics above, so this should be fast metrics[metric_name].sort() return metrics
def _get_smoothed_non_zero_metadata( self, metric_name: str, time_start: arrow.Arrow, time_end: arrow.Arrow, smoothing: int = 5, ) -> Optional[Tuple[int, int, float]]: """ Compute some smoothed-out historical metrics metadata :param metric_name: the metadata metric to query :param time_start: the beginning of the historical time window to query :param time_end: the end of the historical time window to query :param smoothing: take this many non-zero metric values and average them together :returns: the start and end times over which the average was taken, and smoothed-out metric value during this time period; or None, if no historical data exists """ metrics = self.metrics_client.get_metric_values( metric_name, METADATA, time_start, time_end, extra_dimensions=get_cluster_dimensions(self.cluster, self.pool, self.scheduler), )[metric_name] latest_non_zero_values = [(ts, val) for ts, val in metrics if val > 0][-smoothing:] if not latest_non_zero_values: return None return ( latest_non_zero_values[0][0], latest_non_zero_values[-1][0], sum([float(val) for __, val in latest_non_zero_values]) / len(latest_non_zero_values), )
def _populate_cluster_size_events(simulator, start_time, end_time): capacity_metrics = simulator.metrics_client.get_metric_values( f'fulfilled_capacity', METADATA, start_time.timestamp, end_time.timestamp, use_cache=False, extra_dimensions=get_cluster_dimensions( simulator.metadata.cluster, simulator.metadata.pool, simulator.metadata.scheduler, ), ) for i, (timestamp, data) in enumerate(capacity_metrics['fulfilled_capacity']): market_data = {} for market_str, value in data.items(): market = InstanceMarket.parse(market_str) weight = get_market_resources(market).cpus // staticconf.read_int( 'cpus_per_weight') market_data[market] = int(value) // weight simulator.markets |= set(market_data.keys()) use_join_delay = ( i != 0) # Want to start the cluster out at the expected capacity simulator.add_event( ModifyClusterSizeEvent(arrow.get(timestamp), market_data, use_join_delay))
def generate_simple_metadata( manager: PoolManager) -> Generator[ClusterMetric, None, None]: dimensions = get_cluster_dimensions(manager.cluster, manager.pool, manager.scheduler) for metric_name, value_method in SIMPLE_METADATA.items(): yield ClusterMetric(metric_name, value_method(manager), dimensions=dimensions)
def generate_system_metrics( manager: PoolManager) -> Generator[ClusterMetric, None, None]: dimensions = get_cluster_dimensions(manager.cluster, manager.pool, manager.scheduler) for metric_name, value_method in SYSTEM_METRICS.items(): yield ClusterMetric(metric_name, value_method(manager), dimensions=dimensions)
def generate_simple_metadata( manager: PoolManager) -> Generator[ClusterMetric, None, None]: dimensions = get_cluster_dimensions(manager.cluster, manager.pool, manager.scheduler) for metric_name, value_method in SIMPLE_METADATA.items(): try: result = value_method(manager) except NoResourceGroupsFoundError: logger.warning( f'Resources for metric {metric_name} cluster {manager.cluster} not found' ) continue yield ClusterMetric(metric_name, result, dimensions=dimensions)
def _populate_allocated_resources(simulator, start_time, end_time): allocated_metrics = simulator.metrics_client.get_metric_values( 'cpus_allocated', SYSTEM_METRICS, start_time.timestamp, end_time.timestamp, use_cache=False, extra_dimensions=get_cluster_dimensions( simulator.metadata.cluster, simulator.metadata.pool, simulator.metadata.scheduler, ), ) # It's OK to just directly set up the timeseries here, instead of using events; if the autoscaler # depends on these values it will re-read it from the metrics client anyways. # # In the future, we may want to make the simulator smarter (if the value of cpus_allocated exceeds the # simulated total cpus, for example), but for right now I don't care (CLUSTERMAN-145) for timestamp, data in allocated_metrics['cpus_allocated']: simulator.mesos_cpus_allocated.add_breakpoint(arrow.get(timestamp), float(data))
def _make_autoscaler(self, autoscaler_config_file: str) -> None: fetch_count, signal_count = setup_signals_environment( self.metadata.pool, self.metadata.scheduler) signal_dir = os.path.join(os.path.expanduser('~'), '.cache', 'clusterman') endpoint_url = staticconf.read_string('aws.endpoint_url', '').format(svc='s3') env = os.environ.copy() if endpoint_url: env['AWS_ENDPOINT_URL_ARGS'] = f'--endpoint-url {endpoint_url}' for i in range(fetch_count): subprocess.run(['fetch_clusterman_signal', str(i), signal_dir], check=True, env=env) for i in range(signal_count): subprocess.Popen(['run_clusterman_signal', str(i), signal_dir], env=env) with open(autoscaler_config_file) as f: autoscaler_config = yaml.safe_load(f) configs = autoscaler_config.get('configs', []) if 'sfrs' in autoscaler_config: aws_configs = ec2.describe_spot_fleet_requests( SpotFleetRequestIds=autoscaler_config['sfrs']) configs.extend([ config['SpotFleetRequestConfig'] for config in aws_configs['SpotFleetRequestConfigs'] ]) pool_manager = SimulatedPoolManager(self.metadata.cluster, self.metadata.pool, configs, self) metric_values = self.metrics_client.get_metric_values( 'target_capacity', METADATA, self.start_time.timestamp, # metrics collector runs 1x/min, but we'll try to get five data points in case some data is missing self.start_time.shift(minutes=5).timestamp, use_cache=False, extra_dimensions=get_cluster_dimensions(self.metadata.cluster, self.metadata.pool, self.metadata.scheduler), ) # take the earliest data point available - this is a Decimal, which doesn't play nicely, so convert to an int with patch_join_delay(): actual_target_capacity = int( metric_values['target_capacity'][0][1]) pool_manager.modify_target_capacity(actual_target_capacity, force=True, prune=False) for config in configs: for spec in config['LaunchSpecifications']: self.markets |= {get_instance_market(spec)} self.autoscaler = Autoscaler( self.metadata.cluster, self.metadata.pool, self.metadata.scheduler, [self.metadata.pool], pool_manager=pool_manager, metrics_client=self.metrics_client, monitoring_enabled=False, # no sensu alerts during simulations )