async def run_fetch_metric(context: MetricsContext, project_id: str, service: GCPService, metric: Metric): try: return await fetch_metric(context, project_id, service, metric) except Exception as e: context.log( project_id, f"Failed to finish task for [{metric.google_metric}], reason is {type(e).__name__} {e}" ) return []
async def check_x_goog_user_project_header_permissions(context: MetricsContext, project_id: str): try: await _check_x_goog_user_project_header_permissions( context, project_id) except Exception as e: context.log( project_id, f"Unexpected exception when checking 'x-goog-user-project' header: {e}" )
async def get_and_upload(ctx: MetricsContext, project_id: str, svc_def: GCPService) -> Iterable[Entity]: try: entities = await fun(ctx, project_id, svc_def) except Exception as e: ctx.log( f"Failed to finish entity extractor task, reason is {type(e).__name__} {e}" ) return [] return entities
async def log_invalid_lines(context: MetricsContext, ingest_response_json: Dict, lines_batch: List[IngestLine]): error = ingest_response_json.get("error", None) if error is None: return invalid_lines = error.get("invalidLines", []) if invalid_lines: for invalid_line_error_message in invalid_lines: line_index = invalid_line_error_message.get("line", 0) - 1 if line_index > -1: invalid_line_error_message = invalid_line_error_message.get( "error", "") context.log( f"INVALID LINE: '{lines_batch[line_index].to_string()}', reason: '{invalid_line_error_message}'" )
async def fetch_ingest_lines_task( context: MetricsContext, project_id: str, services: List[GCPService]) -> List[IngestLine]: fetch_metric_tasks = [] topology_tasks = [] topology_task_services = [] for service in services: if service.name in entities_extractors: topology_task = entities_extractors[service.name](context, project_id, service) topology_tasks.append(topology_task) topology_task_services.append(service) fetch_topology_results = await asyncio.gather(*topology_tasks, return_exceptions=True) skipped_services = [] for service in services: if service in topology_task_services: service_topology = fetch_topology_results[ topology_task_services.index(service)] if not service_topology: skipped_services.append(service.name) continue # skip fetching the metrics because there are no instances for metric in service.metrics: fetch_metric_task = run_fetch_metric(context=context, project_id=project_id, service=service, metric=metric) fetch_metric_tasks.append(fetch_metric_task) if skipped_services: skipped_services_string = ', '.join(skipped_services) context.log( project_id, f"Skipped fetching metrics for {skipped_services_string} due to no instances detected" ) fetch_metric_results = await asyncio.gather(*fetch_metric_tasks, return_exceptions=True) entity_id_map = build_entity_id_map(fetch_topology_results) flat_metric_results = flatten_and_enrich_metric_results( context, fetch_metric_results, entity_id_map) return flat_metric_results
async def get_all_disabled_apis(context: MetricsContext, project_id: str): base_url = f"{GCP_SERVICE_USAGE_URL}{project_id}/services?filter=state:DISABLED" headers = context.create_gcp_request_headers(project_id) disabled_apis = set() try: response = await context.gcp_session.get(base_url, headers=headers, raise_for_status=True) disabled_services_json = await response.json() disabled_services = disabled_services_json.get("services", []) disabled_apis.update({disable_service.get("config", {}).get("name", "") for disable_service in disabled_services}) while disabled_services_json.get("nextPageToken"): url = f"{base_url}&pageToken={disabled_services_json['nextPageToken']}" response = await context.gcp_session.get(url, headers=headers, raise_for_status=True) disabled_services_json = await response.json() disabled_services = disabled_services_json.get("services", []) disabled_apis.update({disable_service.get("config", {}).get("name", "") for disable_service in disabled_services}) return disabled_apis except ClientResponseError as e: context.log(project_id, f'Disabled APIs call returned failed status code. {e}') return disabled_apis except Exception as e: context.log(project_id, f'Cannot get disabled APIs: {GCP_SERVICE_USAGE_URL}/projects/{project_id}/services?filter=state:DISABLED. {e}') return disabled_apis
async def process_project_metrics(context: MetricsContext, project_id: str, services: List[GCPService], disabled_apis: Set[str]): try: context.log(project_id, f"Starting processing...") ingest_lines = await fetch_ingest_lines_task(context, project_id, services, disabled_apis) fetch_data_time = time.time() - context.start_processing_timestamp context.fetch_gcp_data_execution_time[project_id] = fetch_data_time context.log(project_id, f"Finished fetching data in {fetch_data_time}") await push_ingest_lines(context, project_id, ingest_lines) except Exception as e: context.t_exception(f"Failed to finish processing due to {e}")
async def process_project_metrics(context: MetricsContext, project_id: str, services: List[GCPService]): try: context.log(project_id, f"Starting processing...") await check_x_goog_user_project_header_permissions(context, project_id) ingest_lines = await fetch_ingest_lines_task(context, project_id, services) fetch_data_time = time.time() - context.start_processing_timestamp context.fetch_gcp_data_execution_time[project_id] = fetch_data_time context.log(project_id, f"Finished fetching data in {fetch_data_time}") await push_ingest_lines(context, project_id, ingest_lines) except Exception as e: context.log(f"Failed to finish processing due to {e}") traceback.print_exc()
async def fetch_zones(context: MetricsContext, project_id: str) -> List[str]: headers = context.create_gcp_request_headers(project_id) resp = await context.gcp_session.request( "GET", params={}, url=f"{_GCP_COMPUTE_ENDPOINT}/compute/v1/projects/{project_id}/zones", headers=headers, raise_for_status=True) response_json = await resp.json() if resp.status != 200: raise Exception( f"Failed to fetch available zones, response is {response_json}") zone_items = response_json.get("items", []) return [zone["name"] for zone in zone_items]
async def generic_paging( project_id: str, url: Text, ctx: MetricsContext, mapper: Callable[[Dict[Any, Any]], List[Entity]]) -> List[Entity]: """Apply mapper function on any page returned by gcp api url.""" headers = ctx.create_gcp_request_headers(project_id) get_page = True params: Dict[Text, Text] = {} entities: List[Entity] = [] while get_page: resp = await ctx.gcp_session.request("GET", params=params, url=url, headers=headers) try: page = await resp.json() except Exception: error_message = await resp.text() error_message = ' '.join(error_message.split()) ctx.log(f'Failed to decode JSON. {url} {error_message}') return entities if resp.status >= 400: ctx.log( project_id, f'Failed to retrieve information from googleapis. {url} {page}' ) return entities try: entities.extend(mapper(page)) except Exception as ex: ctx.log(project_id, f"Failed to map response from googleapis. {url} {ex}") return entities get_page = "nextPageToken" in page if get_page: params["pageToken"] = page.get("nextPageToken", None) return entities
async def handle_event(event: Dict, event_context, projects_ids: Optional[List[str]] = None, services: Optional[List[GCPService]] = None): if isinstance(event_context, Dict): # for k8s installation context = LoggingContext(event_context.get("execution_id", None)) else: context = LoggingContext(None) if not services: # load services for GCP Function services = load_supported_services(context) async with init_gcp_client_session( ) as gcp_session, init_dt_client_session() as dt_session: setup_start_time = time.time() token = await create_token(context, gcp_session) if token is None: context.log( "Cannot proceed without authorization token, stopping the execution" ) return if not isinstance(token, str): raise Exception( f"Failed to fetch access token, got non string value: {token}") context.log("Successfully obtained access token") project_id_owner = get_project_id_from_environment() dynatrace_api_key = await fetch_dynatrace_api_key( gcp_session=gcp_session, project_id=project_id_owner, token=token) dynatrace_url = await fetch_dynatrace_url(gcp_session=gcp_session, project_id=project_id_owner, token=token) check_version(logging_context=context) await check_dynatrace(logging_context=context, project_id=project_id_owner, dt_session=dt_session, dynatrace_url=dynatrace_url, dynatrace_access_key=dynatrace_api_key) query_interval_min = get_query_interval_minutes() print_metric_ingest_input = os.environ.get( "PRINT_METRIC_INGEST_INPUT", "FALSE").upper() in ["TRUE", "YES"] self_monitoring_enabled = os.environ.get( 'SELF_MONITORING_ENABLED', "FALSE").upper() in ["TRUE", "YES"] context = MetricsContext( gcp_session=gcp_session, dt_session=dt_session, project_id_owner=project_id_owner, token=token, execution_time=datetime.utcnow(), execution_interval_seconds=60 * query_interval_min, dynatrace_api_key=dynatrace_api_key, dynatrace_url=dynatrace_url, print_metric_ingest_input=print_metric_ingest_input, self_monitoring_enabled=self_monitoring_enabled, scheduled_execution_id=context.scheduled_execution_id) if not projects_ids: projects_ids = await get_all_accessible_projects( context, gcp_session, token) disabled_apis = {} disabled_projects = [] for project_id in projects_ids: await check_x_goog_user_project_header_permissions( context, project_id) disabled_apis = { project_id: await get_all_disabled_apis(context, project_id) } if 'monitoring.googleapis.com' in disabled_apis[project_id]: disabled_projects.append(project_id) if disabled_projects: context.log( f"monitoring.googleapis.com API disabled in the projects: " + ", ".join(disabled_projects) + ", that projects will not be monitored") for disabled_project in disabled_projects: projects_ids.remove(disabled_project) setup_time = (time.time() - setup_start_time) context.setup_execution_time = { project_id: setup_time for project_id in projects_ids } context.start_processing_timestamp = time.time() process_project_metrics_tasks = [ process_project_metrics(context, project_id, services, disabled_apis.get(project_id, set())) for project_id in projects_ids ] await asyncio.gather(*process_project_metrics_tasks, return_exceptions=True) context.log( f"Fetched and pushed GCP data in {time.time() - context.start_processing_timestamp} s" ) log_self_monitoring_data(context) if context.self_monitoring_enabled: await push_self_monitoring(context) await gcp_session.close() await dt_session.close()
async def _check_x_goog_user_project_header_permissions( context: MetricsContext, project_id: str): if project_id in context.use_x_goog_user_project_header: return service_usage_booking = os.environ['SERVICE_USAGE_BOOKING'] if 'SERVICE_USAGE_BOOKING' in os.environ.keys() \ else 'source' if service_usage_booking.casefold().strip() != 'destination': context.log(project_id, "Using SERVICE_USAGE_BOOKING = source") context.use_x_goog_user_project_header[project_id] = False return url = f"https://monitoring.googleapis.com/v3/projects/{project_id}/metricDescriptors" params = [('pageSize', 1)] headers = { "Authorization": "Bearer {token}".format(token=context.token), "x-goog-user-project": project_id } resp = await context.gcp_session.get(url=url, params=params, headers=headers) page = await resp.json() if resp.status == 200: context.use_x_goog_user_project_header[project_id] = True context.log(project_id, "Using SERVICE_USAGE_BOOKING = destination") elif resp.status == 403 and 'serviceusage.services.use' in page['error'][ 'message']: context.use_x_goog_user_project_header[project_id] = False context.log( project_id, "Ignoring destination SERVICE_USAGE_BOOKING. Missing permission: 'serviceusage.services.use'" ) else: context.log( project_id, f"Unexpected response when checking 'x-goog-user-project' header: {str(page)}" )
async def _push_to_dynatrace(context: MetricsContext, project_id: str, lines_batch: List[IngestLine]): ingest_input = "\n".join([line.to_string() for line in lines_batch]) if context.print_metric_ingest_input: context.log("Ingest input is: ") context.log(ingest_input) dt_url = f"{context.dynatrace_url.rstrip('/')}/api/v2/metrics/ingest" ingest_response = await context.dt_session.post( url=dt_url, headers={ "Authorization": f"Api-Token {context.dynatrace_api_key}", "Content-Type": "text/plain; charset=utf-8" }, data=ingest_input, verify_ssl=context.require_valid_certificate) if ingest_response.status == 401: context.dynatrace_connectivity = DynatraceConnectivity.ExpiredToken raise Exception("Expired token") elif ingest_response.status == 403: context.dynatrace_connectivity = DynatraceConnectivity.WrongToken raise Exception( "Wrong token - missing 'Ingest metrics using API V2' permission") elif ingest_response.status == 404 or ingest_response.status == 405: context.dynatrace_connectivity = DynatraceConnectivity.WrongURL raise Exception(f"Wrong URL {dt_url}") ingest_response_json = await ingest_response.json() context.dynatrace_request_count[ingest_response.status] \ = context.dynatrace_request_count.get(ingest_response.status, 0) + 1 context.dynatrace_ingest_lines_ok_count[project_id] \ = context.dynatrace_ingest_lines_ok_count.get(project_id, 0) + ingest_response_json.get("linesOk", 0) context.dynatrace_ingest_lines_invalid_count[project_id] \ = context.dynatrace_ingest_lines_invalid_count.get(project_id, 0) + ingest_response_json.get("linesInvalid", 0) context.log(project_id, f"Ingest response: {ingest_response_json}") await log_invalid_lines(context, ingest_response_json, lines_batch)
async def push_ingest_lines(context: MetricsContext, project_id: str, fetch_metric_results: List[IngestLine]): if context.dynatrace_connectivity != DynatraceConnectivity.Ok: context.log(project_id, f"Skipping push due to detected connectivity error") return if not fetch_metric_results: context.log(project_id, "Skipping push due to no data to push") lines_sent = 0 maximum_lines_threshold = context.maximum_metric_data_points_per_minute start_time = time.time() try: lines_batch = [] for result in fetch_metric_results: lines_batch.append(result) lines_sent += 1 if len(lines_batch) >= context.metric_ingest_batch_size: await _push_to_dynatrace(context, project_id, lines_batch) lines_batch = [] if lines_sent >= maximum_lines_threshold: await _push_to_dynatrace(context, project_id, lines_batch) lines_dropped_count = len( fetch_metric_results) - maximum_lines_threshold context.dynatrace_ingest_lines_dropped_count[project_id] = \ context.dynatrace_ingest_lines_dropped_count.get(project_id, 0) + lines_dropped_count context.log( project_id, f"Number of metric lines exceeded maximum {maximum_lines_threshold}, dropped {lines_dropped_count} lines" ) return if lines_batch: await _push_to_dynatrace(context, project_id, lines_batch) except Exception as e: if isinstance(e, InvalidURL): context.dynatrace_connectivity = DynatraceConnectivity.WrongURL context.log( project_id, f"Failed to push ingest lines to Dynatrace due to {type(e).__name__} {e}" ) finally: push_data_time = time.time() - start_time context.push_to_dynatrace_execution_time[project_id] = push_data_time context.log( project_id, f"Finished uploading metric ingest lines to Dynatrace in {push_data_time} s" )
async def fetch_metric(context: MetricsContext, project_id: str, service: GCPService, metric: Metric) -> List[IngestLine]: end_time = (context.execution_time - metric.ingest_delay) start_time = (end_time - context.execution_interval) reducer = 'REDUCE_SUM' aligner = 'ALIGN_SUM' if metric.value_type.lower() == 'bool': aligner = 'ALIGN_COUNT_TRUE' elif metric.google_metric_kind.lower().startswith('cumulative'): aligner = 'ALIGN_DELTA' params = [('filter', f'metric.type = "{metric.google_metric}"'), ('interval.startTime', start_time.isoformat() + "Z"), ('interval.endTime', end_time.isoformat() + "Z"), ('aggregation.alignmentPeriod', f"{metric.sample_period_seconds.total_seconds()}s"), ('aggregation.perSeriesAligner', aligner), ('aggregation.crossSeriesReducer', reducer)] all_dimensions = (service.dimensions + metric.dimensions) for dimension in all_dimensions: source = dimension.source or f'metric.labels.{dimension.dimension}' params.append(('aggregation.groupByFields', source)) headers = {"Authorization": "Bearer {token}".format(token=context.token)} if context.use_x_goog_user_project_header.get(project_id, False): headers["x-goog-user-project"] = project_id should_fetch = True lines = [] while should_fetch: context.gcp_metric_request_count[ project_id] = context.gcp_metric_request_count.get(project_id, 0) + 1 url = f"https://monitoring.googleapis.com/v3/projects/{project_id}/timeSeries" resp = await context.gcp_session.request('GET', url=url, params=params, headers=headers) page = await resp.json() # response body is https://cloud.google.com/monitoring/api/ref_v3/rest/v3/projects.timeSeries/list#response-body if 'error' in page: raise Exception(str(page)) if 'timeSeries' not in page: break for time_serie in page['timeSeries']: typed_value_key = extract_typed_value_key(time_serie) dimensions = create_dimensions(context, time_serie) entity_id = create_entity_id(service, time_serie) for point in time_serie['points']: line = convert_point_to_ingest_line(dimensions, metric, point, typed_value_key, entity_id) if line: lines.append(line) next_page_token = page.get('nextPageToken', None) if next_page_token: update_params(next_page_token, params) else: should_fetch = False return lines
async def fetch_metric(context: MetricsContext, project_id: str, service: GCPService, metric: Metric) -> List[IngestLine]: end_time = (context.execution_time - metric.ingest_delay) start_time = (end_time - context.execution_interval) reducer = 'REDUCE_SUM' aligner = 'ALIGN_SUM' if metric.value_type.lower() == 'bool': aligner = 'ALIGN_COUNT_TRUE' elif metric.google_metric_kind.lower().startswith('cumulative'): aligner = 'ALIGN_DELTA' params = [ ('filter', f'metric.type = "{metric.google_metric}" {service.monitoring_filter}'. strip()), ('interval.startTime', start_time.isoformat() + "Z"), ('interval.endTime', end_time.isoformat() + "Z"), ('aggregation.alignmentPeriod', f"{metric.sample_period_seconds.total_seconds()}s"), ('aggregation.perSeriesAligner', aligner), ('aggregation.crossSeriesReducer', reducer) ] all_dimensions = (service.dimensions + metric.dimensions) dt_dimensions_mapping = DtDimensionsMap() for dimension in all_dimensions: if dimension.key_for_send_to_dynatrace: dt_dimensions_mapping.add_label_mapping( dimension.key_for_fetch_metric, dimension.key_for_send_to_dynatrace) params.append( ('aggregation.groupByFields', dimension.key_for_fetch_metric)) headers = context.create_gcp_request_headers(project_id) should_fetch = True lines = [] while should_fetch: context.gcp_metric_request_count[ project_id] = context.gcp_metric_request_count.get(project_id, 0) + 1 url = f"{_MONITORING_ROOT}/projects/{project_id}/timeSeries" resp = await context.gcp_session.request('GET', url=url, params=params, headers=headers) page = await resp.json() # response body is https://cloud.google.com/monitoring/api/ref_v3/rest/v3/projects.timeSeries/list#response-body if 'error' in page: raise Exception(str(page)) if 'timeSeries' not in page: break for time_serie in page['timeSeries']: typed_value_key = extract_typed_value_key(time_serie) dimensions = create_dimensions(context, service.name, time_serie, dt_dimensions_mapping) entity_id = create_entity_id(service, time_serie) for point in time_serie['points']: line = convert_point_to_ingest_line(dimensions, metric, point, typed_value_key, entity_id) if line: lines.append(line) next_page_token = page.get('nextPageToken', None) if next_page_token: update_params(next_page_token, params) else: should_fetch = False return lines
async def fetch_ingest_lines_task(context: MetricsContext, project_id: str, services: List[GCPService], disabled_apis: Set[str]) -> List[IngestLine]: fetch_metric_tasks = [] topology_tasks = [] topology_task_services = [] skipped_topology_services = set() for service in services: if service.name in entities_extractors: if entities_extractors[service.name].used_api in disabled_apis: skipped_topology_services.add(service.name) continue topology_task = entities_extractors[service.name].extractor( context, project_id, service) topology_tasks.append(topology_task) topology_task_services.append(service) if skipped_topology_services: skipped_topology_services_string = ", ".join(skipped_topology_services) context.log( project_id, f"Skipped fetching topology for disabled services: {skipped_topology_services_string}" ) fetch_topology_results = await asyncio.gather(*topology_tasks, return_exceptions=True) skipped_services_no_instances = [] skipped_disabled_apis = set() for service in services: if service in topology_task_services: service_topology = fetch_topology_results[ topology_task_services.index(service)] if not service_topology: skipped_services_no_instances.append( f"{service.name}/{service.feature_set}") continue # skip fetching the metrics because there are no instances for metric in service.metrics: gcp_api_last_index = metric.google_metric.find("/") api = metric.google_metric[:gcp_api_last_index] if api in disabled_apis: skipped_disabled_apis.add(api) continue # skip fetching the metrics because service API is disabled fetch_metric_task = run_fetch_metric(context=context, project_id=project_id, service=service, metric=metric) fetch_metric_tasks.append(fetch_metric_task) if skipped_services_no_instances: skipped_services_string = ', '.join(skipped_services_no_instances) context.log( project_id, f"Skipped fetching metrics for {skipped_services_string} due to no instances detected" ) if skipped_disabled_apis: skipped_disabled_apis_string = ", ".join(skipped_disabled_apis) context.log( project_id, f"Skipped fetching metrics for disabled APIs: {skipped_disabled_apis_string}" ) fetch_metric_results = await asyncio.gather(*fetch_metric_tasks, return_exceptions=True) entity_id_map = build_entity_id_map(fetch_topology_results) flat_metric_results = flatten_and_enrich_metric_results( context, fetch_metric_results, entity_id_map) return flat_metric_results
async def handle_event(event: Dict, event_context, project_id_owner: Optional[str], projects_ids: Optional[List[str]] = None): if isinstance(event_context, Dict): context = LoggingContext(event_context.get("execution_id", None)) else: context = LoggingContext(None) selected_services = None if "GCP_SERVICES" in os.environ: selected_services_string = os.environ.get("GCP_SERVICES", "") selected_services = selected_services_string.split( ",") if selected_services_string else [] #set default featureset if featureset not present in env variable for i, service in enumerate(selected_services): if "/" not in service: selected_services[i] = f"{service}/default" services = load_supported_services(context, selected_services) async with init_gcp_client_session( ) as gcp_session, init_dt_client_session() as dt_session: setup_start_time = time.time() token = await create_token(context, gcp_session) if token is None: context.log( "Cannot proceed without authorization token, stopping the execution" ) return if not isinstance(token, str): raise Exception( f"Failed to fetch access token, got non string value: {token}") context.log("Successfully obtained access token") if not project_id_owner: project_id_owner = get_project_id_from_environment() dynatrace_api_key = await fetch_dynatrace_api_key( gcp_session=gcp_session, project_id=project_id_owner, token=token) dynatrace_url = await fetch_dynatrace_url(gcp_session=gcp_session, project_id=project_id_owner, token=token) print_metric_ingest_input = \ "PRINT_METRIC_INGEST_INPUT" in os.environ and os.environ["PRINT_METRIC_INGEST_INPUT"].upper() == "TRUE" self_monitoring_enabled = os.environ.get('SELF_MONITORING_ENABLED', "False").upper() == "TRUE" context = MetricsContext( gcp_session=gcp_session, dt_session=dt_session, project_id_owner=project_id_owner, token=token, execution_time=datetime.utcnow(), execution_interval_seconds=60 * 1, dynatrace_api_key=dynatrace_api_key, dynatrace_url=dynatrace_url, print_metric_ingest_input=print_metric_ingest_input, self_monitoring_enabled=self_monitoring_enabled, scheduled_execution_id=context.scheduled_execution_id) if not projects_ids: projects_ids = await get_all_accessible_projects( context, gcp_session, token) setup_time = (time.time() - setup_start_time) context.setup_execution_time = { project_id: setup_time for project_id in projects_ids } context.start_processing_timestamp = time.time() process_project_metrics_tasks = [ process_project_metrics(context, project_id, services) for project_id in projects_ids ] await asyncio.gather(*process_project_metrics_tasks, return_exceptions=True) context.log( f"Fetched and pushed GCP data in {time.time() - context.start_processing_timestamp} s" ) log_self_monitoring_data(context) if context.self_monitoring_enabled: await push_self_monitoring(context) await gcp_session.close() await dt_session.close()
def log_self_monitoring_data(context: MetricsContext): context.log("SFM", f"GCP Monitoring API request count [per project]: {context.gcp_metric_request_count}") context.log("SFM", f"Dynatrace MINT API request count [per response code]: {context.dynatrace_request_count}") context.log("SFM", f"Dynatrace MINT accepted lines count [per project]: {context.dynatrace_ingest_lines_ok_count}") context.log("SFM", f"Dynatrace MINT invalid lines count [per project]: {context.dynatrace_ingest_lines_invalid_count}") context.log("SFM", f"Dynatrace MINT dropped lines count [per project]: {context.dynatrace_ingest_lines_dropped_count}") context.log("SFM", f"Setup execution time: {context.setup_execution_time.get(context.project_id_owner, None)}") # values are the same for all projects context.log("SFM", f"Fetch GCP data execution time [per project]: {context.fetch_gcp_data_execution_time}") context.log("SFM", f"Push data to Dynatrace execution time [per project]: {context.push_to_dynatrace_execution_time}")