def load_supported_services(context: LoggingContext, selected_services: List[str]) -> List[GCPService]: working_directory = os.path.dirname(os.path.realpath(__file__)) config_directory = os.path.join(working_directory, "config") config_files = [ file for file in listdir(config_directory) if isfile(os.path.join(config_directory, file)) and is_yaml_file(file) ] services = [] for file in config_files: config_file_path = os.path.join(config_directory, file) try: with open(config_file_path, encoding="utf-8") as config_file: config_yaml = yaml.safe_load(config_file) technology_name = extract_technology_name(config_yaml) for service_yaml in config_yaml.get("gcp", {}): # If whitelist of services exists and current service is not present in it, skip should_skip = selected_services and \ (service_yaml.get("service", "None") not in selected_services) if should_skip: continue services.append(GCPService(tech_name=technology_name, **service_yaml)) except Exception as error: context.log(f"Failed to load configuration file: '{config_file_path}'. Error details: {error}") continue services_names = [service.name for service in services] context.log("Selected services: " + ",".join(services_names)) return services
def _load_configs(self): context = LoggingContext("ME startup") working_directory = os.path.dirname(os.path.realpath(__file__)) config_directory = os.path.join(working_directory, "../../config_logs") config_files = [ file for file in listdir(config_directory) if isfile(os.path.join(config_directory, file)) and _is_json_file(file) ] for file in config_files: config_file_path = os.path.join(config_directory, file) try: with open(config_file_path) as config_file: config_json = json.load(config_file) if config_json.get("name", "") == DEFAULT_RULE_NAME: self.default_rule = _create_config_rules( context, config_json)[0] elif config_json.get("name", "") == COMMON_RULE_NAME: self.common_rule = _create_config_rules( context, config_json)[0] elif config_json.get("name", "").startswith(AUDIT_LOGS_RULE): self.audit_logs_rules = _create_config_rules( context, config_json) else: self.rules.extend( _create_config_rules(context, config_json)) except Exception as e: context.exception( f"Failed to load configuration file: '{config_file_path}'")
async def get_dynatrace_token_metadata(dt_session: ClientSession, context: LoggingContext, dynatrace_url: str, dynatrace_api_key: str, timeout: Optional[int] = 2) -> dict: try: response = await dt_session.post( url=f"{dynatrace_url.rstrip('/')}/api/v1/tokens/lookup", headers={ "Authorization": f"Api-Token {dynatrace_api_key}", "Content-Type": "application/json; charset=utf-8" }, json={"token": dynatrace_api_key}, verify_ssl=get_should_require_valid_certificate(), timeout=timeout) if response.status != 200: context.log( f'Unable to get Dynatrace token metadata: {response.status}, url: {response.url}, reason: {response.reason}' ) return {} return await response.json() except Exception as e: context.log( f'Unable to get Dynatrace token metadata. Error details: {e}') return {}
def _apply_rule(context: LoggingContext, rule: ConfigRule, record: Dict, parsed_record: Dict): for attribute in rule.attributes: try: value = jmespath.search(attribute.pattern, record, JMESPATH_OPTIONS) if value: parsed_record[attribute.key] = value except Exception: context.exception(f"Encountered exception when evaluating attribute {attribute} of rule for {rule.entity_type_name}")
def _check_configuration_flags(logging_context: LoggingContext, flags_to_check: List[str]): configuration_flag_values = [] for key in flags_to_check: value = os.environ.get(key, None) if value is None: configuration_flag_values.append(f"{key} is None") else: configuration_flag_values.append(f"{key} = '{value}'") logging_context.log( f"Found configuration flags: {', '.join(configuration_flag_values)}")
def _create_config_rule(context: LoggingContext, entity_name: str, rule_json: Dict) -> Optional[ConfigRule]: sources_json = rule_json.get("sources", []) if entity_name not in SPECIAL_RULE_NAMES and not sources_json: context.log(f"Encountered invalid rule with missing sources for config entry named {entity_name}") return None sources = _create_sources(context, sources_json) if entity_name not in SPECIAL_RULE_NAMES and not sources: context.log(f"Encountered invalid rule with invalid sources for config entry named {entity_name}: {sources_json}") return None attributes = _create_attributes(context, rule_json.get("attributes", [])) return ConfigRule(entity_type_name=entity_name, source_matchers=sources, attributes=attributes)
async def get_all_accessible_projects(context: LoggingContext, session: ClientSession, token: str): url = "https://cloudresourcemanager.googleapis.com/v1/projects" headers = {"Authorization": "Bearer {token}".format(token=token)} response = await session.get(url, headers=headers) response_json = await response.json() all_projects = [ project["projectId"] for project in response_json.get("projects", []) ] context.log("Access to following projects: " + ", ".join(all_projects)) return all_projects
async def create_sfm_worker_loop(sfm_queue: Queue, logging_context: LoggingContext, instance_metadata: InstanceMetadata): while True: try: await asyncio.sleep(SFM_WORKER_EXECUTION_PERIOD_SECONDS) self_monitoring = LogSelfMonitoring() asyncio.get_event_loop().create_task( _loop_single_period(self_monitoring, sfm_queue, logging_context, instance_metadata)) except Exception: logging_context.exception( "Logs Self Monitoring Worker Loop Exception:")
def apply(self, context: LoggingContext, record: Dict, parsed_record: Dict): try: if self.common_rule: _apply_rule(context, self.common_rule, record, parsed_record) for rule in self.rules: if _check_if_rule_applies(rule, record, parsed_record): _apply_rule(context, rule, record, parsed_record) return # No matching rule has been found, applying the default rule if self.default_rule: _apply_rule(context, self.default_rule, record, parsed_record) except Exception: context.exception("Encountered exception when running Rule Engine")
def _create_attributes(context: LoggingContext, attributes_json: List[Dict]) -> List[Attribute]: result = [] for source_json in attributes_json: key = source_json.get("key", None) pattern = source_json.get("pattern", None) if key and pattern: result.append(Attribute(key, pattern)) else: context.log(f"Encountered invalid rule attribute with missing parameter, parameters were: key = {key}, pattern = {pattern}") return result
def load_activated_feature_sets(logging_context: LoggingContext, activation_yaml) -> List[str]: services_whitelist = [] for service in activation_yaml.get("services", []): feature_sets = service.get("featureSets", []) for feature_set in feature_sets: services_whitelist.append( f"{service.get('service')}/{feature_set}") if not feature_sets: logging_context.error( f"No feature set in given {service} service.") return services_whitelist
async def get_all_accessible_projects(context: LoggingContext, session: ClientSession, token: str): url = _CLOUD_RESOURCE_MANAGER_ROOT + "/projects?filter=lifecycleState%3AACTIVE" headers = {"Authorization": "Bearer {token}".format(token=token)} response = await session.get(url, headers=headers) response_json = await response.json() all_projects = [ project["projectId"] for project in response_json.get("projects", []) ] if all_projects: context.log("Access to following projects: " + ", ".join(all_projects)) else: context.log( "There is no access to any projects. Check service account configuration." ) return all_projects
def generate_metadata(): toc = [] units = set() unmapped_units = set() # some metrics are used for multiple services and script will encounter them multiple times visited_metric_keys = set() supported_services = load_supported_services(LoggingContext(None), []) prepare_metric_metadata_dir() for supported_service in supported_services: print(f"\n => {supported_service.name}") for metric in supported_service.metrics: print(f"{metric.dynatrace_name}") if metric.dynatrace_name in visited_metric_keys: print(" - Already mapped, skipping") continue else: visited_metric_keys.add(metric.dynatrace_name) metadata = create_metadata(metric, unmapped_units) if not metadata: continue filename = write_metadata(metadata, metric) units.add(metric.unit) toc.append(filename) write_toc(toc) print(f"\nFound units: {units}") print(f"\nFailed to map units: {unmapped_units}")
async def create_token(context: LoggingContext, session: ClientSession): credentials_path = os.environ[ 'GOOGLE_APPLICATION_CREDENTIALS'] if 'GOOGLE_APPLICATION_CREDENTIALS' in os.environ.keys( ) else "" if credentials_path: context.log(f"Using credentials from {credentials_path}") with open(credentials_path) as key_file: credentials_data = json.load(key_file) return await get_token(key=credentials_data['private_key'], service=credentials_data['client_email'], uri=credentials_data['token_uri'], session=session) else: context.log("Trying to use default service account") return await create_default_service_account_token(context, session)
def create_dimension( name: str, value: Any, context: LoggingContext = LoggingContext(None)) -> DimensionValue: string_value = str(value) if len(name) > MAX_DIMENSION_NAME_LENGTH: context.log( f'MINT rejects dimension names longer that {MAX_DIMENSION_NAME_LENGTH} chars. Dimension name \"{name}\" "has been truncated' ) name = name[:MAX_DIMENSION_NAME_LENGTH] if len(string_value) > MAX_DIMENSION_VALUE_LENGTH: context.log( f'MINT rejects dimension values longer that {MAX_DIMENSION_VALUE_LENGTH} chars. Dimension value \"{string_value}\" has been truncated' ) string_value = string_value[:MAX_DIMENSION_VALUE_LENGTH] return DimensionValue(name, string_value)
def __init__(self, context: LoggingContext, source: str, condition: str): self.source = source self.condition = condition for key in _CONDITION_COMPARATOR_MAP: if condition.startswith(key): self._evaluator = _CONDITION_COMPARATOR_MAP[key] break operands = re.findall(r"'(.*?)'", condition, re.DOTALL) self._operand = operands[0] if operands else None self._source_value_extractor = _SOURCE_VALUE_EXTRACTOR_MAP.get(source.casefold(), None) if not self._source_value_extractor: context.log(f"Unsupported source type: '{source}'") self.valid = False if not self._evaluator or not self._operand: context.log(f"Failed to parse condition macro for expression: '{condition}'") self.valid = False
def apply(self, context: LoggingContext, record: Dict, parsed_record: Dict): try: if self.common_rule: _apply_rule(context, self.common_rule, record, parsed_record) any_rule_applied = self._apply_rules(context, self.rules, record, parsed_record) any_audit_rule_applied = self._apply_rules(context, self.audit_logs_rules, record, parsed_record) # No matching rule has been found, applying the default rule no_rule_applied = not (any_rule_applied or any_audit_rule_applied) if no_rule_applied and self.default_rule: _apply_rule(context, self.default_rule, record, parsed_record) except Exception as e: context.t_exception( f"Encountered exception when running Rule Engine. {e}")
def _create_sources(context: LoggingContext, sources_json: List[Dict]) -> List[SourceMatcher]: result = [] for source_json in sources_json: source = source_json.get("source", None) condition = source_json.get("condition", None) source_matcher = None if source and condition: source_matcher = SourceMatcher(context, source, condition) if source_matcher and source_matcher.valid: result.append(source_matcher) else: context.log(f"Encountered invalid rule source, parameters were: source= {source}, condition = {condition}") return [] return result
async def async_dynatrace_gcp_extension( project_ids: Optional[List[str]] = None, services: Optional[List[GCPService]] = None): """ Used in docker or for tests """ timestamp_utc = datetime.utcnow() timestamp_utc_iso = timestamp_utc.isoformat() execution_identifier = hashlib.md5( timestamp_utc_iso.encode("UTF-8")).hexdigest() logging_context = LoggingContext(execution_identifier) logging_context.log(f'Starting execution for project(s): {project_ids}' if project_ids else "Starting execution") event_context = { 'timestamp': timestamp_utc_iso, 'event_id': timestamp_utc.timestamp(), 'event_type': 'test', 'execution_id': execution_identifier } data = {'data': '', 'publishTime': timestamp_utc_iso} start_time = time.time() await handle_event(data, event_context, project_ids, services) elapsed_time = time.time() - start_time logging_context.log(f"Execution took {elapsed_time}\n")
async def _loop_single_period(self_monitoring: LogSelfMonitoring, sfm_queue: Queue, context: LoggingContext, instance_metadata: InstanceMetadata): try: sfm_list = _pull_sfm(sfm_queue) if sfm_list: async with init_gcp_client_session() as gcp_session: context = await _create_sfm_logs_context( sfm_queue, context, gcp_session, instance_metadata) self_monitoring = aggregate_self_monitoring_metrics( self_monitoring, sfm_list) _log_self_monitoring_data(self_monitoring, context) if context.self_monitoring_enabled: if context.token is None: context.log( "Cannot proceed without authorization token, failed to send log self monitoring" ) return if not isinstance(context.token, str): context.log( f"Failed to fetch access token, got non string value: {context.token}" ) return time_series = create_self_monitoring_time_series( self_monitoring, context) await push_self_monitoring_time_series( context, time_series) for _ in sfm_list: sfm_queue.task_done() except Exception: context.exception("Log SFM Loop Exception:")
def generate_ddu_estimation(): supported_services = load_supported_services(LoggingContext(None), []) print("|| name || data points rate (/min) || estimated DDU rate (/min) (1 data point = 0.001 DDU)||") for supported_service in supported_services: data_points_per_minute = 0 for metric in supported_service.metrics: dimensions_multiplier = (ASSUMED_AVG_DIMENSION_VALUES ** len(metric.dimensions)) rate_per_minute = (metric.sample_period_seconds.seconds / 60.0) data_points_per_minute += rate_per_minute * dimensions_multiplier ddu_estimation = round(data_points_per_minute * DATA_POINT_WEIGHT, DECIMAL_PLACES) data_points_rate_estimation = round(data_points_per_minute, 0) print(f"| {supported_service.name} | {data_points_rate_estimation} | {ddu_estimation} |")
async def test_execution_expired_token(): expected_cluster_response_code = 401 expected_sent_requests = 3 response(expected_cluster_response_code, "Expired token") ack_queue = Queue() sfm_queue = Queue() mock_subscriber_client = MockSubscriberClient(ack_queue) expected_ack_ids = [f"ACK_ID_{i}" for i in range(0, 10)] message_data_json = json.loads(LOG_MESSAGE_DATA) message_data_json["timestamp"] = datetime.utcnow().isoformat() + "Z" fresh_message_data = json.dumps(message_data_json) for ack_id in expected_ack_ids: message = create_fake_message(ack_id=ack_id, message_data=fresh_message_data) mock_subscriber_client.add_message(message) worker_state = WorkerState("TEST") perform_pull(worker_state, sfm_queue, mock_subscriber_client, "") # Flush down rest of messages perform_flush(worker_state, sfm_queue, mock_subscriber_client, "") metadata = InstanceMetadata( project_id="", container_name="", token_scopes="", service_account="", audience="", hostname="local deployment 2", zone="us-east1" ) self_monitoring = LogSelfMonitoring() await log_self_monitoring._loop_single_period(self_monitoring, sfm_queue, LoggingContext("TEST"), metadata) sfm_queue.join() assert ack_queue.qsize() == 0 verify_requests(expected_cluster_response_code, expected_sent_requests) assert self_monitoring.too_old_records == 0 assert self_monitoring.parsing_errors == 0 assert self_monitoring.records_with_too_long_content == 0 assert Counter(self_monitoring.dynatrace_connectivity) == {DynatraceConnectivity.ExpiredToken: 3} assert self_monitoring.processing_time > 0 assert self_monitoring.sending_time > 0
async def create_default_service_account_token(context: LoggingContext, session: ClientSession): """ For reference check out https://github.com/googleapis/google-auth-library-python/tree/master/google/auth/compute_engine :param session: :return: """ url = _METADATA_ROOT + "/instance/service-accounts/{0}/token".format( "default") try: response = await session.get(url, headers=_METADATA_HEADERS) if response.status >= 300: body = await response.text() context.log( f"Failed to authorize with Service Account from Metadata Service, response is {response.status} => {body}" ) return None response_json = await response.json() return response_json["access_token"] except Exception as e: context.log( f"Failed to authorize with Service Account from Metadata Service due to '{e}'" ) return None
async def test_empty_activation_config(mocker: MockerFixture, monkeypatch: MonkeyPatchFixture): # NO filestore/default configured monkeypatch.setenv("ACTIVATION_CONFIG", "{services: []}") dt_session = ClientSession() mocker.patch.object(dt_session, 'get', side_effect=mocked_get) extensions_fetcher = ExtensionsFetcher(dt_session, "", "", LoggingContext("TEST")) result = await extensions_fetcher.execute() assert_that(result).is_not_none() feature_sets_to_filter_conditions = { f"{gcp_service_config.name}/{gcp_service_config.feature_set}": gcp_service_config.monitoring_filter for gcp_service_config in result.services } assert_that(feature_sets_to_filter_conditions).is_equal_to({})
def load_supported_services(context: LoggingContext) -> List[GCPService]: activation_yaml = read_activation_yaml() activation_config_per_service = get_activation_config_per_service( activation_yaml) feature_sets_from_activation_config = load_activated_feature_sets( context, activation_yaml) working_directory = os.path.dirname(os.path.realpath(__file__)) config_directory = os.path.join(working_directory, "config") config_files = [ file for file in listdir(config_directory) if isfile(os.path.join(config_directory, file)) and is_yaml_file(file) ] services = [] for file in config_files: config_file_path = os.path.join(config_directory, file) try: with open(config_file_path, encoding="utf-8") as config_file: config_yaml = yaml.safe_load(config_file) technology_name = extract_technology_name(config_yaml) for service_yaml in config_yaml.get("gcp", {}): service_name = service_yaml.get("service", "None") featureSet = service_yaml.get("featureSet", "default_metrics") # If whitelist of services exists and current service is not present in it, skip # If whitelist is empty - no services explicitly selected - load all available whitelist_exists = feature_sets_from_activation_config.__len__( ) > 0 if f'{service_name}/{featureSet}' in feature_sets_from_activation_config or not whitelist_exists: activation = activation_config_per_service.get( service_name, {}) services.append( GCPService(tech_name=technology_name, **service_yaml, activation=activation)) except Exception as error: context.log( f"Failed to load configuration file: '{config_file_path}'. Error details: {error}" ) continue featureSets = [ f"{service.name}/{service.feature_set}" for service in services ] if featureSets: context.log("Selected feature sets: " + ", ".join(featureSets)) else: context.log("Empty feature sets. GCP services not monitored.") return services
def run_ack_logs(worker_name: str, sfm_queue: Queue): logging_context = LoggingContext(worker_name) subscriber_client = pubsub.SubscriberClient() subscription_path = subscriber_client.subscription_path( LOGS_SUBSCRIPTION_PROJECT, LOGS_SUBSCRIPTION_ID) logging_context.log(f"Starting processing") worker_state = WorkerState(worker_name) while True: try: perform_pull(worker_state, sfm_queue, subscriber_client, subscription_path) except Exception as e: if isinstance(e, Forbidden): logging_context.error( f"{e} Please check whether assigned service account has permission to fetch Pub/Sub messages." ) else: logging_context.exception("Failed to pull messages") # Backoff for 1 minute to avoid spamming requests and logs time.sleep(60)
def run_ack_logs(worker_name: str, sfm_queue: Queue): logging_context = LoggingContext(worker_name) subscriber_client = pubsub.SubscriberClient() subscription_path = subscriber_client.subscription_path( LOGS_SUBSCRIPTION_PROJECT, LOGS_SUBSCRIPTION_ID) logging_context.log(f"Starting processing") worker_state = WorkerState(worker_name) while True: try: perform_pull(worker_state, sfm_queue, subscriber_client, subscription_path) except Exception as e: logging_context.exception("Failed to pull messages")
def generate_ddu_estimation(): supported_services = load_supported_services(LoggingContext(None), []) print("|| Service name || Configuration || DDU per minute per instance ||") for supported_service in supported_services: data_points_per_minute = 0 for metric in supported_service.metrics: dimensions_multiplier = (ASSUMED_AVG_DIMENSION_VALUES**len( metric.dimensions)) rate_per_minute = (metric.sample_period_seconds.seconds / 60.0) data_points_per_minute += rate_per_minute * dimensions_multiplier ddu_estimation = round(data_points_per_minute * DATA_POINT_WEIGHT, DECIMAL_PLACES) data_points_rate_estimation = round(data_points_per_minute, 0) feature_set = "/" + supported_service.feature_set print( f"| {supported_service.technology_name} | {supported_service.name}{feature_set} | {ddu_estimation} |" )
async def test_execute(mocker: MockerFixture, monkeypatch: MonkeyPatchFixture): # NO filestore/default configured monkeypatch.setenv("ACTIVATION_CONFIG", ACTIVATION_CONFIG) dt_session = ClientSession() mocker.patch.object(dt_session, 'get', side_effect=mocked_get) extensions_fetcher = ExtensionsFetcher(dt_session, "", "", LoggingContext("TEST")) result = await extensions_fetcher.execute() assert_that(result).is_not_none() feature_sets_to_filter_conditions = { f"{gcp_service_config.name}/{gcp_service_config.feature_set}": gcp_service_config.monitoring_filter for gcp_service_config in result.services } assert_that(feature_sets_to_filter_conditions).is_equal_to({ "cloudsql_database/default_metrics": "", "gce_instance/default_metrics": "resource.labels.instance_name=starts_with(\"test\")", "gce_instance/agent": "resource.labels.instance_name=starts_with(\"test\")" })
async def async_dynatrace_gcp_extension(): timestamp_utc = datetime.utcnow() timestamp_utc_iso = timestamp_utc.isoformat() execution_identifier = hashlib.md5(timestamp_utc_iso.encode("UTF-8")).hexdigest() logging_context = LoggingContext(execution_identifier) logging_context.log(f"Starting execution") event_context = { 'timestamp': timestamp_utc_iso, 'event_id': timestamp_utc.timestamp(), 'event_type': 'test', 'execution_id': execution_identifier } data = {'data': '', 'publishTime': timestamp_utc_iso} start_time = time.time() await handle_event(data, event_context, "dynatrace-gcp-extension") elapsed_time = time.time() - start_time logging_context.log(f"Execution took {elapsed_time}\n")