def test_it_leaves_other_values_untouched(self): self.agent_config_merger.disable_profiling() assert AgentConfiguration.get().sampling_interval == timedelta( milliseconds=1) assert AgentConfiguration.get( ).minimum_time_reporting == timedelta(seconds=1) assert AgentConfiguration.get().reporting_interval == timedelta( minutes=1) assert AgentConfiguration.get().max_stack_depth == 998
def assert_init_values(self): assert AgentConfiguration.get().should_profile is True assert AgentConfiguration.get().sampling_interval == timedelta( milliseconds=1) assert AgentConfiguration.get().minimum_time_reporting == timedelta( seconds=1) assert AgentConfiguration.get().reporting_interval == timedelta( minutes=1) assert AgentConfiguration.get().max_stack_depth == 998
def test_agent_configuration_when_configure_agent_throws_error(self): self.client_stubber.add_client_error( 'configure_agent', http_status_code=500, service_message='Simulated error in configure_agent call') with self.client_stubber: self.subject.refresh_configuration() assert AgentConfiguration.get().should_profile is True assert AgentConfiguration.get().sampling_interval == timedelta( seconds=13)
def _refresh_configuration(self): self.collector.refresh_configuration() self.is_profiling_in_progress = AgentConfiguration.get().should_profile if self.is_profiling_in_progress: self.scheduler.update_delay_provider( lambda: AgentConfiguration.get().sampling_interval) else: # if we should not profile we can simply wait for the reporting interval and call again at that time. self.scheduler.update_delay_provider( lambda: AgentConfiguration.get().reporting_interval)
def test_default_values_are_overridden_at_merge_with(self): agent_config_merger = AgentConfigurationMerger(default=self.config) self.assert_init_values() agent_config_merger.merge_with( configure_agent_response=self.configure_agent_response) assert AgentConfiguration.get().should_profile is False assert AgentConfiguration.get().sampling_interval == timedelta( milliseconds=2000) assert AgentConfiguration.get().minimum_time_reporting == timedelta( milliseconds=21000) assert AgentConfiguration.get().reporting_interval == timedelta( seconds=123) assert AgentConfiguration.get().max_stack_depth == 1001
def __init__(self, environment=dict()): """ :param environment: dependency container dictionary for the current profiler :param sampling_interval: (required inside environment) delay between profile reports in datetime.timedelta :param killswitch_filepath: (required inside environment) filepath pointing to the killswitch file. This path gets checked every time the profiler samples; the profiler is immediately stopped if this file exists. :param collector: (required inside environment) collector object to handle sample processing :param initial_sampling_interval: (required inside environment) Initial delay signal sampler takes for starting to sample :param profiler_thread_name: (required inside environment) Thread name used for running the report_orchestration_scheduler """ self.timer = environment.get("timer") self.sampler = environment.get("sampler") or Sampler( environment=environment) self.scheduler = Scheduler( command=self._profiling_command, delay_provider=lambda: AgentConfiguration.get().sampling_interval, initial_delay=environment["initial_sampling_interval"], thread_name=environment["profiler_thread_name"]) self.collector = environment["collector"] self.profiler_disabler = environment["profiler_disabler"] self.is_profiling_in_progress = False self._first_execution = True
def is_overall_cpu_usage_limit_reached(self, profile=None): """ This function carries out an overall cpu limit check that covers the cpu overhead caused for the full sampling cycle: refresh config -> (sample -> aggregate) * n -> profile submission. We expect this function to be called after profile submission. """ profiler_metric = self.timer.metrics.get("runProfiler") if not profile or not profiler_metric or profiler_metric.counter < MINIMUM_MEASURES_IN_DURATION_METRICS: return False used_time_percentage = 100 * profiler_metric.total / ( profile.get_active_millis_since_start() / 1000) cpu_limit_percentage = AgentConfiguration.get().cpu_limit_percentage if used_time_percentage >= cpu_limit_percentage: logger.debug(self.timer.metrics) logger.debug("Profile active seconds since start: {:.2f} s".format( profile.get_active_millis_since_start() / 1000)) logger.info( "Profiler overall cpu usage limit reached: {:.2f} % (limit: {:.2f} %), will stop CodeGuru Profiler." .format(used_time_percentage, cpu_limit_percentage)) return True else: return False
def test_it_sets_all_parameters(self): self.env['memory_limit_bytes'] = 42 self.disabler = ProfilerDisabler(self.env) assert self.disabler.memory_limit_bytes == 42 assert self.disabler.killswitch.killswitch_filepath == 'path_to_my_kill_switch' assert self.disabler.cpu_usage_check.timer == self.timer assert AgentConfiguration.get( ).cpu_limit_percentage == DEFAULT_CPU_LIMIT_PERCENTAGE
def reset(self): self.profile = self.profile_factory( profiling_group_name=self.profiling_group_name, sampling_interval_seconds=AgentConfiguration.get( ).sampling_interval.total_seconds(), host_weight=self.host_weight, start=current_milli_time(clock=self.clock), clock=self.clock) self.timer.reset()
def test_when_backend_sends_validation_exception_it_stops_the_profiling( self): self.client_stubber.add_client_error( 'configure_agent', service_error_code='ValidationException', service_message='Simulated error in configure_agent call') with self.client_stubber: self.subject.refresh_configuration() assert AgentConfiguration.get().should_profile is False
def test_when_backends_sends_resource_not_found_it_stops_the_profiling_in_non_lambda_case( self): self.client_stubber.add_client_error( 'configure_agent', service_error_code='ResourceNotFoundException', service_message='Simulated error in configure_agent call') with self.client_stubber: self.subject.refresh_configuration() assert AgentConfiguration.get().should_profile is False
def test_configure_agent_calls_the_client(self): response = { 'configuration': { 'agentParameters': { 'SamplingIntervalInMilliseconds': '91000', 'MinimumTimeForReportingInMilliseconds': '60000', 'MaxStackDepth': '1001' }, 'periodInSeconds': 123, 'shouldProfile': False } } self.client_stubber.add_response('configure_agent', response) with self.client_stubber: self.subject.refresh_configuration() assert AgentConfiguration.get().should_profile is False assert AgentConfiguration.get().sampling_interval.total_seconds( ) == 91
def is_cpu_usage_limit_reached(self, profile=None): profiler_metric = self.timer.metrics.get("runProfiler") if not profiler_metric or profiler_metric.counter < MINIMUM_MEASURES_IN_DURATION_METRICS: return False sampling_interval_seconds = self._get_average_sampling_interval_seconds( profile) used_time_percentage = 100 * profiler_metric.average( ) / sampling_interval_seconds if used_time_percentage >= AgentConfiguration.get( ).cpu_limit_percentage: logger.debug(self.timer.metrics) logger.info( "Profiler cpu usage limit reached: {:.2f} % (limit: {:.2f} %), will stop CodeGuru Profiler." .format(used_time_percentage, AgentConfiguration.get().cpu_limit_percentage)) return True else: return False
def test_when_backends_sends_resource_not_found_it_does_not_stop_the_profiling_in_lambda_case( self): self.client_stubber.add_client_error( 'configure_agent', service_error_code='ResourceNotFoundException', service_message='Simulated error in configure_agent call') os.environ.__setitem__(LAMBDA_TASK_ROOT, 'test-task-root') os.environ.__setitem__(LAMBDA_RUNTIME_DIR, 'test-dir') with self.client_stubber: self.subject.refresh_configuration() assert AgentConfiguration.get().should_profile is True
def reset(self): self.errors_metadata.reset() self.timer.reset() self.profile = self.profile_factory( profiling_group_name=self.profiling_group_name, sampling_interval_seconds=AgentConfiguration.get( ).sampling_interval.total_seconds(), host_weight=self.host_weight, start=current_milli_time(clock=self.clock), agent_debug_info=AgentDebugInfo(self.errors_metadata, self.agent_start_time, self.timer), clock=self.clock)
def test_a_user_override_is_not_overridden_at_merge(self): agent_config_merger = AgentConfigurationMerger( default=self.config, user_overrides=self.overide_config) assert AgentConfiguration.get().should_profile is True assert AgentConfiguration.get().sampling_interval == timedelta( seconds=9) assert AgentConfiguration.get().minimum_time_reporting == timedelta( seconds=1) assert AgentConfiguration.get().reporting_interval == timedelta( minutes=1) assert AgentConfiguration.get().max_stack_depth == 998 agent_config_merger.merge_with( configure_agent_response=self.configure_agent_response) assert AgentConfiguration.get().should_profile is False assert AgentConfiguration.get().sampling_interval == timedelta( seconds=9) assert AgentConfiguration.get().minimum_time_reporting == timedelta( milliseconds=21000) assert AgentConfiguration.get().reporting_interval == timedelta( seconds=123) assert AgentConfiguration.get().max_stack_depth == 1001
def _setup_final_environment(self, environment, environment_override): environment.update(environment_override) # set additional parameters if needed (costly default init or depend on other parameters) if environment.get('initial_sampling_interval') is None: environment['initial_sampling_interval'] = datetime.timedelta( seconds=SystemRandom().uniform(0, AgentConfiguration.get().sampling_interval.total_seconds())) environment['excluded_threads'] = \ frozenset({environment['profiler_thread_name']}.union(environment['excluded_threads'])) # TODO delay metadata lookup until we need it environment['agent_metadata'] = environment.get('agent_metadata') or AgentMetadata() environment['collector'] = environment.get('collector') or self._select_collector(environment) environment["profiler_disabler"] = environment.get('profiler_disabler') or ProfilerDisabler(environment) return UnmodifiableDict(environment)
def test_when_orchestrator_says_no_to_profiler(self): self.agent_configuration = AgentConfiguration( should_profile=False, sampling_interval=timedelta(seconds=2), reporting_interval=timedelta(seconds=151)) # calling start in this test, it will start the scheduler and because initial delay is 0 it will execute now self.profiler_runner.start() # still it is safer to wait until the new config has been applied wait_for(lambda: AgentConfiguration.get().reporting_interval. total_seconds() == 151) wait_for(lambda: self.profiler_runner.scheduler. _get_next_delay_seconds() == 151) assert self.profiler_runner.scheduler._get_next_delay_seconds() == 151 self.mock_collector.add.assert_not_called()
def assert_initial_values(): assert AgentConfiguration.get().should_profile is True assert AgentConfiguration.get().sampling_interval == timedelta( seconds=1) assert AgentConfiguration.get().reporting_interval == timedelta( minutes=13) assert AgentConfiguration.get().minimum_time_reporting == timedelta( minutes=6) assert AgentConfiguration.get().max_stack_depth == 2345 assert AgentConfiguration.get().cpu_limit_percentage == 29
def sample(self): """ Samples stack traces of running threads (up to max_threads, and excluding excluded_threads) running in the current Python instance. Any exception encountered during sampling process will be propagated. """ all_threads = self._get_all_threads() all_threads_count = len(all_threads) threads_to_sample = self._threads_to_sample_from(all_threads) threads_to_sample_count = len(threads_to_sample) stacks = self._get_stacks( threads_to_sample=threads_to_sample, excluded_threads=self._excluded_threads, max_depth=AgentConfiguration.get().max_stack_depth) # Memory usage optimization del all_threads del threads_to_sample return Sample(stacks=stacks, attempted_sample_threads_count=threads_to_sample_count, seen_threads_count=all_threads_count)
def is_sampling_cpu_usage_limit_reached(self, profile=None): sample_and_aggregate_metric = self.timer.metrics.get( "sampleAndAggregate") if not sample_and_aggregate_metric or \ sample_and_aggregate_metric.counter < MINIMUM_MEASURES_IN_DURATION_METRICS: return False sampling_interval_seconds = self._get_average_sampling_interval_seconds( profile) used_time_percentage = 100 * sample_and_aggregate_metric.average( ) / sampling_interval_seconds cpu_limit_percentage = AgentConfiguration.get().cpu_limit_percentage if used_time_percentage >= cpu_limit_percentage: logger.debug(self.timer.metrics) logger.debug("Sampling interval seconds: {:.2f} s".format( sampling_interval_seconds)) logger.info( "Profiler sampling cpu usage limit reached: {:.2f} % (limit: {:.2f} %), will stop CodeGuru Profiler." .format(used_time_percentage, cpu_limit_percentage)) return True else: return False
def test_beta_endpoint_call_report_and_refresh_and_overrides_default_agent_configuration( self): self.environment["agent_config_merger"] = AgentConfigurationMerger( default=self.agent_config) sdk_reporter = SdkReporter(self.environment) sdk_reporter.setup() self.assert_initial_values() assert sdk_reporter.report(self.profile) is True sdk_reporter.refresh_configuration() assert AgentConfiguration.get().should_profile is True assert AgentConfiguration.get().sampling_interval == timedelta( seconds=1) assert AgentConfiguration.get().reporting_interval == timedelta( minutes=5) assert AgentConfiguration.get().minimum_time_reporting == timedelta( seconds=60) assert AgentConfiguration.get().max_stack_depth == 1000 assert AgentConfiguration.get().cpu_limit_percentage == 10
def test_live_profiling(self): with \ patch( "codeguru_profiler_agent.reporter.agent_configuration.AgentConfiguration.is_under_min_reporting_time", return_value=False), \ patch( "codeguru_profiler_agent.sdk_reporter.sdk_reporter.SdkReporter.check_create_pg_called_during_submit_profile", return_value=False), \ patch( "codeguru_profiler_agent.reporter.agent_configuration.AgentConfiguration._is_reporting_interval_smaller_than_minimum_allowed", return_value=False): profiler = Profiler( profiling_group_name=DUMMY_TEST_PROFILING_GROUP_NAME, region_name='eu-west-2', environment_override={ "initial_sampling_interval": timedelta(), "sampling_interval": timedelta(seconds=1), "reporting_interval": timedelta(seconds=2), 'agent_metadata': AgentMetadata(fleet_info=DefaultFleetInfo()) }) client = profiler._profiler_runner.collector.reporter.codeguru_client_builder.codeguru_client aggregator = profiler._profiler_runner.collector assert AgentConfiguration.get().sampling_interval == timedelta( seconds=1) assert AgentConfiguration.get().reporting_interval == timedelta( seconds=2) with \ patch.object(client, "post_agent_profile", wraps=client.post_agent_profile) as wrapped_post_agent_profile, \ patch.object(client, "configure_agent", wraps=client.configure_agent) as wrapped_configure_agent, \ patch.object(aggregator, "add", wraps=aggregator.add) as wrapped_add, \ patch( "codeguru_profiler_agent.reporter.agent_configuration.AgentConfiguration.is_under_min_reporting_time", return_value=False), \ patch( "codeguru_profiler_agent.reporter.agent_configuration.AgentConfiguration._is_reporting_interval_smaller_than_minimum_allowed", return_value=False): wrapped_configure_agent.return_value = { "configuration": { "agentParameters": { "SamplingIntervalInMilliseconds": "100", "MinimumTimeForReportingInMilliseconds": "1000", "MaxStackDepth": "1000", "MemoryUsageLimitPercent": "29" }, "periodInSeconds": 2, "shouldProfile": True } } try: start_status = profiler.start() assert start_status assert profiler.is_running() time.sleep(4) finally: profiler.stop() # We should see at least 2 samples in 4 seconds as the sequence should happen in the order of # initial delay (1 second) # After 1 second, no flush -> sample # After 2 seconds, it attempt to flush (possibly succeed) -> sample/ no sample # After 3 seconds, it attempt to flush (must succeed if it did not flush before) -> no sample/ sample # After 4 seconds, no flush -> sample (if profiler has not stopped yet) assert wrapped_add.call_count >= 2 assert wrapped_post_agent_profile.call_count >= 1 assert wrapped_configure_agent.call_count >= 1 assert AgentConfiguration.get().sampling_interval == timedelta( seconds=1) assert AgentConfiguration.get( ).reporting_interval == timedelta(seconds=2)
def test_given_override_is_used(self): self.handler({}, self.context) assert AgentConfiguration.get().cpu_limit_percentage == 42
def test_it_throws_error_at_calling_get_when_singleton_is_none(self): setattr(codeguru_profiler_agent.reporter.agent_configuration, "_singleton", None) with pytest.raises(ValueError): AgentConfiguration.get()
def _get_average_sampling_interval_seconds(profile): if profile is None or profile.total_sample_count < MINIMUM_SAMPLES_IN_PROFILE: return AgentConfiguration.get().sampling_interval.total_seconds() return (profile.get_active_millis_since_start() / profile.total_sample_count) / 1000
def _is_over_reporting_interval(self, now): return AgentConfiguration.get().is_over_reporting_interval( now - self.last_report_attempted)
def _is_under_min_reporting_time(self, now): return AgentConfiguration.get().is_under_min_reporting_time( now - self.last_report_attempted)
def test_it_sets_should_profile_to_false(self): self.agent_config_merger.disable_profiling() assert AgentConfiguration.get().should_profile is False
def test_live_profiling(self): with \ patch( "codeguru_profiler_agent.reporter.agent_configuration.AgentConfiguration.is_under_min_reporting_time", return_value=False), \ patch( "codeguru_profiler_agent.reporter.agent_configuration.AgentConfiguration._is_reporting_interval_smaller_than_minimum_allowed", return_value=False): profiler = Profiler( profiling_group_name=DUMMY_TEST_PROFILING_GROUP_NAME, region_name='eu-west-2', environment_override={ "initial_sampling_interval": timedelta(), "sampling_interval": timedelta(seconds=1), "reporting_interval": timedelta(seconds=2), 'agent_metadata': AgentMetadata(fleet_info=DefaultFleetInfo()) }) client = profiler._profiler_runner.collector.reporter.codeguru_client_builder.codeguru_client aggregator = profiler._profiler_runner.collector assert AgentConfiguration.get().sampling_interval == timedelta( seconds=1) assert AgentConfiguration.get().reporting_interval == timedelta( seconds=2) with \ patch.object(client, "post_agent_profile", wraps=client.post_agent_profile) as wrapped_post_agent_profile, \ patch.object(client, "configure_agent", wraps=client.configure_agent) as wrapped_configure_agent, \ patch.object(aggregator, "add", wraps=aggregator.add) as wrapped_add, \ patch( "codeguru_profiler_agent.reporter.agent_configuration.AgentConfiguration.is_under_min_reporting_time", return_value=False), \ patch( "codeguru_profiler_agent.reporter.agent_configuration.AgentConfiguration._is_reporting_interval_smaller_than_minimum_allowed", return_value=False): wrapped_configure_agent.return_value = { "configuration": { "agentParameters": { "SamplingIntervalInMilliseconds": "100", "MinimumTimeForReportingInMilliseconds": "1000", "MaxStackDepth": "1000", "MemoryUsageLimitPercent": "29" }, "periodInSeconds": 2, "shouldProfile": True } } try: start_status = profiler.start() assert start_status assert profiler.is_running() time.sleep(3) finally: profiler.stop() assert wrapped_add.call_count >= 3 assert wrapped_post_agent_profile.call_count >= 1 assert wrapped_configure_agent.call_count >= 1 assert AgentConfiguration.get().sampling_interval == timedelta( seconds=1) assert AgentConfiguration.get( ).reporting_interval == timedelta(seconds=2)