def test_success_outside_grace_period(self): ''' Health checks fail inside grace period, but pass outside and leads to success ''' self.append_health_checks(False, num_calls=2) self.append_health_checks(True) hct = HealthChecker( self._checker.health, interval_secs=self.interval_secs, clock=self._clock) hct.start() assert self._clock.converge(threads=[hct.threaded_health_checker]) self._clock.assert_waiting(hct.threaded_health_checker, self.interval_secs) assert hct.status == StatusResult(None, TaskState.Value('TASK_STARTING')) assert hct.threaded_health_checker.running is False self._clock.tick(self.interval_secs) assert self._clock.converge(threads=[hct.threaded_health_checker]) self._clock.assert_waiting(hct.threaded_health_checker, self.interval_secs) assert hct.status == StatusResult(None, TaskState.Value('TASK_STARTING')) assert hct.threaded_health_checker.running is False self._clock.tick(self.interval_secs) assert self._clock.converge(threads=[hct.threaded_health_checker]) self._clock.assert_waiting(hct.threaded_health_checker, self.interval_secs) assert hct.status == StatusResult('Task is healthy.', TaskState.Value('TASK_RUNNING')) assert hct.threaded_health_checker.running is True hct.stop() assert self._checker.health.call_count == 3
def test_grace_period_2x_failure(self): ''' Grace period is 2 x interval and all health checks fail. Failures are ignored when in grace period. ''' self.append_health_checks(False, num_calls=3) hct = HealthChecker( self._checker.health, interval_secs=self.interval_secs, clock=self._clock) hct.start() assert self._clock.converge(threads=[hct.threaded_health_checker]) self._clock.assert_waiting(hct.threaded_health_checker, self.interval_secs) assert hct.status == StatusResult(None, TaskState.Value('TASK_STARTING')) assert hct.threaded_health_checker.running is False self._clock.tick(self.interval_secs) assert self._clock.converge(threads=[hct.threaded_health_checker]) self._clock.assert_waiting(hct.threaded_health_checker, self.interval_secs) assert hct.status == StatusResult(None, TaskState.Value('TASK_STARTING')) assert hct.threaded_health_checker.running is False self._clock.tick(self.interval_secs) assert self._clock.converge(threads=[hct.threaded_health_checker]) self._clock.assert_waiting(hct.threaded_health_checker, self.interval_secs) assert hct.status == StatusResult('Failed health check! reason', TaskState.Value('TASK_FAILED')) assert hct.threaded_health_checker.running is False hct.stop() assert self._checker.health.call_count == 3
def test_consecutive_failures_failfast(self): '''Verify that health check is failed fast''' grace_period_secs = self.initial_interval_secs interval_secs = self.interval_secs self.append_health_checks(False, num_calls=3) hct = HealthChecker( self._checker.health, interval_secs=interval_secs, grace_period_secs=grace_period_secs, max_consecutive_failures=2, min_consecutive_successes=2, clock=self._clock) hct.start() # 3 consecutive health check failures causes fail-fast self._clock.converge(threads=[hct.threaded_health_checker]) self._clock.assert_waiting(hct.threaded_health_checker, interval_secs) assert hct.status == StatusResult(None, TaskState.Value('TASK_STARTING')) # failure is ignored inside grace_period_secs assert hct.metrics.sample()['consecutive_failures'] == 0 self._clock.tick(interval_secs) self._clock.converge(threads=[hct.threaded_health_checker]) self._clock.assert_waiting(hct.threaded_health_checker, interval_secs) assert hct.status == StatusResult('Failed health check! reason', TaskState.Value('TASK_FAILED')) assert hct.metrics.sample()['consecutive_failures'] == 1 hct.stop() assert self._checker.health.call_count == 2
def test_consecutive_failures(self): '''Verify that a task is unhealthy only after max_consecutive_failures is exceeded''' initial_interval_secs = 2 interval_secs = 1 self.append_health_checks(False, num_calls=2) self.append_health_checks(True) self.append_health_checks(False, num_calls=3) hct = HealthChecker( self._checker.health, interval_secs=interval_secs, initial_interval_secs=initial_interval_secs, max_consecutive_failures=2, clock=self._clock) hct.start() # 2 consecutive health check failures followed by a successful health check. self._clock.tick(initial_interval_secs) assert hct.status is None self._clock.tick(interval_secs) assert hct.status is None self._clock.tick(interval_secs) assert hct.status is None # 3 consecutive health check failures. self._clock.tick(interval_secs) assert hct.status is None self._clock.tick(interval_secs) assert hct.status is None self._clock.tick(interval_secs) thread_yield() assert hct.status.status == TaskState.Value('TASK_FAILED') hct.stop() assert self._checker.health.call_count == 6
def test_initial_interval_whatev(self): self.append_health_checks(False) hct = HealthChecker( self._checker.health, interval_secs=5, initial_interval_secs=0, clock=self._clock) hct.start() assert hct.status.status == TaskState.Value('TASK_FAILED') hct.stop() assert self._checker.health.call_count == 1
def test_consecutive_failures(self): '''Verify that a task is unhealthy only after max_consecutive_failures is exceeded''' initial_interval_secs = 2 interval_secs = 1 self.append_health_checks(False, num_calls=2) self.append_health_checks(True) self.append_health_checks(False, num_calls=3) hct = HealthChecker( self._checker.health, interval_secs=interval_secs, initial_interval_secs=initial_interval_secs, max_consecutive_failures=2, clock=self._clock) hct.start() self._clock.converge(threads=[hct.threaded_health_checker]) # 2 consecutive health check failures followed by a successful health check. epsilon = 0.001 self._clock.tick(initial_interval_secs + epsilon) self._clock.converge(threads=[hct.threaded_health_checker]) self._clock.assert_waiting(hct.threaded_health_checker, amount=1) assert hct.status is None assert hct.metrics.sample()['consecutive_failures'] == 1 self._clock.tick(interval_secs + epsilon) self._clock.converge(threads=[hct.threaded_health_checker]) self._clock.assert_waiting(hct.threaded_health_checker, amount=1) assert hct.status is None assert hct.metrics.sample()['consecutive_failures'] == 2 self._clock.tick(interval_secs + epsilon) self._clock.converge(threads=[hct.threaded_health_checker]) self._clock.assert_waiting(hct.threaded_health_checker, amount=1) assert hct.status is None assert hct.metrics.sample()['consecutive_failures'] == 0 # 3 consecutive health check failures. self._clock.tick(interval_secs + epsilon) self._clock.converge(threads=[hct.threaded_health_checker]) self._clock.assert_waiting(hct.threaded_health_checker, amount=1) assert hct.status is None assert hct.metrics.sample()['consecutive_failures'] == 1 self._clock.tick(interval_secs + epsilon) self._clock.converge(threads=[hct.threaded_health_checker]) self._clock.assert_waiting(hct.threaded_health_checker, amount=1) assert hct.status is None assert hct.metrics.sample()['consecutive_failures'] == 2 self._clock.tick(interval_secs + epsilon) self._clock.converge(threads=[hct.threaded_health_checker]) self._clock.assert_waiting(hct.threaded_health_checker, amount=1) assert hct.status.status == TaskState.Value('TASK_FAILED') assert hct.metrics.sample()['consecutive_failures'] == 3 hct.stop() assert self._checker.health.call_count == 6
def test_initial_interval_whatev(self): self.append_health_checks(False, 2) hct = HealthChecker( self._checker.health, interval_secs=self.interval_secs, grace_period_secs=0, clock=self._clock) hct.start() self._clock.converge(threads=[hct.threaded_health_checker]) self._clock.assert_waiting(hct.threaded_health_checker, self.interval_secs) assert hct.status == StatusResult('Failed health check! reason', TaskState.Value('TASK_FAILED')) hct.stop() assert self._checker.health.call_count == 1
def test_initial_interval_2x(self): self.append_health_checks(False) hct = HealthChecker(self._checker.health, interval_secs=5, clock=self._clock) hct.start() thread_yield() assert hct.status is None self._clock.tick(6) assert hct.status is None self._clock.tick(3) assert hct.status is None self._clock.tick(5) thread_yield() assert hct.status.status == TaskState.Value('TASK_FAILED') hct.stop() assert self._checker.health.call_count == 1
def test_grace_period_2x_success(self): '''Grace period is 2 x interval and health checks succeed.''' self.append_health_checks(True, num_calls=2) hct = HealthChecker( self._checker.health, interval_secs=self.interval_secs, clock=self._clock) hct.start() assert self._clock.converge(threads=[hct.threaded_health_checker]) self._clock.assert_waiting(hct.threaded_health_checker, self.interval_secs) assert hct.status == StatusResult('Task is healthy.', TaskState.Value('TASK_RUNNING')) assert hct.threaded_health_checker.running is True hct.stop() assert self._checker.health.call_count == 1
def test_initial_interval_whatev(self): self.append_health_checks(False, 2) hct = HealthChecker( self._checker.health, interval_secs=5, initial_interval_secs=0, clock=self._clock) hct.start() self._clock.converge(threads=[hct.threaded_health_checker]) self._clock.assert_waiting(hct.threaded_health_checker, amount=5) assert hct.status.status == TaskState.Value('TASK_FAILED') hct.stop() # this is an implementation detail -- we healthcheck in the initializer and # healthcheck in the run loop. if we ever change the implementation, expect # this to break. assert self._checker.health.call_count == 2
def test_initial_interval_2x(self): self.append_health_checks(False) hct = HealthChecker(self._checker.health, interval_secs=5, clock=self._clock) hct.start() assert self._clock.converge(threads=[hct.threaded_health_checker]) self._clock.assert_waiting(hct.threaded_health_checker, 10) assert hct.status is None self._clock.tick(6) assert self._clock.converge(threads=[hct.threaded_health_checker]) assert hct.status is None self._clock.tick(3) assert self._clock.converge(threads=[hct.threaded_health_checker]) assert hct.status is None self._clock.tick(5) assert self._clock.converge(threads=[hct.threaded_health_checker]) assert hct.status.status == TaskState.Value('TASK_FAILED') hct.stop() assert self._checker.health.call_count == 1
def test_health_checker_metrics(self): def slow_check(): self._clock.sleep(0.5) return (True, None) hct = HealthChecker(slow_check, interval_secs=1, initial_interval_secs=1, clock=self._clock) hct.start() self._clock.converge(threads=[hct.threaded_health_checker]) self._clock.assert_waiting(hct.threaded_health_checker, amount=1) assert hct._total_latency == 0 assert hct.metrics.sample()['total_latency_secs'] == 0 # start the health check (during health check it is still 0) epsilon = 0.001 self._clock.tick(1.0 + epsilon) self._clock.converge(threads=[hct.threaded_health_checker]) self._clock.assert_waiting(hct.threaded_health_checker, amount=0.5) assert hct._total_latency == 0 assert hct.metrics.sample()['total_latency_secs'] == 0 assert hct.metrics.sample()['checks'] == 0 # finish the health check self._clock.tick(0.5 + epsilon) self._clock.converge(threads=[hct.threaded_health_checker]) self._clock.assert_waiting(hct.threaded_health_checker, amount=1) # interval_secs assert hct._total_latency == 0.5 assert hct.metrics.sample()['total_latency_secs'] == 0.5 assert hct.metrics.sample()['checks'] == 1 # tick again self._clock.tick(1.0 + epsilon) self._clock.converge(threads=[hct.threaded_health_checker]) self._clock.tick(0.5 + epsilon) self._clock.converge(threads=[hct.threaded_health_checker]) self._clock.assert_waiting(hct.threaded_health_checker, amount=1) # interval_secs assert hct._total_latency == 1.0 assert hct.metrics.sample()['total_latency_secs'] == 1.0 assert hct.metrics.sample()['checks'] == 2
def test_consecutive_failures_max_failures(self): '''Verify that a task is unhealthy after max_consecutive_failures is exceeded''' grace_period_secs = self.initial_interval_secs interval_secs = self.interval_secs self.append_health_checks(True, num_calls=2) self.append_health_checks(False, num_calls=3) hct = HealthChecker( self._checker.health, interval_secs=interval_secs, grace_period_secs=grace_period_secs, max_consecutive_failures=2, min_consecutive_successes=2, clock=self._clock) hct.start() self._clock.converge(threads=[hct.threaded_health_checker]) self._clock.assert_waiting(hct.threaded_health_checker, interval_secs) assert hct.status == StatusResult(None, TaskState.Value('TASK_STARTING')) assert hct.metrics.sample()['consecutive_failures'] == 0 self._clock.tick(interval_secs) self._clock.converge(threads=[hct.threaded_health_checker]) self._clock.assert_waiting(hct.threaded_health_checker, interval_secs) assert hct.status == StatusResult('Task is healthy.', TaskState.Value('TASK_RUNNING')) assert hct.metrics.sample()['consecutive_failures'] == 0 assert hct.threaded_health_checker.running is True self._clock.tick(interval_secs) self._clock.converge(threads=[hct.threaded_health_checker]) self._clock.assert_waiting(hct.threaded_health_checker, interval_secs) assert hct.status == StatusResult('Task is healthy.', TaskState.Value('TASK_RUNNING')) assert hct.metrics.sample()['consecutive_failures'] == 1 self._clock.tick(interval_secs) self._clock.converge(threads=[hct.threaded_health_checker]) self._clock.assert_waiting(hct.threaded_health_checker, interval_secs) assert hct.status == StatusResult('Task is healthy.', TaskState.Value('TASK_RUNNING')) assert hct.metrics.sample()['consecutive_failures'] == 2 self._clock.tick(interval_secs) self._clock.converge(threads=[hct.threaded_health_checker]) self._clock.assert_waiting(hct.threaded_health_checker, interval_secs) assert hct.status == StatusResult('Failed health check! reason', TaskState.Value('TASK_FAILED')) assert hct.metrics.sample()['consecutive_failures'] == 3 hct.stop() assert self._checker.health.call_count == 5
def setUp(self): self.health = mock.Mock() self.health.return_value = (True, 'Fake') self.sandbox = mock.Mock(spec_set=SandboxInterface) self.sandbox.exists.return_value = True self.sandbox.root = '/root' self.initial_interval_secs = 1 self.interval_secs = 5 self.max_consecutive_failures = 2 self.clock = mock.Mock(spec=time) self.clock.time.return_value = 1.0 self.health_checker = HealthChecker(self.health, None, self.interval_secs, self.initial_interval_secs, self.max_consecutive_failures, self.clock) self.health_checker_sandbox_exists = HealthChecker( self.health, self.sandbox, self.interval_secs, self.initial_interval_secs, self.max_consecutive_failures, self.clock)
def test_initial_interval_whatev(self): self.append_health_checks(False) hct = HealthChecker(self._checker.health, interval_secs=5, initial_interval_secs=0, clock=self._clock) hct.start() assert hct.status.status == TaskState.Value('TASK_FAILED') hct.stop() assert self._checker.health.call_count == 1
def test_consecutive_failures(self): '''Verify that a task is unhealthy only after max_consecutive_failures is exceeded''' initial_interval_secs = 2 interval_secs = 1 self.append_health_checks(False, num_calls=2) self.append_health_checks(True) self.append_health_checks(False, num_calls=3) hct = HealthChecker(self._checker.health, interval_secs=interval_secs, initial_interval_secs=initial_interval_secs, max_consecutive_failures=2, clock=self._clock) hct.start() self._clock.converge(threads=[hct.threaded_health_checker]) # 2 consecutive health check failures followed by a successful health check. epsilon = 0.001 self._clock.tick(initial_interval_secs + epsilon) self._clock.converge(threads=[hct.threaded_health_checker]) self._clock.assert_waiting(hct.threaded_health_checker, amount=1) assert hct.status is None assert hct.metrics.sample()['consecutive_failures'] == 1 self._clock.tick(interval_secs + epsilon) self._clock.converge(threads=[hct.threaded_health_checker]) self._clock.assert_waiting(hct.threaded_health_checker, amount=1) assert hct.status is None assert hct.metrics.sample()['consecutive_failures'] == 2 self._clock.tick(interval_secs + epsilon) self._clock.converge(threads=[hct.threaded_health_checker]) self._clock.assert_waiting(hct.threaded_health_checker, amount=1) assert hct.status is None assert hct.metrics.sample()['consecutive_failures'] == 0 # 3 consecutive health check failures. self._clock.tick(interval_secs + epsilon) self._clock.converge(threads=[hct.threaded_health_checker]) self._clock.assert_waiting(hct.threaded_health_checker, amount=1) assert hct.status is None assert hct.metrics.sample()['consecutive_failures'] == 1 self._clock.tick(interval_secs + epsilon) self._clock.converge(threads=[hct.threaded_health_checker]) self._clock.assert_waiting(hct.threaded_health_checker, amount=1) assert hct.status is None assert hct.metrics.sample()['consecutive_failures'] == 2 self._clock.tick(interval_secs + epsilon) self._clock.converge(threads=[hct.threaded_health_checker]) self._clock.assert_waiting(hct.threaded_health_checker, amount=1) assert hct.status.status == TaskState.Value('TASK_FAILED') assert hct.metrics.sample()['consecutive_failures'] == 3 hct.stop() assert self._checker.health.call_count == 6
def test_initial_interval_whatev(self): self.append_health_checks(False, 2) hct = HealthChecker(self._checker.health, interval_secs=5, initial_interval_secs=0, clock=self._clock) hct.start() self._clock.converge(threads=[hct.threaded_health_checker]) self._clock.assert_waiting(hct.threaded_health_checker, amount=5) assert hct.status.status == TaskState.Value('TASK_FAILED') hct.stop() # this is an implementation detail -- we healthcheck in the initializer and # healthcheck in the run loop. if we ever change the implementation, expect # this to break. assert self._checker.health.call_count == 2