def test_failed_master(self): """ Test that a replica can still serve packages even if the master is down. https://bitbucket.org/hpk42/devpi/issues/353/non-available-mirrors-can-abort-index """ users = {'user': {'password': NATIVE_PASSWORD}} indices = OrderedDict([ ('user/baseindex', {}), ('user/index', {'bases': 'root/pypi,user/baseindex'}) ]) master_context = TestServer(users, indices, config={'role': 'master', 'port': 2414, 'request-timeout': 30}) with master_context as master: # Upload packages to baseindex master.use('user', 'baseindex') master.login('user', NATIVE_PASSWORD) master.upload(DIST_DIR, directory=True) with TestServer(config={'master-url': master.server_url, 'port': 2413, 'request-timeout': 30}, fail_on_output=[]) as replica: replica.use('user', 'index') # Request package on a replica wait_until(lambda: download(PACKAGE_NAME, replica.url) is True) # Terminate the master. Downloading the package should still succeed master_context.__exit__(None, None, None) wait_until(lambda: download(PACKAGE_NAME, replica.url) is True)
def test_failing_jobs(self): with TestAreaContext("job_queue_test_add") as work_area: job_queue = create_queue(failing_script, max_submit=1) assert job_queue.queue_size == 10 assert job_queue.is_active() pool_sema = BoundedSemaphore(value=10) start_all(job_queue, pool_sema) wait_until( func=(lambda: self.assertFalse(job_queue.is_active())), ) for job in job_queue.job_list: job.wait_for() job_queue._transition() assert job_queue.fetch_next_waiting() is None for q_index, job in enumerate(job_queue.job_list): assert job.status == JobStatusType.JOB_QUEUE_FAILED iens = job_queue._qindex_to_iens[q_index] assert job_queue.snapshot()[iens] == str( JobStatusType.JOB_QUEUE_FAILED)
def test_unknown_workload_type_label(self): registry = Registry() test_context = TestContext() unknown_event = get_event( CONTAINER, CREATE, uuid.uuid4(), { NAME: "container-name", APP_NAME_LABEL_KEY: DEFAULT_TEST_APP_NAME, CPU_LABEL_KEY: "1", MEM_LABEL_KEY: str(DEFAULT_TEST_MEM), DISK_LABEL_KEY: str(DEFAULT_TEST_DISK), NETWORK_LABEL_KEY: str(DEFAULT_TEST_NETWORK), JOB_TYPE_LABEL_KEY: DEFAULT_TEST_JOB_TYPE, WORKLOAD_TYPE_LABEL_KEY: "unknown", OWNER_EMAIL_LABEL_KEY: DEFAULT_TEST_OWNER_EMAIL, IMAGE_LABEL_KEY: DEFAULT_TEST_IMAGE, }) valid_event = get_container_create_event(1) event_iterable = MockEventProvider([unknown_event, valid_event]) manager = EventManager(event_iterable, test_context.get_event_handlers(), DEFAULT_TEST_EVENT_TIMEOUT_SECS) manager.set_registry(registry) manager.start_processing_events() wait_until(lambda: manager.get_error_count() == 1) wait_until(lambda: manager.get_processed_count() == 2) self.assertEqual(0, manager.get_queue_depth()) manager.stop_processing_events() manager.report_metrics({}) self.assertTrue(gauge_value_equals(registry, QUEUE_DEPTH_KEY, 0)) self.assertTrue(gauge_value_equals(registry, EVENT_SUCCEEDED_KEY, 5)) self.assertTrue(gauge_value_equals(registry, EVENT_FAILED_KEY, 1)) self.assertTrue(gauge_value_equals(registry, EVENT_PROCESSED_KEY, 2))
def test_free_cpu_on_container_die(self): workload_name = str(uuid.uuid4()) workload = Workload(workload_name, DEFAULT_CPU_COUNT, STATIC) docker_client = MockDockerClient([MockContainer(workload)]) events = [ get_container_create_event(DEFAULT_CPU_COUNT, STATIC, workload_name, workload_name), get_container_die_event(workload_name) ] event_count = len(events) event_iterable = MockEventProvider( events, 1) # Force in order event processing for the test test_context = TestContext(docker_client) manager = EventManager(event_iterable, test_context.get_event_handlers(), get_mock_file_manager(), DEFAULT_TEST_EVENT_TIMEOUT_SECS) wait_until(lambda: event_count == manager.get_processed_count()) self.assertEqual(0, manager.get_queue_depth()) self.assertEqual(DEFAULT_TOTAL_THREAD_COUNT, len(test_context.get_cpu().get_empty_threads())) self.assertEqual( 1, test_context.get_create_event_handler().get_handled_event_count()) self.assertEqual( 1, test_context.get_free_event_handler().get_handled_event_count()) manager.stop_processing_events()
def test_absent_workload_type_label(self): registry = Registry() test_context = TestContext() name = str(uuid.uuid4()) unknown_event = get_event(CONTAINER, CREATE, name, { CPU_LABEL_KEY: "1", NAME: name }) event_handlers = test_context.get_event_handlers() event_iterable = MockEventProvider([unknown_event]) manager = EventManager(event_iterable, event_handlers, DEFAULT_TEST_EVENT_TIMEOUT_SECS) manager.set_registry(registry) manager.start_processing_events() wait_until(lambda: test_context.get_create_event_handler(). get_ignored_event_count() == 1) self.assertEqual(0, manager.get_queue_depth()) manager.stop_processing_events() manager.report_metrics({}) self.assertTrue(gauge_value_equals(registry, QUEUE_DEPTH_KEY, 0)) self.assertTrue( gauge_value_equals(registry, EVENT_SUCCEEDED_KEY, len(test_context.get_event_handlers()))) self.assertTrue(gauge_value_equals(registry, EVENT_FAILED_KEY, 0)) self.assertTrue(gauge_value_equals(registry, EVENT_PROCESSED_KEY, 1))
def test_cross_replica_synchronization(self): """ Any change performed by a replica should be observable by another one. """ users = {NATIVE_USER: {'password': NATIVE_PASSWORD}} indices = {NATIVE_USER + '/index': {}} with TestServer(users, indices, config={ 'role': 'master', 'port': 2414 }) as master: with TestServer(config={ 'master-url': master.server_url, 'port': 2413 }, fail_on_output=[]) as replica1: with TestServer(config={ 'master-url': master.server_url, 'port': 2412 }, fail_on_output=[]) as replica2: replica1.use(NATIVE_USER, 'index') replica2.use(NATIVE_USER, 'index') replica1.login(NATIVE_USER, NATIVE_PASSWORD) replica1.upload(DIST_DIR, directory=True) wait_until( lambda: download(PACKAGE_NAME, replica2.url) is True) replica1.remove(PACKAGE_NAME) wait_until( lambda: download(PACKAGE_NAME, replica2.url) is False)
def test_kill_jobs(self): with TestAreaContext("job_queue_test_kill") as work_area: job_queue = create_queue(never_ending_script) assert job_queue.queue_size == 10 assert job_queue.is_active() pool_sema = BoundedSemaphore(value=10) start_all(job_queue, pool_sema) # make sure never ending jobs are running wait_until(lambda: self.assertTrue(job_queue.is_active())) for job in job_queue.job_list: job.stop() wait_until(lambda: self.assertFalse(job_queue.is_active())) job_queue._transition() for q_index, job in enumerate(job_queue.job_list): assert job.status == JobStatusType.JOB_QUEUE_IS_KILLED iens = job_queue._qindex_to_iens[q_index] assert job_queue.snapshot()[iens] == str( JobStatusType.JOB_QUEUE_IS_KILLED) for job in job_queue.job_list: job.wait_for()
def test_workflow_thread_cancel_external(self): with TestAreaContext( "python/job_queue/workflow_runner_external") as work_area: WorkflowCommon.createWaitJob() joblist = WorkflowJoblist() self.assertTrue(joblist.addJobFromFile("WAIT", "external_wait_job")) self.assertTrue("WAIT" in joblist) workflow = Workflow("wait_workflow", joblist) self.assertEqual(len(workflow), 3) workflow_runner = WorkflowRunner(workflow, ert=None, context=SubstitutionList()) self.assertFalse(workflow_runner.isRunning()) with workflow_runner: wait_until( lambda: self.assertTrue(workflow_runner.isRunning())) wait_until(lambda: self.assertFileExists("wait_started_0")) wait_until(lambda: self.assertFileExists("wait_finished_0")) wait_until(lambda: self.assertFileExists("wait_started_1")) workflow_runner.cancel() self.assertTrue(workflow_runner.isCancelled()) self.assertFileDoesNotExist("wait_finished_1") self.assertFileDoesNotExist("wait_started_2") self.assertFileDoesNotExist("wait_cancelled_2") self.assertFileDoesNotExist("wait_finished_2")
def test_unknown_action(self): test_context = TestContext() unknown_event = get_event(CONTAINER, "unknown", uuid.uuid4(), {}) event_iterable = MockEventProvider([unknown_event]) manager = EventManager(event_iterable, test_context.get_event_handlers(), get_mock_file_manager(), DEFAULT_TEST_EVENT_TIMEOUT_SECS) wait_until(lambda: test_context.get_create_event_handler(). get_ignored_event_count() == 1) self.assertEqual(0, manager.get_queue_depth()) manager.stop_processing_events()
def test_absent_cpu_label(self): test_context = TestContext() unknown_event = get_event(CONTAINER, CREATE, "unknown", { WORKLOAD_TYPE_LABEL_KEY: STATIC, NAME: str(uuid.uuid4()) }) event_iterable = MockEventProvider([unknown_event]) manager = EventManager(event_iterable, test_context.get_event_handlers(), get_mock_file_manager(), DEFAULT_TEST_EVENT_TIMEOUT_SECS) wait_until(lambda: test_context.get_create_event_handler(). get_ignored_event_count() == 1) self.assertEqual(0, manager.get_queue_depth()) manager.stop_processing_events()
def test_absent_workload_type_label(self): test_context = TestContext() name = str(uuid.uuid4()) unknown_event = get_event(CONTAINER, CREATE, name, { CPU_LABEL_KEY: "1", NAME: name }) event_handlers = test_context.get_event_handlers() event_iterable = MockEventProvider([unknown_event]) manager = EventManager(event_iterable, event_handlers, get_mock_file_manager(), DEFAULT_TEST_EVENT_TIMEOUT_SECS) wait_until(lambda: test_context.get_create_event_handler(). get_ignored_event_count() == 1) self.assertEqual(0, manager.get_queue_depth()) manager.stop_processing_events()
def test_add_jobs(self): with TestAreaContext("job_queue_test_add") as work_area: job_queue = create_queue(simple_script) assert job_queue.queue_size == 10 assert job_queue.is_active() assert job_queue.fetch_next_waiting() is not None pool_sema = BoundedSemaphore(value=10) start_all(job_queue, pool_sema) for job in job_queue.job_list: job.stop() wait_until(lambda: self.assertFalse(job_queue.is_active())) for job in job_queue.job_list: job.wait_for()
def test_free_cpu_on_container_die(self): registry = Registry() test_pod = get_simple_test_pod() get_pod_manager().set_pod(test_pod) workload_name = test_pod.metadata.name events = [ get_container_create_event(DEFAULT_CPU_COUNT, STATIC, workload_name, workload_name), get_container_die_event(workload_name) ] event_count = len(events) event_iterable = MockEventProvider( events, 1) # Force in order event processing for the test test_context = TestContext() manager = EventManager(event_iterable, test_context.get_event_handlers(), DEFAULT_TEST_EVENT_TIMEOUT_SECS) manager.set_registry(registry, {}) manager.start_processing_events() wait_until(lambda: event_count == manager.get_processed_count()) self.assertEqual(0, manager.get_queue_depth()) self.assertEqual(DEFAULT_TOTAL_THREAD_COUNT, len(test_context.get_cpu().get_empty_threads())) self.assertEqual( 1, test_context.get_create_event_handler().get_handled_event_count()) self.assertEqual( 1, test_context.get_free_event_handler().get_handled_event_count()) manager.stop_processing_events() manager.report_metrics({}) self.assertTrue(gauge_value_equals(registry, QUEUE_DEPTH_KEY, 0)) self.assertTrue( counter_value_equals( registry, EVENT_SUCCEEDED_KEY, event_count * len(test_context.get_event_handlers()))) self.assertTrue(counter_value_equals(registry, EVENT_FAILED_KEY, 0)) self.assertTrue( counter_value_equals(registry, EVENT_PROCESSED_KEY, event_count))
def test_cross_replica_synchronization(self): """ Any change performed by a replica should be observable by another one. """ users = {NATIVE_USER: {'password': NATIVE_PASSWORD}} indices = {NATIVE_USER + '/index': {}} with TestServer(users, indices, config={'role': 'master', 'port': 2414}) as master: with TestServer(config={'master-url': master.server_url, 'port': 2413}, fail_on_output=[]) as replica1: with TestServer(config={'master-url': master.server_url, 'port': 2412}, fail_on_output=[]) as replica2: replica1.use(NATIVE_USER, 'index') replica2.use(NATIVE_USER, 'index') replica1.login(NATIVE_USER, NATIVE_PASSWORD) replica1.upload(DIST_DIR, directory=True) wait_until(lambda: download(PACKAGE_NAME, replica2.url) is True) replica1.remove(PACKAGE_NAME) wait_until(lambda: download(PACKAGE_NAME, replica2.url) is False)
def test_unknown_workload_type_label(self): test_context = TestContext() unknown_event = get_event( CONTAINER, CREATE, uuid.uuid4(), { NAME: "container-name", CPU_LABEL_KEY: "1", WORKLOAD_TYPE_LABEL_KEY: "unknown" }) valid_event = get_container_create_event(1) event_iterable = MockEventProvider([unknown_event, valid_event]) manager = EventManager(event_iterable, test_context.get_event_handlers(), get_mock_file_manager(), DEFAULT_TEST_EVENT_TIMEOUT_SECS) wait_until(lambda: manager.get_error_count() == 1) wait_until(lambda: manager.get_processed_count() == 2) self.assertEqual(0, manager.get_queue_depth()) manager.stop_processing_events()
def test_upload(self): users = {NATIVE_USER: {'password': NATIVE_PASSWORD}} indices = {NATIVE_USER + '/index': {}} with TestServer(users=users, indices=indices) as devpi: devpi.use(NATIVE_USER, 'index') devpi.login(NATIVE_USER, NATIVE_PASSWORD) with pushd(SOURCE_DIR): devpi.upload(path=None, with_docs=True) def doc_present(version=PACKAGE_VERSION): return requests.get( devpi.server_url + "/{}/index/test-package/{}/+d/index.html".format(NATIVE_USER, version), ).status_code == 200, wait_until(doc_present, maxloop=300) self.assertTrue(doc_present('+latest')) self.assertTrue(doc_present('+stable'))
def test_upload(self): users = {NATIVE_USER: {'password': NATIVE_PASSWORD}} indices = {NATIVE_USER + '/index': {}} with TestServer(users=users, indices=indices) as devpi: devpi.use(NATIVE_USER, 'index') devpi.login(NATIVE_USER, NATIVE_PASSWORD) with pushd(SOURCE_DIR): devpi.upload(path=None, with_docs=True) def doc_present(version=PACKAGE_VERSION): return requests.get( devpi.server_url + "/{}/index/test-package/{}/+d/index.html".format( NATIVE_USER, version), ).status_code == 200, wait_until(doc_present, maxloop=300) self.assertTrue(doc_present('+latest')) self.assertTrue(doc_present('+stable'))
def test_update_mock_container(self): registry = Registry() workload_name = str(uuid.uuid4()) events = [ get_container_create_event(DEFAULT_CPU_COUNT, STATIC, workload_name, workload_name) ] event_count = len(events) event_iterable = MockEventProvider(events) test_context = TestContext() manager = EventManager(event_iterable, test_context.get_event_handlers(), DEFAULT_TEST_EVENT_TIMEOUT_SECS) manager.set_registry(registry) manager.start_processing_events() wait_until(lambda: event_count == manager.get_processed_count()) self.assertEqual(0, manager.get_queue_depth()) self.assertEqual( event_count, test_context.get_workload_manager().get_success_count()) self.assertEqual(DEFAULT_TOTAL_THREAD_COUNT - DEFAULT_CPU_COUNT, len(test_context.get_cpu().get_empty_threads())) self.assertEqual( 1, test_context.get_create_event_handler().get_handled_event_count()) manager.stop_processing_events() manager.report_metrics({}) self.assertTrue(gauge_value_equals(registry, QUEUE_DEPTH_KEY, 0)) self.assertTrue( gauge_value_equals( registry, EVENT_SUCCEEDED_KEY, event_count * len(test_context.get_event_handlers()))) self.assertTrue(gauge_value_equals(registry, EVENT_FAILED_KEY, 0)) self.assertTrue( gauge_value_equals(registry, EVENT_PROCESSED_KEY, event_count))
def test_late_replication(self): """ Test that the replicas are properly catching up with changes, even if they were not online when the change happened. This sometimes used to result in tracebacks. """ users = {NATIVE_USER: {'password': NATIVE_PASSWORD}} indices = {NATIVE_USER + '/index': {}} with TestServer(users, indices, config={'role': 'master', 'port': 2414, 'request-timeout': 30, 'replica-max-retries': 2}) as master: master.use(NATIVE_USER, 'index') master.login(NATIVE_USER, NATIVE_PASSWORD) master.upload(DIST_DIR, directory=True) with TestServer(config={'master-url': master.server_url, 'port': 2413, 'request-timeout': 30, 'replica-max-retries': 2}) as replica1: replica1.use(NATIVE_USER, 'index') wait_until(lambda: download(PACKAGE_NAME, replica1.url) is True) master.remove(PACKAGE_NAME) wait_until(lambda: download(PACKAGE_NAME, replica1.url) is False) with TestServer(config={'master-url': master.server_url, 'port': 2412, 'request-timeout': 30, 'replica-max-retries': 2}) as replica2: replica2.use(NATIVE_USER, 'index') wait_until(lambda: download(PACKAGE_NAME, replica2.url) is False)
def test_rebalance(self): registry = Registry() events = [REBALANCE_EVENT] event_count = len(events) event_iterable = MockEventProvider(events) test_context = TestContext() manager = EventManager(event_iterable, test_context.get_event_handlers(), DEFAULT_TEST_EVENT_TIMEOUT_SECS) manager.set_registry(registry) manager.start_processing_events() wait_until(lambda: event_count == manager.get_processed_count()) self.assertEqual(0, manager.get_queue_depth()) self.assertEqual(DEFAULT_TOTAL_THREAD_COUNT, len(test_context.get_cpu().get_empty_threads())) self.assertEqual( 0, test_context.get_create_event_handler().get_handled_event_count()) self.assertEqual( 0, test_context.get_free_event_handler().get_handled_event_count()) self.assertEqual( 1, test_context.get_rebalance_event_handler().get_handled_event_count( )) manager.stop_processing_events() manager.report_metrics({}) self.assertTrue(gauge_value_equals(registry, QUEUE_DEPTH_KEY, 0)) self.assertTrue( gauge_value_equals( registry, EVENT_SUCCEEDED_KEY, event_count * len(test_context.get_event_handlers()))) self.assertTrue(gauge_value_equals(registry, EVENT_FAILED_KEY, 0)) self.assertTrue( gauge_value_equals(registry, EVENT_PROCESSED_KEY, event_count))
def test_failing_jobs(self): with TestAreaContext("job_queue_test_add") as work_area: job_queue = create_queue(failing_script, max_submit=1) assert job_queue.queue_size == 10 assert job_queue.is_active() pool_sema = BoundedSemaphore(value=10) start_all(job_queue, pool_sema) wait_until( func=(lambda: self.assertFalse(job_queue.is_active())), ) for job in job_queue.job_list: job.wait_for() assert job_queue.fetch_next_waiting() is None for job in job_queue.job_list: assert job.status == JobStatusType.JOB_QUEUE_FAILED assert True
def test_timeout_jobs(self): with TestAreaContext("job_queue_test_kill") as work_area: job_queue = create_queue(never_ending_script, max_submit=1, max_runtime=5) assert job_queue.queue_size == 10 assert job_queue.is_active() pool_sema = BoundedSemaphore(value=10) start_all(job_queue, pool_sema) # make sure never ending jobs are running wait_until(lambda: self.assertTrue(job_queue.is_active())) wait_until(lambda: self.assertFalse(job_queue.is_active())) for job in job_queue.job_list: assert job.status == JobStatusType.JOB_QUEUE_IS_KILLED for job in job_queue.job_list: job.wait_for()
def test_failed_master(self): """ Test that a replica can still serve packages even if the master is down. https://bitbucket.org/hpk42/devpi/issues/353/non-available-mirrors-can-abort-index """ users = {'user': {'password': NATIVE_PASSWORD}} indices = OrderedDict([('user/baseindex', {}), ('user/index', { 'bases': 'root/pypi,user/baseindex' })]) master_context = TestServer(users, indices, config={ 'role': 'master', 'port': 2414, 'request-timeout': 30 }) with master_context as master: # Upload packages to baseindex master.use('user', 'baseindex') master.login('user', NATIVE_PASSWORD) master.upload(DIST_DIR, directory=True) with TestServer(config={ 'master-url': master.server_url, 'port': 2413, 'request-timeout': 30 }, fail_on_output=[]) as replica: replica.use('user', 'index') # Request package on a replica wait_until(lambda: download(PACKAGE_NAME, replica.url) is True) # Terminate the master. Downloading the package should still succeed master_context.__exit__(None, None, None) wait_until(lambda: download(PACKAGE_NAME, replica.url) is True)
def test_timeout_jobs(self): with TestAreaContext("job_queue_test_kill") as work_area: job_numbers = set() def callback(arg): nonlocal job_numbers job_numbers.add(arg[0]["job_number"]) job_queue = create_queue( never_ending_script, max_submit=1, max_runtime=5, callback_timeout=callback, ) assert job_queue.queue_size == 10 assert job_queue.is_active() pool_sema = BoundedSemaphore(value=10) start_all(job_queue, pool_sema) # make sure never ending jobs are running wait_until(lambda: self.assertTrue(job_queue.is_active())) wait_until(lambda: self.assertFalse(job_queue.is_active())) job_queue._transition() for q_index, job in enumerate(job_queue.job_list): assert job.status == JobStatusType.JOB_QUEUE_IS_KILLED iens = job_queue._qindex_to_iens[q_index] assert job_queue.snapshot()[iens] == str( JobStatusType.JOB_QUEUE_IS_KILLED) assert job_numbers == set(range(10)) for job in job_queue.job_list: job.wait_for()
def test_unknown_action(self): registry = Registry() test_context = TestContext() unknown_event = get_event(CONTAINER, "unknown", uuid.uuid4(), {}) event_iterable = MockEventProvider([unknown_event]) manager = EventManager(event_iterable, test_context.get_event_handlers(), DEFAULT_TEST_EVENT_TIMEOUT_SECS) manager.set_registry(registry) manager.start_processing_events() wait_until(lambda: test_context.get_create_event_handler(). get_ignored_event_count() == 1) self.assertEqual(0, manager.get_queue_depth()) manager.stop_processing_events() manager.report_metrics({}) self.assertTrue(gauge_value_equals(registry, QUEUE_DEPTH_KEY, 0)) self.assertTrue( gauge_value_equals(registry, EVENT_SUCCEEDED_KEY, len(test_context.get_event_handlers()))) self.assertTrue(gauge_value_equals(registry, EVENT_FAILED_KEY, 0)) self.assertTrue(gauge_value_equals(registry, EVENT_PROCESSED_KEY, 1))
def test_late_replication(self): """ Test that the replicas are properly catching up with changes, even if they were not online when the change happened. This sometimes used to result in tracebacks. """ users = {NATIVE_USER: {'password': NATIVE_PASSWORD}} indices = {NATIVE_USER + '/index': {}} with TestServer(users, indices, config={ 'role': 'master', 'port': 2414, 'request-timeout': 30, 'replica-max-retries': 2 }) as master: master.use(NATIVE_USER, 'index') master.login(NATIVE_USER, NATIVE_PASSWORD) master.upload(DIST_DIR, directory=True) with TestServer( config={ 'master-url': master.server_url, 'port': 2413, 'request-timeout': 30, 'replica-max-retries': 2 }) as replica1: replica1.use(NATIVE_USER, 'index') wait_until( lambda: download(PACKAGE_NAME, replica1.url) is True) master.remove(PACKAGE_NAME) wait_until( lambda: download(PACKAGE_NAME, replica1.url) is False) with TestServer( config={ 'master-url': master.server_url, 'port': 2412, 'request-timeout': 30, 'replica-max-retries': 2 }) as replica2: replica2.use(NATIVE_USER, 'index') wait_until( lambda: download(PACKAGE_NAME, replica2.url) is False)
def test_simulation_context(self): config_file = self.createTestPath("local/batch_sim/sleepy_time.ert") with ErtTestContext("res/sim/simulation_context", config_file) as test_context: ert = test_context.getErt() size = 4 even_mask = BoolVector(initial_size=size) odd_mask = BoolVector(initial_size=size) for iens_2 in range(size // 2): even_mask[2 * iens_2] = True even_mask[2 * iens_2 + 1] = False odd_mask[2 * iens_2] = False odd_mask[2 * iens_2 + 1] = True fs_manager = ert.getEnkfFsManager() even_half = fs_manager.getFileSystem("even_half") odd_half = fs_manager.getFileSystem("odd_half") # i represents geo_id case_data = [(i, {}) for i in range(size)] even_ctx = SimulationContext(ert, even_half, even_mask, 0, case_data) odd_ctx = SimulationContext(ert, odd_half, odd_mask, 0, case_data) for iens in range(size): # do we have the proper geo_id in run_args? if iens % 2 == 0: self.assertFalse(even_ctx.isRealizationFinished(iens)) self.assertEqual(even_ctx.get_run_args(iens).geo_id, iens) else: self.assertFalse(odd_ctx.isRealizationFinished(iens)) self.assertEqual(odd_ctx.get_run_args(iens).geo_id, iens) def any_is_running(): return even_ctx.isRunning() or odd_ctx.isRunning() wait_until(func=(lambda: self.assertFalse(any_is_running())), timeout=90) self.assertEqual(even_ctx.getNumFailed(), 0) self.assertEqual(even_ctx.getNumRunning(), 0) self.assertEqual(even_ctx.getNumSuccess(), size / 2) self.assertEqual(odd_ctx.getNumFailed(), 0) self.assertEqual(odd_ctx.getNumRunning(), 0) self.assertEqual(odd_ctx.getNumSuccess(), size / 2) even_state_map = even_half.getStateMap() odd_state_map = odd_half.getStateMap() for iens in range(size): if iens % 2 == 0: self.assertTrue(even_ctx.didRealizationSucceed(iens)) self.assertFalse(even_ctx.didRealizationFail(iens)) self.assertTrue(even_ctx.isRealizationFinished(iens)) self.assertEqual(even_state_map[iens], RealizationStateEnum.STATE_HAS_DATA) else: self.assertTrue(odd_ctx.didRealizationSucceed(iens)) self.assertFalse(odd_ctx.didRealizationFail(iens)) self.assertTrue(odd_ctx.isRealizationFinished(iens)) self.assertEqual(odd_state_map[iens], RealizationStateEnum.STATE_HAS_DATA)
def test_empty_metrics(self): test_context = TestContext() event_manager = EventManager(MockEventProvider([]), [], get_mock_file_manager(), 0.01) registry = Registry() reporter = InternalMetricsReporter(test_context.get_workload_manager(), event_manager) reporter.set_registry(registry) reporter.report_metrics({}) wait_until(lambda: self.__gauge_value_equals(registry, RUNNING, 1)) wait_until(lambda: self.__gauge_value_equals(registry, ADDED_KEY, 0)) wait_until(lambda: self.__gauge_value_equals(registry, REMOVED_KEY, 0)) wait_until( lambda: self.__gauge_value_equals(registry, SUCCEEDED_KEY, 0)) wait_until(lambda: self.__gauge_value_equals(registry, FAILED_KEY, 0)) wait_until( lambda: self.__gauge_value_equals(registry, QUEUE_DEPTH_KEY, 0)) wait_until( lambda: self.__gauge_value_equals(registry, WORKLOAD_COUNT_KEY, 0)) wait_until(lambda: self.__gauge_value_equals( registry, PACKAGE_VIOLATIONS_KEY, 0)) wait_until(lambda: self.__gauge_value_equals(registry, CORE_VIOLATIONS_KEY, 0)) wait_until(lambda: self.__gauge_value_equals(registry, EVENT_SUCCEEDED_KEY, 0)) wait_until( lambda: self.__gauge_value_equals(registry, EVENT_FAILED_KEY, 0)) wait_until(lambda: self.__gauge_value_equals(registry, EVENT_PROCESSED_KEY, 0)) wait_until(lambda: self.__gauge_value_equals( registry, FALLBACK_ALLOCATOR_COUNT, 0)) wait_until(lambda: self.__gauge_value_equals( registry, IP_ALLOCATOR_TIMEBOUND_COUNT, 0)) event_manager.stop_processing_events()
def test_edge_case_ip_allocator_metrics(self): # this is a specific scenario causing troubles to the solver. # we should hit the time-bound limit and report it. cpu = get_cpu(2, 16, 2) test_context = TestContext(cpu=cpu) test_context.get_workload_manager().get_allocator( ).set_solver_max_runtime_secs(0.01) events = [] cnt_evts = 0 for i in range(15): events.append(get_container_create_event(2, name=str(i), id=str(i))) cnt_evts += 15 events.append(get_container_create_event(1, name="15", id="15")) cnt_evts += 1 for i in range(9): events.append( get_container_create_event(2, name=str(i + cnt_evts), id=str(i + cnt_evts))) events.append(get_container_die_event(name="15", id="15")) event_count = len(events) event_manager = EventManager(MockEventProvider(events), test_context.get_event_handlers(), get_mock_file_manager(), 5.0) wait_until(lambda: event_count == event_manager.get_processed_count(), timeout=20) log.info("Event manager has processed {} events.".format( event_manager.get_processed_count())) workload_manager = test_context.get_workload_manager() registry = Registry() reporter = InternalMetricsReporter(workload_manager, event_manager) reporter.set_registry(registry) reporter.report_metrics({}) wait_until(lambda: self.__gauge_value_equals(registry, RUNNING, 1)) wait_until(lambda: self.__gauge_value_equals(registry, ADDED_KEY, 25)) wait_until(lambda: self.__gauge_value_equals(registry, REMOVED_KEY, 1)) wait_until( lambda: self.__gauge_value_equals(registry, SUCCEEDED_KEY, 26)) wait_until(lambda: self.__gauge_value_equals(registry, FAILED_KEY, 0)) wait_until( lambda: self.__gauge_value_equals(registry, QUEUE_DEPTH_KEY, 0)) wait_until(lambda: self.__gauge_value_equals(registry, WORKLOAD_COUNT_KEY, 24)) wait_until(lambda: self.__gauge_value_equals( registry, PACKAGE_VIOLATIONS_KEY, 0)) wait_until(lambda: self.__gauge_value_equals( registry, EVENT_SUCCEEDED_KEY, 3 * 26)) wait_until( lambda: self.__gauge_value_equals(registry, EVENT_FAILED_KEY, 0)) wait_until(lambda: self.__gauge_value_equals(registry, EVENT_PROCESSED_KEY, 26)) wait_until(lambda: self.__gauge_value_reached( registry, IP_ALLOCATOR_TIMEBOUND_COUNT, 1)) wait_until(lambda: self.__gauge_value_reached( registry, ALLOCATOR_CALL_DURATION, 0.1)) event_manager.stop_processing_events()
def test_crash_ip_allocator_metrics(self): cpu = get_cpu(2, 16, 2) test_context = TestContext(cpu=cpu) # now override the cpu seen by the allocator to crash it test_context.get_workload_manager().get_allocator().set_cpu( get_cpu(2, 2, 2)) events = [get_container_create_event(10, name="foo", id="bar")] event_count = len(events) event_manager = EventManager(MockEventProvider(events), test_context.get_event_handlers(), get_mock_file_manager(), 5.0) wait_until(lambda: event_count == event_manager.get_processed_count()) log.info("Event manager has processed {} events.".format( event_manager.get_processed_count())) workload_manager = test_context.get_workload_manager() registry = Registry() reporter = InternalMetricsReporter(workload_manager, event_manager) reporter.set_registry(registry) reporter.report_metrics({}) wait_until(lambda: self.__gauge_value_equals(registry, RUNNING, 1)) wait_until(lambda: self.__gauge_value_equals(registry, ADDED_KEY, 1)) wait_until(lambda: self.__gauge_value_equals(registry, REMOVED_KEY, 0)) wait_until( lambda: self.__gauge_value_equals(registry, SUCCEEDED_KEY, 1)) wait_until(lambda: self.__gauge_value_equals(registry, FAILED_KEY, 0)) wait_until( lambda: self.__gauge_value_equals(registry, WORKLOAD_COUNT_KEY, 1)) wait_until(lambda: self.__gauge_value_equals( registry, FALLBACK_ALLOCATOR_COUNT, 1)) event_manager.stop_processing_events()
def test_terminate_jobs(self): # Executes it self recursively and sleeps for 100 seconds with open("dummy_executable", "w") as f: f.write("""#!/usr/bin/env python import sys, os, time counter = eval(sys.argv[1]) if counter > 0: os.fork() os.execv(sys.argv[0],[sys.argv[0], str(counter - 1) ]) else: time.sleep(100)""") executable = os.path.realpath("dummy_executable") os.chmod("dummy_executable", stat.S_IRWXU | stat.S_IRWXO | stat.S_IRWXG) self.job_list = { "umask": "0002", "DATA_ROOT": "", "global_environment": {}, "global_update_path": {}, "jobList": [{ "name": "dummy_executable", "executable": executable, "target_file": None, "error_file": None, "start_file": None, "stdout": "dummy.stdout", "stderr": "dummy.stderr", "stdin": None, "argList": ["3"], "environment": None, "exec_env": None, "license_path": None, "max_running_minutes": None, "max_running": None, "min_arg": 1, "arg_types": [], "max_arg": None, }], "run_id": "", "ert_pid": "", } with open("jobs.json", "w") as f: f.write(json.dumps(self.job_list)) # macOS doesn't provide /usr/bin/setsid, so we roll our own with open("setsid", "w") as f: f.write( dedent("""\ #!/usr/bin/env python import os import sys os.setsid() os.execvp(sys.argv[1], sys.argv[1:]) """)) os.chmod("setsid", 0o755) job_dispatch_script = importlib.util.find_spec( "job_runner.job_dispatch").origin job_dispatch_process = Popen([ os.getcwd() + "/setsid", sys.executable, job_dispatch_script, os.getcwd(), ]) p = psutil.Process(job_dispatch_process.pid) # Three levels of processes should spawn 8 children in total wait_until( lambda: self.assertEqual(len(p.children(recursive=True)), 8)) p.terminate() wait_until( lambda: self.assertEqual(len(p.children(recursive=True)), 0)) os.wait() # allow os to clean up zombie processes
def test_add_metrics(self): test_context = TestContext() workload_name = str(uuid.uuid4()) events = [ get_container_create_event(DEFAULT_CPU_COUNT, STATIC, workload_name, workload_name) ] event_count = len(events) event_manager = EventManager(MockEventProvider(events), test_context.get_event_handlers(), get_mock_file_manager(), 5.0) wait_until(lambda: event_count == event_manager.get_processed_count()) log.info("Event manager has processed {} events.".format( event_manager.get_processed_count())) workload_manager = test_context.get_workload_manager() registry = Registry() reporter = InternalMetricsReporter(workload_manager, event_manager) reporter.set_registry(registry) reporter.report_metrics({}) wait_until(lambda: self.__gauge_value_equals(registry, RUNNING, 1)) wait_until(lambda: self.__gauge_value_equals(registry, ADDED_KEY, 1)) wait_until(lambda: self.__gauge_value_equals(registry, REMOVED_KEY, 0)) wait_until( lambda: self.__gauge_value_equals(registry, SUCCEEDED_KEY, 1)) wait_until(lambda: self.__gauge_value_equals(registry, FAILED_KEY, 0)) wait_until( lambda: self.__gauge_value_equals(registry, QUEUE_DEPTH_KEY, 0)) wait_until( lambda: self.__gauge_value_equals(registry, WORKLOAD_COUNT_KEY, 1)) wait_until(lambda: self.__gauge_value_equals( registry, PACKAGE_VIOLATIONS_KEY, 0)) wait_until(lambda: self.__gauge_value_equals(registry, CORE_VIOLATIONS_KEY, 0)) wait_until(lambda: self.__gauge_value_equals(registry, EVENT_SUCCEEDED_KEY, 3)) wait_until( lambda: self.__gauge_value_equals(registry, EVENT_FAILED_KEY, 0)) wait_until(lambda: self.__gauge_value_equals(registry, EVENT_PROCESSED_KEY, 1)) wait_until(lambda: self.__gauge_value_equals( registry, FALLBACK_ALLOCATOR_COUNT, 0)) wait_until(lambda: self.__gauge_value_equals( registry, IP_ALLOCATOR_TIMEBOUND_COUNT, 0)) event_manager.stop_processing_events()
def test_terminate_jobs(self): # Executes it self recursively and sleeps for 100 seconds with open("dummy_executable", "w") as f: f.write("""#!/usr/bin/env python import sys, os, time counter = eval(sys.argv[1]) if counter > 0: os.fork() os.execv(sys.argv[0],[sys.argv[0], str(counter - 1) ]) else: time.sleep(100)""") executable = os.path.realpath("dummy_executable") os.chmod("dummy_executable", stat.S_IRWXU | stat.S_IRWXO | stat.S_IRWXG) self.job_list = { "umask": "0002", "DATA_ROOT": "", "global_environment": {}, "global_update_path": {}, "jobList": [{ "name": "dummy_executable", "executable": executable, "target_file": None, "error_file": None, "start_file": None, "stdout": "dummy.stdout", "stderr": "dummy.stderr", "stdin": None, "argList": ["3"], "environment": None, "exec_env": None, "license_path": None, "max_running_minutes": None, "max_running": None, "min_arg": 1, "arg_types": [], "max_arg": None, }], "run_id": "", "ert_pid": "", } job_dispatch_script = os.path.realpath( os.path.join( os.path.dirname(os.path.abspath(__file__)), "../../../bin/job_dispatch.py", )) with open("jobs.json", "w") as f: f.write(json.dumps(self.job_list)) # Required to execute job_dispatch in separate process group by # os.setsid moves the current process out of the current group with open("job_dispatch_executer", "w") as f: f.write("#!/usr/bin/env python\n" "import os, sys\n" "os.setsid()\n" "os.execv(sys.argv[1], sys.argv[1:])\n" "\n") os.chmod("job_dispatch_executer", stat.S_IRWXU | stat.S_IRWXO | stat.S_IRWXG) current_dir = os.path.realpath(os.curdir) job_dispatch_process = Popen([ os.path.realpath("job_dispatch_executer"), job_dispatch_script, current_dir, ]) p = psutil.Process(job_dispatch_process.pid) # Three levels of processes should spawn 8 children in total wait_until( lambda: self.assertEqual(len(p.children(recursive=True)), 8)) p.terminate() wait_until( lambda: self.assertEqual(len(p.children(recursive=True)), 0)) os.wait() # allow os to clean up zombie processes
def test_simulation_context(self): config_file = self.createTestPath( "local/snake_oil_no_data/snake_oil.ert") with ErtTestContext("ert/server/rpc/simulation_context", config_file) as test_context: ert = test_context.getErt() size = 4 mask1 = BoolVector(initial_size=size) mask2 = BoolVector(initial_size=size) for iens_2 in range(size // 2): mask1[2 * iens_2] = True mask1[2 * iens_2 + 1] = False mask2[2 * iens_2] = False mask2[2 * iens_2 + 1] = True fs_manager = ert.getEnkfFsManager() first_half = fs_manager.getFileSystem("first_half") other_half = fs_manager.getFileSystem("other_half") # i represents geo_id case_data = [(i, {}) for i in range(size)] simulation_context1 = SimulationContext(ert, first_half, mask1, 0, case_data) simulation_context2 = SimulationContext(ert, other_half, mask2, 0, case_data) for iens in range(size): if iens % 2 == 0: self.assertFalse( simulation_context1.isRealizationFinished(iens)) # do we have the proper geo_id in run_args? self.assertEqual( simulation_context1.get_run_args(iens).geo_id, iens) else: self.assertFalse( simulation_context2.isRealizationFinished(iens)) self.assertEqual( simulation_context2.get_run_args(iens).geo_id, iens) wait_until( func=(lambda: self.assertFalse(simulation_context1.isRunning( ) or simulation_context2.isRunning())), timeout=60) self.assertEqual(simulation_context1.getNumFailed(), 0) self.assertEqual(simulation_context1.getNumRunning(), 0) self.assertEqual(simulation_context1.getNumSuccess(), size / 2) self.assertEqual(simulation_context2.getNumFailed(), 0) self.assertEqual(simulation_context2.getNumRunning(), 0) self.assertEqual(simulation_context2.getNumSuccess(), size / 2) first_half_state_map = first_half.getStateMap() other_half_state_map = other_half.getStateMap() for iens in range(size): if iens % 2 == 0: self.assertTrue( simulation_context1.didRealizationSucceed(iens)) self.assertFalse( simulation_context1.didRealizationFail(iens)) self.assertTrue( simulation_context1.isRealizationFinished(iens)) self.assertEqual(first_half_state_map[iens], RealizationStateEnum.STATE_HAS_DATA) else: self.assertTrue( simulation_context2.didRealizationSucceed(iens)) self.assertFalse( simulation_context2.didRealizationFail(iens)) self.assertTrue( simulation_context2.isRealizationFinished(iens)) self.assertEqual(other_half_state_map[iens], RealizationStateEnum.STATE_HAS_DATA)