def test_failed_master(self):
        """
        Test that a replica can still serve packages even if the master is down.

        https://bitbucket.org/hpk42/devpi/issues/353/non-available-mirrors-can-abort-index
        """
        users = {'user': {'password': NATIVE_PASSWORD}}
        indices = OrderedDict([
            ('user/baseindex', {}),
            ('user/index', {'bases': 'root/pypi,user/baseindex'})
        ])
        master_context = TestServer(users, indices, config={'role': 'master', 'port': 2414, 'request-timeout': 30})
        with master_context as master:
            # Upload packages to baseindex
            master.use('user', 'baseindex')
            master.login('user', NATIVE_PASSWORD)
            master.upload(DIST_DIR, directory=True)

            with TestServer(config={'master-url': master.server_url, 'port': 2413, 'request-timeout': 30}, fail_on_output=[]) as replica:
                replica.use('user', 'index')

                # Request package on a replica
                wait_until(lambda: download(PACKAGE_NAME, replica.url) is True)

                # Terminate the master. Downloading the package should still succeed
                master_context.__exit__(None, None, None)
                wait_until(lambda: download(PACKAGE_NAME, replica.url) is True)
Beispiel #2
0
    def test_failing_jobs(self):
        with TestAreaContext("job_queue_test_add") as work_area:
            job_queue = create_queue(failing_script, max_submit=1)

            assert job_queue.queue_size == 10
            assert job_queue.is_active()

            pool_sema = BoundedSemaphore(value=10)
            start_all(job_queue, pool_sema)

            wait_until(
                func=(lambda: self.assertFalse(job_queue.is_active())), )

            for job in job_queue.job_list:
                job.wait_for()

            job_queue._transition()

            assert job_queue.fetch_next_waiting() is None

            for q_index, job in enumerate(job_queue.job_list):
                assert job.status == JobStatusType.JOB_QUEUE_FAILED
                iens = job_queue._qindex_to_iens[q_index]
                assert job_queue.snapshot()[iens] == str(
                    JobStatusType.JOB_QUEUE_FAILED)
Beispiel #3
0
    def test_unknown_workload_type_label(self):
        registry = Registry()
        test_context = TestContext()
        unknown_event = get_event(
            CONTAINER, CREATE, uuid.uuid4(), {
                NAME: "container-name",
                APP_NAME_LABEL_KEY: DEFAULT_TEST_APP_NAME,
                CPU_LABEL_KEY: "1",
                MEM_LABEL_KEY: str(DEFAULT_TEST_MEM),
                DISK_LABEL_KEY: str(DEFAULT_TEST_DISK),
                NETWORK_LABEL_KEY: str(DEFAULT_TEST_NETWORK),
                JOB_TYPE_LABEL_KEY: DEFAULT_TEST_JOB_TYPE,
                WORKLOAD_TYPE_LABEL_KEY: "unknown",
                OWNER_EMAIL_LABEL_KEY: DEFAULT_TEST_OWNER_EMAIL,
                IMAGE_LABEL_KEY: DEFAULT_TEST_IMAGE,
            })
        valid_event = get_container_create_event(1)
        event_iterable = MockEventProvider([unknown_event, valid_event])
        manager = EventManager(event_iterable,
                               test_context.get_event_handlers(),
                               DEFAULT_TEST_EVENT_TIMEOUT_SECS)
        manager.set_registry(registry)
        manager.start_processing_events()

        wait_until(lambda: manager.get_error_count() == 1)
        wait_until(lambda: manager.get_processed_count() == 2)
        self.assertEqual(0, manager.get_queue_depth())

        manager.stop_processing_events()

        manager.report_metrics({})
        self.assertTrue(gauge_value_equals(registry, QUEUE_DEPTH_KEY, 0))
        self.assertTrue(gauge_value_equals(registry, EVENT_SUCCEEDED_KEY, 5))
        self.assertTrue(gauge_value_equals(registry, EVENT_FAILED_KEY, 1))
        self.assertTrue(gauge_value_equals(registry, EVENT_PROCESSED_KEY, 2))
Beispiel #4
0
    def test_free_cpu_on_container_die(self):
        workload_name = str(uuid.uuid4())
        workload = Workload(workload_name, DEFAULT_CPU_COUNT, STATIC)
        docker_client = MockDockerClient([MockContainer(workload)])

        events = [
            get_container_create_event(DEFAULT_CPU_COUNT, STATIC,
                                       workload_name, workload_name),
            get_container_die_event(workload_name)
        ]
        event_count = len(events)
        event_iterable = MockEventProvider(
            events, 1)  # Force in order event processing for the test

        test_context = TestContext(docker_client)
        manager = EventManager(event_iterable,
                               test_context.get_event_handlers(),
                               get_mock_file_manager(),
                               DEFAULT_TEST_EVENT_TIMEOUT_SECS)

        wait_until(lambda: event_count == manager.get_processed_count())
        self.assertEqual(0, manager.get_queue_depth())
        self.assertEqual(DEFAULT_TOTAL_THREAD_COUNT,
                         len(test_context.get_cpu().get_empty_threads()))
        self.assertEqual(
            1,
            test_context.get_create_event_handler().get_handled_event_count())
        self.assertEqual(
            1,
            test_context.get_free_event_handler().get_handled_event_count())

        manager.stop_processing_events()
Beispiel #5
0
    def test_absent_workload_type_label(self):
        registry = Registry()
        test_context = TestContext()
        name = str(uuid.uuid4())
        unknown_event = get_event(CONTAINER, CREATE, name, {
            CPU_LABEL_KEY: "1",
            NAME: name
        })
        event_handlers = test_context.get_event_handlers()
        event_iterable = MockEventProvider([unknown_event])
        manager = EventManager(event_iterable, event_handlers,
                               DEFAULT_TEST_EVENT_TIMEOUT_SECS)
        manager.set_registry(registry)
        manager.start_processing_events()

        wait_until(lambda: test_context.get_create_event_handler().
                   get_ignored_event_count() == 1)
        self.assertEqual(0, manager.get_queue_depth())

        manager.stop_processing_events()

        manager.report_metrics({})
        self.assertTrue(gauge_value_equals(registry, QUEUE_DEPTH_KEY, 0))
        self.assertTrue(
            gauge_value_equals(registry, EVENT_SUCCEEDED_KEY,
                               len(test_context.get_event_handlers())))
        self.assertTrue(gauge_value_equals(registry, EVENT_FAILED_KEY, 0))
        self.assertTrue(gauge_value_equals(registry, EVENT_PROCESSED_KEY, 1))
    def test_cross_replica_synchronization(self):
        """
        Any change performed by a replica should be observable by another one.
        """
        users = {NATIVE_USER: {'password': NATIVE_PASSWORD}}
        indices = {NATIVE_USER + '/index': {}}

        with TestServer(users,
                        indices,
                        config={
                            'role': 'master',
                            'port': 2414
                        }) as master:
            with TestServer(config={
                    'master-url': master.server_url,
                    'port': 2413
            },
                            fail_on_output=[]) as replica1:
                with TestServer(config={
                        'master-url': master.server_url,
                        'port': 2412
                },
                                fail_on_output=[]) as replica2:
                    replica1.use(NATIVE_USER, 'index')
                    replica2.use(NATIVE_USER, 'index')

                    replica1.login(NATIVE_USER, NATIVE_PASSWORD)
                    replica1.upload(DIST_DIR, directory=True)

                    wait_until(
                        lambda: download(PACKAGE_NAME, replica2.url) is True)
                    replica1.remove(PACKAGE_NAME)
                    wait_until(
                        lambda: download(PACKAGE_NAME, replica2.url) is False)
Beispiel #7
0
    def test_kill_jobs(self):
        with TestAreaContext("job_queue_test_kill") as work_area:
            job_queue = create_queue(never_ending_script)

            assert job_queue.queue_size == 10
            assert job_queue.is_active()

            pool_sema = BoundedSemaphore(value=10)
            start_all(job_queue, pool_sema)

            # make sure never ending jobs are running
            wait_until(lambda: self.assertTrue(job_queue.is_active()))

            for job in job_queue.job_list:
                job.stop()

            wait_until(lambda: self.assertFalse(job_queue.is_active()))

            job_queue._transition()

            for q_index, job in enumerate(job_queue.job_list):
                assert job.status == JobStatusType.JOB_QUEUE_IS_KILLED
                iens = job_queue._qindex_to_iens[q_index]
                assert job_queue.snapshot()[iens] == str(
                    JobStatusType.JOB_QUEUE_IS_KILLED)

            for job in job_queue.job_list:
                job.wait_for()
    def test_workflow_thread_cancel_external(self):
        with TestAreaContext(
                "python/job_queue/workflow_runner_external") as work_area:
            WorkflowCommon.createWaitJob()

            joblist = WorkflowJoblist()
            self.assertTrue(joblist.addJobFromFile("WAIT",
                                                   "external_wait_job"))
            self.assertTrue("WAIT" in joblist)

            workflow = Workflow("wait_workflow", joblist)

            self.assertEqual(len(workflow), 3)

            workflow_runner = WorkflowRunner(workflow,
                                             ert=None,
                                             context=SubstitutionList())

            self.assertFalse(workflow_runner.isRunning())

            with workflow_runner:
                wait_until(
                    lambda: self.assertTrue(workflow_runner.isRunning()))
                wait_until(lambda: self.assertFileExists("wait_started_0"))
                wait_until(lambda: self.assertFileExists("wait_finished_0"))
                wait_until(lambda: self.assertFileExists("wait_started_1"))
                workflow_runner.cancel()
                self.assertTrue(workflow_runner.isCancelled())

            self.assertFileDoesNotExist("wait_finished_1")
            self.assertFileDoesNotExist("wait_started_2")
            self.assertFileDoesNotExist("wait_cancelled_2")
            self.assertFileDoesNotExist("wait_finished_2")
Beispiel #9
0
    def test_unknown_action(self):
        test_context = TestContext()
        unknown_event = get_event(CONTAINER, "unknown", uuid.uuid4(), {})
        event_iterable = MockEventProvider([unknown_event])
        manager = EventManager(event_iterable,
                               test_context.get_event_handlers(),
                               get_mock_file_manager(),
                               DEFAULT_TEST_EVENT_TIMEOUT_SECS)

        wait_until(lambda: test_context.get_create_event_handler().
                   get_ignored_event_count() == 1)
        self.assertEqual(0, manager.get_queue_depth())

        manager.stop_processing_events()
Beispiel #10
0
    def test_absent_cpu_label(self):
        test_context = TestContext()
        unknown_event = get_event(CONTAINER, CREATE, "unknown", {
            WORKLOAD_TYPE_LABEL_KEY: STATIC,
            NAME: str(uuid.uuid4())
        })
        event_iterable = MockEventProvider([unknown_event])
        manager = EventManager(event_iterable,
                               test_context.get_event_handlers(),
                               get_mock_file_manager(),
                               DEFAULT_TEST_EVENT_TIMEOUT_SECS)

        wait_until(lambda: test_context.get_create_event_handler().
                   get_ignored_event_count() == 1)
        self.assertEqual(0, manager.get_queue_depth())

        manager.stop_processing_events()
Beispiel #11
0
    def test_absent_workload_type_label(self):
        test_context = TestContext()
        name = str(uuid.uuid4())
        unknown_event = get_event(CONTAINER, CREATE, name, {
            CPU_LABEL_KEY: "1",
            NAME: name
        })
        event_handlers = test_context.get_event_handlers()
        event_iterable = MockEventProvider([unknown_event])
        manager = EventManager(event_iterable, event_handlers,
                               get_mock_file_manager(),
                               DEFAULT_TEST_EVENT_TIMEOUT_SECS)

        wait_until(lambda: test_context.get_create_event_handler().
                   get_ignored_event_count() == 1)
        self.assertEqual(0, manager.get_queue_depth())

        manager.stop_processing_events()
Beispiel #12
0
    def test_add_jobs(self):
        with TestAreaContext("job_queue_test_add") as work_area:
            job_queue = create_queue(simple_script)

            assert job_queue.queue_size == 10
            assert job_queue.is_active()
            assert job_queue.fetch_next_waiting() is not None

            pool_sema = BoundedSemaphore(value=10)
            start_all(job_queue, pool_sema)

            for job in job_queue.job_list:
                job.stop()

            wait_until(lambda: self.assertFalse(job_queue.is_active()))

            for job in job_queue.job_list:
                job.wait_for()
    def test_free_cpu_on_container_die(self):
        registry = Registry()
        test_pod = get_simple_test_pod()
        get_pod_manager().set_pod(test_pod)
        workload_name = test_pod.metadata.name

        events = [
            get_container_create_event(DEFAULT_CPU_COUNT, STATIC,
                                       workload_name, workload_name),
            get_container_die_event(workload_name)
        ]
        event_count = len(events)
        event_iterable = MockEventProvider(
            events, 1)  # Force in order event processing for the test

        test_context = TestContext()
        manager = EventManager(event_iterable,
                               test_context.get_event_handlers(),
                               DEFAULT_TEST_EVENT_TIMEOUT_SECS)
        manager.set_registry(registry, {})
        manager.start_processing_events()

        wait_until(lambda: event_count == manager.get_processed_count())
        self.assertEqual(0, manager.get_queue_depth())
        self.assertEqual(DEFAULT_TOTAL_THREAD_COUNT,
                         len(test_context.get_cpu().get_empty_threads()))
        self.assertEqual(
            1,
            test_context.get_create_event_handler().get_handled_event_count())
        self.assertEqual(
            1,
            test_context.get_free_event_handler().get_handled_event_count())

        manager.stop_processing_events()

        manager.report_metrics({})
        self.assertTrue(gauge_value_equals(registry, QUEUE_DEPTH_KEY, 0))
        self.assertTrue(
            counter_value_equals(
                registry, EVENT_SUCCEEDED_KEY,
                event_count * len(test_context.get_event_handlers())))
        self.assertTrue(counter_value_equals(registry, EVENT_FAILED_KEY, 0))
        self.assertTrue(
            counter_value_equals(registry, EVENT_PROCESSED_KEY, event_count))
    def test_cross_replica_synchronization(self):
        """
        Any change performed by a replica should be observable by another one.
        """
        users = {NATIVE_USER: {'password': NATIVE_PASSWORD}}
        indices = {NATIVE_USER + '/index': {}}

        with TestServer(users, indices, config={'role': 'master', 'port': 2414}) as master:
            with TestServer(config={'master-url': master.server_url, 'port': 2413}, fail_on_output=[]) as replica1:
                with TestServer(config={'master-url': master.server_url, 'port': 2412}, fail_on_output=[]) as replica2:
                    replica1.use(NATIVE_USER, 'index')
                    replica2.use(NATIVE_USER, 'index')

                    replica1.login(NATIVE_USER, NATIVE_PASSWORD)
                    replica1.upload(DIST_DIR, directory=True)

                    wait_until(lambda: download(PACKAGE_NAME, replica2.url) is True)
                    replica1.remove(PACKAGE_NAME)
                    wait_until(lambda: download(PACKAGE_NAME, replica2.url) is False)
Beispiel #15
0
    def test_unknown_workload_type_label(self):
        test_context = TestContext()
        unknown_event = get_event(
            CONTAINER, CREATE, uuid.uuid4(), {
                NAME: "container-name",
                CPU_LABEL_KEY: "1",
                WORKLOAD_TYPE_LABEL_KEY: "unknown"
            })
        valid_event = get_container_create_event(1)
        event_iterable = MockEventProvider([unknown_event, valid_event])
        manager = EventManager(event_iterable,
                               test_context.get_event_handlers(),
                               get_mock_file_manager(),
                               DEFAULT_TEST_EVENT_TIMEOUT_SECS)

        wait_until(lambda: manager.get_error_count() == 1)
        wait_until(lambda: manager.get_processed_count() == 2)
        self.assertEqual(0, manager.get_queue_depth())

        manager.stop_processing_events()
    def test_upload(self):
        users = {NATIVE_USER: {'password': NATIVE_PASSWORD}}
        indices = {NATIVE_USER + '/index': {}}

        with TestServer(users=users, indices=indices) as devpi:

            devpi.use(NATIVE_USER, 'index')
            devpi.login(NATIVE_USER, NATIVE_PASSWORD)

            with pushd(SOURCE_DIR):
                devpi.upload(path=None, with_docs=True)

            def doc_present(version=PACKAGE_VERSION):
                return requests.get(
                    devpi.server_url + "/{}/index/test-package/{}/+d/index.html".format(NATIVE_USER, version),
                ).status_code == 200,

            wait_until(doc_present, maxloop=300)
            self.assertTrue(doc_present('+latest'))
            self.assertTrue(doc_present('+stable'))
Beispiel #17
0
    def test_upload(self):
        users = {NATIVE_USER: {'password': NATIVE_PASSWORD}}
        indices = {NATIVE_USER + '/index': {}}

        with TestServer(users=users, indices=indices) as devpi:

            devpi.use(NATIVE_USER, 'index')
            devpi.login(NATIVE_USER, NATIVE_PASSWORD)

            with pushd(SOURCE_DIR):
                devpi.upload(path=None, with_docs=True)

            def doc_present(version=PACKAGE_VERSION):
                return requests.get(
                    devpi.server_url +
                    "/{}/index/test-package/{}/+d/index.html".format(
                        NATIVE_USER, version), ).status_code == 200,

            wait_until(doc_present, maxloop=300)
            self.assertTrue(doc_present('+latest'))
            self.assertTrue(doc_present('+stable'))
Beispiel #18
0
    def test_update_mock_container(self):
        registry = Registry()
        workload_name = str(uuid.uuid4())

        events = [
            get_container_create_event(DEFAULT_CPU_COUNT, STATIC,
                                       workload_name, workload_name)
        ]
        event_count = len(events)
        event_iterable = MockEventProvider(events)

        test_context = TestContext()
        manager = EventManager(event_iterable,
                               test_context.get_event_handlers(),
                               DEFAULT_TEST_EVENT_TIMEOUT_SECS)
        manager.set_registry(registry)
        manager.start_processing_events()

        wait_until(lambda: event_count == manager.get_processed_count())
        self.assertEqual(0, manager.get_queue_depth())
        self.assertEqual(
            event_count,
            test_context.get_workload_manager().get_success_count())
        self.assertEqual(DEFAULT_TOTAL_THREAD_COUNT - DEFAULT_CPU_COUNT,
                         len(test_context.get_cpu().get_empty_threads()))
        self.assertEqual(
            1,
            test_context.get_create_event_handler().get_handled_event_count())

        manager.stop_processing_events()

        manager.report_metrics({})
        self.assertTrue(gauge_value_equals(registry, QUEUE_DEPTH_KEY, 0))
        self.assertTrue(
            gauge_value_equals(
                registry, EVENT_SUCCEEDED_KEY,
                event_count * len(test_context.get_event_handlers())))
        self.assertTrue(gauge_value_equals(registry, EVENT_FAILED_KEY, 0))
        self.assertTrue(
            gauge_value_equals(registry, EVENT_PROCESSED_KEY, event_count))
    def test_late_replication(self):
        """
        Test that the replicas are properly catching up with changes, even
        if they were not online when the change happened.

        This sometimes used to result in tracebacks.
        """
        users = {NATIVE_USER: {'password': NATIVE_PASSWORD}}
        indices = {NATIVE_USER + '/index': {}}

        with TestServer(users, indices, config={'role': 'master', 'port': 2414, 'request-timeout': 30, 'replica-max-retries': 2}) as master:
            master.use(NATIVE_USER, 'index')
            master.login(NATIVE_USER, NATIVE_PASSWORD)
            master.upload(DIST_DIR, directory=True)

            with TestServer(config={'master-url': master.server_url, 'port': 2413, 'request-timeout': 30, 'replica-max-retries': 2}) as replica1:
                replica1.use(NATIVE_USER, 'index')

                wait_until(lambda: download(PACKAGE_NAME, replica1.url) is True)
                master.remove(PACKAGE_NAME)
                wait_until(lambda: download(PACKAGE_NAME, replica1.url) is False)

                with TestServer(config={'master-url': master.server_url, 'port': 2412, 'request-timeout': 30, 'replica-max-retries': 2}) as replica2:
                    replica2.use(NATIVE_USER, 'index')

                    wait_until(lambda: download(PACKAGE_NAME, replica2.url) is False)
Beispiel #20
0
    def test_rebalance(self):
        registry = Registry()

        events = [REBALANCE_EVENT]
        event_count = len(events)
        event_iterable = MockEventProvider(events)

        test_context = TestContext()
        manager = EventManager(event_iterable,
                               test_context.get_event_handlers(),
                               DEFAULT_TEST_EVENT_TIMEOUT_SECS)
        manager.set_registry(registry)
        manager.start_processing_events()

        wait_until(lambda: event_count == manager.get_processed_count())
        self.assertEqual(0, manager.get_queue_depth())
        self.assertEqual(DEFAULT_TOTAL_THREAD_COUNT,
                         len(test_context.get_cpu().get_empty_threads()))
        self.assertEqual(
            0,
            test_context.get_create_event_handler().get_handled_event_count())
        self.assertEqual(
            0,
            test_context.get_free_event_handler().get_handled_event_count())
        self.assertEqual(
            1,
            test_context.get_rebalance_event_handler().get_handled_event_count(
            ))

        manager.stop_processing_events()

        manager.report_metrics({})
        self.assertTrue(gauge_value_equals(registry, QUEUE_DEPTH_KEY, 0))
        self.assertTrue(
            gauge_value_equals(
                registry, EVENT_SUCCEEDED_KEY,
                event_count * len(test_context.get_event_handlers())))
        self.assertTrue(gauge_value_equals(registry, EVENT_FAILED_KEY, 0))
        self.assertTrue(
            gauge_value_equals(registry, EVENT_PROCESSED_KEY, event_count))
Beispiel #21
0
    def test_failing_jobs(self):
        with TestAreaContext("job_queue_test_add") as work_area:
            job_queue = create_queue(failing_script, max_submit=1)

            assert job_queue.queue_size == 10
            assert job_queue.is_active()

            pool_sema = BoundedSemaphore(value=10)
            start_all(job_queue, pool_sema)

            wait_until(
                func=(lambda: self.assertFalse(job_queue.is_active())), )

            for job in job_queue.job_list:
                job.wait_for()

            assert job_queue.fetch_next_waiting() is None

            for job in job_queue.job_list:
                assert job.status == JobStatusType.JOB_QUEUE_FAILED

            assert True
Beispiel #22
0
    def test_timeout_jobs(self):
        with TestAreaContext("job_queue_test_kill") as work_area:
            job_queue = create_queue(never_ending_script,
                                     max_submit=1,
                                     max_runtime=5)

            assert job_queue.queue_size == 10
            assert job_queue.is_active()

            pool_sema = BoundedSemaphore(value=10)
            start_all(job_queue, pool_sema)

            # make sure never ending jobs are running
            wait_until(lambda: self.assertTrue(job_queue.is_active()))

            wait_until(lambda: self.assertFalse(job_queue.is_active()))

            for job in job_queue.job_list:
                assert job.status == JobStatusType.JOB_QUEUE_IS_KILLED

            for job in job_queue.job_list:
                job.wait_for()
    def test_failed_master(self):
        """
        Test that a replica can still serve packages even if the master is down.

        https://bitbucket.org/hpk42/devpi/issues/353/non-available-mirrors-can-abort-index
        """
        users = {'user': {'password': NATIVE_PASSWORD}}
        indices = OrderedDict([('user/baseindex', {}),
                               ('user/index', {
                                   'bases': 'root/pypi,user/baseindex'
                               })])
        master_context = TestServer(users,
                                    indices,
                                    config={
                                        'role': 'master',
                                        'port': 2414,
                                        'request-timeout': 30
                                    })
        with master_context as master:
            # Upload packages to baseindex
            master.use('user', 'baseindex')
            master.login('user', NATIVE_PASSWORD)
            master.upload(DIST_DIR, directory=True)

            with TestServer(config={
                    'master-url': master.server_url,
                    'port': 2413,
                    'request-timeout': 30
            },
                            fail_on_output=[]) as replica:
                replica.use('user', 'index')

                # Request package on a replica
                wait_until(lambda: download(PACKAGE_NAME, replica.url) is True)

                # Terminate the master. Downloading the package should still succeed
                master_context.__exit__(None, None, None)
                wait_until(lambda: download(PACKAGE_NAME, replica.url) is True)
Beispiel #24
0
    def test_timeout_jobs(self):
        with TestAreaContext("job_queue_test_kill") as work_area:
            job_numbers = set()

            def callback(arg):
                nonlocal job_numbers
                job_numbers.add(arg[0]["job_number"])

            job_queue = create_queue(
                never_ending_script,
                max_submit=1,
                max_runtime=5,
                callback_timeout=callback,
            )

            assert job_queue.queue_size == 10
            assert job_queue.is_active()

            pool_sema = BoundedSemaphore(value=10)
            start_all(job_queue, pool_sema)

            # make sure never ending jobs are running
            wait_until(lambda: self.assertTrue(job_queue.is_active()))

            wait_until(lambda: self.assertFalse(job_queue.is_active()))

            job_queue._transition()

            for q_index, job in enumerate(job_queue.job_list):
                assert job.status == JobStatusType.JOB_QUEUE_IS_KILLED
                iens = job_queue._qindex_to_iens[q_index]
                assert job_queue.snapshot()[iens] == str(
                    JobStatusType.JOB_QUEUE_IS_KILLED)

            assert job_numbers == set(range(10))

            for job in job_queue.job_list:
                job.wait_for()
Beispiel #25
0
    def test_unknown_action(self):
        registry = Registry()
        test_context = TestContext()
        unknown_event = get_event(CONTAINER, "unknown", uuid.uuid4(), {})
        event_iterable = MockEventProvider([unknown_event])
        manager = EventManager(event_iterable,
                               test_context.get_event_handlers(),
                               DEFAULT_TEST_EVENT_TIMEOUT_SECS)
        manager.set_registry(registry)
        manager.start_processing_events()

        wait_until(lambda: test_context.get_create_event_handler().
                   get_ignored_event_count() == 1)
        self.assertEqual(0, manager.get_queue_depth())

        manager.stop_processing_events()

        manager.report_metrics({})
        self.assertTrue(gauge_value_equals(registry, QUEUE_DEPTH_KEY, 0))
        self.assertTrue(
            gauge_value_equals(registry, EVENT_SUCCEEDED_KEY,
                               len(test_context.get_event_handlers())))
        self.assertTrue(gauge_value_equals(registry, EVENT_FAILED_KEY, 0))
        self.assertTrue(gauge_value_equals(registry, EVENT_PROCESSED_KEY, 1))
    def test_late_replication(self):
        """
        Test that the replicas are properly catching up with changes, even
        if they were not online when the change happened.

        This sometimes used to result in tracebacks.
        """
        users = {NATIVE_USER: {'password': NATIVE_PASSWORD}}
        indices = {NATIVE_USER + '/index': {}}

        with TestServer(users,
                        indices,
                        config={
                            'role': 'master',
                            'port': 2414,
                            'request-timeout': 30,
                            'replica-max-retries': 2
                        }) as master:
            master.use(NATIVE_USER, 'index')
            master.login(NATIVE_USER, NATIVE_PASSWORD)
            master.upload(DIST_DIR, directory=True)

            with TestServer(
                    config={
                        'master-url': master.server_url,
                        'port': 2413,
                        'request-timeout': 30,
                        'replica-max-retries': 2
                    }) as replica1:
                replica1.use(NATIVE_USER, 'index')

                wait_until(
                    lambda: download(PACKAGE_NAME, replica1.url) is True)
                master.remove(PACKAGE_NAME)
                wait_until(
                    lambda: download(PACKAGE_NAME, replica1.url) is False)

                with TestServer(
                        config={
                            'master-url': master.server_url,
                            'port': 2412,
                            'request-timeout': 30,
                            'replica-max-retries': 2
                        }) as replica2:
                    replica2.use(NATIVE_USER, 'index')

                    wait_until(
                        lambda: download(PACKAGE_NAME, replica2.url) is False)
    def test_simulation_context(self):
        config_file = self.createTestPath("local/batch_sim/sleepy_time.ert")
        with ErtTestContext("res/sim/simulation_context",
                            config_file) as test_context:
            ert = test_context.getErt()

            size = 4
            even_mask = BoolVector(initial_size=size)
            odd_mask = BoolVector(initial_size=size)

            for iens_2 in range(size // 2):
                even_mask[2 * iens_2] = True
                even_mask[2 * iens_2 + 1] = False

                odd_mask[2 * iens_2] = False
                odd_mask[2 * iens_2 + 1] = True

            fs_manager = ert.getEnkfFsManager()
            even_half = fs_manager.getFileSystem("even_half")
            odd_half = fs_manager.getFileSystem("odd_half")

            # i represents geo_id
            case_data = [(i, {}) for i in range(size)]
            even_ctx = SimulationContext(ert, even_half, even_mask, 0,
                                         case_data)
            odd_ctx = SimulationContext(ert, odd_half, odd_mask, 0, case_data)

            for iens in range(size):
                # do we have the proper geo_id in run_args?
                if iens % 2 == 0:
                    self.assertFalse(even_ctx.isRealizationFinished(iens))
                    self.assertEqual(even_ctx.get_run_args(iens).geo_id, iens)
                else:
                    self.assertFalse(odd_ctx.isRealizationFinished(iens))
                    self.assertEqual(odd_ctx.get_run_args(iens).geo_id, iens)

            def any_is_running():
                return even_ctx.isRunning() or odd_ctx.isRunning()

            wait_until(func=(lambda: self.assertFalse(any_is_running())),
                       timeout=90)

            self.assertEqual(even_ctx.getNumFailed(), 0)
            self.assertEqual(even_ctx.getNumRunning(), 0)
            self.assertEqual(even_ctx.getNumSuccess(), size / 2)

            self.assertEqual(odd_ctx.getNumFailed(), 0)
            self.assertEqual(odd_ctx.getNumRunning(), 0)
            self.assertEqual(odd_ctx.getNumSuccess(), size / 2)

            even_state_map = even_half.getStateMap()
            odd_state_map = odd_half.getStateMap()

            for iens in range(size):
                if iens % 2 == 0:
                    self.assertTrue(even_ctx.didRealizationSucceed(iens))
                    self.assertFalse(even_ctx.didRealizationFail(iens))
                    self.assertTrue(even_ctx.isRealizationFinished(iens))

                    self.assertEqual(even_state_map[iens],
                                     RealizationStateEnum.STATE_HAS_DATA)
                else:
                    self.assertTrue(odd_ctx.didRealizationSucceed(iens))
                    self.assertFalse(odd_ctx.didRealizationFail(iens))
                    self.assertTrue(odd_ctx.isRealizationFinished(iens))

                    self.assertEqual(odd_state_map[iens],
                                     RealizationStateEnum.STATE_HAS_DATA)
    def test_empty_metrics(self):

        test_context = TestContext()
        event_manager = EventManager(MockEventProvider([]), [],
                                     get_mock_file_manager(), 0.01)

        registry = Registry()
        reporter = InternalMetricsReporter(test_context.get_workload_manager(),
                                           event_manager)
        reporter.set_registry(registry)
        reporter.report_metrics({})

        wait_until(lambda: self.__gauge_value_equals(registry, RUNNING, 1))
        wait_until(lambda: self.__gauge_value_equals(registry, ADDED_KEY, 0))
        wait_until(lambda: self.__gauge_value_equals(registry, REMOVED_KEY, 0))
        wait_until(
            lambda: self.__gauge_value_equals(registry, SUCCEEDED_KEY, 0))
        wait_until(lambda: self.__gauge_value_equals(registry, FAILED_KEY, 0))
        wait_until(
            lambda: self.__gauge_value_equals(registry, QUEUE_DEPTH_KEY, 0))
        wait_until(
            lambda: self.__gauge_value_equals(registry, WORKLOAD_COUNT_KEY, 0))
        wait_until(lambda: self.__gauge_value_equals(
            registry, PACKAGE_VIOLATIONS_KEY, 0))
        wait_until(lambda: self.__gauge_value_equals(registry,
                                                     CORE_VIOLATIONS_KEY, 0))
        wait_until(lambda: self.__gauge_value_equals(registry,
                                                     EVENT_SUCCEEDED_KEY, 0))
        wait_until(
            lambda: self.__gauge_value_equals(registry, EVENT_FAILED_KEY, 0))
        wait_until(lambda: self.__gauge_value_equals(registry,
                                                     EVENT_PROCESSED_KEY, 0))
        wait_until(lambda: self.__gauge_value_equals(
            registry, FALLBACK_ALLOCATOR_COUNT, 0))
        wait_until(lambda: self.__gauge_value_equals(
            registry, IP_ALLOCATOR_TIMEBOUND_COUNT, 0))

        event_manager.stop_processing_events()
    def test_edge_case_ip_allocator_metrics(self):
        # this is a specific scenario causing troubles to the solver.
        # we should hit the time-bound limit and report it.

        cpu = get_cpu(2, 16, 2)
        test_context = TestContext(cpu=cpu)
        test_context.get_workload_manager().get_allocator(
        ).set_solver_max_runtime_secs(0.01)
        events = []
        cnt_evts = 0

        for i in range(15):
            events.append(get_container_create_event(2, name=str(i),
                                                     id=str(i)))
        cnt_evts += 15

        events.append(get_container_create_event(1, name="15", id="15"))
        cnt_evts += 1

        for i in range(9):
            events.append(
                get_container_create_event(2,
                                           name=str(i + cnt_evts),
                                           id=str(i + cnt_evts)))

        events.append(get_container_die_event(name="15", id="15"))

        event_count = len(events)
        event_manager = EventManager(MockEventProvider(events),
                                     test_context.get_event_handlers(),
                                     get_mock_file_manager(), 5.0)

        wait_until(lambda: event_count == event_manager.get_processed_count(),
                   timeout=20)

        log.info("Event manager has processed {} events.".format(
            event_manager.get_processed_count()))

        workload_manager = test_context.get_workload_manager()
        registry = Registry()
        reporter = InternalMetricsReporter(workload_manager, event_manager)
        reporter.set_registry(registry)
        reporter.report_metrics({})

        wait_until(lambda: self.__gauge_value_equals(registry, RUNNING, 1))
        wait_until(lambda: self.__gauge_value_equals(registry, ADDED_KEY, 25))
        wait_until(lambda: self.__gauge_value_equals(registry, REMOVED_KEY, 1))
        wait_until(
            lambda: self.__gauge_value_equals(registry, SUCCEEDED_KEY, 26))
        wait_until(lambda: self.__gauge_value_equals(registry, FAILED_KEY, 0))
        wait_until(
            lambda: self.__gauge_value_equals(registry, QUEUE_DEPTH_KEY, 0))
        wait_until(lambda: self.__gauge_value_equals(registry,
                                                     WORKLOAD_COUNT_KEY, 24))
        wait_until(lambda: self.__gauge_value_equals(
            registry, PACKAGE_VIOLATIONS_KEY, 0))
        wait_until(lambda: self.__gauge_value_equals(
            registry, EVENT_SUCCEEDED_KEY, 3 * 26))
        wait_until(
            lambda: self.__gauge_value_equals(registry, EVENT_FAILED_KEY, 0))
        wait_until(lambda: self.__gauge_value_equals(registry,
                                                     EVENT_PROCESSED_KEY, 26))
        wait_until(lambda: self.__gauge_value_reached(
            registry, IP_ALLOCATOR_TIMEBOUND_COUNT, 1))
        wait_until(lambda: self.__gauge_value_reached(
            registry, ALLOCATOR_CALL_DURATION, 0.1))

        event_manager.stop_processing_events()
    def test_crash_ip_allocator_metrics(self):

        cpu = get_cpu(2, 16, 2)
        test_context = TestContext(cpu=cpu)

        # now override the cpu seen by the allocator to crash it
        test_context.get_workload_manager().get_allocator().set_cpu(
            get_cpu(2, 2, 2))

        events = [get_container_create_event(10, name="foo", id="bar")]
        event_count = len(events)
        event_manager = EventManager(MockEventProvider(events),
                                     test_context.get_event_handlers(),
                                     get_mock_file_manager(), 5.0)

        wait_until(lambda: event_count == event_manager.get_processed_count())

        log.info("Event manager has processed {} events.".format(
            event_manager.get_processed_count()))

        workload_manager = test_context.get_workload_manager()
        registry = Registry()
        reporter = InternalMetricsReporter(workload_manager, event_manager)
        reporter.set_registry(registry)
        reporter.report_metrics({})

        wait_until(lambda: self.__gauge_value_equals(registry, RUNNING, 1))
        wait_until(lambda: self.__gauge_value_equals(registry, ADDED_KEY, 1))
        wait_until(lambda: self.__gauge_value_equals(registry, REMOVED_KEY, 0))
        wait_until(
            lambda: self.__gauge_value_equals(registry, SUCCEEDED_KEY, 1))
        wait_until(lambda: self.__gauge_value_equals(registry, FAILED_KEY, 0))
        wait_until(
            lambda: self.__gauge_value_equals(registry, WORKLOAD_COUNT_KEY, 1))
        wait_until(lambda: self.__gauge_value_equals(
            registry, FALLBACK_ALLOCATOR_COUNT, 1))

        event_manager.stop_processing_events()
Beispiel #31
0
    def test_terminate_jobs(self):

        # Executes it self recursively and sleeps for 100 seconds
        with open("dummy_executable", "w") as f:
            f.write("""#!/usr/bin/env python
import sys, os, time
counter = eval(sys.argv[1])
if counter > 0:
    os.fork()
    os.execv(sys.argv[0],[sys.argv[0], str(counter - 1) ])
else:
    time.sleep(100)""")

        executable = os.path.realpath("dummy_executable")
        os.chmod("dummy_executable",
                 stat.S_IRWXU | stat.S_IRWXO | stat.S_IRWXG)

        self.job_list = {
            "umask":
            "0002",
            "DATA_ROOT":
            "",
            "global_environment": {},
            "global_update_path": {},
            "jobList": [{
                "name": "dummy_executable",
                "executable": executable,
                "target_file": None,
                "error_file": None,
                "start_file": None,
                "stdout": "dummy.stdout",
                "stderr": "dummy.stderr",
                "stdin": None,
                "argList": ["3"],
                "environment": None,
                "exec_env": None,
                "license_path": None,
                "max_running_minutes": None,
                "max_running": None,
                "min_arg": 1,
                "arg_types": [],
                "max_arg": None,
            }],
            "run_id":
            "",
            "ert_pid":
            "",
        }

        with open("jobs.json", "w") as f:
            f.write(json.dumps(self.job_list))

        # macOS doesn't provide /usr/bin/setsid, so we roll our own
        with open("setsid", "w") as f:
            f.write(
                dedent("""\
                #!/usr/bin/env python
                import os
                import sys
                os.setsid()
                os.execvp(sys.argv[1], sys.argv[1:])
                """))
        os.chmod("setsid", 0o755)

        job_dispatch_script = importlib.util.find_spec(
            "job_runner.job_dispatch").origin
        job_dispatch_process = Popen([
            os.getcwd() + "/setsid",
            sys.executable,
            job_dispatch_script,
            os.getcwd(),
        ])

        p = psutil.Process(job_dispatch_process.pid)

        # Three levels of processes should spawn 8 children in total
        wait_until(
            lambda: self.assertEqual(len(p.children(recursive=True)), 8))

        p.terminate()

        wait_until(
            lambda: self.assertEqual(len(p.children(recursive=True)), 0))

        os.wait()  # allow os to clean up zombie processes
    def test_add_metrics(self):

        test_context = TestContext()
        workload_name = str(uuid.uuid4())
        events = [
            get_container_create_event(DEFAULT_CPU_COUNT, STATIC,
                                       workload_name, workload_name)
        ]
        event_count = len(events)
        event_manager = EventManager(MockEventProvider(events),
                                     test_context.get_event_handlers(),
                                     get_mock_file_manager(), 5.0)
        wait_until(lambda: event_count == event_manager.get_processed_count())

        log.info("Event manager has processed {} events.".format(
            event_manager.get_processed_count()))

        workload_manager = test_context.get_workload_manager()
        registry = Registry()
        reporter = InternalMetricsReporter(workload_manager, event_manager)
        reporter.set_registry(registry)
        reporter.report_metrics({})

        wait_until(lambda: self.__gauge_value_equals(registry, RUNNING, 1))
        wait_until(lambda: self.__gauge_value_equals(registry, ADDED_KEY, 1))
        wait_until(lambda: self.__gauge_value_equals(registry, REMOVED_KEY, 0))
        wait_until(
            lambda: self.__gauge_value_equals(registry, SUCCEEDED_KEY, 1))
        wait_until(lambda: self.__gauge_value_equals(registry, FAILED_KEY, 0))
        wait_until(
            lambda: self.__gauge_value_equals(registry, QUEUE_DEPTH_KEY, 0))
        wait_until(
            lambda: self.__gauge_value_equals(registry, WORKLOAD_COUNT_KEY, 1))
        wait_until(lambda: self.__gauge_value_equals(
            registry, PACKAGE_VIOLATIONS_KEY, 0))
        wait_until(lambda: self.__gauge_value_equals(registry,
                                                     CORE_VIOLATIONS_KEY, 0))
        wait_until(lambda: self.__gauge_value_equals(registry,
                                                     EVENT_SUCCEEDED_KEY, 3))
        wait_until(
            lambda: self.__gauge_value_equals(registry, EVENT_FAILED_KEY, 0))
        wait_until(lambda: self.__gauge_value_equals(registry,
                                                     EVENT_PROCESSED_KEY, 1))
        wait_until(lambda: self.__gauge_value_equals(
            registry, FALLBACK_ALLOCATOR_COUNT, 0))
        wait_until(lambda: self.__gauge_value_equals(
            registry, IP_ALLOCATOR_TIMEBOUND_COUNT, 0))

        event_manager.stop_processing_events()
Beispiel #33
0
    def test_terminate_jobs(self):

        # Executes it self recursively and sleeps for 100 seconds
        with open("dummy_executable", "w") as f:
            f.write("""#!/usr/bin/env python
import sys, os, time
counter = eval(sys.argv[1])
if counter > 0:
    os.fork()
    os.execv(sys.argv[0],[sys.argv[0], str(counter - 1) ])
else:
    time.sleep(100)""")

        executable = os.path.realpath("dummy_executable")
        os.chmod("dummy_executable",
                 stat.S_IRWXU | stat.S_IRWXO | stat.S_IRWXG)

        self.job_list = {
            "umask":
            "0002",
            "DATA_ROOT":
            "",
            "global_environment": {},
            "global_update_path": {},
            "jobList": [{
                "name": "dummy_executable",
                "executable": executable,
                "target_file": None,
                "error_file": None,
                "start_file": None,
                "stdout": "dummy.stdout",
                "stderr": "dummy.stderr",
                "stdin": None,
                "argList": ["3"],
                "environment": None,
                "exec_env": None,
                "license_path": None,
                "max_running_minutes": None,
                "max_running": None,
                "min_arg": 1,
                "arg_types": [],
                "max_arg": None,
            }],
            "run_id":
            "",
            "ert_pid":
            "",
        }

        job_dispatch_script = os.path.realpath(
            os.path.join(
                os.path.dirname(os.path.abspath(__file__)),
                "../../../bin/job_dispatch.py",
            ))

        with open("jobs.json", "w") as f:
            f.write(json.dumps(self.job_list))

            # Required to execute job_dispatch in separate process group by
            # os.setsid moves the current process out of the current group
            with open("job_dispatch_executer", "w") as f:
                f.write("#!/usr/bin/env python\n"
                        "import os, sys\n"
                        "os.setsid()\n"
                        "os.execv(sys.argv[1], sys.argv[1:])\n"
                        "\n")
            os.chmod("job_dispatch_executer",
                     stat.S_IRWXU | stat.S_IRWXO | stat.S_IRWXG)

        current_dir = os.path.realpath(os.curdir)
        job_dispatch_process = Popen([
            os.path.realpath("job_dispatch_executer"),
            job_dispatch_script,
            current_dir,
        ])

        p = psutil.Process(job_dispatch_process.pid)

        # Three levels of processes should spawn 8 children in total
        wait_until(
            lambda: self.assertEqual(len(p.children(recursive=True)), 8))

        p.terminate()

        wait_until(
            lambda: self.assertEqual(len(p.children(recursive=True)), 0))

        os.wait()  # allow os to clean up zombie processes
Beispiel #34
0
    def test_simulation_context(self):
        config_file = self.createTestPath(
            "local/snake_oil_no_data/snake_oil.ert")
        with ErtTestContext("ert/server/rpc/simulation_context",
                            config_file) as test_context:
            ert = test_context.getErt()

            size = 4
            mask1 = BoolVector(initial_size=size)
            mask2 = BoolVector(initial_size=size)

            for iens_2 in range(size // 2):
                mask1[2 * iens_2] = True
                mask1[2 * iens_2 + 1] = False

                mask2[2 * iens_2] = False
                mask2[2 * iens_2 + 1] = True

            fs_manager = ert.getEnkfFsManager()
            first_half = fs_manager.getFileSystem("first_half")
            other_half = fs_manager.getFileSystem("other_half")

            # i represents geo_id
            case_data = [(i, {}) for i in range(size)]
            simulation_context1 = SimulationContext(ert, first_half, mask1, 0,
                                                    case_data)
            simulation_context2 = SimulationContext(ert, other_half, mask2, 0,
                                                    case_data)

            for iens in range(size):
                if iens % 2 == 0:
                    self.assertFalse(
                        simulation_context1.isRealizationFinished(iens))
                    # do we have the proper geo_id in run_args?
                    self.assertEqual(
                        simulation_context1.get_run_args(iens).geo_id, iens)
                else:
                    self.assertFalse(
                        simulation_context2.isRealizationFinished(iens))
                    self.assertEqual(
                        simulation_context2.get_run_args(iens).geo_id, iens)

            wait_until(
                func=(lambda: self.assertFalse(simulation_context1.isRunning(
                ) or simulation_context2.isRunning())),
                timeout=60)

            self.assertEqual(simulation_context1.getNumFailed(), 0)
            self.assertEqual(simulation_context1.getNumRunning(), 0)
            self.assertEqual(simulation_context1.getNumSuccess(), size / 2)

            self.assertEqual(simulation_context2.getNumFailed(), 0)
            self.assertEqual(simulation_context2.getNumRunning(), 0)
            self.assertEqual(simulation_context2.getNumSuccess(), size / 2)

            first_half_state_map = first_half.getStateMap()
            other_half_state_map = other_half.getStateMap()

            for iens in range(size):
                if iens % 2 == 0:
                    self.assertTrue(
                        simulation_context1.didRealizationSucceed(iens))
                    self.assertFalse(
                        simulation_context1.didRealizationFail(iens))
                    self.assertTrue(
                        simulation_context1.isRealizationFinished(iens))

                    self.assertEqual(first_half_state_map[iens],
                                     RealizationStateEnum.STATE_HAS_DATA)
                else:
                    self.assertTrue(
                        simulation_context2.didRealizationSucceed(iens))
                    self.assertFalse(
                        simulation_context2.didRealizationFail(iens))
                    self.assertTrue(
                        simulation_context2.isRealizationFinished(iens))

                    self.assertEqual(other_half_state_map[iens],
                                     RealizationStateEnum.STATE_HAS_DATA)