Esempio n. 1
0
    def test_fix_lost_trials_race_condition(self, monkeypatch, caplog):
        """Test that a lost trial fixed by a concurrent process does not cause error."""
        trial = copy.deepcopy(base_trial)
        trial['status'] = 'interrupted'
        trial['heartbeat'] = datetime.datetime.utcnow() - datetime.timedelta(
            seconds=360)
        with OrionState(trials=[trial]) as cfg:
            exp = Experiment('supernaekei')
            exp._id = cfg.trials[0]['experiment']

            assert len(exp.fetch_trials_by_status('interrupted')) == 1

            assert len(exp._storage.fetch_lost_trials(exp)) == 0

            def fetch_lost_trials(self, query):
                trial_object = Trial(**trial)
                trial_object.status = 'reserved'
                return [trial_object]

            # Force the fetch of a trial marked as reserved (and lost) while actually interrupted
            # (as if already failed-over by another process).
            with monkeypatch.context() as m:
                m.setattr(exp._storage.__class__, 'fetch_lost_trials',
                          fetch_lost_trials)

                assert len(exp._storage.fetch_lost_trials(exp)) == 1

                with caplog.at_level(logging.DEBUG):
                    exp.fix_lost_trials()

            assert caplog.records[-1].levelname == 'DEBUG'
            assert caplog.records[-1].msg == 'failed'
            assert len(exp.fetch_trials_by_status('interrupted')) == 1
            assert len(exp.fetch_trials_by_status('reserved')) == 0
Esempio n. 2
0
    def test_fix_lost_trials(self):
        """Test that a running trial with an old heartbeat is set to interrupted."""
        trial = copy.deepcopy(base_trial)
        trial['status'] = 'reserved'
        trial['heartbeat'] = datetime.datetime.utcnow() - datetime.timedelta(
            seconds=360)
        with OrionState(trials=[trial]) as cfg:
            exp = Experiment('supernaekei')
            exp._id = cfg.trials[0]['experiment']

            assert len(exp.fetch_trials_by_status('reserved')) == 1
            exp.fix_lost_trials()
            assert len(exp.fetch_trials_by_status('reserved')) == 0
Esempio n. 3
0
    def test_fix_lost_trials(self):
        """Test that a running trial with an old heartbeat is set to interrupted."""
        trial = copy.deepcopy(base_trial)
        trial["status"] = "reserved"
        trial["heartbeat"] = datetime.datetime.utcnow() - datetime.timedelta(
            seconds=60 * 10)
        with OrionState(trials=[trial]) as cfg:
            exp = Experiment("supernaekei", mode="x")
            exp._id = cfg.trials[0]["experiment"]

            assert len(exp.fetch_trials_by_status("reserved")) == 1
            exp.fix_lost_trials()
            assert len(exp.fetch_trials_by_status("reserved")) == 0
Esempio n. 4
0
    def test_fix_lost_trials_configurable_hb(self):
        """Test that heartbeat is correctly being configured."""
        trial = copy.deepcopy(base_trial)
        trial['status'] = 'reserved'
        trial['heartbeat'] = datetime.datetime.utcnow() - datetime.timedelta(
            seconds=180)
        with OrionState(trials=[trial]) as cfg:
            exp = Experiment('supernaekei')
            exp._id = cfg.trials[0]['experiment']

            assert len(exp.fetch_trials_by_status('reserved')) == 1

            orion.core.config.worker.heartbeat = 360

            exp.fix_lost_trials()

            assert len(exp.fetch_trials_by_status('reserved')) == 1

            orion.core.config.worker.heartbeat = 180

            exp.fix_lost_trials()

            assert len(exp.fetch_trials_by_status('reserved')) == 0
Esempio n. 5
0
    def test_fix_lost_trials_configurable_hb(self):
        """Test that heartbeat is correctly being configured."""
        trial = copy.deepcopy(base_trial)
        trial["status"] = "reserved"
        trial["heartbeat"] = datetime.datetime.utcnow() - datetime.timedelta(
            seconds=60 * 2)
        with OrionState(trials=[trial]) as cfg:
            exp = Experiment("supernaekei", mode="x")
            exp._id = cfg.trials[0]["experiment"]

            assert len(exp.fetch_trials_by_status("reserved")) == 1

            orion.core.config.worker.heartbeat = 60 * 2

            exp.fix_lost_trials()

            assert len(exp.fetch_trials_by_status("reserved")) == 1

            orion.core.config.worker.heartbeat = 60 * 2 / 10.0

            exp.fix_lost_trials()

            assert len(exp.fetch_trials_by_status("reserved")) == 0
Esempio n. 6
0
    def test_fix_only_lost_trials(self):
        """Test that an old trial is set to interrupted but not a recent one."""
        lost_trial, running_trial = generate_trials(["reserved"] * 2)
        lost_trial["heartbeat"] = datetime.datetime.utcnow() - datetime.timedelta(
            seconds=60 * 10
        )
        running_trial["heartbeat"] = datetime.datetime.utcnow()

        with OrionState(trials=[lost_trial, running_trial]) as cfg:
            exp = Experiment("supernaekei", mode="x")
            exp._id = cfg.trials[0]["experiment"]

            assert len(exp.fetch_trials_by_status("reserved")) == 2

            exp.fix_lost_trials()

            reserved_trials = exp.fetch_trials_by_status("reserved")
            assert len(reserved_trials) == 1
            assert reserved_trials[0].to_dict()["params"] == running_trial["params"]

            failedover_trials = exp.fetch_trials_by_status("interrupted")
            assert len(failedover_trials) == 1
            assert failedover_trials[0].to_dict()["params"] == lost_trial["params"]
Esempio n. 7
0
    def test_fix_only_lost_trials(self):
        """Test that an old trial is set to interrupted but not a recent one."""
        lost_trial, running_trial = generate_trials(['reserved'] * 2)
        lost_trial['heartbeat'] = datetime.datetime.utcnow(
        ) - datetime.timedelta(seconds=360)
        running_trial['heartbeat'] = datetime.datetime.utcnow()

        with OrionState(trials=[lost_trial, running_trial]) as cfg:
            exp = Experiment('supernaekei')
            exp._id = cfg.trials[0]['experiment']

            assert len(exp.fetch_trials_by_status('reserved')) == 2

            exp.fix_lost_trials()

            reserved_trials = exp.fetch_trials_by_status('reserved')
            assert len(reserved_trials) == 1
            assert reserved_trials[0].to_dict(
            )['params'] == running_trial['params']

            failedover_trials = exp.fetch_trials_by_status('interrupted')
            assert len(failedover_trials) == 1
            assert failedover_trials[0].to_dict(
            )['params'] == lost_trial['params']