def test_preempt_workers_ask_master(dummy: bool, auto_ack: bool) -> None:
    with parallel.Execution(2) as pex:

        # Steal the automatically-created pex.distributed contexts, then test chief/worker serially
        # so we know they're not using distributed comms.
        @pex.run
        def distributed_contexts() -> core.DistributedContext:
            return pex.distributed

        # Test steps are identical for chief and worker.
        for dist in distributed_contexts:
            if not dummy:
                state, context = make_test_preempt_context(
                    dist, core.PreemptMode.WorkersAskMaster)
            else:
                context = core.DummyPreemptContext(
                    dist, core.PreemptMode.WorkersAskMaster)
            with context:
                assert context.should_preempt() is False
                if not dummy:
                    # No ack preemption calls yet.
                    state.mock_session.post.assert_not_called()
                    # Send the preemption signal.
                    state.preempt()
                    wait_on_watcher(context)
                    # Call again, to make sure we only ack once.
                    assert context.should_preempt(auto_ack=auto_ack) is True
                    if auto_ack:
                        state.mock_session.post.assert_called_once()
                    else:
                        state.mock_session.post.assert_not_called()
Esempio n. 2
0
def _dummy_init(
    *,
    distributed: Optional[core.DistributedContext] = None,
    # TODO(DET-6153): allow a Union[StorageManager, str] here.
    storage_manager: Optional[storage.StorageManager] = None,
    preempt_mode: core.PreemptMode = core.PreemptMode.WorkersAskChief,
) -> Context:
    """
    Build a core.Context suitable for running off-cluster.  This is normally called by init()
    when it is detected that there is no ClusterInfo available, but can be invoked directly for
    e.g. local test mode.
    """
    distributed = distributed or core.DummyDistributedContext()
    preempt = core.DummyPreemptContext(distributed, preempt_mode)

    if storage_manager is None:
        base_path = appdirs.user_data_dir("determined")
        logger.info("no storage_manager provided; storing checkpoints in {base_path}")
        storage_manager = storage.SharedFSStorageManager(base_path)
    checkpoint = core.DummyCheckpointContext(distributed, storage_manager)

    train = core.DummyTrainContext()
    searcher = core.DummySearcherContext(distributed)

    _install_stacktrace_on_sigusr1()

    return Context(
        distributed=distributed,
        checkpoint=checkpoint,
        preempt=preempt,
        train=train,
        searcher=searcher,
    )
def test_preempt_chief_only(dummy: bool, auto_ack: bool) -> None:
    with parallel.Execution(2) as pex:

        # Steal the automatically-created pex.distributed contexts, then test chief/worker serially
        # so we know they're not using distributed comms.
        @pex.run
        def distributed_contexts() -> core.DistributedContext:
            return pex.distributed

        # Test chief.
        if not dummy:
            state, context = make_test_preempt_context(
                distributed_contexts[0], core.PreemptMode.ChiefOnly)
        else:
            context = core.DummyPreemptContext(distributed_contexts[0],
                                               core.PreemptMode.ChiefOnly)
        with context:
            assert context.should_preempt() is False
            if not dummy:
                # No ack preemption calls yet.
                state.mock_session.post.assert_not_called()
                # Send the preemption signal.
                state.preempt()
                wait_on_watcher(context)
                assert context.should_preempt(auto_ack=auto_ack) is True
                # Call again, to make sure we only ack once.
                assert context.should_preempt(auto_ack=auto_ack) is True
                if auto_ack:
                    state.mock_session.post.assert_called_once()
                else:
                    state.mock_session.post.assert_not_called()

        # Test worker.
        if not dummy:
            state, context = make_test_preempt_context(
                distributed_contexts[1], core.PreemptMode.ChiefOnly)
        else:
            context = core.DummyPreemptContext(distributed_contexts[1],
                                               core.PreemptMode.ChiefOnly)
        with context:
            with pytest.raises(RuntimeError,
                               match="should_preempt.*called from non-chief"):
                context.should_preempt()
Esempio n. 4
0
 def __init__(
     self,
     checkpoint: core.CheckpointContext,
     distributed: Optional[core.DistributedContext] = None,
     preempt: Optional[core.PreemptContext] = None,
     train: Optional[core.TrainContext] = None,
     searcher: Optional[core.SearcherContext] = None,
 ) -> None:
     self.checkpoint = checkpoint
     self.distributed = distributed or core.DummyDistributedContext()
     self.preempt = preempt or core.DummyPreemptContext(self.distributed)
     self.train = train or core.DummyTrainContext()
     self.searcher = searcher or core.DummySearcherContext(self.distributed)
        def do_test() -> None:
            if not dummy:
                state, context = make_test_preempt_context(
                    pex.distributed, core.PreemptMode.WorkersAskChief)
            else:
                context = core.DummyPreemptContext(
                    pex.distributed, core.PreemptMode.WorkersAskChief)

            with pytest.raises(
                    RuntimeError,
                    match="cannot call.*should_preempt.*before.*start"):
                context.should_preempt()
            with context:
                assert context.should_preempt() is False
        def do_test() -> None:
            if not dummy:
                state, context = make_test_preempt_context(
                    pex.distributed, core.PreemptMode.WorkersAskChief)
            else:
                context = core.DummyPreemptContext(
                    pex.distributed, core.PreemptMode.WorkersAskChief)

            with context:
                if pex.rank == 0:
                    # Check preemption.
                    assert context.should_preempt() is False
                    # Make sure the worker is receiving broadcasts.
                    _ = pex.distributed.broadcast(False)
                    if not dummy:
                        # No ack preemption calls yet.
                        state.mock_session.post.assert_not_called()
                        # Send the preemption signal.
                        state.preempt()
                        wait_on_watcher(context)
                        assert context.should_preempt(
                            auto_ack=auto_ack) is True
                        # Call again, to make sure we only ack once.
                        assert context.should_preempt(
                            auto_ack=auto_ack) is True
                        if auto_ack:
                            state.mock_session.post.assert_called_once()
                        else:
                            state.mock_session.post.assert_not_called()
                else:
                    # Intercept the broadcast from the chief to make sure it's happening.
                    out = pex.distributed.broadcast(None)
                    assert out is False, out
                    # Try receving from the chief.
                    assert context.should_preempt() is False
                    if not dummy:
                        # The chief should send a True now.
                        assert context.should_preempt() is True
                        # Only the chief acknowledges the preemption signal.
                        state.mock_session.post.assert_not_called()