Esempio n. 1
0
def test_launch_slurm_cluster_orc(fileutils, wlmutils):
    """test clustered 3-node orchestrator"""

    # TODO detect number of nodes in allocation and skip if not sufficent
    launcher = wlmutils.get_test_launcher()
    if launcher != "slurm":
        pytest.skip("Test only runs on systems with Slurm as WLM")

    exp_name = "test-launch-slurm-cluster-orc"
    exp = Experiment(exp_name, launcher=launcher)
    test_dir = fileutils.make_test_dir(exp_name)

    # batch = False to launch on existing allocation
    orc = SlurmOrchestrator(6780, db_nodes=3, batch=False)
    orc.set_path(test_dir)

    exp.start(orc, block=True)
    status = exp.get_status(orc)

    # don't use assert so that orc we don't leave an orphan process
    if constants.STATUS_FAILED in status:
        exp.stop(orc)
        assert False

    exp.stop(orc)
    status = exp.get_status(orc)
    assert all([stat == constants.STATUS_CANCELLED for stat in status])
Esempio n. 2
0
def test_launch_pbs_orc(fileutils, wlmutils):
    """test single node orchestrator"""
    launcher = wlmutils.get_test_launcher()
    if launcher != "pbs":
        pytest.skip("Test only runs on systems with PBSPro as WLM")

    exp_name = "test-launch-pbs-orc"
    exp = Experiment(exp_name, launcher=launcher)
    test_dir = fileutils.make_test_dir(exp_name)

    # batch = False to launch on existing allocation
    orc = PBSOrchestrator(6780, batch=False)
    orc.set_path(test_dir)

    exp.start(orc, block=True)
    status = exp.get_status(orc)

    # don't use assert so that orc we don't leave an orphan process
    if constants.STATUS_FAILED in status:
        exp.stop(orc)
        assert False

    exp.stop(orc)
    status = exp.get_status(orc)
    assert all([stat == constants.STATUS_CANCELLED for stat in status])
Esempio n. 3
0
def test_launch_pbs_cluster_orc(fileutils, wlmutils):
    """test clustered 3-node orchestrator

    This test will fail if the PBS allocation is not
    obtained with `-l place=scatter`

    It will also fail if there are not enough nodes in the
    allocation to support a 3 node deployment
    """
    launcher = wlmutils.get_test_launcher()
    if launcher != "pbs":
        pytest.skip("Test only runs on systems with PBSPro as WLM")

    exp_name = "test-launch-pbs-cluster-orc"
    exp = Experiment(exp_name, launcher=launcher)
    test_dir = fileutils.make_test_dir(exp_name)

    # batch = False to launch on existing allocation
    orc = PBSOrchestrator(6780, db_nodes=3, batch=False, inter_op_threads=4)
    orc.set_path(test_dir)

    exp.start(orc, block=True)
    status = exp.get_status(orc)

    # don't use assert so that orc we don't leave an orphan process
    if constants.STATUS_FAILED in status:
        exp.stop(orc)
        assert False

    exp.stop(orc)
    status = exp.get_status(orc)
    assert all([stat == constants.STATUS_CANCELLED for stat in status])
Esempio n. 4
0
def test_bad_run_command_args(fileutils, wlmutils):
    """Should fail because of incorrect arguments given to the
    run command

    This test ensures that we catch immediate failures
    """
    launcher = wlmutils.get_test_launcher()
    if launcher != "slurm":
        pytest.skip(f"Only fails with slurm. Launcher is {launcher}")

    exp_name = "test-bad-run-command-args"
    exp = Experiment(exp_name, launcher=launcher)
    test_dir = fileutils.make_test_dir(exp_name)

    script = fileutils.get_test_conf_path("bad.py")

    # this argument will get turned into an argument for the run command
    # of the specific WLM of the system.
    settings = wlmutils.get_run_settings("python",
                                         f"{script} --time=5",
                                         badarg="bad-arg")

    model = exp.create_model("bad-model", path=test_dir, run_settings=settings)

    with pytest.raises(SmartSimError):
        exp.start(model)
Esempio n. 5
0
def test_ensemble(fileutils):
    exp = Experiment("gen-test", launcher="local")
    test_dir = fileutils.get_test_dir("gen_ensemble_test")
    gen = Generator(test_dir)

    params = {"THERMO": [10, 20, 30], "STEPS": [10, 20, 30]}
    ensemble = exp.create_ensemble("test", params=params, run_settings=rs)

    config = fileutils.get_test_conf_path("in.atm")
    ensemble.attach_generator_files(to_configure=config)
    gen.generate_experiment(ensemble)

    assert len(ensemble) == 9
    assert osp.isdir(osp.join(test_dir, "test"))
    for i in range(9):
        assert osp.isdir(osp.join(test_dir, "test/test_" + str(i)))
Esempio n. 6
0
def test_ensemble_overwrite_error(fileutils):
    exp = Experiment("gen-test-overwrite-error", launcher="local")
    test_dir = fileutils.get_test_dir("test_gen_overwrite_error")
    gen = Generator(test_dir)

    params = {"THERMO": [10, 20, 30], "STEPS": [10, 20, 30]}
    ensemble = exp.create_ensemble("test", params=params, run_settings=rs)

    config = fileutils.get_test_conf_path("in.atm")
    ensemble.attach_generator_files(to_configure=[config])
    gen.generate_experiment(ensemble)

    # re generate without overwrite
    config = fileutils.get_test_conf_path("in.atm")
    ensemble.attach_generator_files(to_configure=[config])
    with pytest.raises(FileExistsError):
        gen.generate_experiment(ensemble)
Esempio n. 7
0
def test_local_orchestrator(fileutils):
    """Test launching orchestrator locally"""
    global first_dir
    exp_name = "test-orc-launch-local"
    exp = Experiment(exp_name, launcher="local")
    test_dir = fileutils.make_test_dir(exp_name)
    first_dir = test_dir

    orc = Orchestrator(port=6780)
    orc.set_path(test_dir)

    exp.start(orc)
    statuses = exp.get_status(orc)
    assert [stat != constants.STATUS_FAILED for stat in statuses]

    # simulate user shutting down main thread
    exp._control._jobs.actively_monitoring = False
    exp._control._launcher.task_manager.actively_monitoring = False
Esempio n. 8
0
def test_dir_files(fileutils):
    """test the generate of models with files that
    are directories with subdirectories and files
    """

    test_dir = fileutils.make_test_dir("gen_dir_test")
    exp = Experiment("gen-test", test_dir, launcher="local")

    params = {"THERMO": [10, 20, 30], "STEPS": [10, 20, 30]}
    ensemble = exp.create_ensemble("dir_test", params=params, run_settings=rs)
    conf_dir = fileutils.get_test_dir_path("test_dir")
    ensemble.attach_generator_files(to_copy=conf_dir)

    exp.generate(ensemble)

    assert osp.isdir(osp.join(test_dir, "dir_test/"))
    for i in range(9):
        model_path = osp.join(test_dir, "dir_test/dir_test_" + str(i))
        assert osp.isdir(model_path)
        assert osp.isdir(osp.join(model_path, "test_dir_1"))
        assert osp.isfile(osp.join(model_path, "test.py"))
def test_launch_slurm_cluster_orc(fileutils):
    """test clustered 3-node orchestrator"""
    exp_name = "test-launch-slurm-cluster-orc-batch"
    exp = Experiment(exp_name, launcher="slurm")
    test_dir = fileutils.make_test_dir(exp_name)

    # batch = False to launch on existing allocation
    orc = SlurmOrchestrator(6780, db_nodes=3, batch=True)
    orc.set_path(test_dir)

    exp.start(orc, block=True)
    status = exp.get_status(orc)

    # don't use assert so that orc we don't leave an orphan process
    if constants.STATUS_FAILED in status:
        exp.stop(orc)
        assert False

    exp.stop(orc)
    status = exp.get_status(orc)
    assert all([stat == constants.STATUS_CANCELLED for stat in status])
Esempio n. 10
0
def test_full_exp(fileutils):

    test_dir = fileutils.make_test_dir("gen_full_test")
    exp = Experiment("gen-test", test_dir, launcher="local")

    model = exp.create_model("model", run_settings=rs)
    script = fileutils.get_test_conf_path("sleep.py")
    model.attach_generator_files(to_copy=script)

    orc = Orchestrator(6780)
    params = {"THERMO": [10, 20, 30], "STEPS": [10, 20, 30]}
    ensemble = exp.create_ensemble("test_ens", params=params, run_settings=rs)

    config = fileutils.get_test_conf_path("in.atm")
    ensemble.attach_generator_files(to_configure=config)
    exp.generate(orc, ensemble, model)

    # test for ensemble
    assert osp.isdir(osp.join(test_dir, "test_ens/"))
    for i in range(9):
        assert osp.isdir(osp.join(test_dir, "test_ens/test_ens_" + str(i)))

    # test for orc dir
    assert osp.isdir(osp.join(test_dir, "database"))

    # test for model file
    assert osp.isdir(osp.join(test_dir, "model"))
    assert osp.isfile(osp.join(test_dir, "model/sleep.py"))
Esempio n. 11
0
def test_stop_entity(fileutils, wlmutils):
    exp_name = "test-launch-stop-model"
    exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher())
    test_dir = fileutils.make_test_dir(exp_name)

    script = fileutils.get_test_conf_path("sleep.py")
    settings = wlmutils.get_run_settings("python", f"{script} --time=10")
    M1 = exp.create_model("m1", path=test_dir, run_settings=settings)

    exp.start(M1, block=False)
    time.sleep(5)
    exp.stop(M1)
    assert M1.name in exp._control._jobs.completed
    assert exp.get_status(M1)[0] == constants.STATUS_CANCELLED
Esempio n. 12
0
def test_restart(fileutils, wlmutils):

    exp_name = "test-restart"
    exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher())
    test_dir = fileutils.make_test_dir(exp_name)

    script = fileutils.get_test_conf_path("sleep.py")
    settings = wlmutils.get_run_settings("python", f"{script} --time=5")
    M1 = exp.create_model("m1", path=test_dir, run_settings=settings)
    M2 = exp.create_model("m2", path=test_dir, run_settings=settings)

    exp.start(M1, M2, block=True)
    statuses = exp.get_status(M1, M2)
    assert all([stat == constants.STATUS_COMPLETED for stat in statuses])

    exp.start(M1, M2, block=True)
    statuses = exp.get_status(M1, M2)
    assert all([stat == constants.STATUS_COMPLETED for stat in statuses])
Esempio n. 13
0
def test_orchestrator_relaunch(fileutils):
    """Test error when users try to launch second orchestrator"""
    exp_name = "test-orc-error-on-relaunch"
    exp = Experiment(exp_name, launcher="local")
    test_dir = fileutils.make_test_dir(exp_name)

    orc = Orchestrator(port=6780)
    orc.set_path(test_dir)
    orc_1 = Orchestrator(port=6790)
    orc_1.set_path(test_dir)

    exp.start(orc)
    with pytest.raises(SmartSimError):
        exp.start(orc_1)

    exp.stop(orc)
Esempio n. 14
0
def test_stop_entity_list(fileutils, wlmutils):

    exp_name = "test-launch-stop-ensemble"
    exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher())
    test_dir = fileutils.make_test_dir(exp_name)

    script = fileutils.get_test_conf_path("sleep.py")
    settings = wlmutils.get_run_settings("python", f"{script} --time=10")
    ensemble = exp.create_ensemble("e1", run_settings=settings, replicas=2)
    ensemble.set_path(test_dir)

    exp.start(ensemble, block=False)
    time.sleep(5)
    exp.stop(ensemble)
    statuses = exp.get_status(ensemble)
    assert all([stat == constants.STATUS_CANCELLED for stat in statuses])
    assert all([m.name in exp._control._jobs.completed for m in ensemble])
Esempio n. 15
0
def test_reconnect_local_orc():
    """Test reconnecting to orchestrator from first experiment"""
    global first_dir
    # start new experiment
    exp_name = "test-orc-local-reconnect-2nd"
    exp_2 = Experiment(exp_name, launcher="local")

    checkpoint = osp.join(first_dir, "smartsim_db.dat")
    reloaded_orc = exp_2.reconnect_orchestrator(checkpoint)

    # let statuses update once
    time.sleep(5)

    statuses = exp_2.get_status(reloaded_orc)
    for stat in statuses:
        if stat == constants.STATUS_FAILED:
            exp_2.stop(reloaded_orc)
            assert False
    exp_2.stop(reloaded_orc)
Esempio n. 16
0
def test_summary(fileutils, wlmutils):
    """Fairly rudimentary test of the summary dataframe"""

    exp_name = "test-launch-summary"
    exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher())
    test_dir = fileutils.make_test_dir(exp_name)

    sleep = fileutils.get_test_conf_path("sleep.py")
    bad = fileutils.get_test_conf_path("bad.py")
    sleep_settings = wlmutils.get_run_settings("python", f"{sleep} --time=3")
    bad_settings = wlmutils.get_run_settings("python", f"{bad} --time=6")

    sleep = exp.create_model("sleep",
                             path=test_dir,
                             run_settings=sleep_settings)
    bad = exp.create_model("bad", path=test_dir, run_settings=bad_settings)

    # start and poll
    exp.start(sleep, bad)
    assert exp.get_status(bad)[0] == constants.STATUS_FAILED
    assert exp.get_status(sleep)[0] == constants.STATUS_COMPLETED

    summary_df = exp.summary()
    print(summary_df)
    row = summary_df.loc[0]

    assert sleep.name == row["Name"]
    assert sleep.type == row["Entity-Type"]
    assert 0 == int(row["RunID"])
    assert 0 == int(row["Returncode"])

    row_1 = summary_df.loc[1]

    assert bad.name == row_1["Name"]
    assert bad.type == row_1["Entity-Type"]
    assert 0 == int(row_1["RunID"])
    assert 0 != int(row_1["Returncode"])
Esempio n. 17
0
def test_model_failure(fileutils):
    exp_name = "test-model-failure"
    exp = Experiment(exp_name, launcher="local")
    test_dir = fileutils.make_test_dir(exp_name)

    script = fileutils.get_test_conf_path("bad.py")
    settings = RunSettings("python", f"{script} --time=3")

    M1 = exp.create_model("m1", path=test_dir, run_settings=settings)

    exp.start(M1, block=True)
    statuses = exp.get_status(M1)
    assert all([stat == constants.STATUS_FAILED for stat in statuses])
Esempio n. 18
0
def test_ensemble(fileutils, wlmutils):
    exp_name = "test-ensemble-launch"
    exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher())
    test_dir = fileutils.make_test_dir(exp_name)

    script = fileutils.get_test_conf_path("sleep.py")
    settings = wlmutils.get_run_settings("python", f"{script} --time=5")
    ensemble = exp.create_ensemble("e1", run_settings=settings, replicas=2)
    ensemble.set_path(test_dir)

    exp.start(ensemble, block=True)
    statuses = exp.get_status(ensemble)
    assert all([stat == constants.STATUS_COMPLETED for stat in statuses])
Esempio n. 19
0
def test_failed_status(fileutils, wlmutils):
    """Test when a failure occurs deep into model execution"""

    exp_name = "test-report-failure"
    exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher())
    test_dir = fileutils.make_test_dir(exp_name)

    script = fileutils.get_test_conf_path("bad.py")
    settings = wlmutils.get_run_settings("python", f"{script} --time=7")

    model = exp.create_model("bad-model", path=test_dir, run_settings=settings)

    exp.start(model, block=False)
    while not exp.finished(model):
        time.sleep(2)
    status = exp.get_status(model)
    assert status[0] == constants.STATUS_FAILED
Esempio n. 20
0
def test_models(fileutils):
    exp_name = "test-models-local-launch"
    exp = Experiment(exp_name, launcher="local")
    test_dir = fileutils.make_test_dir(exp_name)

    script = fileutils.get_test_conf_path("sleep.py")
    settings = RunSettings("python", f"{script} --time=3")

    M1 = exp.create_model("m1", path=test_dir, run_settings=settings)
    M2 = exp.create_model("m2", path=test_dir, run_settings=settings)

    exp.start(M1, M2, block=True, summary=True)
    statuses = exp.get_status(M1, M2)
    assert all([stat == constants.STATUS_COMPLETED for stat in statuses])
def test_batch_ensemble_replicas(fileutils, wlmutils):
    exp_name = "test-slurm-batch-ensemble-replicas"
    exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher())
    test_dir = fileutils.make_test_dir(exp_name)

    script = fileutils.get_test_conf_path("sleep.py")
    settings = wlmutils.get_run_settings("python", f"{script} --time=5")

    batch = SbatchSettings(nodes=2, time="00:01:00")
    ensemble = exp.create_ensemble(
        "batch-ens-replicas", batch_settings=batch, run_settings=settings, replicas=2
    )
    ensemble.set_path(test_dir)

    exp.start(ensemble, block=True)
    statuses = exp.get_status(ensemble)
    assert all([stat == constants.STATUS_COMPLETED for stat in statuses])
def test_batch_ensemble(fileutils, wlmutils):
    """Test the launch of a manually constructed batch ensemble"""

    exp_name = "test-slurm-batch-ensemble"
    exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher())
    test_dir = fileutils.make_test_dir(exp_name)

    script = fileutils.get_test_conf_path("sleep.py")
    settings = wlmutils.get_run_settings("python", f"{script} --time=5")
    M1 = exp.create_model("m1", path=test_dir, run_settings=settings)
    M2 = exp.create_model("m2", path=test_dir, run_settings=settings)

    batch = SbatchSettings(nodes=2, time="00:01:00")
    ensemble = exp.create_ensemble("batch-ens", batch_settings=batch)
    ensemble.add_model(M1)
    ensemble.add_model(M2)
    ensemble.set_path(test_dir)

    exp.start(ensemble, block=True)
    statuses = exp.get_status(ensemble)
    assert all([stat == constants.STATUS_COMPLETED for stat in statuses])
Esempio n. 23
0
def test_launch_pbs_mpmd():
    """test the launch of a aprun MPMD workload

    this test will obtain an allocation as a batch workload.
    Aprun MPMD workloads share an output file for all processes
    and they share MPI_COMM_WORLDs.

    Prior to running this test, hw_mpi.c in test_configs needs to
    be compiled. #TODO write a script for this.
    """
    exp = Experiment("pbs-test", launcher="pbs")
    run_args = {"pes": 1, "pes-per-node": 1}
    aprun = AprunSettings("./hellow", run_args=run_args)
    aprun2 = AprunSettings("./hellow", run_args=run_args)
    aprun.make_mpmd(aprun2)
    model = exp.create_model("hello_world", run_settings=aprun)

    qsub = QsubBatchSettings(nodes=2, ppn=1, time="1:00:00")
    ensemble = exp.create_ensemble("ensemble", batch_settings=qsub)
    ensemble.add_model(model)

    exp.start(ensemble)
Esempio n. 24
0
def test_consumer(fileutils):
    """Run three processes, each one of the first two processes
    puts a tensor on the DB; the third process accesses the
    tensors put by the two producers.
    Finally, the tensor is used to run a model by each producer
    and the consumer accesses the two results.
    """
    test_dir = fileutils.make_test_dir("smartredis_ensemble_consumer_test")
    exp = Experiment("smartredis_ensemble_consumer",
                     exp_path=test_dir,
                     launcher="local")

    # create and start a database
    orc = Orchestrator(port=REDIS_PORT)
    exp.generate(orc)
    exp.start(orc, block=False)

    rs_prod = RunSettings("python", "producer.py")
    rs_consumer = RunSettings("python", "consumer.py")
    params = {"mult": [1, -10]}
    ensemble = Ensemble(name="producer",
                        params=params,
                        run_settings=rs_prod,
                        perm_strat="step")

    consumer = Model("consumer",
                     params={},
                     path=ensemble.path,
                     run_settings=rs_consumer)
    ensemble.add_model(consumer)

    ensemble.register_incoming_entity(ensemble[0])
    ensemble.register_incoming_entity(ensemble[1])

    config = fileutils.get_test_conf_path("smartredis")
    ensemble.attach_generator_files(to_copy=[config])

    exp.generate(ensemble)

    # start the models
    exp.start(ensemble, summary=False)

    # get and confirm statuses
    statuses = exp.get_status(ensemble)
    assert all([stat == constants.STATUS_COMPLETED for stat in statuses])

    # stop the orchestrator
    exp.stop(orc)

    print(exp.summary())
Esempio n. 25
0
def test_exchange(fileutils):
    """Run two processes, each process puts a tensor on
    the DB, then accesses the other process's tensor.
    Finally, the tensor is used to run a model.
    """

    test_dir = fileutils.make_test_dir("smartredis_ensemble_exchange_test")
    exp = Experiment("smartredis_ensemble_exchange",
                     exp_path=test_dir,
                     launcher="local")

    # create and start a database
    orc = Orchestrator(port=REDIS_PORT)
    exp.generate(orc)
    exp.start(orc, block=False)

    rs = RunSettings("python", "producer.py --exchange")
    params = {"mult": [1, -10]}
    ensemble = Ensemble(
        name="producer",
        params=params,
        run_settings=rs,
        perm_strat="step",
    )

    ensemble.register_incoming_entity(ensemble[0])
    ensemble.register_incoming_entity(ensemble[1])

    config = fileutils.get_test_conf_path("smartredis")
    ensemble.attach_generator_files(to_copy=[config])

    exp.generate(ensemble)

    # start the models
    exp.start(ensemble, summary=False)

    # get and confirm statuses
    statuses = exp.get_status(ensemble)
    assert all([stat == constants.STATUS_COMPLETED for stat in statuses])

    # stop the orchestrator
    exp.stop(orc)

    print(exp.summary())
Esempio n. 26
0
def mom6_colocated_driver(
        walltime="02:00:00",
        ensemble_size=1,
        nodes_per_member=15,
        tasks_per_node=17,
        mom6_exe_path="/lus/cls01029/shao/dev/gfdl/MOM6-examples/build/gnu/" +
    "ice_ocean_SIS2/repro/MOM6",
        ensemble_node_features='P100',
        mask_table="mask_table.33.16x18",
        domain_layout="16,18",
        eke_model_name="ncar_ml_eke.gpu.pt",
        eke_backend="GPU",
        orchestrator_port=6780,
        orchestrator_interface="ipogif0",
        colocated_stride=18,
        orchestrator_cpus=4,
        limit_orchestrator_cpus=False):
    """Run a MOM6 OM4_025 simulation using a colocated deployment for online
    machine-learning inference

    :param walltime: how long to allocate for the run, "hh:mm:ss"
    :type walltime: str, optional
    :param ensemble_size: number of members in the ensemble
    :type ensemble_size: int, optional
    :param nodes_per_member: number of nodes allocated to each ensemble member
    :type nodes_per_member: int, optional
    :param tasks_per_node: how many MPI ranks to be run per node
    :type tasks_per_node: int, optional
    :param mom6_exe_path: full path to the compiled MOM6 executable
    :type mom6_exe_path: str, optional
    :param ensemble_node_features: (Slurm-only) Constraints/features for the
                                    node
    :type ensemble_node_features: str, optional
    :param mask_table: the file to use for the specified layout eliminating
                       land domains
    :type mask_table: str, optional
    :param domain_layout: the particular domain decomposition
    :type domain_layout: str, optional
    :param eke_model_name: file containing the saved machine-learning model
    :type eke_model_name: str, optional
    :param eke_backend: (CPU or GPU), sets whether the ML-EKE model will be
                        run on CPU or GPU
    :type eke_backend: str, optional
    :param orchestrator_port: port that the database will listen on
    :type orchestrator_port: int, optional
    :param orchestrator_interface: network interface bound to the orchestrator
    :type orchestrator_interface: str, optional
    :param orchestrator_cpus: Specify the number of cores that the
                                    orchestrator can use to handle requests
    :type orchestrator_cpus: int, optional
    :param limit_orchestrator_cpus: Limit the number of CPUs that the
                                    orchestrator can use to handle requests
    :type limit_orchestrator_cpus: bool, optional
    """
    experiment = Experiment("AI-EKE-MOM6", launcher="auto")
    mom_ensemble = create_mom_ensemble(experiment, walltime, ensemble_size,
                                       nodes_per_member, tasks_per_node,
                                       mom6_exe_path, ensemble_node_features)
    configure_mom_ensemble(mom_ensemble,
                           True,
                           False,
                           mask_table,
                           domain_layout,
                           eke_model_name,
                           eke_backend,
                           colocated_stride=colocated_stride)

    add_colocated_orchestrator(
        mom_ensemble,
        orchestrator_port,
        orchestrator_interface,
        orchestrator_cpus,
        limit_orchestrator_cpus,
    )

    experiment.generate(mom_ensemble, overwrite=True)
    experiment.start(mom_ensemble, summary=True)
    experiment.stop()
Esempio n. 27
0
from copy import deepcopy

import pytest

from smartsim import Experiment
from smartsim.database import Orchestrator
from smartsim.error import SmartSimError
from smartsim.settings import RunSettings
from smartsim.utils.entityutils import separate_entities

# ---- create entities for testing --------

rs = RunSettings("python", "sleep.py")

exp = Experiment("util-test", launcher="local")
model = exp.create_model("model_1", run_settings=rs)
model_2 = exp.create_model("model_1", run_settings=rs)
ensemble = exp.create_ensemble("ensemble", run_settings=rs, replicas=1)
orc = Orchestrator()
orc_1 = deepcopy(orc)


def test_separate():
    ent, ent_list, _orc = separate_entities([model, ensemble, orc])
    assert ent[0] == model
    assert ent_list[0] == ensemble
    assert _orc == orc


def test_two_orc():
    with pytest.raises(SmartSimError):
Esempio n. 28
0
def test_model_prefix():
    exp = Experiment("test")
    model = exp.create_model("model",
                             RunSettings("python"),
                             enable_key_prefixing=True)
    assert model._key_prefixing_enabled == True
Esempio n. 29
0
def mom6_clustered_driver(
        walltime="02:00:00",
        ensemble_size=1,
        nodes_per_member=25,
        tasks_per_node=45,
        mom6_exe_path="/lus/cls01029/shao/dev/gfdl/MOM6-examples/build/gnu/" +
    "ice_ocean_SIS2/repro/MOM6",
        ensemble_node_features='[CL48|SK48|SK56]',
        mask_table="mask_table.315.32x45",
        domain_layout="32,45",
        eke_model_name="ncar_ml_eke.gpu.pt",
        eke_backend="GPU",
        orchestrator_port=6780,
        orchestrator_interface="ipogif0",
        orchestrator_nodes=3,
        orchestrator_node_features='P100',
        configure_only=False):
    """Run a MOM6 OM4_025 simulation with a cluster of databases used for
    machine-learning inference

    :param walltime: how long to allocate for the run, "hh:mm:ss"
    :type walltime: str, optional
    :param ensemble_size: number of members in the ensemble
    :type ensemble_size: int, optional
    :param nodes_per_member: number of nodes allocated to each ensemble member
    :type nodes_per_member: int, optional
    :param tasks_per_node: how many MPI ranks to be run per node
    :type tasks_per_node: int, optional
    :param mom6_exe_path: full path to the compiled MOM6 executable
    :type mom6_exe_path: str, optional
    :param ensemble_node_features: (Slurm-only) Constraints/features for the
                                    node
    :type ensemble_node_features: str, optional
    :param mask_table: the file to use for the specified layout eliminating
                       land domains
    :type mask_table: str, optional
    :param domain_layout: the particular domain decomposition
    :type domain_layout: str, optional
    :param eke_model_name: file containing the saved machine-learning model
    :type eke_model_name: str, optional
    :param eke_backend: (CPU or GPU), sets whether the ML-EKE model will be
                        run on CPU or GPU
    :type eke_backend: str, optional
    :param orchestrator_port: port that the database will listen on
    :type orchestrator_port: int, optional
    :param orchestrator_interface: network interface bound to the database
    :type orchestrator_interface: str, optional
    :param orchestrator_nodes: number of orchestrator nodes to use
    :type orchestrator_nodes: int, optional
    :param orchestrator_node_features: (Slurm-only) node features requested for
                                       the orchestrator nodes
    :type orchestrator_node_features: str, optional
    :param configure_only: If True, only configure the experiment and return
                           the orchestrator and experiment objects
    :type configure_only: bool, optional
    """

    experiment = Experiment("AI-EKE-MOM6", launcher="auto")
    mom_ensemble = create_mom_ensemble(experiment, walltime, ensemble_size,
                                       nodes_per_member, tasks_per_node,
                                       mom6_exe_path, ensemble_node_features)
    configure_mom_ensemble(mom_ensemble, False, orchestrator_nodes >= 3,
                           mask_table, domain_layout, eke_model_name,
                           eke_backend)
    orchestrator = create_distributed_orchestrator(
        experiment, orchestrator_port, orchestrator_interface,
        orchestrator_nodes, orchestrator_node_features, walltime)

    experiment.generate(mom_ensemble, orchestrator, overwrite=True)
    if configure_only:
        return experiment, mom_ensemble, orchestrator
    else:
        experiment.start(mom_ensemble, orchestrator, summary=True)
        experiment.stop(orchestrator)
Esempio n. 30
0
def test_bad_ensemble_init_no_rs_bs():
    """ensemble init without run settings or batch settings"""
    exp = Experiment("test")
    with pytest.raises(SmartSimError):
        exp.create_ensemble("name")