Ejemplo n.º 1
0
def router():
    # We need at least 5 workers so resource won't be oversubscribed
    ray.init(num_cpus=5)

    # The following two blobs are equivalent
    #
    # handle = DeadlineAwareRouter.remote("DefaultTestRouter")
    # ray.experimental.register_actor("DefaultTestRouter", handle)
    # handle.start.remote()
    #
    # handle = start_router(DeadlineAwareRouter, "DefaultRouter")
    handle = start_router(DeadlineAwareRouter, "DefaultRouter")

    handle.register_actor.remote(
        "VAdder", VectorizedAdder,
        init_kwargs={"scaler_increment": 1})  # init args
    handle.register_actor.remote(
        "SAdder", ScalerAdder, init_kwargs={"scaler_increment": 2})
    handle.register_actor.remote(
        "SleepFirst", SleepOnFirst, init_kwargs={"sleep_time": 1})
    handle.register_actor.remote(
        "SleepCounter", SleepCounter, max_batch_size=1)

    yield handle

    ray.shutdown()
Ejemplo n.º 2
0
def ray_start_object_store_memory():
    # Start the Ray processes.
    store_size = 10**6
    ray.init(num_cpus=1, object_store_memory=store_size)
    yield None
    # The code after the yield will run as teardown code.
    ray.shutdown()
Ejemplo n.º 3
0
def start_connected_cluster():
    # Start the Ray processes.
    cluster = _start_new_cluster()
    yield cluster
    # The code after the yield will run as teardown code.
    ray.shutdown()
    cluster.shutdown()
Ejemplo n.º 4
0
def ray_start_reconstruction(request):
    num_nodes = request.param

    plasma_store_memory = int(0.5 * 10**9)

    cluster = Cluster(
        initialize_head=True,
        head_node_args={
            "num_cpus": 1,
            "object_store_memory": plasma_store_memory // num_nodes,
            "redis_max_memory": 10**7,
            "_internal_config": json.dumps({
                "initial_reconstruction_timeout_milliseconds": 200
            })
        })
    for i in range(num_nodes - 1):
        cluster.add_node(
            num_cpus=1,
            object_store_memory=plasma_store_memory // num_nodes,
            _internal_config=json.dumps({
                "initial_reconstruction_timeout_milliseconds": 200
            }))
    ray.init(redis_address=cluster.redis_address)

    yield plasma_store_memory, num_nodes, cluster

    # Clean up the Ray cluster.
    ray.shutdown()
    cluster.shutdown()
Ejemplo n.º 5
0
def init():
    ray.init(num_cpus=4)
    async_api.init()
    asyncio.get_event_loop().set_debug(False)
    yield
    async_api.shutdown()
    ray.shutdown()
Ejemplo n.º 6
0
def ray_start_driver_put_errors():
    plasma_store_memory = 10**9
    # Start the Ray processes.
    ray.init(num_cpus=1, object_store_memory=plasma_store_memory)
    yield plasma_store_memory
    # The code after the yield will run as teardown code.
    ray.shutdown()
Ejemplo n.º 7
0
def ray_start_empty_cluster():
    cluster = Cluster()
    yield cluster

    # The code after the yield will run as teardown code.
    ray.shutdown()
    cluster.shutdown()
Ejemplo n.º 8
0
def ray_start_cluster():
    num_nodes = 5
    cluster = create_cluster(num_nodes)
    yield cluster, num_nodes

    # The code after the yield will run as teardown code.
    ray.shutdown()
    cluster.shutdown()
Ejemplo n.º 9
0
def test_cluster_rllib_restore(start_connected_cluster, tmpdir):
    cluster = start_connected_cluster
    dirpath = str(tmpdir)
    script = """
import time
import ray
from ray import tune

ray.init(redis_address="{redis_address}")

kwargs = dict(
    run="PG",
    env="CartPole-v1",
    stop=dict(training_iteration=10),
    local_dir="{checkpoint_dir}",
    checkpoint_freq=1,
    max_failures=1)

tune.run_experiments(
    dict(experiment=kwargs),
    raise_on_failed_trial=False)
""".format(
        redis_address=cluster.redis_address, checkpoint_dir=dirpath)
    run_string_as_driver_nonblocking(script)
    # Wait until the right checkpoint is saved.
    # The trainable returns every 0.5 seconds, so this should not miss
    # the checkpoint.
    metadata_checkpoint_dir = os.path.join(dirpath, "experiment")
    for i in range(100):
        if TrialRunner.checkpoint_exists(metadata_checkpoint_dir):
            # Inspect the internal trialrunner
            runner = TrialRunner.restore(metadata_checkpoint_dir)
            trials = runner.get_trials()
            last_res = trials[0].last_result
            if last_res and last_res.get("training_iteration"):
                break
        time.sleep(0.3)

    if not TrialRunner.checkpoint_exists(metadata_checkpoint_dir):
        raise RuntimeError("Checkpoint file didn't appear.")

    ray.shutdown()
    cluster.shutdown()
    cluster = _start_new_cluster()
    cluster.wait_for_nodes()

    # Restore properly from checkpoint
    trials2 = tune.run_experiments(
        {
            "experiment": {
                "run": "PG",
                "checkpoint_freq": 1,
                "local_dir": dirpath
            }
        },
        resume=True)
    assert all(t.status == Trial.TERMINATED for t in trials2)
    cluster.shutdown()
Ejemplo n.º 10
0
def test_temp_plasma_store_socket():
    ray.init(plasma_store_socket_name="/tmp/i_am_a_temp_socket")
    assert os.path.exists(
        "/tmp/i_am_a_temp_socket"), "Specified socket path not found."
    ray.shutdown()
    try:
        os.remove("/tmp/i_am_a_temp_socket")
    except Exception:
        pass
Ejemplo n.º 11
0
def ray_start():
    # Start ray instance
    ray.init(num_cpus=1)

    # Run test using this fixture
    yield None

    # Shutdown ray instance
    ray.shutdown()
Ejemplo n.º 12
0
 def tearDown(self):
     print("Tearing down....")
     try:
         self.runner._server.shutdown()
         self.runner = None
     except Exception as e:
         print(e)
     ray.shutdown()
     _register_all()
Ejemplo n.º 13
0
def ray_start_regular():
    for module in [
            ra.core, ra.random, ra.linalg, da.core, da.random, da.linalg
    ]:
        reload(module)
    # Start the Ray processes.
    ray.init(num_cpus=2)
    yield None
    # The code after the yield will run as teardown code.
    ray.shutdown()
Ejemplo n.º 14
0
def main(config, experiments, num_cpus, num_gpus, redis_address):
  print("config =", config.name)
  print("experiments =", experiments)
  print("num_gpus =", num_gpus)
  print("num_cpus =", num_cpus)
  print("redis_address =", redis_address)

  # Use configuration file location as the project location.
  projectDir = os.path.dirname(config.name)
  projectDir = os.path.abspath(projectDir)
  print("projectDir =", projectDir)

  # Load and parse experiment configurations
  configs = parse_config(config, experiments, globals=globals())

  # Pre-download dataset
  data_dir = os.path.join(projectDir, "data")
  datasets.CIFAR10(data_dir, download=True, train=True)

  # Initialize ray cluster
  if redis_address is not None:
    ray.init(redis_address=redis_address, include_webui=True)
    num_cpus = 1
  else:
    ray.init(num_cpus=num_cpus, num_gpus=num_gpus, local_mode=num_cpus == 1)

  # Run all experiments in parallel
  results = []
  for exp in configs:
    config = configs[exp]
    config["name"] = exp

    # Make sure local directories are relative to the project location
    path = config.get("path", None)
    if path and not os.path.isabs(path):
      config["path"] = os.path.join(projectDir, path)

    data_dir = config.get("data_dir", "data")
    if not os.path.isabs(data_dir):
      config["data_dir"] = os.path.join(projectDir, data_dir)

    # When running multiple hyperparameter searches on different experiments,
    # ray.tune will run one experiment at the time. We use "ray.remote" to
    # run each tune experiment in parallel as a "remote" function and wait until
    # all experiments complete
    results.append(run_experiment.remote(config, MobileNetTune,
                                         num_cpus=1,
                                         num_gpus=num_gpus / num_cpus))

  # Wait for all experiments to complete
  ray.get(results)

  ray.shutdown()
Ejemplo n.º 15
0
def ray_gdb_start():
    # Setup environment and start ray
    _environ = os.environ.copy()
    for process_name in ["RAYLET", "PLASMA_STORE"]:
        os.environ["RAY_{}_GDB".format(process_name)] = "1"
        os.environ["RAY_{}_TMUX".format(process_name)] = "1"

    yield None

    # Restore original environment and stop ray
    os.environ.clear()
    os.environ.update(_environ)
    ray.shutdown()
Ejemplo n.º 16
0
def ray_start_workers_separate_multinode(request):
    num_nodes = request.param[0]
    num_initial_workers = request.param[1]
    # Start the Ray processes.
    cluster = Cluster()
    for _ in range(num_nodes):
        cluster.add_node(num_cpus=num_initial_workers)
    ray.init(redis_address=cluster.redis_address)

    yield num_nodes, num_initial_workers
    # The code after the yield will run as teardown code.
    ray.shutdown()
    cluster.shutdown()
Ejemplo n.º 17
0
def ray_start_head_local():
    # Start the Ray processes on this machine.
    run_and_get_output([
        "ray", "start", "--head", "--node-ip-address=localhost",
        "--redis-port=6379"
    ])

    yield None

    # Disconnect from the Ray cluster.
    ray.shutdown()
    # Kill the Ray cluster.
    subprocess.Popen(["ray", "stop"]).wait()
Ejemplo n.º 18
0
def cluster_start():
    # Start the Ray processes.
    cluster = Cluster(
        initialize_head=True,
        connect=True,
        head_node_args={
            "num_cpus": 1,
            "_internal_config": json.dumps({
                "num_heartbeats_timeout": 10
            })
        })
    yield cluster
    ray.shutdown()
    cluster.shutdown()
Ejemplo n.º 19
0
def test_driver_lives_sequential():
    ray.worker.init()
    all_processes = ray.services.all_processes
    processes = (all_processes[ray.services.PROCESS_TYPE_PLASMA_STORE] +
                 all_processes[ray.services.PROCESS_TYPE_RAYLET])

    # Kill all the components sequentially.
    for process in processes:
        process.terminate()
        time.sleep(0.1)
        process.kill()
        process.wait()

    ray.shutdown()
Ejemplo n.º 20
0
def start_connected_longer_cluster():
    """Creates a cluster with a longer timeout."""
    g = Cluster(
        initialize_head=True,
        connect=True,
        head_node_args={
            "num_cpus": 1,
            "_internal_config": json.dumps({
                "num_heartbeats_timeout": 20
            })
        })
    yield g
    # The code after the yield will run as teardown code.
    ray.shutdown()
    g.shutdown()
Ejemplo n.º 21
0
def ray_start_head():
    out = run_and_get_output(["ray", "start", "--head", "--num-cpus=2"])
    # Get the redis address from the output.
    redis_substring_prefix = "redis_address=\""
    redis_address_location = (
        out.find(redis_substring_prefix) + len(redis_substring_prefix))
    redis_address = out[redis_address_location:]
    redis_address = redis_address.split("\"")[0]

    yield redis_address

    # Disconnect from the Ray cluster.
    ray.shutdown()
    # Kill the Ray cluster.
    subprocess.Popen(["ray", "stop"]).wait()
Ejemplo n.º 22
0
def ray_start_two_nodes():
    for module in [
            ra.core, ra.random, ra.linalg, da.core, da.random, da.linalg
    ]:
        reload(module)
    # Start the Ray processes.
    cluster = ray.test.cluster_utils.Cluster()
    for _ in range(2):
        cluster.add_node(num_cpus=10)
    ray.init(redis_address=cluster.redis_address)
    yield None

    # The code after the yield will run as teardown code.
    ray.shutdown()
    cluster.shutdown()
Ejemplo n.º 23
0
def ray_start_two_nodes():
    # Start the Ray processes.
    cluster = ray.test.cluster_utils.Cluster()
    for _ in range(2):
        cluster.add_node(
            num_cpus=0,
            _internal_config=json.dumps({
                "num_heartbeats_timeout": 40
            }))
    ray.init(redis_address=cluster.redis_address)

    yield cluster
    # The code after the yield will run as teardown code.
    ray.shutdown()
    cluster.shutdown()
Ejemplo n.º 24
0
def start_connected_cluster():
    # Start the Ray processes.
    g = Cluster(
        initialize_head=True,
        connect=True,
        head_node_args={
            "num_cpus": 1,
            "_internal_config": json.dumps({
                "num_heartbeats_timeout": 10
            })
        })
    yield g
    # The code after the yield will run as teardown code.
    ray.shutdown()
    g.shutdown()
Ejemplo n.º 25
0
def ray_start_sharded(request):
    num_redis_shards = request.param

    if os.environ.get("RAY_USE_NEW_GCS") == "on":
        num_redis_shards = 1
        # For now, RAY_USE_NEW_GCS supports 1 shard, and credis supports
        # 1-node chain for that shard only.

    # Start the Ray processes.
    ray.init(
        num_cpus=10, num_redis_shards=num_redis_shards, redis_max_memory=10**7)

    yield None

    # The code after the yield will run as teardown code.
    ray.shutdown()
Ejemplo n.º 26
0
def noise(config, experiments, num_cpus, num_gpus, redis_address):
  print("config =", config.name)
  print("num_gpus =", num_gpus)
  print("num_cpus =", num_cpus)
  print("redis_address =", redis_address)

  # Use configuration file location as the project location.
  project_dir = os.path.dirname(config.name)
  project_dir = os.path.abspath(project_dir)
  print("projectDir =", project_dir)

  # Load and parse experiment configurations
  configs = parse_config(config, experiments, globals=globals())

  # Initialize ray cluster
  if redis_address is not None:
    ray.init(redis_address=redis_address, include_webui=True)
  else:
    ray.init(num_cpus=num_cpus, num_gpus=num_gpus, local_mode=num_cpus == 1)

  # FIXME: Update remote function resource usage
  num_gpus = float(num_gpus / num_cpus)
  run_noise_test._num_gpus = num_gpus
  run_noise_test.num_cpus = 1

  # Run experiments
  results = []
  for exp in configs:
    config = configs[exp]
    config["name"] = exp

    # Make sure local directories are relative to the project location
    path = config.get("path", None)
    if path and not os.path.isabs(path):
      config["path"] = os.path.join(project_dir, path)

    data_dir = config.get("data_dir", "data")
    if not os.path.isabs(data_dir):
      config["data_dir"] = os.path.join(project_dir, data_dir)

    # Run each experiment in parallel
    results.append(run_noise_test.remote(config))

  # Wait until all experiments complete
  ray.get(results)
  ray.shutdown()
Ejemplo n.º 27
0
    def Driver(success):
        success.value = True
        # Start driver.
        ray.init(redis_address=redis_address)
        summary_start = StateSummary()
        if (0, 1) != summary_start[:2]:
            success.value = False

        max_attempts_before_failing = 100

        # Two new objects.
        ray.get(ray.put(1111))
        ray.get(ray.put(1111))
        attempts = 0
        while (2, 1, summary_start[2]) != StateSummary():
            time.sleep(0.1)
            attempts += 1
            if attempts == max_attempts_before_failing:
                success.value = False
                break

        @ray.remote
        def f():
            ray.put(1111)  # Yet another object.
            return 1111  # A returned object as well.

        # 1 new function.
        attempts = 0
        while (2, 1, summary_start[2] + 1) != StateSummary():
            time.sleep(0.1)
            attempts += 1
            if attempts == max_attempts_before_failing:
                success.value = False
                break

        ray.get(f.remote())
        attempts = 0
        while (4, 2, summary_start[2] + 1) != StateSummary():
            time.sleep(0.1)
            attempts += 1
            if attempts == max_attempts_before_failing:
                success.value = False
                break

        ray.shutdown()
Ejemplo n.º 28
0
def start_connected_emptyhead_cluster():
    """Starts head with no resources."""

    cluster = Cluster(
        initialize_head=True,
        connect=True,
        head_node_args={
            "num_cpus": 0,
            "_internal_config": json.dumps({
                "num_heartbeats_timeout": 10
            })
        })
    # Pytest doesn't play nicely with imports
    _register_all()
    yield cluster
    # The code after the yield will run as teardown code.
    ray.shutdown()
    cluster.shutdown()
Ejemplo n.º 29
0
def ray_start_cluster():
    node_args = {
        "num_cpus": 8,
        "_internal_config": json.dumps({
            "initial_reconstruction_timeout_milliseconds": 1000,
            "num_heartbeats_timeout": 10
        })
    }
    # Start with 4 worker nodes and 8 cores each.
    cluster = Cluster(
        initialize_head=True, connect=True, head_node_args=node_args)
    workers = []
    for _ in range(4):
        workers.append(cluster.add_node(**node_args))
    cluster.wait_for_nodes()
    yield cluster
    ray.shutdown()
    cluster.shutdown()
Ejemplo n.º 30
0
def ray_start_combination(request):
    num_nodes = request.param[0]
    num_workers_per_scheduler = request.param[1]
    # Start the Ray processes.
    cluster = Cluster(
        initialize_head=True,
        head_node_args={
            "num_cpus": 10,
            "redis_max_memory": 10**7
        })
    for i in range(num_nodes - 1):
        cluster.add_node(num_cpus=10)
    ray.init(redis_address=cluster.redis_address)

    yield num_nodes, num_workers_per_scheduler, cluster
    # The code after the yield will run as teardown code.
    ray.shutdown()
    cluster.shutdown()
Ejemplo n.º 31
0
    def build_data(self):
        if self.data_ready:
            print("Data is Already build~")
            return
        
        persons = ["P%d" % i for i in range(9)]
        gestures = os.listdir(os.path.join(self.path, persons[0]))
        gestures.sort()
        all_paths = [[os.path.join(self.path, person, gesture) for gesture in gestures] for person in persons]
        bin_paths = [[] for i in range(9)]
        joints = [[] for i in range(9)]

        print('loading file list ......')
        with tqdm(len(persons) * len(gestures)) as pbar:
            for i, paths in enumerate(all_paths):
                for path in paths:
                    _joints = np.loadtxt(os.path.join(path, 'joint.txt'), skiprows=1)
                    with open(os.path.join(path, 'joint.txt')) as f:
                        samples = int(f.readline())
                    _joints = _joints.reshape((samples, 21, 3))
                    _joints[:, :, 1] = - _joints[:, :, 1]
                    _joints[:, :, 2] = - _joints[:, :, 2]
                    joints[i].append(_joints.reshape((samples, 63)))
                    for j in range(samples):
                        bin_paths[i].append(os.path.join(path, "%06d_depth.bin" % j))
                    pbar.update(1)
                joints[i] = np.concatenate(joints[i], axis=0)

        print('saving test.txt ......')
        for i in range(9):
            super().write_data_txt(os.path.join(self.path, "test_{}.txt".format(i)), list(bin_paths[i]), list(joints[i]))

        print('checking data ......')
        datatexts = []
        traintxts = []
        for i in range(9):
            dataname = os.path.join(self.path, "test_{}.txt".format(i))
            with open(dataname, 'r') as f:
                datatexts.append(f.readlines())
        
        ray.init()
        for i in range(9):
            reporter = Reporter.remote(len(datatexts[i]))
            chunk = len(datatexts[i]) // (os.cpu_count() - 1) + 1
            traintxt = []
            processing = [check_texts.remote(self, datatexts[i][j * chunk : (j + 1) * chunk], reporter) for j in range(os.cpu_count() - 1)]
            for r in ray.get(processing):
                traintxt += r

            traintxts.append(traintxt)
            print('For person {}, {} / {} data can use to train'.format(i, len(traintxt), len(datatexts[i])))
        ray.shutdown()
        
        for i in range(9):
            train_to_write = []
            for j in range(9):
                if i == j:
                    val_to_write = traintxts[j]
                else:
                    train_to_write += traintxts[j]

            with open(os.path.join(self.path, "train_{}.txt".format(i)), 'w') as f:
                f.writelines(train_to_write)

            with open(os.path.join(self.path, "val_{}.txt".format(i)), 'w') as f:
                f.writelines(val_to_write)
Ejemplo n.º 32
0
#!/usr/bin/env/ python
# Large-scale training of PPO agent using Ray Tune
# Chapter 8, TensorFlow 2 Reinforcement Learning Cookbook | Praveen Palanisamy

import ray
import sys
from ray import tune
from ray.rllib.models import ModelCatalog

if not "." in sys.path:
    sys.path.insert(0, ".")
from custom_model import CustomModel

# Register custom-model in ModelCatalog
ModelCatalog.register_custom_model("CustomCNN", CustomModel)

ray.init()
experiment_analysis = tune.run(
    "PPO",
    config={
        "env": "procgen:procgen-coinrun-v0",
        "num_gpus": 0,
        "num_workers": 2,
        "model": {"custom_model": "CustomCNN"},
        "framework": "tf2",
        "log_level": "INFO",
    },
    local_dir="ray_results",  # store experiment results in this dir
)
ray.shutdown()
def test_instance():
    """
    example of SimDeepBoosting
    """
    PATH_DATA = '{0}/../examples/data/'.format(split(abspath(__file__))[0])

    #Input file
    TRAINING_TSV = {'RNA': 'rna_dummy.tsv', 'METH': 'meth_dummy.tsv'}
    SURVIVAL_TSV = 'survival_dummy.tsv'

    PROJECT_NAME = 'TestProject'
    EPOCHS = 10
    SEED = 3
    nb_it = 5
    nb_threads = 2

    # Optional metadata FILE
    OPTIONAL_METADATA = "metadata_dummy.tsv"

    # Import cluster scheduler
    import ray
    ray.init(num_cpus=3)
    # More options can be used (e.g. remote clusters, AWS, memory,...etc...)
    # ray can be used locally to maximize the use of CPUs on the local machine
    # See ray API: https://ray.readthedocs.io/en/latest/index.html

    boosting = SimDeepBoosting(
        nb_threads=nb_threads,
        nb_it=nb_it,
        split_n_fold=3,
        survival_tsv=SURVIVAL_TSV,
        training_tsv=TRAINING_TSV,
        path_data=PATH_DATA,
        project_name=PROJECT_NAME,
        path_results=PATH_DATA,
        metadata_tsv=OPTIONAL_METADATA,  # optional
        metadata_usage='all',
        epochs=EPOCHS,
        distribute=True,  # Option to use ray cluster scheduler
        seed=SEED)

    boosting.fit()
    boosting.save_models_classes()
    boosting.save_cv_models_classes()

    boosting.predict_labels_on_full_dataset()

    boosting.compute_clusters_consistency_for_full_labels()
    boosting.evalutate_cluster_performance()
    boosting.collect_cindex_for_test_fold()
    boosting.collect_cindex_for_full_dataset()

    boosting.compute_feature_scores_per_cluster()
    boosting.collect_number_of_features_per_omic()

    boosting.write_feature_score_per_cluster()

    boosting.load_new_test_dataset(
        {
            'RNA': 'rna_dummy.tsv'
        },  # OMIC file of the test set. It doesnt have to be the same as for training
        'dummy',  # Name of the test test to be used
        'survival_dummy.tsv',  # Survival file of the test set
    )

    boosting.predict_labels_on_test_dataset()
    boosting.save_test_models_classes()

    boosting.compute_c_indexes_for_test_dataset()
    boosting.compute_clusters_consistency_for_test_labels()

    # Experimental method to plot the test dataset amongst the class kernel densities
    boosting.plot_supervised_kernel_for_test_sets()
    boosting.plot_supervised_predicted_labels_for_test_sets()

    boosting.load_new_test_dataset(
        {'METH': 'meth_dummy.tsv'},  # OMIC file of the second test set.
        'dummy_METH',  # Name of the second test test
        'survival_dummy.tsv',  # Survival file of the test set (optional)
    )

    boosting.predict_labels_on_test_dataset()
    boosting.compute_c_indexes_for_test_dataset()
    boosting.compute_clusters_consistency_for_test_labels()

    # Experimental method to plot the test dataset amongst the class kernel densities
    boosting.plot_supervised_kernel_for_test_sets()
    boosting.plot_supervised_predicted_labels_for_test_sets()

    # Close clusters and free memory
    ray.shutdown()
Ejemplo n.º 34
0
def experiment():
    ucbstate = None

    if IS_UCB:
        ucbstate = ucb_state(n_params=N_PARAMS)

    os.makedirs(SAVE_DIR, exist_ok=True)

    scheduler = PopulationBasedTraining(
        time_attr="training_iteration",
        metric=METRIC_NAME,
        mode="max",
        ucb=ucbstate,
        perturbation_interval=PERTUBATION_INTERVAL,
        hyperparam_mutations={
            "dropout": lambda: np.random.uniform(0, 1),
            "lr": lambda: np.random.uniform(0.001, 0.003),
            "batch_size": lambda: random.choice([64, 128, 256, 512])
        })

    ray.shutdown(
    )  # Restart Ray defensively in case the ray connection is lost.
    # ray.init(log_to_driver=False, local_mode=True)
    ray.init(log_to_driver=False)

    # register_trainable('train_cifar10', Cifar10Model)

    analysis = tune.run(
        Cifar10Model,
        name=EXPERIMENT_NAME,
        scheduler=scheduler,
        # reuse_actors=True,
        checkpoint_freq=20,
        verbose=1,
        stop={
            "training_iteration": TRAINING_ITERATION,
        },
        num_samples=NUM_WORKERS,
        resources_per_trial={"gpu": 1},
        # PBT starts by training many neural networks in parallel with random hyperparameters.
        config={
            "epochs": 1,
            "batch_size": 64,
            # "lr": grid_search([10**-4, 10**-5]),
            "lr": 1e-4,  # 1e-4,
            "decay": sample_from(lambda spec: spec.config.lr / 100.0),
            # "dropout": grid_search([0.25, 0.5]),
            "dropout": 0.5,
            # "num_cpus":32,
            "num_gpus": 1,
        })

    # Plot by wall-clock time
    dfs = analysis.fetch_trial_dataframes()

    ## Save pickle
    with open(f"{SAVE_DIR}/{EXPERIMENT_NAME}_trials.pickle", "wb") as fw:
        pickle.dump(dfs, fw)

    # This plots everything on the same plot
    ax = None
    for d in dfs.values():
        ax = d.plot("training_iteration", METRIC_NAME, ax=ax, legend=False)

    if METRIC_NAME == 'mean_accuracy':
        a = np.asarray([
            list(dfs.values())[i].mean_accuracy.max()
            for i in range(NUM_WORKERS)
        ])
    elif METRIC_NAME == 'episode_reward_mean':
        a = np.asarray([
            list(dfs.values())[i].episode_reward_mean.max()
            for i in range(NUM_WORKERS)
        ])

    topk = heapq.nlargest(3, range(len(a)), a.__getitem__)
    sum = 0
    for i in topk:
        sum += a[i]
    avg_top_k = sum / 3

    plt.xlabel("epoch")
    plt.ylabel("Test Accuracy")
    # plt.show()
    plt.savefig(f'{SAVE_DIR}/{EXPERIMENT_NAME}_accuracy.png')

    if IS_UCB:
        # bar chart
        fig, axs = plt.subplots(1, 2, figsize=(9, 3))
        axs[0].bar(range(len(ucbstate.num_of_selections) - 1),
                   ucbstate.num_of_selections[1:])
        axs[1].bar(range(len(ucbstate.rewards) - 1), ucbstate.rewards[1:])

        print(ucbstate.rewards)
        print(ucbstate.num_of_selections)

        ## Save pickle
        with open(f"{SAVE_DIR}/{EXPERIMENT_NAME}_bandit.pickle", "wb") as fw:
            pickle.dump(ucbstate, fw)

        plt.savefig(f'{SAVE_DIR}/{EXPERIMENT_NAME}_bandit_final.png')
        # plt.show()

    return avg_top_k
Ejemplo n.º 35
0
def main():
    parser = argparse.ArgumentParser(description='GSSP for nlp')
    parser.add_argument('--model', default='resnet', help='model name')
    parser.add_argument('--world-size', default=16, type=int,
                        help='node size in simulation')
    parser.add_argument('--lr', default=0.01, type=float)
    parser.add_argument('--batch-size', default=32, type=int)
    parser.add_argument('--epochs', default=100, type=int, help="train epoch")
    parser.add_argument('--data-dir', default='./data',
                        help='the data directory location')
    parser.add_argument('--stdout', default='./stdout/resnet', help='stdout log dir for subprocess')
    parser.add_argument('--momentum', default=0.9, type=float, help='the momentum of iteration time')
    parser.add_argument('--enc-emb', default=256, type=int, help='Encoder embedding size')
    parser.add_argument('--dec-emb', default=256, type=int, help='Decoder embedding size')
    parser.add_argument('--enc-hid', default=512, type=int, help='Encoder hidden layer size')
    parser.add_argument('--dec-hid', default=512, type=int, help='Decoder hidden layer size')
    parser.add_argument('--enc-drop', default=0.5, type=float, help='Encoder dropout probability ')
    parser.add_argument('--dec-drop', default=0.5, type=float, help='Decoder dropout probability ')
    parser.add_argument('--ps-num', default=4, type=int)
    parser.add_argument('--bounded-delay', default=3, type=int)

    args = parser.parse_args()
    sys.stdout = open(f'{args.stdout}/main_stdout.log', 'a+', 1)
    sys.stderr = open(f'{args.stdout}/main_stdout.log', 'a+', 1)

    dirs = [args.data_dir, args.stdout]
    for d in dirs:
        if not os.path.isdir(d):
            os.mkdir(d, mode=0o755)

    ray.shutdown()
    ray.init(num_gpus=20, ignore_reinit_error=True)
    print('==> ray.init..')

    # get model
    train_data, valid_data, test_data, model, clip_value, PAD_IDX = get_model(args)
    worker_tasks = [Worker.remote(i, args, model, clip_value, PAD_IDX)
                    for i in range(args.world_size)]
    globalps=GlobalPS.remote(model, args, args.ps_num)
    pss = [ParameterServer.remote(args, model, clip_value, PAD_IDX, args.bounded_delay, i) for i in range(args.ps_num)]
    print('==>ps success..')
    pss[0].init_pss.remote(globalps)
    pss[1].init_pss.remote(globalps)
    pss[2].init_pss.remote(globalps)
    pss[3].init_pss.remote(globalps)


    print('==>worker_tasks..')

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    model.apply(init_weights)

    for worker in worker_tasks:
        worker.compute_gradients.remote(pss)
    i=0
    while i <= 1000:
        i += 1
        time.sleep(40)

    ray.shutdown()
Ejemplo n.º 36
0
def test_job_config_conda_env(conda_envs, shutdown_only):
    for package_version in REQUEST_VERSIONS:
        runtime_env = {"conda": f"package-{package_version}"}
        ray.init(runtime_env=runtime_env)
        assert ray.get(get_requests_version.remote()) == package_version
        ray.shutdown()
Ejemplo n.º 37
0
 def tearDown(self):
     ray.shutdown()
     _register_all()  # re-register the evicted objects
Ejemplo n.º 38
0
def shutdown_with_server(server, _exiting_interpreter=False):
    server.stop(1)
    with disable_client_hook():
        ray.shutdown(_exiting_interpreter)
Ejemplo n.º 39
0
def test_cluster_interrupt(start_connected_cluster, tmpdir):
    """Tests run_experiment on cluster shutdown with actual interrupt.

    This is an end-to-end test.
    """
    cluster = start_connected_cluster
    dirpath = str(tmpdir)

    # Needs to be in scope for pytest
    class _Mock(tune.Trainable):
        """Finishes on the 4th iteration."""
        def setup(self, config):
            self.state = {"hi": 0}

        def step(self):
            self.state["hi"] += 1
            time.sleep(0.5)
            return {"done": self.state["hi"] >= 4}

        def save_checkpoint(self, path):
            return self.state

        def load_checkpoint(self, state):
            self.state = state

    # Removes indent from class.
    reformatted = "\n".join(line[4:] if len(line) else line
                            for line in inspect.getsource(_Mock).split("\n"))

    script = """
import os
import time
import ray
from ray import tune

os.environ["TUNE_GLOBAL_CHECKPOINT_S"] = "0"

ray.init(address="{address}")

{fail_class_code}

tune.run(
    {fail_class},
    name="experiment",
    stop=dict(training_iteration=5),
    local_dir="{checkpoint_dir}",
    checkpoint_freq=1,
    max_failures=1,
    raise_on_failed_trial=False)
""".format(address=cluster.address,
           checkpoint_dir=dirpath,
           fail_class_code=reformatted,
           fail_class=_Mock.__name__)
    run_string_as_driver_nonblocking(script)

    # Wait until the right checkpoint is saved.
    # The trainable returns every 0.5 seconds, so this should not miss
    # the checkpoint.
    local_checkpoint_dir = os.path.join(dirpath, "experiment")
    for i in range(50):
        if TrialRunner.checkpoint_exists(local_checkpoint_dir):
            # Inspect the internal trialrunner
            runner = TrialRunner(resume="LOCAL",
                                 local_checkpoint_dir=local_checkpoint_dir)
            trials = runner.get_trials()
            last_res = trials[0].last_result
            if last_res and last_res.get("training_iteration") == 3:
                break
        time.sleep(0.2)

    if not TrialRunner.checkpoint_exists(local_checkpoint_dir):
        raise RuntimeError("Checkpoint file didn't appear.")

    ray.shutdown()
    cluster.shutdown()
    cluster = _start_new_cluster()
    Experiment.register_if_needed(_Mock)

    # Inspect the internal trialrunner
    runner = TrialRunner(resume="LOCAL",
                         local_checkpoint_dir=local_checkpoint_dir)
    trials = runner.get_trials()
    assert trials[0].last_result["training_iteration"] == 3
    assert trials[0].status == Trial.PENDING

    # Restore properly from checkpoint
    trials2 = tune.run_experiments(
        {
            "experiment": {
                "run": _Mock,
                "local_dir": dirpath,
                "checkpoint_freq": 1
            }
        },
        resume=True,
        raise_on_failed_trial=False)
    assert all(t.status == Trial.TERMINATED for t in trials2)
    assert {t.trial_id for t in trials2} == {t.trial_id for t in trials}
    ray.shutdown()
    cluster.shutdown()
Ejemplo n.º 40
0
 def tearDown(self):
     ray.shutdown()
     _register_all()
Ejemplo n.º 41
0
 def tearDownClass(cls):
     ray.shutdown()
Ejemplo n.º 42
0
def shutdown_only():
    yield None
    # The code after the yield will run as teardown code.
    ray.shutdown()
Ejemplo n.º 43
0
def ray_shutdown():
    yield
    # The code after the yield will run as teardown code.
    ray.shutdown()
Ejemplo n.º 44
0
def _runSimulations_Parallel(simDefinition,
                             nRuns,
                             outputLists,
                             silent=False,
                             nProcesses=1):
    '''
        Runs a probabilistic simulation a several times, collects and displays average results for common parameters
        Parallelized using [ray](https://github.com/ray-project/ray)
    '''
    import ray
    runRemoteSimulation = ray.remote(runSimulation)
    runRemoteSimulation.options(num_returns=2)

    landingLocations, apogees, maxSpeeds, flightTimes, maxHorizontalVels, flights = outputLists
    resultsToOutput = simDefinition.getValue("MonteCarlo.output")

    def postProcess(rayObject):
        ''' Gets sim results from worker, appends results to outputLists '''
        # Get sim results
        stagePaths, logPaths = ray.get(rayObject)

        # Save results from the top stage
        flight = stagePaths[0]

        landingLocations.append(flight.getLandingLocation())
        apogees.append(flight.getApogee())
        maxSpeeds.append(flight.getMaxSpeed())
        flightTimes.append(flight.getFlightTime())
        maxHorizontalVels.append(flight.getMaxHorizontalVel())

        if "flightPaths" in resultsToOutput:
            flight = Plotting._keepNTimeSteps(
                flight, 900
            )  # Limit the number of time steps saved to avoid wasting memory
            flights.append(flight)

    # Create an instance of random to generate random seeds for each copy of sim definition sent to workers
    #NOTE: This means Monte Carlo repeatability does not transfer across single-threaded / parallel sims
    try:
        randomSeed = simDefinition.getValue("MonteCarlo.randomSeed")
    except KeyError:
        randomSeed = random.randrange(1e7)
    rng = random.Random(randomSeed)

    ### Run simulations ###
    # TODO: Adapt this to work on a cluster
    # Reminder that ray must be initialized separately on a cluster, before running ray.init()
    # https://docs.ray.io/en/latest/cluster/index.html
    ray.init()

    # Start simulations
    runningJobs = []
    for i in range(nRuns):
        # Don't start more sims than there are processes available
        if i >= nProcesses:
            completedJobs, runningJobs = ray.wait(runningJobs)
            for completedJob in completedJobs:
                # Save results
                postProcess(completedJob)

        # Make sure each copy of simDefinition has a different, but repeatable random seed
        newRandomSeed = rng.randrange(1e7)
        simDef = deepcopy(simDefinition)
        simDef.rng = random.Random(newRandomSeed)
        simDef.resampleProbabilisticValues()

        # Start sim
        flightPathsFuture = runRemoteSimulation.remote(simDefinition=simDef,
                                                       silent=True)
        runningJobs.append(flightPathsFuture)

    # Wait for remaining sims to complete
    for remainingJob in runningJobs:
        postProcess(remainingJob)

    ray.shutdown()
 def tearDown(self):
     ray.shutdown()
Ejemplo n.º 46
0
 def tearDown(self):
     shutil.rmtree(self.logdir)
     ray.shutdown()
     _register_all()
Ejemplo n.º 47
0
def main():
    warnings.filterwarnings("ignore", category=DeprecationWarning)

    # initialize logs, dataset, configuration
    PARAM = parse_args()
    init_dir(PARAM["checkpoint_root"])
    dataset_path = Path.cwd() / Path(PARAM["dataset"])

    CONFIG = JokeRec.prep_config(dataset_path,
                                 k_clusters=PARAM["k_clusters"],
                                 debug=PARAM["debug"],
                                 verbose=PARAM["verbose"])

    # measure the baseline performance of a naive agent
    if PARAM["debug"]:
        pdb.set_trace()

    baseline = JokeRec.measure_baseline(
        CONFIG["env_config"],
        n_iter=PARAM["baseline_iter"],
        naive=True,
        verbose=PARAM["verbose"],
    )

    print("BASELINE CUMULATIVE REWARD", round(baseline, 3), "\n")

    # restart Ray, register our environment, and create an agent
    ray.init(ignore_reinit_error=True)

    env_key = "JokeRec-v0"
    register_env(env_key, lambda config_env: JokeRec(config_env))
    AGENT = ppo.PPOTrainer(CONFIG, env=env_key)

    # use RLlib to train a policy using PPO
    df = pd.DataFrame(columns=[
        "min_reward", "avg_reward", "max_reward", "steps", "checkpoint"
    ])
    status = "reward {:6.2f} {:6.2f} {:6.2f}  len {:4.2f}  saved {}"

    for i in range(PARAM["train_iter"]):
        result = AGENT.train()

        checkpoint_file = AGENT.save(PARAM["checkpoint_root"])
        row = [
            result["episode_reward_min"],
            result["episode_reward_mean"],
            result["episode_reward_max"],
            result["episode_len_mean"],
            checkpoint_file,
        ]

        df.loc[len(df)] = row
        print(status.format(*row))

    best_checkpoint = get_best_checkpoint(df)
    print("\n", "BEST CHECKPOINT:", best_checkpoint, "\n")

    # apply the trained policy in a rollout
    AGENT.restore(best_checkpoint)

    if PARAM["rollout_dataset"]:
        CONFIG["env_config"]["dataset"] = Path.cwd() / Path(
            PARAM["rollout_dataset"])

    JokeRec.run_rollout(AGENT,
                        JokeRec(CONFIG["env_config"]),
                        PARAM["rollout_iter"],
                        verbose=PARAM["verbose"])

    # examine the trained policy
    policy = AGENT.get_policy()
    model = policy.model
    print("\n", model.base_model.summary())

    # shutdown gracefully, kthxbai
    ray.shutdown()
Ejemplo n.º 48
0
 def tearDown(self):
     del NODE_PROVIDERS["mock"]
     shutil.rmtree(self.tmpdir)
     ray.shutdown()
Ejemplo n.º 49
0
def start_ray():
    ray.init()
    _register_all()
    yield
    ray.shutdown()
Ejemplo n.º 50
0
def test_placement_group_reschedule_when_node_dead(ray_start_cluster):
    @ray.remote(num_cpus=1)
    class Actor(object):
        def __init__(self):
            self.n = 0

        def value(self):
            return self.n

    cluster = ray_start_cluster
    cluster.add_node(num_cpus=4)
    cluster.add_node(num_cpus=4)
    cluster.add_node(num_cpus=4)
    cluster.wait_for_nodes()
    ray.init(address=cluster.address)

    # Make sure both head and worker node are alive.
    nodes = ray.nodes()
    assert len(nodes) == 3
    assert nodes[0]["alive"] and nodes[1]["alive"] and nodes[2]["alive"]

    placement_group = ray.util.placement_group(
        name="name",
        strategy="SPREAD",
        bundles=[{
            "CPU": 2
        }, {
            "CPU": 2
        }, {
            "CPU": 2
        }])
    actor_1 = Actor.options(
        placement_group=placement_group,
        placement_group_bundle_index=0,
        lifetime="detached").remote()
    actor_2 = Actor.options(
        placement_group=placement_group,
        placement_group_bundle_index=1,
        lifetime="detached").remote()
    actor_3 = Actor.options(
        placement_group=placement_group,
        placement_group_bundle_index=2,
        lifetime="detached").remote()
    ray.get(actor_1.value.remote())
    ray.get(actor_2.value.remote())
    ray.get(actor_3.value.remote())

    cluster.remove_node(get_other_nodes(cluster, exclude_head=True)[-1])
    cluster.wait_for_nodes()

    actor_4 = Actor.options(
        placement_group=placement_group,
        placement_group_bundle_index=0,
        lifetime="detached").remote()
    actor_5 = Actor.options(
        placement_group=placement_group,
        placement_group_bundle_index=1,
        lifetime="detached").remote()
    actor_6 = Actor.options(
        placement_group=placement_group,
        placement_group_bundle_index=2,
        lifetime="detached").remote()
    ray.get(actor_4.value.remote())
    ray.get(actor_5.value.remote())
    ray.get(actor_6.value.remote())
    ray.shutdown()
Ejemplo n.º 51
0
def run_learning_tests_from_yaml(
    yaml_files: List[str],
    *,
    max_num_repeats: int = 2,
    smoke_test: bool = False,
) -> Dict[str, Any]:
    """Runs the given experiments in yaml_files and returns results dict.

    Args:
        yaml_files (List[str]): List of yaml file names.
        max_num_repeats (int): How many times should we repeat a failed
            experiment?
        smoke_test (bool): Whether this is just a smoke-test. If True,
            set time_total_s to 5min and don't early out due to rewards
            or timesteps reached.
    """
    print("Will run the following yaml files:")
    for yaml_file in yaml_files:
        print("->", yaml_file)

    # All trials we'll ever run in this test script.
    all_trials = []
    # The experiments (by name) we'll run up to `max_num_repeats` times.
    experiments = {}
    # The results per experiment.
    checks = {}
    # Metrics per experiment.
    stats = {}

    start_time = time.monotonic()

    def should_check_eval(experiment):
        # If we have evaluation workers, use their rewards.
        # This is useful for offline learning tests, where
        # we evaluate against an actual environment.
        return experiment["config"].get("evaluation_interval", None) is not None

    # Loop through all collected files and gather experiments.
    # Augment all by `torch` framework.
    for yaml_file in yaml_files:
        tf_experiments = yaml.safe_load(open(yaml_file).read())

        # Add torch version of all experiments to the list.
        for k, e in tf_experiments.items():
            # If framework explicitly given, only test for that framework.
            # Some algos do not have both versions available.
            if "frameworks" in e:
                frameworks = e["frameworks"]
            else:
                # By default we don't run tf2, because tf2's multi-gpu support
                # isn't complete yet.
                frameworks = ["tf", "torch"]
            # Pop frameworks key to not confuse Tune.
            e.pop("frameworks", None)

            e["stop"] = e["stop"] if "stop" in e else {}
            e["pass_criteria"] = e["pass_criteria"] if "pass_criteria" in e else {}

            # For smoke-tests, we just run for n min.
            if smoke_test:
                # 0sec for each(!) experiment/trial.
                # This is such that if there are many experiments/trials
                # in a test (e.g. rllib_learning_test), each one can at least
                # create its trainer and run a first iteration.
                e["stop"]["time_total_s"] = 0
            else:
                check_eval = should_check_eval(e)
                episode_reward_key = (
                    "episode_reward_mean"
                    if not check_eval
                    else "evaluation/episode_reward_mean"
                )
                # We also stop early, once we reach the desired reward.
                min_reward = e.get("pass_criteria", {}).get(episode_reward_key)
                if min_reward is not None:
                    e["stop"][episode_reward_key] = min_reward

            # Generate `checks` dict for all experiments
            # (tf, tf2 and/or torch).
            for framework in frameworks:
                k_ = k + "-" + framework
                ec = copy.deepcopy(e)
                ec["config"]["framework"] = framework
                if framework == "tf2":
                    ec["config"]["eager_tracing"] = True

                checks[k_] = {
                    "min_reward": ec["pass_criteria"].get("episode_reward_mean", 0.0),
                    "min_throughput": ec["pass_criteria"].get("timesteps_total", 0.0)
                    / (ec["stop"].get("time_total_s", 1.0) or 1.0),
                    "time_total_s": ec["stop"].get("time_total_s"),
                    "failures": 0,
                    "passed": False,
                }
                # This key would break tune.
                ec.pop("pass_criteria", None)

                # One experiment to run.
                experiments[k_] = ec

    # Print out the actual config.
    print("== Test config ==")
    print(yaml.dump(experiments))

    # Keep track of those experiments we still have to run.
    # If an experiment passes, we'll remove it from this dict.
    experiments_to_run = experiments.copy()

    try:
        ray.init(address="auto")
    except ConnectionError:
        ray.init()

    for i in range(max_num_repeats):
        # We are done.
        if len(experiments_to_run) == 0:
            print("All experiments finished.")
            break

        print(f"Starting learning test iteration {i}...")

        # Run remaining experiments.
        trials = run_experiments(
            experiments_to_run,
            resume=False,
            verbose=2,
            progress_reporter=CLIReporter(
                metric_columns={
                    "training_iteration": "iter",
                    "time_total_s": "time_total_s",
                    "timesteps_total": "ts",
                    "episodes_this_iter": "train_episodes",
                    "episode_reward_mean": "reward_mean",
                    "evaluation/episode_reward_mean": "eval_reward_mean",
                },
                sort_by_metric=True,
                max_report_frequency=30,
            ),
        )

        all_trials.extend(trials)

        # Check each experiment for whether it passed.
        # Criteria is to a) reach reward AND b) to have reached the throughput
        # defined by `timesteps_total` / `time_total_s`.
        for experiment in experiments_to_run.copy():
            print(f"Analyzing experiment {experiment} ...")
            # Collect all trials within this experiment (some experiments may
            # have num_samples or grid_searches defined).
            trials_for_experiment = []
            for t in trials:
                trial_exp = re.sub(".+/([^/]+)$", "\\1", t.local_dir)
                if trial_exp == experiment:
                    trials_for_experiment.append(t)
            print(f" ... Trials: {trials_for_experiment}.")

            check_eval = should_check_eval(experiments[experiment])

            # Error: Increase failure count and repeat.
            if any(t.status == "ERROR" for t in trials_for_experiment):
                print(" ... ERROR.")
                checks[experiment]["failures"] += 1
            # Smoke-tests always succeed.
            elif smoke_test:
                print(" ... SMOKE TEST (mark ok).")
                checks[experiment]["passed"] = True
                del experiments_to_run[experiment]
            # Experiment finished: Check reward achieved and timesteps done
            # (throughput).
            else:
                if check_eval:
                    episode_reward_mean = np.mean(
                        [
                            t.last_result["evaluation"]["episode_reward_mean"]
                            for t in trials_for_experiment
                        ]
                    )
                else:
                    episode_reward_mean = np.mean(
                        [
                            t.last_result["episode_reward_mean"]
                            for t in trials_for_experiment
                        ]
                    )
                desired_reward = checks[experiment]["min_reward"]

                timesteps_total = np.mean(
                    [t.last_result["timesteps_total"] for t in trials_for_experiment]
                )
                total_time_s = np.mean(
                    [t.last_result["time_total_s"] for t in trials_for_experiment]
                )

                # TODO(jungong) : track trainer and env throughput separately.
                throughput = timesteps_total / (total_time_s or 1.0)
                # TODO(jungong) : enable throughput check again after
                #   TD3_HalfCheetahBulletEnv is fixed and verified.
                # desired_throughput = checks[experiment]["min_throughput"]
                desired_throughput = None

                # Record performance.
                stats[experiment] = {
                    "episode_reward_mean": float(episode_reward_mean),
                    "throughput": (
                        float(throughput) if throughput is not None else 0.0
                    ),
                }

                print(
                    f" ... Desired reward={desired_reward}; "
                    f"desired throughput={desired_throughput}"
                )

                # We failed to reach desired reward or the desired throughput.
                if (desired_reward and episode_reward_mean < desired_reward) or (
                    desired_throughput and throughput < desired_throughput
                ):
                    print(
                        " ... Not successful: Actual "
                        f"reward={episode_reward_mean}; "
                        f"actual throughput={throughput}"
                    )
                    checks[experiment]["failures"] += 1
                # We succeeded!
                else:
                    print(" ... Successful: (mark ok).")
                    checks[experiment]["passed"] = True
                    del experiments_to_run[experiment]

    ray.shutdown()

    time_taken = time.monotonic() - start_time

    # Create results dict and write it to disk.
    result = {
        "time_taken": float(time_taken),
        "trial_states": dict(Counter([trial.status for trial in all_trials])),
        "last_update": float(time.time()),
        "stats": stats,
        "passed": [k for k, exp in checks.items() if exp["passed"]],
        "failures": {
            k: exp["failures"] for k, exp in checks.items() if exp["failures"] > 0
        },
    }

    return result
Ejemplo n.º 52
0
    def build_data(self):
        if self.data_ready:
            print("Data is Already build~")
            return

        dataset = self.dataset

        if not os.path.exists(os.path.join(self.path, "test.txt")):
            self.dataset = 'test'
            print("building text.txt ...")
            test_set = []
            with open(os.path.join(self.path, "Testing", "test_seq_1.txt"), "r") as f:
                lines = f.readlines()
            test_line = [line.strip() for line in lines if not line == "\n"]
            for line in test_line:
                words = line.split()
                path = words[0]
                words = words[1:]
                name = os.path.join(self.path, "Testing", "Depth", path)
                words = [name] + words
                test_set.append(" ".join(words))

            with open(os.path.join(self.path, "Testing", "test_seq_2.txt"), "r") as f:
                lines = f.readlines()
            test_line = [line.strip() for line in lines if not line == "\n"]
            for line in test_line:
                words = line.split()
                path = words[0]
                words = words[1:]
                name = os.path.join(self.path, "Testing", "Depth", path)
                words = [name] + words
                test_set.append(" ".join(words))

            print("saving text.txt ...")
            with open(os.path.join(self.path, "test.txt"), 'w') as f:
                f.write("\n".join(test_set))

            with open(os.path.join(self.path, "val.txt"), 'w') as f:
                f.write("\n".join(test_set))

        if not os.path.exists(os.path.join(self.path, "train.txt")):
            self.dataset = 'train'
            print("building train.txt ...")

            datatexts = []
            with open(os.path.join(self.path, "Training", "labels.txt"), 'r') as f:
                train_line = f.readlines()

            for line in train_line:
                words = line.split()
                path = words[0]
                words = words[1:]
                if len(path.split('/')) > 2:
                    # this is the augmented data
                    continue
                name = os.path.join(self.path, "Training", "Depth", path)
                words = [name] + words
                datatexts.append(" ".join(words))

            print('checking data ......')
            ray.init()
            reporter = Reporter.remote(len(datatexts))
            chunk = len(datatexts) // (os.cpu_count() - 1) + 1
            traintxt = []
            processing = [check_texts.remote(self, datatexts[i * chunk : (i + 1) * chunk], reporter) for i in range(os.cpu_count() - 1)]
            for r in ray.get(processing):
                traintxt += r
            ray.shutdown()
            
            print('{} / {} data can use to train'.format(len(traintxt), len(datatexts)))

            with open(os.path.join(self.path, "train.txt"), 'w') as f:
                f.write("\n".join(traintxt))

        self.dataset = dataset
Ejemplo n.º 53
0
Archivo: train.py Proyecto: rlan/ray
def run(args, parser):
    if args.config_file:
        with open(args.config_file) as f:
            experiments = yaml.safe_load(f)
    else:
        # Note: keep this in sync with tune/config_parser.py
        experiments = {
            args.experiment_name: {  # i.e. log to ~/ray_results/default
                "run": args.run,
                "checkpoint_freq": args.checkpoint_freq,
                "checkpoint_at_end": args.checkpoint_at_end,
                "keep_checkpoints_num": args.keep_checkpoints_num,
                "checkpoint_score_attr": args.checkpoint_score_attr,
                "local_dir": args.local_dir,
                "resources_per_trial": (
                    args.resources_per_trial and
                    resources_to_json(args.resources_per_trial)),
                "stop": args.stop,
                "config": dict(args.config, env=args.env),
                "restore": args.restore,
                "num_samples": args.num_samples,
                "upload_dir": args.upload_dir,
            }
        }

    # Ray UI.
    if args.no_ray_ui:
        deprecation_warning(old="--no-ray-ui", new="--ray-ui", error=False)
        args.ray_ui = False

    verbose = 1
    for exp in experiments.values():
        # Bazel makes it hard to find files specified in `args` (and `data`).
        # Look for them here.
        # NOTE: Some of our yaml files don't have a `config` section.
        input_ = exp.get("config", {}).get("input")

        if input_ and input_ != "sampler":
            # This script runs in the ray/rllib dir.
            rllib_dir = Path(__file__).parent

            def patch_path(path):
                if isinstance(path, list):
                    return [patch_path(i) for i in path]
                elif isinstance(path, dict):
                    return {
                        patch_path(k): patch_path(v)
                        for k, v in path.items()
                    }
                elif isinstance(path, str):
                    if os.path.exists(path):
                        return path
                    else:
                        abs_path = str(rllib_dir.absolute().joinpath(path))
                        return abs_path if os.path.exists(abs_path) else path
                else:
                    return path

            exp["config"]["input"] = patch_path(input_)

        if not exp.get("run"):
            parser.error("the following arguments are required: --run")
        if not exp.get("env") and not exp.get("config", {}).get("env"):
            parser.error("the following arguments are required: --env")

        if args.torch:
            deprecation_warning("--torch", "--framework=torch")
            exp["config"]["framework"] = "torch"
        elif args.eager:
            deprecation_warning("--eager", "--framework=[tf2|tfe]")
            exp["config"]["framework"] = "tfe"
        elif args.framework is not None:
            exp["config"]["framework"] = args.framework

        if args.trace:
            if exp["config"]["framework"] not in ["tf2", "tfe"]:
                raise ValueError("Must enable --eager to enable tracing.")
            exp["config"]["eager_tracing"] = True

        if args.v:
            exp["config"]["log_level"] = "INFO"
            verbose = 3  # Print details on trial result
        if args.vv:
            exp["config"]["log_level"] = "DEBUG"
            verbose = 3  # Print details on trial result

    if args.ray_num_nodes:
        # Import this only here so that train.py also works with
        # older versions (and user doesn't use `--ray-num-nodes`).
        from ray.cluster_utils import Cluster
        cluster = Cluster()
        for _ in range(args.ray_num_nodes):
            cluster.add_node(num_cpus=args.ray_num_cpus or 1,
                             num_gpus=args.ray_num_gpus or 0,
                             object_store_memory=args.ray_object_store_memory)
        ray.init(address=cluster.address)
    else:
        ray.init(include_dashboard=args.ray_ui,
                 address=args.ray_address,
                 object_store_memory=args.ray_object_store_memory,
                 num_cpus=args.ray_num_cpus,
                 num_gpus=args.ray_num_gpus,
                 local_mode=args.local_mode)

    if IS_NOTEBOOK:
        progress_reporter = JupyterNotebookReporter(
            overwrite=verbose >= 3, print_intermediate_tables=verbose >= 1)
    else:
        progress_reporter = CLIReporter(print_intermediate_tables=verbose >= 1)

    run_experiments(experiments,
                    scheduler=create_scheduler(args.scheduler,
                                               **args.scheduler_config),
                    resume=args.resume,
                    queue_trials=args.queue_trials,
                    verbose=verbose,
                    progress_reporter=progress_reporter,
                    concurrent=True)

    ray.shutdown()
Ejemplo n.º 54
0
    def test_creates_new_edges_instance_inverse(self):
        """Tests the creates_new_edges method when applied to a kg with instance-based construction with inverse
        relations."""

        self.kg_instance2.reverse_relation_processor()
        # make sure that kg is empty
        self.kg_instance2.graph = Graph().parse(
            self.dir_loc + '/ontologies/so_with_imports.owl')
        # initialize metadata class
        meta = Metadata(self.kg_instance2.kg_version,
                        self.kg_instance2.write_location,
                        self.kg_instance2.full_kg, self.kg_instance2.node_data,
                        self.kg_instance2.node_dict)
        if self.kg_instance2.node_data:
            meta.metadata_processor()
            meta.extract_metadata(self.kg_instance2.graph)
        # create graph subsets
        self.kg_instance2.graph, annotation_triples = splits_knowledge_graph(
            self.kg_instance2.graph)
        full_kg_owl = '_'.join(
            self.kg_instance2.full_kg.split('_')[0:-1]) + '_OWL.owl'
        annot, full = full_kg_owl[:
                                  -4] + '_AnnotationsOnly.nt', full_kg_owl[:
                                                                           -4] + '.nt'
        appends_to_existing_file(annotation_triples,
                                 self.kg_instance2.write_location + annot, ' ')
        clean_graph = updates_pkt_namespace_identifiers(
            self.kg_instance2.graph, self.kg_instance2.construct_approach)

        # test method
        shutil.copy(self.kg_instance2.write_location + annot,
                    self.kg_instance2.write_location + full)
        appends_to_existing_file(set(self.kg_instance2.graph),
                                 self.kg_instance2.write_location + full, ' ')
        # check that edges were added to the graph
        args = {
            'construction': self.kg_instance2.construct_approach,
            'edge_dict': self.kg_instance2.edge_dict,
            'kg_owl': full_kg_owl,
            'rel_dict': self.kg_instance2.relations_dict,
            'metadata': meta.creates_node_metadata,
            'inverse_dict': self.kg_instance2.inverse_relations_dict,
            'node_data': self.kg_instance2.node_data,
            'ont_cls': self.kg_instance2.ont_classes,
            'obj_props': self.kg_instance2.obj_properties,
            'write_loc': self.kg_instance2.write_location
        }
        edges = [x for x in self.kg_instance2.edge_dict.keys()]
        ray.init(local_mode=True, ignore_reinit_error=True)
        actors = [
            ray.remote(self.kg_instance2.EdgeConstructor).remote(args)
            for _ in range(self.kg_instance2.cpus)
        ]
        for i in range(0, len(edges)):
            actors[i % self.kg_instance2.cpus].creates_new_edges.remote(
                edges[i])
        res = ray.get([x.graph_getter.remote() for x in actors])
        g1 = [self.kg_instance2.graph] + [x[0] for x in res]
        g2 = [clean_graph] + [x[1] for x in res]
        error_dicts = dict(
            ChainMap(*ray.get([x.error_dict_getter.remote() for x in actors])))
        del actors
        ray.shutdown()

        # check that edges were added to the graph
        graph1 = set(x for y in [set(x) for x in g1] for x in y)
        graph2 = set(x for y in [set(x) for x in g2] for x in y)
        self.assertEqual(len(graph1), 9707)
        self.assertEqual(len(graph2), 9687)
        self.assertIsInstance(error_dicts, Dict)
        # check graph files were saved
        f_name = full_kg_owl[:-4] + '_AnnotationsOnly.nt'
        self.assertTrue(
            os.path.exists(self.kg_instance2.write_location + f_name))
        f_name = full_kg_owl[:-4] + '.nt'
        self.assertTrue(
            os.path.exists(self.kg_instance2.write_location + f_name))

        return None
Ejemplo n.º 55
0
def test_instance():
    """
    example of SimDeepBoosting
    """
    PATH_DATA = '{0}/../examples/data/'.format(split(abspath(__file__))[0])

    #Input file
    TRAINING_TSV = {'RNA': 'rna_dummy.tsv', 'METH': 'meth_dummy.tsv'}
    SURVIVAL_TSV = 'survival_dummy.tsv'

    PROJECT_NAME = 'TestProjectTuning'
    nb_threads = 2  # Number of processes to be used to fit individual survival models

    ### Below are examples of parameters that can parsed in the hyperparameter tuning

    ################ AUTOENCODER PARAMETERS ################
    # LEVEL_DIMS_IN = [250]
    # LEVEL_DIMS_OUT = [250]
    # LOSS = 'binary_crossentropy'
    # OPTIMIZER = 'adam'
    # ACT_REG = 0
    # W_REG = 0
    # DROPOUT = 0.5
    # DATA_SPLIT = 0
    # ACTIVATION = 'tanh'
    #########################################################

    ################ ADDITIONAL PARAMETERS ##################
    # stack_multi_omic=STACK_MULTI_OMIC,
    # level_dims_in=LEVEL_DIMS_IN,
    # level_dims_out=LEVEL_DIMS_OUT,
    # loss=LOSS,
    # optimizer=OPTIMIZER,
    # act_reg=ACT_REG,
    # w_reg=W_REG,
    # dropout=DROPOUT,
    # data_split=DATA_SPLIT,
    # activation=ACTIVATION,
    # path_to_save_model=PATH_TO_SAVE_MODEL,
    # pvalue_threshold=PVALUE_THRESHOLD,
    # nb_selected_features=NB_SELECTED_FEATURES,
    # pvalue_threshold = 0.01
    # nb_selected_features = 10
    # stack_multi_omic = False
    # use_autoencoders = True
    # feature_surv_analysis = True
    #########################################################

    # ray.init(num_cpus=3)

    # AgglomerativeClustering is an external class that can be used as
    # a clustering algorithm since it has a fit_predict method
    from sklearn.cluster.hierarchical import AgglomerativeClustering

    args_to_optimize = {
        'seed': [100, 200, 300, 400],
        # 'nb_clusters': [2, 5],
        'cluster_method': ['mixture', 'coxPH', AgglomerativeClustering],
        'normalization': ['default', 'alternative']
        # 'use_autoencoders': (True, False),
        # 'class_selection': ('mean', 'max'),
    }

    # Different normalisations can be tested
    from sklearn.preprocessing import RobustScaler
    # An external normalisation class can be used
    # it requires the class to have fit and fit_transform method

    normalization = {
        'default': {
            'NB_FEATURES_TO_KEEP': 100,
            'TRAIN_RANK_NORM': True,
            'TRAIN_CORR_REDUCTION': True,
            'TRAIN_CORR_RANK_NORM': True,
        },
        'alternative': {
            'CUSTOM': RobustScaler,
        }
    }

    tuning = SimDeepTuning(
        args_to_optimize=args_to_optimize,
        nb_threads=nb_threads,
        survival_tsv=SURVIVAL_TSV,
        training_tsv=TRAINING_TSV,
        path_data=PATH_DATA,
        project_name=PROJECT_NAME,
        path_results=PATH_DATA,
        normalization=normalization,
    )

    # Possible metrics for evaluating training set: {
    #              "cluster_consistency",
    #              "full_pvalue",
    #              "sum_log_pval",
    #              "test_fold_cindex",
    #              "test_fold_pval",
    #              "mix_score",
    #     }

    ray.init()
    tuning.fit(
        metric='log_test_fold_pvalue',
        num_samples=8,
        max_concurrent=4,
        distribute_deepprog=True,
        # iterations is usefull to take into account the DL parameter fitting variations
        iterations=1)

    table = tuning.get_results_table()
    tuning.save_results_table()

    ray.shutdown()
Ejemplo n.º 56
0
def ray_8_cpus():
    ray.init(num_cpus=8)
    yield
    # The code after the yield will run as teardown code.
    ray.shutdown()
Ejemplo n.º 57
0
 def handler(sig, frame):
     if ray.is_initialized():
         ray.shutdown()
         print('ray has been shutdown by sigint')
     sys.exit(0)
Ejemplo n.º 58
0
    def build_data(self):
        if self.data_ready:
            print("Data is Already build~")
            return

        dataset = self.dataset

        if not os.path.exists(os.path.join(self.path, "train.txt")):
            self.dataset = 'train'
            print("building train.txt ...")
            mat = loadmat(os.path.join(self.path, "train", "joint_data.mat"))
            datatexts = []
            for j in range(1):
                uvds = mat['joint_uvd'][j]
                for i in range(uvds.shape[0]):
                    uvd = uvds[i]
                    png = "depth_%d_%07d.png" % (j+1, i+1)
                    filename = os.path.join(self.path, "train", png)
                    # center = np.mean(uvd, axis=0, keepdims=True)
                    uvd = uvd[self.index]
                    # uvd = np.concatenate([uvd, center], axis=0)
                    uvd = uvd.reshape((-1,))
                    words = [str(uvd[j]) for j in range(uvd.shape[0])]
                    words = [filename] + words
                    datatexts.append(" ".join(words))

            print('checking data ......')

            ray.init()
            reporter = Reporter.remote(len(datatexts))
            chunk = len(datatexts) // (os.cpu_count() - 1) + 1
            traintxt = []
            processing = [check_texts.remote(self, datatexts[i * chunk : (i + 1) * chunk], reporter) for i in range(os.cpu_count() - 1)]
            for r in ray.get(processing):
                traintxt += r
            ray.shutdown()
            
            print('{} / {} data can use to train'.format(len(traintxt), len(datatexts)))

            with open(os.path.join(self.path, "train.txt"), 'w') as f:
                f.write("\n".join(traintxt))

        if not os.path.exists(os.path.join(self.path, "test.txt")):
            self.dataset = 'test'
            test_set = []
            mat = loadmat(os.path.join(self.path, "test", "joint_data.mat"))
            uvds = mat['joint_uvd'][0]
            for i in range(uvds.shape[0]):
                uvd = uvds[i]
                png = "depth_1_%07d.png" % (i+1)
                filename = os.path.join(self.path, "test", png)
                # center = np.mean(uvd, axis=0, keepdims=True)
                uvd = uvd[self.index]
                # uvd = np.concatenate([uvd, center], axis=0)
                uvd = uvd.reshape((-1,))
                words = [str(uvd[j]) for j in range(uvd.shape[0])]
                words = [filename] + words
                test_set.append(" ".join(words)) 
                    
            with open(os.path.join(self.path, "test.txt"), 'w') as f:
                f.write("\n".join(test_set))

            print('checking data ......')

            ray.init()
            reporter = Reporter.remote(len(test_set))
            chunk = len(test_set) // (os.cpu_count() - 1) + 1
            valtxt = []
            processing = [check_texts.remote(self, test_set[i * chunk : (i + 1) * chunk], reporter) for i in range(os.cpu_count() - 1)]
            for r in ray.get(processing):
                valtxt += r
            ray.shutdown()
            
            print('{} / {} data can use as valadation'.format(len(valtxt), len(test_set)))

            with open(os.path.join(self.path, "val.txt"), 'w') as f:
                f.write("\n".join(valtxt))

        self.dataset = dataset
Ejemplo n.º 59
0
def shutdown_only_with_initialization_check():
    yield None
    # The code after the yield will run as teardown code.
    ray.shutdown()
    assert not ray.is_initialized()
Ejemplo n.º 60
0
 def tearDown(self):
     self.provider = None
     del _NODE_PROVIDERS["mock"]
     _clear_provider_cache()
     shutil.rmtree(self.tmpdir)
     ray.shutdown()