def router(): # We need at least 5 workers so resource won't be oversubscribed ray.init(num_cpus=5) # The following two blobs are equivalent # # handle = DeadlineAwareRouter.remote("DefaultTestRouter") # ray.experimental.register_actor("DefaultTestRouter", handle) # handle.start.remote() # # handle = start_router(DeadlineAwareRouter, "DefaultRouter") handle = start_router(DeadlineAwareRouter, "DefaultRouter") handle.register_actor.remote( "VAdder", VectorizedAdder, init_kwargs={"scaler_increment": 1}) # init args handle.register_actor.remote( "SAdder", ScalerAdder, init_kwargs={"scaler_increment": 2}) handle.register_actor.remote( "SleepFirst", SleepOnFirst, init_kwargs={"sleep_time": 1}) handle.register_actor.remote( "SleepCounter", SleepCounter, max_batch_size=1) yield handle ray.shutdown()
def ray_start_object_store_memory(): # Start the Ray processes. store_size = 10**6 ray.init(num_cpus=1, object_store_memory=store_size) yield None # The code after the yield will run as teardown code. ray.shutdown()
def start_connected_cluster(): # Start the Ray processes. cluster = _start_new_cluster() yield cluster # The code after the yield will run as teardown code. ray.shutdown() cluster.shutdown()
def ray_start_reconstruction(request): num_nodes = request.param plasma_store_memory = int(0.5 * 10**9) cluster = Cluster( initialize_head=True, head_node_args={ "num_cpus": 1, "object_store_memory": plasma_store_memory // num_nodes, "redis_max_memory": 10**7, "_internal_config": json.dumps({ "initial_reconstruction_timeout_milliseconds": 200 }) }) for i in range(num_nodes - 1): cluster.add_node( num_cpus=1, object_store_memory=plasma_store_memory // num_nodes, _internal_config=json.dumps({ "initial_reconstruction_timeout_milliseconds": 200 })) ray.init(redis_address=cluster.redis_address) yield plasma_store_memory, num_nodes, cluster # Clean up the Ray cluster. ray.shutdown() cluster.shutdown()
def init(): ray.init(num_cpus=4) async_api.init() asyncio.get_event_loop().set_debug(False) yield async_api.shutdown() ray.shutdown()
def ray_start_driver_put_errors(): plasma_store_memory = 10**9 # Start the Ray processes. ray.init(num_cpus=1, object_store_memory=plasma_store_memory) yield plasma_store_memory # The code after the yield will run as teardown code. ray.shutdown()
def ray_start_empty_cluster(): cluster = Cluster() yield cluster # The code after the yield will run as teardown code. ray.shutdown() cluster.shutdown()
def ray_start_cluster(): num_nodes = 5 cluster = create_cluster(num_nodes) yield cluster, num_nodes # The code after the yield will run as teardown code. ray.shutdown() cluster.shutdown()
def test_cluster_rllib_restore(start_connected_cluster, tmpdir): cluster = start_connected_cluster dirpath = str(tmpdir) script = """ import time import ray from ray import tune ray.init(redis_address="{redis_address}") kwargs = dict( run="PG", env="CartPole-v1", stop=dict(training_iteration=10), local_dir="{checkpoint_dir}", checkpoint_freq=1, max_failures=1) tune.run_experiments( dict(experiment=kwargs), raise_on_failed_trial=False) """.format( redis_address=cluster.redis_address, checkpoint_dir=dirpath) run_string_as_driver_nonblocking(script) # Wait until the right checkpoint is saved. # The trainable returns every 0.5 seconds, so this should not miss # the checkpoint. metadata_checkpoint_dir = os.path.join(dirpath, "experiment") for i in range(100): if TrialRunner.checkpoint_exists(metadata_checkpoint_dir): # Inspect the internal trialrunner runner = TrialRunner.restore(metadata_checkpoint_dir) trials = runner.get_trials() last_res = trials[0].last_result if last_res and last_res.get("training_iteration"): break time.sleep(0.3) if not TrialRunner.checkpoint_exists(metadata_checkpoint_dir): raise RuntimeError("Checkpoint file didn't appear.") ray.shutdown() cluster.shutdown() cluster = _start_new_cluster() cluster.wait_for_nodes() # Restore properly from checkpoint trials2 = tune.run_experiments( { "experiment": { "run": "PG", "checkpoint_freq": 1, "local_dir": dirpath } }, resume=True) assert all(t.status == Trial.TERMINATED for t in trials2) cluster.shutdown()
def test_temp_plasma_store_socket(): ray.init(plasma_store_socket_name="/tmp/i_am_a_temp_socket") assert os.path.exists( "/tmp/i_am_a_temp_socket"), "Specified socket path not found." ray.shutdown() try: os.remove("/tmp/i_am_a_temp_socket") except Exception: pass
def ray_start(): # Start ray instance ray.init(num_cpus=1) # Run test using this fixture yield None # Shutdown ray instance ray.shutdown()
def tearDown(self): print("Tearing down....") try: self.runner._server.shutdown() self.runner = None except Exception as e: print(e) ray.shutdown() _register_all()
def ray_start_regular(): for module in [ ra.core, ra.random, ra.linalg, da.core, da.random, da.linalg ]: reload(module) # Start the Ray processes. ray.init(num_cpus=2) yield None # The code after the yield will run as teardown code. ray.shutdown()
def main(config, experiments, num_cpus, num_gpus, redis_address): print("config =", config.name) print("experiments =", experiments) print("num_gpus =", num_gpus) print("num_cpus =", num_cpus) print("redis_address =", redis_address) # Use configuration file location as the project location. projectDir = os.path.dirname(config.name) projectDir = os.path.abspath(projectDir) print("projectDir =", projectDir) # Load and parse experiment configurations configs = parse_config(config, experiments, globals=globals()) # Pre-download dataset data_dir = os.path.join(projectDir, "data") datasets.CIFAR10(data_dir, download=True, train=True) # Initialize ray cluster if redis_address is not None: ray.init(redis_address=redis_address, include_webui=True) num_cpus = 1 else: ray.init(num_cpus=num_cpus, num_gpus=num_gpus, local_mode=num_cpus == 1) # Run all experiments in parallel results = [] for exp in configs: config = configs[exp] config["name"] = exp # Make sure local directories are relative to the project location path = config.get("path", None) if path and not os.path.isabs(path): config["path"] = os.path.join(projectDir, path) data_dir = config.get("data_dir", "data") if not os.path.isabs(data_dir): config["data_dir"] = os.path.join(projectDir, data_dir) # When running multiple hyperparameter searches on different experiments, # ray.tune will run one experiment at the time. We use "ray.remote" to # run each tune experiment in parallel as a "remote" function and wait until # all experiments complete results.append(run_experiment.remote(config, MobileNetTune, num_cpus=1, num_gpus=num_gpus / num_cpus)) # Wait for all experiments to complete ray.get(results) ray.shutdown()
def ray_gdb_start(): # Setup environment and start ray _environ = os.environ.copy() for process_name in ["RAYLET", "PLASMA_STORE"]: os.environ["RAY_{}_GDB".format(process_name)] = "1" os.environ["RAY_{}_TMUX".format(process_name)] = "1" yield None # Restore original environment and stop ray os.environ.clear() os.environ.update(_environ) ray.shutdown()
def ray_start_workers_separate_multinode(request): num_nodes = request.param[0] num_initial_workers = request.param[1] # Start the Ray processes. cluster = Cluster() for _ in range(num_nodes): cluster.add_node(num_cpus=num_initial_workers) ray.init(redis_address=cluster.redis_address) yield num_nodes, num_initial_workers # The code after the yield will run as teardown code. ray.shutdown() cluster.shutdown()
def ray_start_head_local(): # Start the Ray processes on this machine. run_and_get_output([ "ray", "start", "--head", "--node-ip-address=localhost", "--redis-port=6379" ]) yield None # Disconnect from the Ray cluster. ray.shutdown() # Kill the Ray cluster. subprocess.Popen(["ray", "stop"]).wait()
def cluster_start(): # Start the Ray processes. cluster = Cluster( initialize_head=True, connect=True, head_node_args={ "num_cpus": 1, "_internal_config": json.dumps({ "num_heartbeats_timeout": 10 }) }) yield cluster ray.shutdown() cluster.shutdown()
def test_driver_lives_sequential(): ray.worker.init() all_processes = ray.services.all_processes processes = (all_processes[ray.services.PROCESS_TYPE_PLASMA_STORE] + all_processes[ray.services.PROCESS_TYPE_RAYLET]) # Kill all the components sequentially. for process in processes: process.terminate() time.sleep(0.1) process.kill() process.wait() ray.shutdown()
def start_connected_longer_cluster(): """Creates a cluster with a longer timeout.""" g = Cluster( initialize_head=True, connect=True, head_node_args={ "num_cpus": 1, "_internal_config": json.dumps({ "num_heartbeats_timeout": 20 }) }) yield g # The code after the yield will run as teardown code. ray.shutdown() g.shutdown()
def ray_start_head(): out = run_and_get_output(["ray", "start", "--head", "--num-cpus=2"]) # Get the redis address from the output. redis_substring_prefix = "redis_address=\"" redis_address_location = ( out.find(redis_substring_prefix) + len(redis_substring_prefix)) redis_address = out[redis_address_location:] redis_address = redis_address.split("\"")[0] yield redis_address # Disconnect from the Ray cluster. ray.shutdown() # Kill the Ray cluster. subprocess.Popen(["ray", "stop"]).wait()
def ray_start_two_nodes(): for module in [ ra.core, ra.random, ra.linalg, da.core, da.random, da.linalg ]: reload(module) # Start the Ray processes. cluster = ray.test.cluster_utils.Cluster() for _ in range(2): cluster.add_node(num_cpus=10) ray.init(redis_address=cluster.redis_address) yield None # The code after the yield will run as teardown code. ray.shutdown() cluster.shutdown()
def ray_start_two_nodes(): # Start the Ray processes. cluster = ray.test.cluster_utils.Cluster() for _ in range(2): cluster.add_node( num_cpus=0, _internal_config=json.dumps({ "num_heartbeats_timeout": 40 })) ray.init(redis_address=cluster.redis_address) yield cluster # The code after the yield will run as teardown code. ray.shutdown() cluster.shutdown()
def start_connected_cluster(): # Start the Ray processes. g = Cluster( initialize_head=True, connect=True, head_node_args={ "num_cpus": 1, "_internal_config": json.dumps({ "num_heartbeats_timeout": 10 }) }) yield g # The code after the yield will run as teardown code. ray.shutdown() g.shutdown()
def ray_start_sharded(request): num_redis_shards = request.param if os.environ.get("RAY_USE_NEW_GCS") == "on": num_redis_shards = 1 # For now, RAY_USE_NEW_GCS supports 1 shard, and credis supports # 1-node chain for that shard only. # Start the Ray processes. ray.init( num_cpus=10, num_redis_shards=num_redis_shards, redis_max_memory=10**7) yield None # The code after the yield will run as teardown code. ray.shutdown()
def noise(config, experiments, num_cpus, num_gpus, redis_address): print("config =", config.name) print("num_gpus =", num_gpus) print("num_cpus =", num_cpus) print("redis_address =", redis_address) # Use configuration file location as the project location. project_dir = os.path.dirname(config.name) project_dir = os.path.abspath(project_dir) print("projectDir =", project_dir) # Load and parse experiment configurations configs = parse_config(config, experiments, globals=globals()) # Initialize ray cluster if redis_address is not None: ray.init(redis_address=redis_address, include_webui=True) else: ray.init(num_cpus=num_cpus, num_gpus=num_gpus, local_mode=num_cpus == 1) # FIXME: Update remote function resource usage num_gpus = float(num_gpus / num_cpus) run_noise_test._num_gpus = num_gpus run_noise_test.num_cpus = 1 # Run experiments results = [] for exp in configs: config = configs[exp] config["name"] = exp # Make sure local directories are relative to the project location path = config.get("path", None) if path and not os.path.isabs(path): config["path"] = os.path.join(project_dir, path) data_dir = config.get("data_dir", "data") if not os.path.isabs(data_dir): config["data_dir"] = os.path.join(project_dir, data_dir) # Run each experiment in parallel results.append(run_noise_test.remote(config)) # Wait until all experiments complete ray.get(results) ray.shutdown()
def Driver(success): success.value = True # Start driver. ray.init(redis_address=redis_address) summary_start = StateSummary() if (0, 1) != summary_start[:2]: success.value = False max_attempts_before_failing = 100 # Two new objects. ray.get(ray.put(1111)) ray.get(ray.put(1111)) attempts = 0 while (2, 1, summary_start[2]) != StateSummary(): time.sleep(0.1) attempts += 1 if attempts == max_attempts_before_failing: success.value = False break @ray.remote def f(): ray.put(1111) # Yet another object. return 1111 # A returned object as well. # 1 new function. attempts = 0 while (2, 1, summary_start[2] + 1) != StateSummary(): time.sleep(0.1) attempts += 1 if attempts == max_attempts_before_failing: success.value = False break ray.get(f.remote()) attempts = 0 while (4, 2, summary_start[2] + 1) != StateSummary(): time.sleep(0.1) attempts += 1 if attempts == max_attempts_before_failing: success.value = False break ray.shutdown()
def start_connected_emptyhead_cluster(): """Starts head with no resources.""" cluster = Cluster( initialize_head=True, connect=True, head_node_args={ "num_cpus": 0, "_internal_config": json.dumps({ "num_heartbeats_timeout": 10 }) }) # Pytest doesn't play nicely with imports _register_all() yield cluster # The code after the yield will run as teardown code. ray.shutdown() cluster.shutdown()
def ray_start_cluster(): node_args = { "num_cpus": 8, "_internal_config": json.dumps({ "initial_reconstruction_timeout_milliseconds": 1000, "num_heartbeats_timeout": 10 }) } # Start with 4 worker nodes and 8 cores each. cluster = Cluster( initialize_head=True, connect=True, head_node_args=node_args) workers = [] for _ in range(4): workers.append(cluster.add_node(**node_args)) cluster.wait_for_nodes() yield cluster ray.shutdown() cluster.shutdown()
def ray_start_combination(request): num_nodes = request.param[0] num_workers_per_scheduler = request.param[1] # Start the Ray processes. cluster = Cluster( initialize_head=True, head_node_args={ "num_cpus": 10, "redis_max_memory": 10**7 }) for i in range(num_nodes - 1): cluster.add_node(num_cpus=10) ray.init(redis_address=cluster.redis_address) yield num_nodes, num_workers_per_scheduler, cluster # The code after the yield will run as teardown code. ray.shutdown() cluster.shutdown()
def build_data(self): if self.data_ready: print("Data is Already build~") return persons = ["P%d" % i for i in range(9)] gestures = os.listdir(os.path.join(self.path, persons[0])) gestures.sort() all_paths = [[os.path.join(self.path, person, gesture) for gesture in gestures] for person in persons] bin_paths = [[] for i in range(9)] joints = [[] for i in range(9)] print('loading file list ......') with tqdm(len(persons) * len(gestures)) as pbar: for i, paths in enumerate(all_paths): for path in paths: _joints = np.loadtxt(os.path.join(path, 'joint.txt'), skiprows=1) with open(os.path.join(path, 'joint.txt')) as f: samples = int(f.readline()) _joints = _joints.reshape((samples, 21, 3)) _joints[:, :, 1] = - _joints[:, :, 1] _joints[:, :, 2] = - _joints[:, :, 2] joints[i].append(_joints.reshape((samples, 63))) for j in range(samples): bin_paths[i].append(os.path.join(path, "%06d_depth.bin" % j)) pbar.update(1) joints[i] = np.concatenate(joints[i], axis=0) print('saving test.txt ......') for i in range(9): super().write_data_txt(os.path.join(self.path, "test_{}.txt".format(i)), list(bin_paths[i]), list(joints[i])) print('checking data ......') datatexts = [] traintxts = [] for i in range(9): dataname = os.path.join(self.path, "test_{}.txt".format(i)) with open(dataname, 'r') as f: datatexts.append(f.readlines()) ray.init() for i in range(9): reporter = Reporter.remote(len(datatexts[i])) chunk = len(datatexts[i]) // (os.cpu_count() - 1) + 1 traintxt = [] processing = [check_texts.remote(self, datatexts[i][j * chunk : (j + 1) * chunk], reporter) for j in range(os.cpu_count() - 1)] for r in ray.get(processing): traintxt += r traintxts.append(traintxt) print('For person {}, {} / {} data can use to train'.format(i, len(traintxt), len(datatexts[i]))) ray.shutdown() for i in range(9): train_to_write = [] for j in range(9): if i == j: val_to_write = traintxts[j] else: train_to_write += traintxts[j] with open(os.path.join(self.path, "train_{}.txt".format(i)), 'w') as f: f.writelines(train_to_write) with open(os.path.join(self.path, "val_{}.txt".format(i)), 'w') as f: f.writelines(val_to_write)
#!/usr/bin/env/ python # Large-scale training of PPO agent using Ray Tune # Chapter 8, TensorFlow 2 Reinforcement Learning Cookbook | Praveen Palanisamy import ray import sys from ray import tune from ray.rllib.models import ModelCatalog if not "." in sys.path: sys.path.insert(0, ".") from custom_model import CustomModel # Register custom-model in ModelCatalog ModelCatalog.register_custom_model("CustomCNN", CustomModel) ray.init() experiment_analysis = tune.run( "PPO", config={ "env": "procgen:procgen-coinrun-v0", "num_gpus": 0, "num_workers": 2, "model": {"custom_model": "CustomCNN"}, "framework": "tf2", "log_level": "INFO", }, local_dir="ray_results", # store experiment results in this dir ) ray.shutdown()
def test_instance(): """ example of SimDeepBoosting """ PATH_DATA = '{0}/../examples/data/'.format(split(abspath(__file__))[0]) #Input file TRAINING_TSV = {'RNA': 'rna_dummy.tsv', 'METH': 'meth_dummy.tsv'} SURVIVAL_TSV = 'survival_dummy.tsv' PROJECT_NAME = 'TestProject' EPOCHS = 10 SEED = 3 nb_it = 5 nb_threads = 2 # Optional metadata FILE OPTIONAL_METADATA = "metadata_dummy.tsv" # Import cluster scheduler import ray ray.init(num_cpus=3) # More options can be used (e.g. remote clusters, AWS, memory,...etc...) # ray can be used locally to maximize the use of CPUs on the local machine # See ray API: https://ray.readthedocs.io/en/latest/index.html boosting = SimDeepBoosting( nb_threads=nb_threads, nb_it=nb_it, split_n_fold=3, survival_tsv=SURVIVAL_TSV, training_tsv=TRAINING_TSV, path_data=PATH_DATA, project_name=PROJECT_NAME, path_results=PATH_DATA, metadata_tsv=OPTIONAL_METADATA, # optional metadata_usage='all', epochs=EPOCHS, distribute=True, # Option to use ray cluster scheduler seed=SEED) boosting.fit() boosting.save_models_classes() boosting.save_cv_models_classes() boosting.predict_labels_on_full_dataset() boosting.compute_clusters_consistency_for_full_labels() boosting.evalutate_cluster_performance() boosting.collect_cindex_for_test_fold() boosting.collect_cindex_for_full_dataset() boosting.compute_feature_scores_per_cluster() boosting.collect_number_of_features_per_omic() boosting.write_feature_score_per_cluster() boosting.load_new_test_dataset( { 'RNA': 'rna_dummy.tsv' }, # OMIC file of the test set. It doesnt have to be the same as for training 'dummy', # Name of the test test to be used 'survival_dummy.tsv', # Survival file of the test set ) boosting.predict_labels_on_test_dataset() boosting.save_test_models_classes() boosting.compute_c_indexes_for_test_dataset() boosting.compute_clusters_consistency_for_test_labels() # Experimental method to plot the test dataset amongst the class kernel densities boosting.plot_supervised_kernel_for_test_sets() boosting.plot_supervised_predicted_labels_for_test_sets() boosting.load_new_test_dataset( {'METH': 'meth_dummy.tsv'}, # OMIC file of the second test set. 'dummy_METH', # Name of the second test test 'survival_dummy.tsv', # Survival file of the test set (optional) ) boosting.predict_labels_on_test_dataset() boosting.compute_c_indexes_for_test_dataset() boosting.compute_clusters_consistency_for_test_labels() # Experimental method to plot the test dataset amongst the class kernel densities boosting.plot_supervised_kernel_for_test_sets() boosting.plot_supervised_predicted_labels_for_test_sets() # Close clusters and free memory ray.shutdown()
def experiment(): ucbstate = None if IS_UCB: ucbstate = ucb_state(n_params=N_PARAMS) os.makedirs(SAVE_DIR, exist_ok=True) scheduler = PopulationBasedTraining( time_attr="training_iteration", metric=METRIC_NAME, mode="max", ucb=ucbstate, perturbation_interval=PERTUBATION_INTERVAL, hyperparam_mutations={ "dropout": lambda: np.random.uniform(0, 1), "lr": lambda: np.random.uniform(0.001, 0.003), "batch_size": lambda: random.choice([64, 128, 256, 512]) }) ray.shutdown( ) # Restart Ray defensively in case the ray connection is lost. # ray.init(log_to_driver=False, local_mode=True) ray.init(log_to_driver=False) # register_trainable('train_cifar10', Cifar10Model) analysis = tune.run( Cifar10Model, name=EXPERIMENT_NAME, scheduler=scheduler, # reuse_actors=True, checkpoint_freq=20, verbose=1, stop={ "training_iteration": TRAINING_ITERATION, }, num_samples=NUM_WORKERS, resources_per_trial={"gpu": 1}, # PBT starts by training many neural networks in parallel with random hyperparameters. config={ "epochs": 1, "batch_size": 64, # "lr": grid_search([10**-4, 10**-5]), "lr": 1e-4, # 1e-4, "decay": sample_from(lambda spec: spec.config.lr / 100.0), # "dropout": grid_search([0.25, 0.5]), "dropout": 0.5, # "num_cpus":32, "num_gpus": 1, }) # Plot by wall-clock time dfs = analysis.fetch_trial_dataframes() ## Save pickle with open(f"{SAVE_DIR}/{EXPERIMENT_NAME}_trials.pickle", "wb") as fw: pickle.dump(dfs, fw) # This plots everything on the same plot ax = None for d in dfs.values(): ax = d.plot("training_iteration", METRIC_NAME, ax=ax, legend=False) if METRIC_NAME == 'mean_accuracy': a = np.asarray([ list(dfs.values())[i].mean_accuracy.max() for i in range(NUM_WORKERS) ]) elif METRIC_NAME == 'episode_reward_mean': a = np.asarray([ list(dfs.values())[i].episode_reward_mean.max() for i in range(NUM_WORKERS) ]) topk = heapq.nlargest(3, range(len(a)), a.__getitem__) sum = 0 for i in topk: sum += a[i] avg_top_k = sum / 3 plt.xlabel("epoch") plt.ylabel("Test Accuracy") # plt.show() plt.savefig(f'{SAVE_DIR}/{EXPERIMENT_NAME}_accuracy.png') if IS_UCB: # bar chart fig, axs = plt.subplots(1, 2, figsize=(9, 3)) axs[0].bar(range(len(ucbstate.num_of_selections) - 1), ucbstate.num_of_selections[1:]) axs[1].bar(range(len(ucbstate.rewards) - 1), ucbstate.rewards[1:]) print(ucbstate.rewards) print(ucbstate.num_of_selections) ## Save pickle with open(f"{SAVE_DIR}/{EXPERIMENT_NAME}_bandit.pickle", "wb") as fw: pickle.dump(ucbstate, fw) plt.savefig(f'{SAVE_DIR}/{EXPERIMENT_NAME}_bandit_final.png') # plt.show() return avg_top_k
def main(): parser = argparse.ArgumentParser(description='GSSP for nlp') parser.add_argument('--model', default='resnet', help='model name') parser.add_argument('--world-size', default=16, type=int, help='node size in simulation') parser.add_argument('--lr', default=0.01, type=float) parser.add_argument('--batch-size', default=32, type=int) parser.add_argument('--epochs', default=100, type=int, help="train epoch") parser.add_argument('--data-dir', default='./data', help='the data directory location') parser.add_argument('--stdout', default='./stdout/resnet', help='stdout log dir for subprocess') parser.add_argument('--momentum', default=0.9, type=float, help='the momentum of iteration time') parser.add_argument('--enc-emb', default=256, type=int, help='Encoder embedding size') parser.add_argument('--dec-emb', default=256, type=int, help='Decoder embedding size') parser.add_argument('--enc-hid', default=512, type=int, help='Encoder hidden layer size') parser.add_argument('--dec-hid', default=512, type=int, help='Decoder hidden layer size') parser.add_argument('--enc-drop', default=0.5, type=float, help='Encoder dropout probability ') parser.add_argument('--dec-drop', default=0.5, type=float, help='Decoder dropout probability ') parser.add_argument('--ps-num', default=4, type=int) parser.add_argument('--bounded-delay', default=3, type=int) args = parser.parse_args() sys.stdout = open(f'{args.stdout}/main_stdout.log', 'a+', 1) sys.stderr = open(f'{args.stdout}/main_stdout.log', 'a+', 1) dirs = [args.data_dir, args.stdout] for d in dirs: if not os.path.isdir(d): os.mkdir(d, mode=0o755) ray.shutdown() ray.init(num_gpus=20, ignore_reinit_error=True) print('==> ray.init..') # get model train_data, valid_data, test_data, model, clip_value, PAD_IDX = get_model(args) worker_tasks = [Worker.remote(i, args, model, clip_value, PAD_IDX) for i in range(args.world_size)] globalps=GlobalPS.remote(model, args, args.ps_num) pss = [ParameterServer.remote(args, model, clip_value, PAD_IDX, args.bounded_delay, i) for i in range(args.ps_num)] print('==>ps success..') pss[0].init_pss.remote(globalps) pss[1].init_pss.remote(globalps) pss[2].init_pss.remote(globalps) pss[3].init_pss.remote(globalps) print('==>worker_tasks..') device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = model.to(device) model.apply(init_weights) for worker in worker_tasks: worker.compute_gradients.remote(pss) i=0 while i <= 1000: i += 1 time.sleep(40) ray.shutdown()
def test_job_config_conda_env(conda_envs, shutdown_only): for package_version in REQUEST_VERSIONS: runtime_env = {"conda": f"package-{package_version}"} ray.init(runtime_env=runtime_env) assert ray.get(get_requests_version.remote()) == package_version ray.shutdown()
def tearDown(self): ray.shutdown() _register_all() # re-register the evicted objects
def shutdown_with_server(server, _exiting_interpreter=False): server.stop(1) with disable_client_hook(): ray.shutdown(_exiting_interpreter)
def test_cluster_interrupt(start_connected_cluster, tmpdir): """Tests run_experiment on cluster shutdown with actual interrupt. This is an end-to-end test. """ cluster = start_connected_cluster dirpath = str(tmpdir) # Needs to be in scope for pytest class _Mock(tune.Trainable): """Finishes on the 4th iteration.""" def setup(self, config): self.state = {"hi": 0} def step(self): self.state["hi"] += 1 time.sleep(0.5) return {"done": self.state["hi"] >= 4} def save_checkpoint(self, path): return self.state def load_checkpoint(self, state): self.state = state # Removes indent from class. reformatted = "\n".join(line[4:] if len(line) else line for line in inspect.getsource(_Mock).split("\n")) script = """ import os import time import ray from ray import tune os.environ["TUNE_GLOBAL_CHECKPOINT_S"] = "0" ray.init(address="{address}") {fail_class_code} tune.run( {fail_class}, name="experiment", stop=dict(training_iteration=5), local_dir="{checkpoint_dir}", checkpoint_freq=1, max_failures=1, raise_on_failed_trial=False) """.format(address=cluster.address, checkpoint_dir=dirpath, fail_class_code=reformatted, fail_class=_Mock.__name__) run_string_as_driver_nonblocking(script) # Wait until the right checkpoint is saved. # The trainable returns every 0.5 seconds, so this should not miss # the checkpoint. local_checkpoint_dir = os.path.join(dirpath, "experiment") for i in range(50): if TrialRunner.checkpoint_exists(local_checkpoint_dir): # Inspect the internal trialrunner runner = TrialRunner(resume="LOCAL", local_checkpoint_dir=local_checkpoint_dir) trials = runner.get_trials() last_res = trials[0].last_result if last_res and last_res.get("training_iteration") == 3: break time.sleep(0.2) if not TrialRunner.checkpoint_exists(local_checkpoint_dir): raise RuntimeError("Checkpoint file didn't appear.") ray.shutdown() cluster.shutdown() cluster = _start_new_cluster() Experiment.register_if_needed(_Mock) # Inspect the internal trialrunner runner = TrialRunner(resume="LOCAL", local_checkpoint_dir=local_checkpoint_dir) trials = runner.get_trials() assert trials[0].last_result["training_iteration"] == 3 assert trials[0].status == Trial.PENDING # Restore properly from checkpoint trials2 = tune.run_experiments( { "experiment": { "run": _Mock, "local_dir": dirpath, "checkpoint_freq": 1 } }, resume=True, raise_on_failed_trial=False) assert all(t.status == Trial.TERMINATED for t in trials2) assert {t.trial_id for t in trials2} == {t.trial_id for t in trials} ray.shutdown() cluster.shutdown()
def tearDown(self): ray.shutdown() _register_all()
def tearDownClass(cls): ray.shutdown()
def shutdown_only(): yield None # The code after the yield will run as teardown code. ray.shutdown()
def ray_shutdown(): yield # The code after the yield will run as teardown code. ray.shutdown()
def _runSimulations_Parallel(simDefinition, nRuns, outputLists, silent=False, nProcesses=1): ''' Runs a probabilistic simulation a several times, collects and displays average results for common parameters Parallelized using [ray](https://github.com/ray-project/ray) ''' import ray runRemoteSimulation = ray.remote(runSimulation) runRemoteSimulation.options(num_returns=2) landingLocations, apogees, maxSpeeds, flightTimes, maxHorizontalVels, flights = outputLists resultsToOutput = simDefinition.getValue("MonteCarlo.output") def postProcess(rayObject): ''' Gets sim results from worker, appends results to outputLists ''' # Get sim results stagePaths, logPaths = ray.get(rayObject) # Save results from the top stage flight = stagePaths[0] landingLocations.append(flight.getLandingLocation()) apogees.append(flight.getApogee()) maxSpeeds.append(flight.getMaxSpeed()) flightTimes.append(flight.getFlightTime()) maxHorizontalVels.append(flight.getMaxHorizontalVel()) if "flightPaths" in resultsToOutput: flight = Plotting._keepNTimeSteps( flight, 900 ) # Limit the number of time steps saved to avoid wasting memory flights.append(flight) # Create an instance of random to generate random seeds for each copy of sim definition sent to workers #NOTE: This means Monte Carlo repeatability does not transfer across single-threaded / parallel sims try: randomSeed = simDefinition.getValue("MonteCarlo.randomSeed") except KeyError: randomSeed = random.randrange(1e7) rng = random.Random(randomSeed) ### Run simulations ### # TODO: Adapt this to work on a cluster # Reminder that ray must be initialized separately on a cluster, before running ray.init() # https://docs.ray.io/en/latest/cluster/index.html ray.init() # Start simulations runningJobs = [] for i in range(nRuns): # Don't start more sims than there are processes available if i >= nProcesses: completedJobs, runningJobs = ray.wait(runningJobs) for completedJob in completedJobs: # Save results postProcess(completedJob) # Make sure each copy of simDefinition has a different, but repeatable random seed newRandomSeed = rng.randrange(1e7) simDef = deepcopy(simDefinition) simDef.rng = random.Random(newRandomSeed) simDef.resampleProbabilisticValues() # Start sim flightPathsFuture = runRemoteSimulation.remote(simDefinition=simDef, silent=True) runningJobs.append(flightPathsFuture) # Wait for remaining sims to complete for remainingJob in runningJobs: postProcess(remainingJob) ray.shutdown()
def tearDown(self): ray.shutdown()
def tearDown(self): shutil.rmtree(self.logdir) ray.shutdown() _register_all()
def main(): warnings.filterwarnings("ignore", category=DeprecationWarning) # initialize logs, dataset, configuration PARAM = parse_args() init_dir(PARAM["checkpoint_root"]) dataset_path = Path.cwd() / Path(PARAM["dataset"]) CONFIG = JokeRec.prep_config(dataset_path, k_clusters=PARAM["k_clusters"], debug=PARAM["debug"], verbose=PARAM["verbose"]) # measure the baseline performance of a naive agent if PARAM["debug"]: pdb.set_trace() baseline = JokeRec.measure_baseline( CONFIG["env_config"], n_iter=PARAM["baseline_iter"], naive=True, verbose=PARAM["verbose"], ) print("BASELINE CUMULATIVE REWARD", round(baseline, 3), "\n") # restart Ray, register our environment, and create an agent ray.init(ignore_reinit_error=True) env_key = "JokeRec-v0" register_env(env_key, lambda config_env: JokeRec(config_env)) AGENT = ppo.PPOTrainer(CONFIG, env=env_key) # use RLlib to train a policy using PPO df = pd.DataFrame(columns=[ "min_reward", "avg_reward", "max_reward", "steps", "checkpoint" ]) status = "reward {:6.2f} {:6.2f} {:6.2f} len {:4.2f} saved {}" for i in range(PARAM["train_iter"]): result = AGENT.train() checkpoint_file = AGENT.save(PARAM["checkpoint_root"]) row = [ result["episode_reward_min"], result["episode_reward_mean"], result["episode_reward_max"], result["episode_len_mean"], checkpoint_file, ] df.loc[len(df)] = row print(status.format(*row)) best_checkpoint = get_best_checkpoint(df) print("\n", "BEST CHECKPOINT:", best_checkpoint, "\n") # apply the trained policy in a rollout AGENT.restore(best_checkpoint) if PARAM["rollout_dataset"]: CONFIG["env_config"]["dataset"] = Path.cwd() / Path( PARAM["rollout_dataset"]) JokeRec.run_rollout(AGENT, JokeRec(CONFIG["env_config"]), PARAM["rollout_iter"], verbose=PARAM["verbose"]) # examine the trained policy policy = AGENT.get_policy() model = policy.model print("\n", model.base_model.summary()) # shutdown gracefully, kthxbai ray.shutdown()
def tearDown(self): del NODE_PROVIDERS["mock"] shutil.rmtree(self.tmpdir) ray.shutdown()
def start_ray(): ray.init() _register_all() yield ray.shutdown()
def test_placement_group_reschedule_when_node_dead(ray_start_cluster): @ray.remote(num_cpus=1) class Actor(object): def __init__(self): self.n = 0 def value(self): return self.n cluster = ray_start_cluster cluster.add_node(num_cpus=4) cluster.add_node(num_cpus=4) cluster.add_node(num_cpus=4) cluster.wait_for_nodes() ray.init(address=cluster.address) # Make sure both head and worker node are alive. nodes = ray.nodes() assert len(nodes) == 3 assert nodes[0]["alive"] and nodes[1]["alive"] and nodes[2]["alive"] placement_group = ray.util.placement_group( name="name", strategy="SPREAD", bundles=[{ "CPU": 2 }, { "CPU": 2 }, { "CPU": 2 }]) actor_1 = Actor.options( placement_group=placement_group, placement_group_bundle_index=0, lifetime="detached").remote() actor_2 = Actor.options( placement_group=placement_group, placement_group_bundle_index=1, lifetime="detached").remote() actor_3 = Actor.options( placement_group=placement_group, placement_group_bundle_index=2, lifetime="detached").remote() ray.get(actor_1.value.remote()) ray.get(actor_2.value.remote()) ray.get(actor_3.value.remote()) cluster.remove_node(get_other_nodes(cluster, exclude_head=True)[-1]) cluster.wait_for_nodes() actor_4 = Actor.options( placement_group=placement_group, placement_group_bundle_index=0, lifetime="detached").remote() actor_5 = Actor.options( placement_group=placement_group, placement_group_bundle_index=1, lifetime="detached").remote() actor_6 = Actor.options( placement_group=placement_group, placement_group_bundle_index=2, lifetime="detached").remote() ray.get(actor_4.value.remote()) ray.get(actor_5.value.remote()) ray.get(actor_6.value.remote()) ray.shutdown()
def run_learning_tests_from_yaml( yaml_files: List[str], *, max_num_repeats: int = 2, smoke_test: bool = False, ) -> Dict[str, Any]: """Runs the given experiments in yaml_files and returns results dict. Args: yaml_files (List[str]): List of yaml file names. max_num_repeats (int): How many times should we repeat a failed experiment? smoke_test (bool): Whether this is just a smoke-test. If True, set time_total_s to 5min and don't early out due to rewards or timesteps reached. """ print("Will run the following yaml files:") for yaml_file in yaml_files: print("->", yaml_file) # All trials we'll ever run in this test script. all_trials = [] # The experiments (by name) we'll run up to `max_num_repeats` times. experiments = {} # The results per experiment. checks = {} # Metrics per experiment. stats = {} start_time = time.monotonic() def should_check_eval(experiment): # If we have evaluation workers, use their rewards. # This is useful for offline learning tests, where # we evaluate against an actual environment. return experiment["config"].get("evaluation_interval", None) is not None # Loop through all collected files and gather experiments. # Augment all by `torch` framework. for yaml_file in yaml_files: tf_experiments = yaml.safe_load(open(yaml_file).read()) # Add torch version of all experiments to the list. for k, e in tf_experiments.items(): # If framework explicitly given, only test for that framework. # Some algos do not have both versions available. if "frameworks" in e: frameworks = e["frameworks"] else: # By default we don't run tf2, because tf2's multi-gpu support # isn't complete yet. frameworks = ["tf", "torch"] # Pop frameworks key to not confuse Tune. e.pop("frameworks", None) e["stop"] = e["stop"] if "stop" in e else {} e["pass_criteria"] = e["pass_criteria"] if "pass_criteria" in e else {} # For smoke-tests, we just run for n min. if smoke_test: # 0sec for each(!) experiment/trial. # This is such that if there are many experiments/trials # in a test (e.g. rllib_learning_test), each one can at least # create its trainer and run a first iteration. e["stop"]["time_total_s"] = 0 else: check_eval = should_check_eval(e) episode_reward_key = ( "episode_reward_mean" if not check_eval else "evaluation/episode_reward_mean" ) # We also stop early, once we reach the desired reward. min_reward = e.get("pass_criteria", {}).get(episode_reward_key) if min_reward is not None: e["stop"][episode_reward_key] = min_reward # Generate `checks` dict for all experiments # (tf, tf2 and/or torch). for framework in frameworks: k_ = k + "-" + framework ec = copy.deepcopy(e) ec["config"]["framework"] = framework if framework == "tf2": ec["config"]["eager_tracing"] = True checks[k_] = { "min_reward": ec["pass_criteria"].get("episode_reward_mean", 0.0), "min_throughput": ec["pass_criteria"].get("timesteps_total", 0.0) / (ec["stop"].get("time_total_s", 1.0) or 1.0), "time_total_s": ec["stop"].get("time_total_s"), "failures": 0, "passed": False, } # This key would break tune. ec.pop("pass_criteria", None) # One experiment to run. experiments[k_] = ec # Print out the actual config. print("== Test config ==") print(yaml.dump(experiments)) # Keep track of those experiments we still have to run. # If an experiment passes, we'll remove it from this dict. experiments_to_run = experiments.copy() try: ray.init(address="auto") except ConnectionError: ray.init() for i in range(max_num_repeats): # We are done. if len(experiments_to_run) == 0: print("All experiments finished.") break print(f"Starting learning test iteration {i}...") # Run remaining experiments. trials = run_experiments( experiments_to_run, resume=False, verbose=2, progress_reporter=CLIReporter( metric_columns={ "training_iteration": "iter", "time_total_s": "time_total_s", "timesteps_total": "ts", "episodes_this_iter": "train_episodes", "episode_reward_mean": "reward_mean", "evaluation/episode_reward_mean": "eval_reward_mean", }, sort_by_metric=True, max_report_frequency=30, ), ) all_trials.extend(trials) # Check each experiment for whether it passed. # Criteria is to a) reach reward AND b) to have reached the throughput # defined by `timesteps_total` / `time_total_s`. for experiment in experiments_to_run.copy(): print(f"Analyzing experiment {experiment} ...") # Collect all trials within this experiment (some experiments may # have num_samples or grid_searches defined). trials_for_experiment = [] for t in trials: trial_exp = re.sub(".+/([^/]+)$", "\\1", t.local_dir) if trial_exp == experiment: trials_for_experiment.append(t) print(f" ... Trials: {trials_for_experiment}.") check_eval = should_check_eval(experiments[experiment]) # Error: Increase failure count and repeat. if any(t.status == "ERROR" for t in trials_for_experiment): print(" ... ERROR.") checks[experiment]["failures"] += 1 # Smoke-tests always succeed. elif smoke_test: print(" ... SMOKE TEST (mark ok).") checks[experiment]["passed"] = True del experiments_to_run[experiment] # Experiment finished: Check reward achieved and timesteps done # (throughput). else: if check_eval: episode_reward_mean = np.mean( [ t.last_result["evaluation"]["episode_reward_mean"] for t in trials_for_experiment ] ) else: episode_reward_mean = np.mean( [ t.last_result["episode_reward_mean"] for t in trials_for_experiment ] ) desired_reward = checks[experiment]["min_reward"] timesteps_total = np.mean( [t.last_result["timesteps_total"] for t in trials_for_experiment] ) total_time_s = np.mean( [t.last_result["time_total_s"] for t in trials_for_experiment] ) # TODO(jungong) : track trainer and env throughput separately. throughput = timesteps_total / (total_time_s or 1.0) # TODO(jungong) : enable throughput check again after # TD3_HalfCheetahBulletEnv is fixed and verified. # desired_throughput = checks[experiment]["min_throughput"] desired_throughput = None # Record performance. stats[experiment] = { "episode_reward_mean": float(episode_reward_mean), "throughput": ( float(throughput) if throughput is not None else 0.0 ), } print( f" ... Desired reward={desired_reward}; " f"desired throughput={desired_throughput}" ) # We failed to reach desired reward or the desired throughput. if (desired_reward and episode_reward_mean < desired_reward) or ( desired_throughput and throughput < desired_throughput ): print( " ... Not successful: Actual " f"reward={episode_reward_mean}; " f"actual throughput={throughput}" ) checks[experiment]["failures"] += 1 # We succeeded! else: print(" ... Successful: (mark ok).") checks[experiment]["passed"] = True del experiments_to_run[experiment] ray.shutdown() time_taken = time.monotonic() - start_time # Create results dict and write it to disk. result = { "time_taken": float(time_taken), "trial_states": dict(Counter([trial.status for trial in all_trials])), "last_update": float(time.time()), "stats": stats, "passed": [k for k, exp in checks.items() if exp["passed"]], "failures": { k: exp["failures"] for k, exp in checks.items() if exp["failures"] > 0 }, } return result
def build_data(self): if self.data_ready: print("Data is Already build~") return dataset = self.dataset if not os.path.exists(os.path.join(self.path, "test.txt")): self.dataset = 'test' print("building text.txt ...") test_set = [] with open(os.path.join(self.path, "Testing", "test_seq_1.txt"), "r") as f: lines = f.readlines() test_line = [line.strip() for line in lines if not line == "\n"] for line in test_line: words = line.split() path = words[0] words = words[1:] name = os.path.join(self.path, "Testing", "Depth", path) words = [name] + words test_set.append(" ".join(words)) with open(os.path.join(self.path, "Testing", "test_seq_2.txt"), "r") as f: lines = f.readlines() test_line = [line.strip() for line in lines if not line == "\n"] for line in test_line: words = line.split() path = words[0] words = words[1:] name = os.path.join(self.path, "Testing", "Depth", path) words = [name] + words test_set.append(" ".join(words)) print("saving text.txt ...") with open(os.path.join(self.path, "test.txt"), 'w') as f: f.write("\n".join(test_set)) with open(os.path.join(self.path, "val.txt"), 'w') as f: f.write("\n".join(test_set)) if not os.path.exists(os.path.join(self.path, "train.txt")): self.dataset = 'train' print("building train.txt ...") datatexts = [] with open(os.path.join(self.path, "Training", "labels.txt"), 'r') as f: train_line = f.readlines() for line in train_line: words = line.split() path = words[0] words = words[1:] if len(path.split('/')) > 2: # this is the augmented data continue name = os.path.join(self.path, "Training", "Depth", path) words = [name] + words datatexts.append(" ".join(words)) print('checking data ......') ray.init() reporter = Reporter.remote(len(datatexts)) chunk = len(datatexts) // (os.cpu_count() - 1) + 1 traintxt = [] processing = [check_texts.remote(self, datatexts[i * chunk : (i + 1) * chunk], reporter) for i in range(os.cpu_count() - 1)] for r in ray.get(processing): traintxt += r ray.shutdown() print('{} / {} data can use to train'.format(len(traintxt), len(datatexts))) with open(os.path.join(self.path, "train.txt"), 'w') as f: f.write("\n".join(traintxt)) self.dataset = dataset
def run(args, parser): if args.config_file: with open(args.config_file) as f: experiments = yaml.safe_load(f) else: # Note: keep this in sync with tune/config_parser.py experiments = { args.experiment_name: { # i.e. log to ~/ray_results/default "run": args.run, "checkpoint_freq": args.checkpoint_freq, "checkpoint_at_end": args.checkpoint_at_end, "keep_checkpoints_num": args.keep_checkpoints_num, "checkpoint_score_attr": args.checkpoint_score_attr, "local_dir": args.local_dir, "resources_per_trial": ( args.resources_per_trial and resources_to_json(args.resources_per_trial)), "stop": args.stop, "config": dict(args.config, env=args.env), "restore": args.restore, "num_samples": args.num_samples, "upload_dir": args.upload_dir, } } # Ray UI. if args.no_ray_ui: deprecation_warning(old="--no-ray-ui", new="--ray-ui", error=False) args.ray_ui = False verbose = 1 for exp in experiments.values(): # Bazel makes it hard to find files specified in `args` (and `data`). # Look for them here. # NOTE: Some of our yaml files don't have a `config` section. input_ = exp.get("config", {}).get("input") if input_ and input_ != "sampler": # This script runs in the ray/rllib dir. rllib_dir = Path(__file__).parent def patch_path(path): if isinstance(path, list): return [patch_path(i) for i in path] elif isinstance(path, dict): return { patch_path(k): patch_path(v) for k, v in path.items() } elif isinstance(path, str): if os.path.exists(path): return path else: abs_path = str(rllib_dir.absolute().joinpath(path)) return abs_path if os.path.exists(abs_path) else path else: return path exp["config"]["input"] = patch_path(input_) if not exp.get("run"): parser.error("the following arguments are required: --run") if not exp.get("env") and not exp.get("config", {}).get("env"): parser.error("the following arguments are required: --env") if args.torch: deprecation_warning("--torch", "--framework=torch") exp["config"]["framework"] = "torch" elif args.eager: deprecation_warning("--eager", "--framework=[tf2|tfe]") exp["config"]["framework"] = "tfe" elif args.framework is not None: exp["config"]["framework"] = args.framework if args.trace: if exp["config"]["framework"] not in ["tf2", "tfe"]: raise ValueError("Must enable --eager to enable tracing.") exp["config"]["eager_tracing"] = True if args.v: exp["config"]["log_level"] = "INFO" verbose = 3 # Print details on trial result if args.vv: exp["config"]["log_level"] = "DEBUG" verbose = 3 # Print details on trial result if args.ray_num_nodes: # Import this only here so that train.py also works with # older versions (and user doesn't use `--ray-num-nodes`). from ray.cluster_utils import Cluster cluster = Cluster() for _ in range(args.ray_num_nodes): cluster.add_node(num_cpus=args.ray_num_cpus or 1, num_gpus=args.ray_num_gpus or 0, object_store_memory=args.ray_object_store_memory) ray.init(address=cluster.address) else: ray.init(include_dashboard=args.ray_ui, address=args.ray_address, object_store_memory=args.ray_object_store_memory, num_cpus=args.ray_num_cpus, num_gpus=args.ray_num_gpus, local_mode=args.local_mode) if IS_NOTEBOOK: progress_reporter = JupyterNotebookReporter( overwrite=verbose >= 3, print_intermediate_tables=verbose >= 1) else: progress_reporter = CLIReporter(print_intermediate_tables=verbose >= 1) run_experiments(experiments, scheduler=create_scheduler(args.scheduler, **args.scheduler_config), resume=args.resume, queue_trials=args.queue_trials, verbose=verbose, progress_reporter=progress_reporter, concurrent=True) ray.shutdown()
def test_creates_new_edges_instance_inverse(self): """Tests the creates_new_edges method when applied to a kg with instance-based construction with inverse relations.""" self.kg_instance2.reverse_relation_processor() # make sure that kg is empty self.kg_instance2.graph = Graph().parse( self.dir_loc + '/ontologies/so_with_imports.owl') # initialize metadata class meta = Metadata(self.kg_instance2.kg_version, self.kg_instance2.write_location, self.kg_instance2.full_kg, self.kg_instance2.node_data, self.kg_instance2.node_dict) if self.kg_instance2.node_data: meta.metadata_processor() meta.extract_metadata(self.kg_instance2.graph) # create graph subsets self.kg_instance2.graph, annotation_triples = splits_knowledge_graph( self.kg_instance2.graph) full_kg_owl = '_'.join( self.kg_instance2.full_kg.split('_')[0:-1]) + '_OWL.owl' annot, full = full_kg_owl[: -4] + '_AnnotationsOnly.nt', full_kg_owl[: -4] + '.nt' appends_to_existing_file(annotation_triples, self.kg_instance2.write_location + annot, ' ') clean_graph = updates_pkt_namespace_identifiers( self.kg_instance2.graph, self.kg_instance2.construct_approach) # test method shutil.copy(self.kg_instance2.write_location + annot, self.kg_instance2.write_location + full) appends_to_existing_file(set(self.kg_instance2.graph), self.kg_instance2.write_location + full, ' ') # check that edges were added to the graph args = { 'construction': self.kg_instance2.construct_approach, 'edge_dict': self.kg_instance2.edge_dict, 'kg_owl': full_kg_owl, 'rel_dict': self.kg_instance2.relations_dict, 'metadata': meta.creates_node_metadata, 'inverse_dict': self.kg_instance2.inverse_relations_dict, 'node_data': self.kg_instance2.node_data, 'ont_cls': self.kg_instance2.ont_classes, 'obj_props': self.kg_instance2.obj_properties, 'write_loc': self.kg_instance2.write_location } edges = [x for x in self.kg_instance2.edge_dict.keys()] ray.init(local_mode=True, ignore_reinit_error=True) actors = [ ray.remote(self.kg_instance2.EdgeConstructor).remote(args) for _ in range(self.kg_instance2.cpus) ] for i in range(0, len(edges)): actors[i % self.kg_instance2.cpus].creates_new_edges.remote( edges[i]) res = ray.get([x.graph_getter.remote() for x in actors]) g1 = [self.kg_instance2.graph] + [x[0] for x in res] g2 = [clean_graph] + [x[1] for x in res] error_dicts = dict( ChainMap(*ray.get([x.error_dict_getter.remote() for x in actors]))) del actors ray.shutdown() # check that edges were added to the graph graph1 = set(x for y in [set(x) for x in g1] for x in y) graph2 = set(x for y in [set(x) for x in g2] for x in y) self.assertEqual(len(graph1), 9707) self.assertEqual(len(graph2), 9687) self.assertIsInstance(error_dicts, Dict) # check graph files were saved f_name = full_kg_owl[:-4] + '_AnnotationsOnly.nt' self.assertTrue( os.path.exists(self.kg_instance2.write_location + f_name)) f_name = full_kg_owl[:-4] + '.nt' self.assertTrue( os.path.exists(self.kg_instance2.write_location + f_name)) return None
def test_instance(): """ example of SimDeepBoosting """ PATH_DATA = '{0}/../examples/data/'.format(split(abspath(__file__))[0]) #Input file TRAINING_TSV = {'RNA': 'rna_dummy.tsv', 'METH': 'meth_dummy.tsv'} SURVIVAL_TSV = 'survival_dummy.tsv' PROJECT_NAME = 'TestProjectTuning' nb_threads = 2 # Number of processes to be used to fit individual survival models ### Below are examples of parameters that can parsed in the hyperparameter tuning ################ AUTOENCODER PARAMETERS ################ # LEVEL_DIMS_IN = [250] # LEVEL_DIMS_OUT = [250] # LOSS = 'binary_crossentropy' # OPTIMIZER = 'adam' # ACT_REG = 0 # W_REG = 0 # DROPOUT = 0.5 # DATA_SPLIT = 0 # ACTIVATION = 'tanh' ######################################################### ################ ADDITIONAL PARAMETERS ################## # stack_multi_omic=STACK_MULTI_OMIC, # level_dims_in=LEVEL_DIMS_IN, # level_dims_out=LEVEL_DIMS_OUT, # loss=LOSS, # optimizer=OPTIMIZER, # act_reg=ACT_REG, # w_reg=W_REG, # dropout=DROPOUT, # data_split=DATA_SPLIT, # activation=ACTIVATION, # path_to_save_model=PATH_TO_SAVE_MODEL, # pvalue_threshold=PVALUE_THRESHOLD, # nb_selected_features=NB_SELECTED_FEATURES, # pvalue_threshold = 0.01 # nb_selected_features = 10 # stack_multi_omic = False # use_autoencoders = True # feature_surv_analysis = True ######################################################### # ray.init(num_cpus=3) # AgglomerativeClustering is an external class that can be used as # a clustering algorithm since it has a fit_predict method from sklearn.cluster.hierarchical import AgglomerativeClustering args_to_optimize = { 'seed': [100, 200, 300, 400], # 'nb_clusters': [2, 5], 'cluster_method': ['mixture', 'coxPH', AgglomerativeClustering], 'normalization': ['default', 'alternative'] # 'use_autoencoders': (True, False), # 'class_selection': ('mean', 'max'), } # Different normalisations can be tested from sklearn.preprocessing import RobustScaler # An external normalisation class can be used # it requires the class to have fit and fit_transform method normalization = { 'default': { 'NB_FEATURES_TO_KEEP': 100, 'TRAIN_RANK_NORM': True, 'TRAIN_CORR_REDUCTION': True, 'TRAIN_CORR_RANK_NORM': True, }, 'alternative': { 'CUSTOM': RobustScaler, } } tuning = SimDeepTuning( args_to_optimize=args_to_optimize, nb_threads=nb_threads, survival_tsv=SURVIVAL_TSV, training_tsv=TRAINING_TSV, path_data=PATH_DATA, project_name=PROJECT_NAME, path_results=PATH_DATA, normalization=normalization, ) # Possible metrics for evaluating training set: { # "cluster_consistency", # "full_pvalue", # "sum_log_pval", # "test_fold_cindex", # "test_fold_pval", # "mix_score", # } ray.init() tuning.fit( metric='log_test_fold_pvalue', num_samples=8, max_concurrent=4, distribute_deepprog=True, # iterations is usefull to take into account the DL parameter fitting variations iterations=1) table = tuning.get_results_table() tuning.save_results_table() ray.shutdown()
def ray_8_cpus(): ray.init(num_cpus=8) yield # The code after the yield will run as teardown code. ray.shutdown()
def handler(sig, frame): if ray.is_initialized(): ray.shutdown() print('ray has been shutdown by sigint') sys.exit(0)
def build_data(self): if self.data_ready: print("Data is Already build~") return dataset = self.dataset if not os.path.exists(os.path.join(self.path, "train.txt")): self.dataset = 'train' print("building train.txt ...") mat = loadmat(os.path.join(self.path, "train", "joint_data.mat")) datatexts = [] for j in range(1): uvds = mat['joint_uvd'][j] for i in range(uvds.shape[0]): uvd = uvds[i] png = "depth_%d_%07d.png" % (j+1, i+1) filename = os.path.join(self.path, "train", png) # center = np.mean(uvd, axis=0, keepdims=True) uvd = uvd[self.index] # uvd = np.concatenate([uvd, center], axis=0) uvd = uvd.reshape((-1,)) words = [str(uvd[j]) for j in range(uvd.shape[0])] words = [filename] + words datatexts.append(" ".join(words)) print('checking data ......') ray.init() reporter = Reporter.remote(len(datatexts)) chunk = len(datatexts) // (os.cpu_count() - 1) + 1 traintxt = [] processing = [check_texts.remote(self, datatexts[i * chunk : (i + 1) * chunk], reporter) for i in range(os.cpu_count() - 1)] for r in ray.get(processing): traintxt += r ray.shutdown() print('{} / {} data can use to train'.format(len(traintxt), len(datatexts))) with open(os.path.join(self.path, "train.txt"), 'w') as f: f.write("\n".join(traintxt)) if not os.path.exists(os.path.join(self.path, "test.txt")): self.dataset = 'test' test_set = [] mat = loadmat(os.path.join(self.path, "test", "joint_data.mat")) uvds = mat['joint_uvd'][0] for i in range(uvds.shape[0]): uvd = uvds[i] png = "depth_1_%07d.png" % (i+1) filename = os.path.join(self.path, "test", png) # center = np.mean(uvd, axis=0, keepdims=True) uvd = uvd[self.index] # uvd = np.concatenate([uvd, center], axis=0) uvd = uvd.reshape((-1,)) words = [str(uvd[j]) for j in range(uvd.shape[0])] words = [filename] + words test_set.append(" ".join(words)) with open(os.path.join(self.path, "test.txt"), 'w') as f: f.write("\n".join(test_set)) print('checking data ......') ray.init() reporter = Reporter.remote(len(test_set)) chunk = len(test_set) // (os.cpu_count() - 1) + 1 valtxt = [] processing = [check_texts.remote(self, test_set[i * chunk : (i + 1) * chunk], reporter) for i in range(os.cpu_count() - 1)] for r in ray.get(processing): valtxt += r ray.shutdown() print('{} / {} data can use as valadation'.format(len(valtxt), len(test_set))) with open(os.path.join(self.path, "val.txt"), 'w') as f: f.write("\n".join(valtxt)) self.dataset = dataset
def shutdown_only_with_initialization_check(): yield None # The code after the yield will run as teardown code. ray.shutdown() assert not ray.is_initialized()
def tearDown(self): self.provider = None del _NODE_PROVIDERS["mock"] _clear_provider_cache() shutil.rmtree(self.tmpdir) ray.shutdown()