def main(): os.environ["TUNE_GLOBAL_CHECKPOINT_S"] = "100" # Tweak ray.init(address="auto") num_samples = 10000 results_per_second = 1 trial_length_s = 1 max_runtime = 800 timed_tune_run(name="bookkeeping overhead", num_samples=num_samples, results_per_second=results_per_second, trial_length_s=trial_length_s, max_runtime=max_runtime)
def main(): os.environ["TUNE_DISABLE_AUTO_CALLBACK_LOGGERS"] = "1" # Tweak ray.init(address="auto") num_samples = 96 results_per_second = 500 trial_length_s = 100 max_runtime = 120 timed_tune_run(name="result throughput single node", num_samples=num_samples, results_per_second=results_per_second, trial_length_s=trial_length_s, max_runtime=max_runtime)
def main(): ray.init(address="auto") num_samples = 200 results_per_second = 0.01 trial_length_s = 300 max_runtime = 1000 timed_tune_run( name="result network overhead", num_samples=num_samples, results_per_second=results_per_second, trial_length_s=trial_length_s, max_runtime=max_runtime, resources_per_trial={"cpu": 2}, # One per node sync_config=tune.SyncConfig(sync_to_driver=True))
def main(): ray.init(address="auto") num_samples = 16 results_per_second = 1 / 60 trial_length_s = 86400 max_runtime = 90000 timed_tune_run( name="long running large checkpoints", num_samples=num_samples, results_per_second=results_per_second, trial_length_s=trial_length_s, max_runtime=max_runtime, checkpoint_freq_s=900, # Once every 15 minutes checkpoint_size_b=int(3.75 * 1000**3), keep_checkpoints_num=2, # 2 * 16 * 4 = 128 GB resources_per_trial={"cpu": 1}, sync_config=tune.SyncConfig(sync_to_driver=True))
def main(): os.environ["TUNE_DISABLE_AUTO_CALLBACK_LOGGERS"] = "1" # Tweak ray.init(address="auto") num_samples = 1000 results_per_second = 0.5 trial_length_s = 100 max_runtime = 120 if is_ray_cluster(): # Add constant overhead for SSH connection max_runtime = 120 timed_tune_run(name="result throughput cluster", num_samples=num_samples, results_per_second=results_per_second, trial_length_s=trial_length_s, max_runtime=max_runtime, sync_config=tune.SyncConfig(sync_to_driver=False)) # Tweak!
def main(): ray.init(address="auto") num_samples = 16 results_per_second = 10 / 60 trial_length_s = 300 max_runtime = 500 timed_tune_run( name="durable trainable", num_samples=num_samples, results_per_second=results_per_second, trial_length_s=trial_length_s, max_runtime=max_runtime, checkpoint_freq_s=10, # Once every 10 seconds checkpoint_size_b=int(10 * 1000**2), # 10 MB keep_checkpoints_num=2, resources_per_trial={"cpu": 2}, sync_config=tune.SyncConfig( sync_to_driver=False, upload_dir="s3://ray-tune-scalability-test/durable/", ))