def main(bucket): secrets_file = os.path.join(os.path.dirname(__file__), "..", "aws_secrets.txt") if os.path.isfile(secrets_file): print(f"Loading AWS secrets from file {secrets_file}") from configparser import ConfigParser config = ConfigParser() config.read(secrets_file) for k, v in config.items(): for x, y in v.items(): var = str(x).upper() os.environ[var] = str(y) else: print("No AWS secrets file found. Loading from boto.") from boto3 import Session session = Session() credentials = session.get_credentials() current_credentials = credentials.get_frozen_credentials() os.environ["AWS_ACCESS_KEY_ID"] = current_credentials.access_key os.environ["AWS_SECRET_ACCESS_KEY"] = current_credentials.secret_key os.environ["AWS_SESSION_TOKEN"] = current_credentials.token if all( os.getenv(k, "") for k in [ "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "AWS_SESSION_TOKEN", ] ): print("AWS secrets found in env.") else: print("Warning: No AWS secrets found in env!") ray.init(address="auto") num_samples = 16 results_per_second = 10 / 60 trial_length_s = 300 max_runtime = 500 timed_tune_run( name="durable trainable", num_samples=num_samples, results_per_second=results_per_second, trial_length_s=trial_length_s, max_runtime=max_runtime, checkpoint_freq_s=10, # Once every 10 seconds checkpoint_size_b=int(10 * 1000 ** 2), # 10 MB keep_checkpoints_num=2, resources_per_trial={"cpu": 2}, sync_config=tune.SyncConfig( upload_dir=f"s3://{bucket}/durable/", ), )
def main(): os.environ["TUNE_DISABLE_AUTO_CALLBACK_LOGGERS"] = "1" # Tweak ray.init(address="auto") num_samples = 96 results_per_second = 50 trial_length_s = 100 max_runtime = 120 timed_tune_run(name="result throughput single node", num_samples=num_samples, results_per_second=results_per_second, trial_length_s=trial_length_s, max_runtime=max_runtime)
def main(): os.environ["TUNE_GLOBAL_CHECKPOINT_S"] = "100" # Tweak ray.init(address="auto") num_samples = 10000 results_per_second = 1 trial_length_s = 1 max_runtime = 800 timed_tune_run(name="bookkeeping overhead", num_samples=num_samples, results_per_second=results_per_second, trial_length_s=trial_length_s, max_runtime=max_runtime)
def main(smoke_test: bool = False): ray.init(address="auto") num_samples = 100 if not smoke_test else 20 results_per_second = 0.01 trial_length_s = 300 max_runtime = 1000 timed_tune_run( name="result network overhead", num_samples=num_samples, results_per_second=results_per_second, trial_length_s=trial_length_s, max_runtime=max_runtime, resources_per_trial={"cpu": 2}, # One per node sync_config=tune.SyncConfig(sync_to_driver=True))
def main(): os.environ["TUNE_DISABLE_AUTO_CALLBACK_LOGGERS"] = "1" # Tweak ray.init(address="auto") num_samples = 1000 results_per_second = 0.5 trial_length_s = 100 max_runtime = 120 if is_ray_cluster(): # Add constant overhead for SSH connection max_runtime = 120 timed_tune_run(name="result throughput cluster", num_samples=num_samples, results_per_second=results_per_second, trial_length_s=trial_length_s, max_runtime=max_runtime, sync_config=tune.SyncConfig(syncer=None)) # Tweak!
def main(bucket): secrets_file = os.path.join(os.path.dirname(__file__), "..", "aws_secrets.txt") if os.path.isfile(secrets_file): print(f"Loading AWS secrets from file {secrets_file}") from configparser import ConfigParser config = ConfigParser() config.read(secrets_file) for k, v in config.items(): for x, y in v.items(): var = str(x).upper() os.environ[var] = str(y) else: print("No AWS secrets file found.") ray.init(address="auto") num_samples = 16 results_per_second = 10 / 60 trial_length_s = 300 max_runtime = 500 timed_tune_run( name="durable trainable", num_samples=num_samples, results_per_second=results_per_second, trial_length_s=trial_length_s, max_runtime=max_runtime, checkpoint_freq_s=10, # Once every 10 seconds checkpoint_size_b=int(10 * 1000**2), # 10 MB keep_checkpoints_num=2, resources_per_trial={"cpu": 2}, sync_config=tune.SyncConfig( sync_to_driver=False, upload_dir=f"s3://{bucket}/durable/", ))
def main(smoke_test: bool = False): ray.init(address="auto") num_samples = 16 results_per_second = 1 / 60 trial_length_s = 86400 if smoke_test else 3600 max_runtime = 90000 if smoke_test else 4200 callback = ProgressCallback() timed_tune_run( name="long running large checkpoints", num_samples=num_samples, results_per_second=results_per_second, trial_length_s=trial_length_s, max_runtime=max_runtime, checkpoint_freq_s=900, # Once every 15 minutes checkpoint_size_b=int(0.75 * 1000**3), keep_checkpoints_num=2, # 2 * 16 * 4 = 128 GB resources_per_trial={"cpu": 1}, sync_config=tune.SyncConfig(syncer="auto"), callbacks=[callback])