def main():
    os.environ["TUNE_GLOBAL_CHECKPOINT_S"] = "100"  # Tweak

    ray.init(address="auto")

    num_samples = 10000
    results_per_second = 1
    trial_length_s = 1

    max_runtime = 800

    timed_tune_run(name="bookkeeping overhead",
                   num_samples=num_samples,
                   results_per_second=results_per_second,
                   trial_length_s=trial_length_s,
                   max_runtime=max_runtime)
def main():
    os.environ["TUNE_DISABLE_AUTO_CALLBACK_LOGGERS"] = "1"  # Tweak

    ray.init(address="auto")

    num_samples = 96
    results_per_second = 500
    trial_length_s = 100

    max_runtime = 120

    timed_tune_run(name="result throughput single node",
                   num_samples=num_samples,
                   results_per_second=results_per_second,
                   trial_length_s=trial_length_s,
                   max_runtime=max_runtime)
Exemple #3
0
def main():
    ray.init(address="auto")

    num_samples = 200
    results_per_second = 0.01
    trial_length_s = 300

    max_runtime = 1000

    timed_tune_run(
        name="result network overhead",
        num_samples=num_samples,
        results_per_second=results_per_second,
        trial_length_s=trial_length_s,
        max_runtime=max_runtime,
        resources_per_trial={"cpu": 2},  # One per node
        sync_config=tune.SyncConfig(sync_to_driver=True))
Exemple #4
0
def main():
    ray.init(address="auto")

    num_samples = 16
    results_per_second = 1 / 60
    trial_length_s = 86400

    max_runtime = 90000

    timed_tune_run(
        name="long running large checkpoints",
        num_samples=num_samples,
        results_per_second=results_per_second,
        trial_length_s=trial_length_s,
        max_runtime=max_runtime,
        checkpoint_freq_s=900,  # Once every 15 minutes
        checkpoint_size_b=int(3.75 * 1000**3),
        keep_checkpoints_num=2,  # 2 * 16 * 4 = 128 GB
        resources_per_trial={"cpu": 1},
        sync_config=tune.SyncConfig(sync_to_driver=True))
def main():
    os.environ["TUNE_DISABLE_AUTO_CALLBACK_LOGGERS"] = "1"  # Tweak

    ray.init(address="auto")

    num_samples = 1000
    results_per_second = 0.5
    trial_length_s = 100

    max_runtime = 120

    if is_ray_cluster():
        # Add constant overhead for SSH connection
        max_runtime = 120

    timed_tune_run(name="result throughput cluster",
                   num_samples=num_samples,
                   results_per_second=results_per_second,
                   trial_length_s=trial_length_s,
                   max_runtime=max_runtime,
                   sync_config=tune.SyncConfig(sync_to_driver=False))  # Tweak!
Exemple #6
0
def main():
    ray.init(address="auto")

    num_samples = 16
    results_per_second = 10 / 60
    trial_length_s = 300

    max_runtime = 500

    timed_tune_run(
        name="durable trainable",
        num_samples=num_samples,
        results_per_second=results_per_second,
        trial_length_s=trial_length_s,
        max_runtime=max_runtime,
        checkpoint_freq_s=10,  # Once every 10 seconds
        checkpoint_size_b=int(10 * 1000**2),  # 10 MB
        keep_checkpoints_num=2,
        resources_per_trial={"cpu": 2},
        sync_config=tune.SyncConfig(
            sync_to_driver=False,
            upload_dir="s3://ray-tune-scalability-test/durable/",
        ))