def main(bucket):
    secrets_file = os.path.join(os.path.dirname(__file__), "..", "aws_secrets.txt")
    if os.path.isfile(secrets_file):
        print(f"Loading AWS secrets from file {secrets_file}")

        from configparser import ConfigParser

        config = ConfigParser()
        config.read(secrets_file)

        for k, v in config.items():
            for x, y in v.items():
                var = str(x).upper()
                os.environ[var] = str(y)
    else:
        print("No AWS secrets file found. Loading from boto.")
        from boto3 import Session

        session = Session()
        credentials = session.get_credentials()
        current_credentials = credentials.get_frozen_credentials()

        os.environ["AWS_ACCESS_KEY_ID"] = current_credentials.access_key
        os.environ["AWS_SECRET_ACCESS_KEY"] = current_credentials.secret_key
        os.environ["AWS_SESSION_TOKEN"] = current_credentials.token

    if all(
        os.getenv(k, "")
        for k in [
            "AWS_ACCESS_KEY_ID",
            "AWS_SECRET_ACCESS_KEY",
            "AWS_SESSION_TOKEN",
        ]
    ):
        print("AWS secrets found in env.")
    else:
        print("Warning: No AWS secrets found in env!")

    ray.init(address="auto")

    num_samples = 16
    results_per_second = 10 / 60
    trial_length_s = 300

    max_runtime = 500

    timed_tune_run(
        name="durable trainable",
        num_samples=num_samples,
        results_per_second=results_per_second,
        trial_length_s=trial_length_s,
        max_runtime=max_runtime,
        checkpoint_freq_s=10,  # Once every 10 seconds
        checkpoint_size_b=int(10 * 1000 ** 2),  # 10 MB
        keep_checkpoints_num=2,
        resources_per_trial={"cpu": 2},
        sync_config=tune.SyncConfig(
            upload_dir=f"s3://{bucket}/durable/",
        ),
    )
def main():
    os.environ["TUNE_DISABLE_AUTO_CALLBACK_LOGGERS"] = "1"  # Tweak

    ray.init(address="auto")

    num_samples = 96
    results_per_second = 50
    trial_length_s = 100

    max_runtime = 120

    timed_tune_run(name="result throughput single node",
                   num_samples=num_samples,
                   results_per_second=results_per_second,
                   trial_length_s=trial_length_s,
                   max_runtime=max_runtime)
Example #3
0
def main():
    os.environ["TUNE_GLOBAL_CHECKPOINT_S"] = "100"  # Tweak

    ray.init(address="auto")

    num_samples = 10000
    results_per_second = 1
    trial_length_s = 1

    max_runtime = 800

    timed_tune_run(name="bookkeeping overhead",
                   num_samples=num_samples,
                   results_per_second=results_per_second,
                   trial_length_s=trial_length_s,
                   max_runtime=max_runtime)
Example #4
0
def main(smoke_test: bool = False):
    ray.init(address="auto")

    num_samples = 100 if not smoke_test else 20
    results_per_second = 0.01
    trial_length_s = 300

    max_runtime = 1000

    timed_tune_run(
        name="result network overhead",
        num_samples=num_samples,
        results_per_second=results_per_second,
        trial_length_s=trial_length_s,
        max_runtime=max_runtime,
        resources_per_trial={"cpu": 2},  # One per node
        sync_config=tune.SyncConfig(sync_to_driver=True))
def main():
    os.environ["TUNE_DISABLE_AUTO_CALLBACK_LOGGERS"] = "1"  # Tweak

    ray.init(address="auto")

    num_samples = 1000
    results_per_second = 0.5
    trial_length_s = 100

    max_runtime = 120

    if is_ray_cluster():
        # Add constant overhead for SSH connection
        max_runtime = 120

    timed_tune_run(name="result throughput cluster",
                   num_samples=num_samples,
                   results_per_second=results_per_second,
                   trial_length_s=trial_length_s,
                   max_runtime=max_runtime,
                   sync_config=tune.SyncConfig(syncer=None))  # Tweak!
Example #6
0
def main(bucket):
    secrets_file = os.path.join(os.path.dirname(__file__), "..",
                                "aws_secrets.txt")
    if os.path.isfile(secrets_file):
        print(f"Loading AWS secrets from file {secrets_file}")

        from configparser import ConfigParser
        config = ConfigParser()
        config.read(secrets_file)

        for k, v in config.items():
            for x, y in v.items():
                var = str(x).upper()
                os.environ[var] = str(y)
    else:
        print("No AWS secrets file found.")

    ray.init(address="auto")

    num_samples = 16
    results_per_second = 10 / 60
    trial_length_s = 300

    max_runtime = 500

    timed_tune_run(
        name="durable trainable",
        num_samples=num_samples,
        results_per_second=results_per_second,
        trial_length_s=trial_length_s,
        max_runtime=max_runtime,
        checkpoint_freq_s=10,  # Once every 10 seconds
        checkpoint_size_b=int(10 * 1000**2),  # 10 MB
        keep_checkpoints_num=2,
        resources_per_trial={"cpu": 2},
        sync_config=tune.SyncConfig(
            sync_to_driver=False,
            upload_dir=f"s3://{bucket}/durable/",
        ))
def main(smoke_test: bool = False):
    ray.init(address="auto")

    num_samples = 16
    results_per_second = 1 / 60
    trial_length_s = 86400 if smoke_test else 3600

    max_runtime = 90000 if smoke_test else 4200

    callback = ProgressCallback()

    timed_tune_run(
        name="long running large checkpoints",
        num_samples=num_samples,
        results_per_second=results_per_second,
        trial_length_s=trial_length_s,
        max_runtime=max_runtime,
        checkpoint_freq_s=900,  # Once every 15 minutes
        checkpoint_size_b=int(0.75 * 1000**3),
        keep_checkpoints_num=2,  # 2 * 16 * 4 = 128 GB
        resources_per_trial={"cpu": 1},
        sync_config=tune.SyncConfig(syncer="auto"),
        callbacks=[callback])