def submit_tf_benchmark(c, node_count=int(env_values["CLUSTER_MAX_NODES"])):
    """Submits TensorFlow benchmark job using synthetic data on remote cluster
    
    Args:
        node_count (int, optional): The number of nodes to use in cluster. Defaults to env_values['CLUSTER_MAX_NODES'].

    Note:
        Runs ResNet 50 model with batch size of 256 and mixed precision
    """
    from aml_compute import TFExperimentCLI

    exp = TFExperimentCLI("tf_benchmark")
    run = exp.submit(
        os.path.join(_BASE_PATH, "src"),
        "tf_cnn_benchmarks.py",
        {
            "--model": "resnet50",
            "--batch_size": 256,
            "--variable_update": "horovod",
            "--use_fp16": "",
        },
        node_count=node_count,
        dependencies_file=os.path.join(_BASE_PATH, "environment_gpu.yml"),
        wait_for_completion=True,
    )
    print(run)
def submit_images_local(c):
    """This command isn't implemented please modify to use.

    The call below will work for submitting jobs to execute locally on a GPU.
    Here we also map a volume to the docker container executing locally. This is the 
    location we tell our script to look for our training and validation data. Feel free to 
    adjust the other arguments as required by your trainining script.
    """
    raise NotImplementedError(
        "You need to modify this call before being able to use it")
    from aml_compute import TFExperimentCLI
    exp = TFExperimentCLI("<YOUR-EXPERIMENT-NAME>")
    run = exp.submit_local(
        os.path.join(_BASE_PATH, "src"),
        "<YOUR-TRAINING-SCRIPT>",
        {
            "--training_data_path": "/data/train",
            "--validation_data_path": "/data/validation",
            "--epochs": "1",
            "--data_type": "images",
            "--data-format": "channels_first",
        },
        dependencies_file="TensorFlow_imagenet/environment_gpu.yml",
        docker_args=["-v", f"{env_values['data']}:/data"],
        wait_for_completion=True,
    )
    print(run)
Esempio n. 3
0
def submit_images(c,
                  node_count=int(env_values["CLUSTER_MAX_NODES"]),
                  epochs=1):
    """Submit TensorFlow training job using real imagenet data to remote cluster
    
    Args:
        node_count (int, optional): The number of nodes to use in cluster. Defaults to env_values['CLUSTER_MAX_NODES'].
        epochs (int, optional): Number of epochs to run training for. Defaults to 1.
    """
    from aml_compute import TFExperimentCLI

    exp = TFExperimentCLI("real_images_remote")
    run = exp.submit(
        os.path.join(_BASE_PATH, "src"),
        "resnet_main.py",
        {
            "--training_data_path": "{datastore}/train",
            "--validation_data_path": "{datastore}/validation",
            "--epochs": epochs,
            "--data_type": "images",
            "--data-format": "channels_first",
        },
        node_count=node_count,
        dependencies_file=os.path.join(_BASE_PATH, "environment_gpu.yml"),
        wait_for_completion=True,
    )
    print(run)
def submit_images(c):
    """This command isn't implemented please modify to use.

    The call below will work for submitting jobs to execute on a remote cluster using GPUs.
    Notive that we are passing in a {datastore} parameter to the path. This tells the submit
    method that we want the location as mapped by the datastore to be inserted here. Upon
    execution the appropriate path will be preappended to the training_data_path and validation_data_path.
    """
    raise NotImplementedError(
        "You need to modify this call before being able to use it")
    from aml_compute import TFExperimentCLI
    exp = TFExperimentCLI("<YOUR-EXPERIMENT-NAME>")
    run = exp.submit(
        os.path.join(_BASE_PATH, "src"),
        "<YOUR-TRAINING-SCRIPT>",
        {
            "--training_data_path": "{datastore}/train",
            "--validation_data_path": "{datastore}/validation",
            "--epochs": "1",
            "--data_type": "images",
            "--data-format": "channels_first",
        },
        node_count=4,
        dependencies_file="TensorFlow/environment_gpu.yml",
        wait_for_completion=True,
    )
    print(run)
Esempio n. 5
0
def submit_synthetic_local(c, epochs=1):
    """Submit TensorFlow training job using synthetic imagenet data for local execution
    
    Args:
        epochs (int, optional): Number of epochs to run training for. Defaults to 1.
    """
    from aml_compute import TFExperimentCLI

    exp = TFExperimentCLI("synthetic_images_local")
    run = exp.submit_local(
        os.path.join(_BASE_PATH, "src"),
        "resnet_main.py",
        {"--epochs": epochs},
        dependencies_file=os.path.join(_BASE_PATH, "environment_gpu.yml"),
        wait_for_completion=True,
    )
    print(run)
def submit_local(c):
    """This command isn't implemented please modify to use.

    The call below will work for submitting jobs to execute locally on a GPU.
    """
    raise NotImplementedError(
        "You need to modify this call before being able to use it")
    from aml_compute import TFExperimentCLI
    exp = TFExperimentCLI("<YOUR-EXPERIMENT-NAME>")
    run = exp.submit_local(
        os.path.join(_BASE_PATH, "src"),
        "<YOUR-TRAINING-SCRIPT>",
        {"YOUR": "ARGS"},
        dependencies_file="TensorFlow/environment_gpu.yml",
        wait_for_completion=True,
    )
    print(run)
Esempio n. 7
0
def submit_benchmark_local(c):
    """Submit PyTorch training job using synthetic data for local execution
    
    """
    from aml_compute import TFExperimentCLI

    exp = TFExperimentCLI("synthetic_images_local")
    run = exp.submit_local(
        os.path.join(_BASE_PATH, "src"),
        "pytorch_synthetic_benchmark.py",
        {
            "--model": "resnet50",
            "--batch-size": 64
        },
        dependencies_file=os.path.join(_BASE_PATH, "environment_gpu.yml"),
        wait_for_completion=True,
    )
    print(run)
def submit_remote(c, node_count=int(env_values["CLUSTER_MAX_NODES"])):
    """This command isn't implemented please modify to use.

    The call below will work for submitting jobs to execute on a remote cluster using GPUs.
    """
    raise NotImplementedError(
        "You need to modify this call before being able to use it")
    from aml_compute import TFExperimentCLI
    exp = TFExperimentCLI("<YOUR-EXPERIMENT-NAME>")
    run = exp.submit(
        os.path.join(_BASE_PATH, "src"),
        "<YOUR-TRAINING-SCRIPT>",
        {"YOUR": "ARGS"},
        node_count=node_count,
        dependencies_file=os.path.join(_BASE_PATH, "environment_gpu.yml"),
        wait_for_completion=True,
    )
    print(run)
def submit_synthetic(c, node_count=int(env_values["CLUSTER_MAX_NODES"]), epochs=1):
    """Submit TensorFlow training job using synthetic imagenet data to remote cluster
    
    Args:
        node_count (int, optional): The number of nodes to use in cluster. Defaults to env_values['CLUSTER_MAX_NODES'].
        epochs (int, optional): Number of epochs to run training for. Defaults to 1.
    """
    from aml_compute import TFExperimentCLI

    exp = TFExperimentCLI("synthetic_images_remote")
    run = exp.submit(
        os.path.join(_BASE_PATH, "src"),
        "resnet_main.py",
        {"--epochs": epochs},
        node_count=node_count,
        dependencies_file="TensorFlow_imagenet/environment_gpu.yml",
        wait_for_completion=True,
    )
    print(run)
def submit_tf_benchmark_local(c):
    """Submits TensorFlow benchmark job using synthetic data for local execution

    Note:
        Runs ResNet 50 model with batch size of 256 and mixed precision
    """
    from aml_compute import TFExperimentCLI

    exp = TFExperimentCLI("tf_benchmark")
    run = exp.submit_local(
        os.path.join(_BASE_PATH, "src"),
        "tf_cnn_benchmarks.py",
        {
            "--model": "resnet50",
            "--batch_size": 256,
            "--variable_update": "horovod",
            "--use_fp16": "",
        },
        dependencies_file=os.path.join(_BASE_PATH, "environment_gpu.yml"),
        wait_for_completion=True,
    )
    print(run)
Esempio n. 11
0
def submit_images_local(c, epochs=1):
    """Submit TensorFlow training job using real imagenet data for local execution
    
    Args:
        epochs (int, optional): Number of epochs to run training for. Defaults to 1.
    """
    from aml_compute import TFExperimentCLI

    exp = TFExperimentCLI("real_images_local")
    run = exp.submit_local(
        os.path.join(_BASE_PATH, "src"),
        "resnet_main.py",
        {
            "--training_data_path": "/data/train",
            "--validation_data_path": "/data/validation",
            "--epochs": epochs,
            "--data_type": "images",
            "--data-format": "channels_first",
        },
        dependencies_file=os.path.join(_BASE_PATH, "environment_gpu.yml"),
        docker_args=["-v", f"{env_values['DATA']}:/data"],
        wait_for_completion=True,
    )
    print(run)