Ejemplo n.º 1
0
def create_and_submit_experiment(
        azure_config: AzureConfig,
        source_config: SourceConfig,
        model_config_overrides: str,
        azure_dataset_id: str) -> Run:
    """
    Creates an AzureML experiment in the workspace and submits it for execution.
    :param azure_config: azure related configurations to setup valid workspace
    :param source_config: The information about which code should be submitted, and which arguments should be used.
    :param model_config_overrides: A string that describes which model parameters were overwritten by commandline
     arguments in the present run. This is only used for diagnostic purposes (it is set as a Tag on the run).
    :param azure_dataset_id: The name of the dataset in blob storage to be used for this run.
    :returns: Run object for the submitted AzureML run
    """
    workspace = azure_config.get_workspace()
    experiment_name = create_experiment_name(azure_config)
    exp = Experiment(workspace=workspace, name=azure_util.to_azure_friendly_string(experiment_name))
    script_run_config = create_run_config(azure_config, source_config, azure_dataset_id)

    # submit a training/testing run associated with the experiment
    run: Run = exp.submit(script_run_config)

    # set metadata for the run
    set_run_tags(run, azure_config, model_config_overrides)

    print("\n==============================================================================")
    print(f"Successfully queued new run {run.id} in experiment: {exp.name}")

    if azure_config.run_recovery_id:
        print(f"\nRecovered from: {azure_config.run_recovery_id}")

    recovery_id = azure_util.create_run_recovery_id(run)
    recovery_file = Path(RUN_RECOVERY_FILE)
    if recovery_file.exists():
        recovery_file.unlink()
    recovery_file.write_text(recovery_id)

    print("Experiment URL: {}".format(exp.get_portal_url()))
    print("Run URL: {}".format(run.get_portal_url()))
    print("If this run fails, re-start runner.py and supply these additional arguments: "
          f"--run_recovery_id={recovery_id}")
    print(f"The run recovery ID has been written to this file: {recovery_file}")
    print("==============================================================================")
    if azure_config.tensorboard and azure_config.azureml:
        print("Starting TensorBoard now because you specified --tensorboard")
        monitor(monitor_config=AMLTensorBoardMonitorConfig(run_ids=[run.id]), azure_config=azure_config)
    else:
        print(f"To monitor this run locally using TensorBoard, run the script: "
              f"InnerEye/Azure/tensorboard_monitor.py --run_ids={run.id}")
        print("==============================================================================")
    return run
Ejemplo n.º 2
0
def submit_azureml_run(args: JobArguments):
    """Submit GLUE experiment to azureml."""
    ws = Workspace.from_config()
    print("ws: ", ws)

    # get root of git repo
    prefix = Path(__file__).parent
    source_directory = str(prefix.joinpath("src"))

    target = ws.compute_targets[args.target_name]

    env = get_azureml_environment()

    distributed_job_config = get_distributed_job_config(args)

    cmd = f"""ds_report && python finetune_glue.py
    --output_dir outputs
    --model_checkpoint {args.model_checkpoint}
    --task {args.task}
    --num_train_epochs {args.num_train_epochs}
    --per_device_train_batch_size {args.per_device_train_batch_size}
    --per_device_eval_batch_size {args.per_device_eval_batch_size}
    --disable_tqdm 1
    --local_rank $OMPI_COMM_WORLD_LOCAL_RANK
    --deepspeed ds_config.json
    """.split()

    config = ScriptRunConfig(
        source_directory=source_directory,
        command=cmd,
        environment=env,
        compute_target=target,
        distributed_job_config=distributed_job_config,
    )

    run = Experiment(ws, "deepspeed-transformers-example").submit(config)
    print(run.get_portal_url())  # link to ml.azure.com

    run.set_tags(asdict(args))
Ejemplo n.º 3
0
        hyperdrive.loguniform(
            convert_base(1e-6),
            convert_base(5e-2)),  # NB. loguniform on [exp(min), exp(max)]
        "--weight_decay":
        hyperdrive.uniform(5e-3, 15e-2),
        "--per_device_train_batch_size":
        hyperdrive.choice([16, 32]),
    }

    hyperparameter_sampling = RandomParameterSampling(search_space)

    policy = TruncationSelectionPolicy(truncation_percentage=50,
                                       evaluation_interval=2,
                                       delay_evaluation=0)

    hyperdrive_config = HyperDriveConfig(
        run_config=config,
        hyperparameter_sampling=hyperparameter_sampling,
        policy=policy,
        primary_metric_name="eval_matthews_correlation",
        primary_metric_goal=hyperdrive.PrimaryMetricGoal.MAXIMIZE,
        max_total_runs=20,
        max_concurrent_runs=8,
    )

    run = Experiment(
        ws,
        "transformers-glue-finetuning-hyperdrive").submit(hyperdrive_config)
    print(run.get_portal_url())
    run.wait_for_completion(show_output=True)
Ejemplo n.º 4
0
    )

    #### SET PROPER INTERPRETER
    estimator._estimator_config.environment.python.interpreter_path = '/opt/conda/envs/rapids/bin/python'
    print_message("STARTING EXPERIMENT")
    
    experiment = Experiment(
        workspace
        , args.experiment_name
    ).submit(estimator)
    
    print()
    print_message("WAITING FOR THE HEADNODE")
    print_message("NOTE: THIS MAY TAKE SEVERAL MINUTES", filler='!')
    print_message(f"TRACK PROGRESS HERE --->>> ", filler='%')
    print_message(experiment.get_portal_url(), filler='%')

    print()
    print_message("SPINNING UP THE DASK CLUSTER")
    
    rep = 0
    done = False
    prev_status = ""
    spinning_thread = threading.Thread(target=spinner)
    spinning_thread.start()
    start_time = time.time()
    timeout_sec = args.timeout_minutes * 60

    while not "headnode" in experiment.get_metrics():
        rep += 1
        time.sleep(5)
Ejemplo n.º 5
0
                              "--datastore": workspace.get_default_datastore(),
                              "--n_gpus_per_node": str(n_gpus_per_node),
                              "--jupyter_token": str(args.jupyter_token)
                          },
                          distributed_training=Mpi(process_count_per_node=1),
                          node_count=int(args.node_count),
                          use_gpu=True,
                          conda_dependencies_file='rapids-0.10.yml')

    print("Starting experiment run ...")

    experiment = Experiment(workspace, args.experiment_name).submit(estimator)

    print(" ... waiting for headnode ...")
    print(" ... this may take several minutes ...")
    print("(For updated results, see: ", experiment.get_portal_url(), ")")
    rep = 0
    done = False
    prev_status = ""
    spinning_thread = threading.Thread(target=spinner)
    spinning_thread.start()
    start_time = time.time()
    timeout_sec = args.timeout_minutes * 60
    while not "headnode" in experiment.get_metrics():
        rep += 1
        time.sleep(5)
        status = experiment.get_status()
        if status != prev_status:
            print("Status now: ", status)
            prev_status = status
Ejemplo n.º 6
0
def create_and_submit_experiment(azure_config: AzureConfig,
                                 script_run_config: ScriptRunConfig,
                                 commandline_args: str) -> Run:
    """
    Creates an AzureML experiment in the workspace and submits it for execution.
    :param azure_config: azure related configurations to setup a valid workspace.
    :param script_run_config: The configuration for the script that should be run inside of AzureML.
    :param commandline_args: A string with all commandline arguments that were provided to the runner. These are only
    used to set a tag on the submitted AzureML run.
    :returns: Run object for the submitted AzureML run
    """
    workspace = azure_config.get_workspace()
    experiment_name = create_experiment_name(azure_config)
    exp = Experiment(workspace=workspace,
                     name=azure_util.to_azure_friendly_string(experiment_name))

    # submit a training/testing run associated with the experiment
    run: Run = exp.submit(script_run_config)

    if is_offline_run_context(run):
        # This codepath will only be executed in unit tests, when exp.submit is mocked.
        return run

    # Set metadata for the run.
    set_run_tags(run, azure_config, commandline_args=commandline_args)

    print(
        "\n=============================================================================="
    )
    print(f"Successfully queued new run {run.id} in experiment: {exp.name}")

    if azure_config.run_recovery_id:
        print(f"\nRecovered from: {azure_config.run_recovery_id}")

    recovery_id = azure_util.create_run_recovery_id(run)
    recovery_file = Path(RUN_RECOVERY_FILE)
    if recovery_file.exists():
        recovery_file.unlink()
    recovery_file.write_text(recovery_id)

    print("Experiment URL: {}".format(exp.get_portal_url()))
    print("Run URL: {}".format(run.get_portal_url()))
    print(
        "If this run fails, re-start runner.py and supply these additional arguments: "
        f"--run_recovery_id={recovery_id}")
    print(
        f"The run recovery ID has been written to this file: {recovery_file}")
    print(
        "=============================================================================="
    )
    if azure_config.tensorboard and azure_config.azureml:
        print("Starting TensorBoard now because you specified --tensorboard")
        monitor(monitor_config=AMLTensorBoardMonitorConfig(run_ids=[run.id]),
                azure_config=azure_config)
    else:
        print(
            f"To monitor this run locally using TensorBoard, run the script: "
            f"InnerEye/Azure/tensorboard_monitor.py --run_ids={run.id}")
        print(
            "=============================================================================="
        )
    return run
Ejemplo n.º 7
0
# start the MLflow experiment
with mlflow.start_run():
    
    print("Starting experiment:", experiment.name)
    
    # Load data
    data = pd.read_csv('data/diabetes.csv')

    # Count the rows and log the result
    row_count = (len(data))
    print('observations:', row_count)
    mlflow.log_metric('observations', row_count)
    
# Get a link to the experiment in Azure ML studio        
experiment_url = experiment.get_portal_url()
print('See details at', experiment_url)



import os, shutil

# Create a folder for the experiment files
folder_name = 'mlflow-experiment-files'
experiment_folder = './' + folder_name
os.makedirs(folder_name, exist_ok=True)

# Copy the data file into the experiment folder
shutil.copy('data/diabetes.csv', os.path.join(folder_name, "diabetes.csv"))

Ejemplo n.º 8
0
# MAGIC We can:
# MAGIC 1. Use the Azure Portal to compare runs
# MAGIC 1. Use Python to compare runs

# COMMAND ----------

# MAGIC %md
# MAGIC #### 1. Azure Portal
# MAGIC The `Experiment` object has the `get_portal_url()` method that will auto populate the URL.
# MAGIC 
# MAGIC We can use Databricks' `displayHTML` function to render a hyperlink.

# COMMAND ----------

# To find the best performing model, we have several options - we can retrieve the metrics from within Python or we review the Azure portal
displayHTML('<a href="{url}" target="_blank">{url}</a>'.format(url=experiment.get_portal_url()))

# COMMAND ----------

# MAGIC %md
# MAGIC #### 2. Python to Compare Runs
# MAGIC 
# MAGIC Each `Run` object has a `get_metrics()` method that will retrieve our stored metrics. We can leverage the `get_runs()` method of the `Experiment` object to retrieve the run objects.
# MAGIC 
# MAGIC We will then render a table to compare model performance.

# COMMAND ----------

# Download RMSE and R2 from AML Service
import pandas as pd