"compute_target_to_use_for_training"].strip().lower() compute_target_name = settings["compute_target"]["training"][ compute_target_to_use]["name"] workspace_config_settings = settings["workspace"]["config"] # Get workspace print("Loading Workspace") cli_auth = AzureCliAuthentication() ws = Workspace.from_config(path=workspace_config_settings["path"], auth=cli_auth, _file_name=workspace_config_settings["file_name"]) print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep='\n') # Attach Experiment print("Loading Experiment") exp = Experiment(workspace=ws, name=experiment_settings["name"]) print(exp.name, exp.workspace.name, sep="\n") # Load compute target print("Loading Compute Target") compute_target = ComputeTarget(workspace=ws, name=compute_target_name) # Create image registry configuration if experiment_settings["docker"]["custom_image"]: container_registry = ContainerRegistry() container_registry.address = experiment_settings["docker"][ "custom_image_registry_details"]["address"] container_registry.username = experiment_settings["docker"][ "custom_image_registry_details"]["username"] container_registry.password = experiment_settings["docker"][ "custom_image_registry_details"]["password"]
automl_config = AutoMLConfig( task='regression', debug_log='automl_errors.log', primary_metric='r2_score', iteration_timeout_minutes=10, iterations=4, max_concurrent_iterations=4, #change it based on number of worker nodes verbosity=logging.INFO, spark_context=sc, enable_cache=True, path=project_folder, preprocess=True, X=train_X, y=train_Y, X_valid=valid_X, y_valid=valid_Y) # COMMAND ---------- # MAGIC %md # MAGIC ### Start the AutoML Tasks # MAGIC Finally, we'll instantiate and Experiment and submit the `automl_config`. This will run for the specified number of iterations. # COMMAND ---------- # Create AML Experiment - use the name from ./99-Shared-Functions-and-Settings notebook experiment = Experiment(ws, automl_experiment_name) # Submit AutoML Run run = experiment.submit(automl_config) run.wait_for_completion(show_output=True)
# COMMAND ---------- # MAGIC %md # MAGIC #### Create Azure Machine Learning Experiment # MAGIC # MAGIC The Experiment object will contain 1-to-many 'Runs'. Each of these Runs can capture track logged data, images, results, and/or trained models. # MAGIC # MAGIC The Run objects can be a submitted `*.py` script or it can be submitted via an interactive notebook session. # MAGIC # MAGIC We'll also create an evaluator to that will calculate performance metrics of the models. # COMMAND ---------- # Use the experiment name from the ./99-Shared-Functions-and-Settings notebook experiment = Experiment(ws, pyspark_experiment_name) # Create evaluator object to assess model performance evaluator = RegressionEvaluator(labelCol='duration_minutes') # COMMAND ---------- # MAGIC %md # MAGIC ### Linear Regression Model # MAGIC The first model that we'll try is a Linear Regression model. # COMMAND ---------- with experiment.start_logging() as run: print("==============================================")
def analyse_with_gordo(): ws = Workspace.from_config() # Azure ML # Get an experiment object from Azure Machine Learning experiment_name = "dummy_test" experiment = Experiment(workspace=ws, name=experiment_name) # Azure ML mlflow.set_experiment(experiment_name) # MLFlow resamples_for_model = ["1T", "1H"] aggregation_methods = ["max", "mean"] batch_sizes = [1, 10, 100] epochs = [1, 10] number_of_permutations = len( list( itertools.product( aggregation_methods, resamples_for_model, batch_sizes, epochs ) ) ) resampled_original_data = read_and_resample("2nd_test.hdf", "1S") if PLOTTING: plotnum = 0 f, axarr = plt.subplots(number_of_permutations + 1, sharex=True) axarr[plotnum].plot( resampled_original_data, linewidth=1, label="sensor_data_1S_mean" ) axarr[plotnum].legend(loc="upper left") plotnum += 1 for aggregation_method, interval, batch_size, epoch in itertools.product( aggregation_methods, resamples_for_model, batch_sizes, epochs ): run = experiment.start_logging() with mlflow.start_run(): mlflow.log_param("interval", interval) # MLFlow mlflow.log_param("aggregation_method", aggregation_method) # MLFlow mlflow.log_param("batch_size", batch_size) # MLFlow mlflow.log_param("epochs", epoch) # MLFlow run.log("interval", interval) # Azure ML run.log("aggregation_method", aggregation_method) # Azure ML run.log("batch_size", batch_size) # Azure ML run.log("epochs", epoch) # Azure ML print( f"Build model for data resampled with {interval} resolution, method {aggregation_method}, batch size {batch_size} and number of epochs {epoch}" ) resampled = read_and_resample( "2nd_test.hdf", interval, aggregation_method=aggregation_method ) anomalies, avg_train_anomaly, predicted_data, train_until_index = build_model( resampled, epoch, batch_size ) r2_train, expl_train, r2_test, expl_test = calc_scores( resampled, predicted_data, train_until_index ) run.log("r2_train", r2_train) # Azure ML run.log("explained_variance_train", expl_train) # Azure ML run.log("r2_test", r2_test) # Azure ML run.log("explained_variance_test", expl_test) # Azure ML mlflow.log_metric("r2_train", r2_train) # MLFlow mlflow.log_metric("explained_variance_train", expl_train) # MLFlow mlflow.log_metric("r2_test", r2_test) # MLFlow mlflow.log_metric("explained_variance_test", expl_test) # MLFlow anomalies = anomalies.rolling( resamples_for_model[-1] ).mean() # Use the last of the experiment resamples as the anomaly resample if PLOTTING: axarr[plotnum].plot( anomalies, label=interval + "-" + aggregation_method + "-model" ) axarr[plotnum].axhline(avg_train_anomaly, color="r") axarr[plotnum].legend(loc="upper left") plotnum += 1 run.complete() # Azure ML if PLOTTING: plt.show()
args = parser.parse_args() output_folder_path = args.outputs_folder expname = args.experiment_name os.makedirs(f'{output_folder_path}/{expname}', exist_ok=True) submissions_download_folder = f'{output_folder_path}/{expname}/submissions' tblogs_download_folder = f'{output_folder_path}/{expname}/tb_logs' os.makedirs(submissions_download_folder, exist_ok=True) if args.collect_tensorboard_logs: os.makedirs(tblogs_download_folder, exist_ok=True) # check workspace ws = Workspace.from_config('aml_config/config.json') print(f'Using Azure ML Workspace {ws.name} in location {ws.location}') experiment = Experiment(ws, expname) for run in experiment.get_runs(): for file in run.get_file_names(): if file.endswith('submission.csv'): print(f'Downloading {file}') run.download_file(file, submissions_download_folder) if 'tfevents' in file and args.collect_tensorboard_logs: _, _, folder, _ = file.split('/') folder_path = f'{tblogs_download_folder}/{folder}' os.makedirs(folder_path, exist_ok=True) print(f'Downloading {file}') run.download_file(file, folder_path)
# get root of git repo prefix = Path(git.Repo(".", search_parent_directories=True).working_tree_dir) # training script script_dir = prefix.joinpath("code", "train", "xgboost", "iris") script_name = "train.py" # environment file environment_file = prefix.joinpath("environments", "xgboost.txt") # azure ml settings environment_name = "xgboost-iris-example" experiment_name = "xgboost-iris-example" compute_target = "cpu-cluster" # create environment env = Environment.from_pip_requirements(environment_name, environment_file) # create job config src = ScriptRunConfig( source_directory=script_dir, script=script_name, environment=env, compute_target=compute_target, ) # submit job run = Experiment(ws, experiment_name).submit(src) print(run) run.wait_for_completion(show_output=True)
def main(): logging.warning("Loading environment variables...") e = Env() e.load_environment_variables(env_file_path='local.env', fallback_to_os=True) # Get Azure machine learning workspace logging.warning( "Getting reference to existing Azure Machine Learning workspace...") auth = InteractiveLoginAuthentication(tenant_id=e.tenant_id) ws = get_workspace(e.workspace_name, auth, e.subscription_id, e.resource_group) # Get compute target. It has to be a GPU compute as such unit is requested by the 'Feature Extraction Inference' step compute_target = get_compute_target(ws, compute_name=e.gpu_compute_name, vm_size=e.gpu_vm_size) # Create run configuration run_config = create_run_configuration(ws) # ------- # Step 1 # ------- # Define input 'prepared datasets' input_prepared_datasets = [] experiment_configuration = ExperimentConfigurationWrapper() experiment_configuration.load( os.path.join(cfg.StepsStructure.SNAPSHOT_ROOT_DIR, cfg.StepsStructure.get_experiments_config_filepath()) ) # noqa: E501 for data_config in experiment_configuration.json['OBJECT_DETECTION'][ 'inference']['data']: dataset_name = data_config['input']['dataset_name'] dataset = ws.datasets.get(dataset_name) input_prepared_datasets.extend([dataset.as_named_input(dataset_name)]) # Create pipeline datastore objects to create links between steps, so they are executed in a sequence, not in parallel pipeline_datastore = ws.get_default_datastore() object_detection_inference_output = PipelineData( name="centers", datastore=pipeline_datastore, is_directory=True) step_object_detection_inference = PythonScriptStep( name="Object Detection - Inference", source_directory=cfg.StepsStructure.SNAPSHOT_ROOT_DIR, script_name=cfg.StepsStructure.ObjectDetection. INFERENCE_STEP_SCRIPT_PATH, arguments=[ '--subscription_id', e.subscription_id, '--resource_group', e.resource_group, '--workspace_name', e.workspace_name, '--experiments_config_filepath', cfg.StepsStructure.get_experiments_config_filepath(), '--model_name', cfg.MLModelNames.OBJECT_DETECTION_MODEL, '--model_version', cfg.MLModelNames.OBJECT_DETECTION_MODEL_BEST_VERSION, '--output_folder', object_detection_inference_output, '--should_register_dataset', True ], inputs=input_prepared_datasets, outputs=[object_detection_inference_output], compute_target=compute_target, runconfig=run_config, allow_reuse=True) # ------- # Step 2 # ------- # input should contain 'prepared datasets' and centers object_extraction_input = object_detection_inference_output.as_input( 'centers') object_extraction_inputs = [object_extraction_input] object_extraction_output = PipelineData(name="cropped_objects", datastore=pipeline_datastore, is_directory=True) step_object_extraction = PythonScriptStep( name="Object Extraction", source_directory=cfg.StepsStructure.SNAPSHOT_ROOT_DIR, script_name=cfg.StepsStructure.ObjectExtraction.STEP_SCRIPT_PATH, arguments=[ "--subscription_id", e.subscription_id, "--resource_group", e.resource_group, "--workspace_name", e.workspace_name, "--experiments_config_filepath", cfg.StepsStructure.get_experiments_config_filepath(), "--output_folder", object_extraction_output, "--should_register_dataset", True, # This flag might be handy when we really want to recreate a cropped objects dataset (e.g. changed implementation # of the NucleiExtractor, although there are no changes in the input datasets). "--force_dataset_recreation", True ], inputs=object_extraction_inputs, outputs=[object_extraction_output], compute_target=compute_target, runconfig=run_config, allow_reuse=True, ) # ------- # Step 3a # ------- step_object_images_upload = PythonScriptStep( name="Cropped Object Images Upload to Blob Storage", source_directory=cfg.StepsStructure.SNAPSHOT_ROOT_DIR, script_name=cfg.StepsStructure.ObjectImagesUpload.STEP_SCRIPT_PATH, arguments=[ # '--subscription_id', e.subscription_id, # '--resource_group', e.resource_group, # '--workspace_name', e.workspace_name, '--experiments_config_filepath', cfg.StepsStructure.get_experiments_config_filepath(), # '--model_name', cfg.MLModelNames.FEATURE_EXTRACTION_MODEL, # '--model_version', cfg.MLModelNames.FEATURE_EXTRACTION_MODEL_BEST_VERSION, # '--output_folder', feature_extraction_inference_output, # '--should_register_dataset', True ], inputs=[object_extraction_output.as_input('cropped_objects')], outputs=[], compute_target=compute_target, runconfig=run_config, allow_reuse=True) # ------- # Step 3b # ------- feature_extraction_inference_input = object_extraction_output.as_input( 'cropped_objects') feature_extraction_inference_inputs = [feature_extraction_inference_input] feature_extraction_inference_output = PipelineData( name="latent_dims", datastore=pipeline_datastore, is_directory=True) step_feature_extraction_inference = PythonScriptStep( name="Feature Extraction - Inference", source_directory=cfg.StepsStructure.SNAPSHOT_ROOT_DIR, script_name=cfg.StepsStructure.FeatureExtraction. INFERENCE_STEP_SCRIPT_PATH, arguments=[ '--subscription_id', e.subscription_id, '--resource_group', e.resource_group, '--workspace_name', e.workspace_name, '--experiments_config_filepath', cfg.StepsStructure.get_experiments_config_filepath(), '--model_name', cfg.MLModelNames.FEATURE_EXTRACTION_MODEL, '--model_version', cfg.MLModelNames.FEATURE_EXTRACTION_MODEL_BEST_VERSION, '--output_folder', feature_extraction_inference_output, '--should_register_dataset', True ], inputs=feature_extraction_inference_inputs, outputs=[feature_extraction_inference_output], compute_target=compute_target, runconfig=run_config, allow_reuse=True) # ------- # Pipeline composition # ------- pipeline_steps = [ step_object_detection_inference, step_object_extraction, step_object_images_upload, step_feature_extraction_inference ] pipeline = Pipeline(workspace=ws, steps=pipeline_steps) # Create and submit an experiment logging.warning("Submitting experiment...") experiment = Experiment(ws, cfg.ExperimentNames.INFERENCE_REMOTE) experiment.submit( pipeline, regenerate_outputs=False) # Allow data reuse for this run logging.warning('Experiment submitted!')
parser.add_argument("--learning-rate", type=float, default=0.001) parser.add_argument("--gamma", type=float, default=0.1) parser.add_argument("--momentum", type=float, default=0.9) parser.add_argument("--step-size", type=int, default=7) args = parser.parse_args() workspace = Workspace( subscription_id=args.subscription_id, resource_group=args.resource_group, workspace_name=args.workspace_name, ) compute_target, compute_target_created = get_compute_target( workspace, "lowpriority") dataset = Dataset.get_by_name(workspace=workspace, name=args.dataset_name) data_directory = dataset.as_mount() experiment = Experiment(workspace, name=args.experiment_name) script_params = { "--action": "final_layer", "--epochs": args.epochs, "--learning-rate": args.learning_rate, "--gamma": args.gamma, "--momentum": args.momentum, "--step-size": args.step_size, "--environment": "azure", "--model-dir": "./outputs", "--data-dir": data_directory, } estimator = PyTorch( source_directory="hymenoptera", script_params=script_params, compute_target=compute_target,
def main(): """ Run the experiment for training """ interactive_auth = InteractiveLoginAuthentication( tenant_id=os.getenv("TENANT_ID")) work_space = Workspace.from_config(auth=interactive_auth) # Set up the dataset for training datastore = work_space.get_default_datastore() dataset = Dataset.File.from_files(path=(datastore, "datasets/mnist")) # Set up the experiment for training experiment = Experiment(workspace=work_space, name="keras-lenet-train") # azureml._restclient.snapshots_client.SNAPSHOT_MAX_SIZE_BYTES = 2000000000 config = ScriptRunConfig( source_directory=".", script="train_keras.py", compute_target="cpu-cluster", arguments=[ "--data_folder", dataset.as_named_input("input").as_mount(), "--log_folder", "./logs", ], ) # Set up the Tensoflow/Keras environment environment = Environment("keras-environment") # environment = Environment.from_conda_specification( # name='keras-environment', # file_path='keras-environment.yml' # ) environment.python.conda_dependencies = CondaDependencies.create( python_version="3.7.7", pip_packages=["azureml-defaults", "numpy", "tensorflow==2.3.1"]) config.run_config.environment = environment # Run the experiment for training run = experiment.submit(config) aml_url = run.get_portal_url() print( "Submitted to an Azure Machine Learning compute cluster. Click on the link below" ) print("") print(aml_url) tboard = Tensorboard([run]) # If successful, start() returns a string with the URI of the instance. tboard.start(start_browser=True) run.wait_for_completion(show_output=True) # After your job completes, be sure to stop() the streaming otherwise it will continue to run. print("Press enter to stop") input() tboard.stop() # Register Model metrics = run.get_metrics() run.register_model( model_name="keras_mnist", tags={ "data": "mnist", "model": "classification" }, model_path="outputs/keras_lenet.h5", model_framework=Model.Framework.TENSORFLOW, model_framework_version="2.3.1", properties={ "train_loss": metrics["train_loss"][-1], "train_accuracy": metrics["train_accuracy"][-1], "val_loss": metrics["val_loss"][-1], "val_accuracy": metrics["val_accuracy"][-1], }, )
from azureml.core.conda_dependencies import CondaDependencies ws = Workspace.from_config() fra_eng_ds = ws.datasets['fra-eng-translation'] environment = Environment.get(ws, "sentiment-env") estimator = TensorFlow( source_directory="translator", entry_script="experiment.py", framework_version="2.1", environment_definition=environment, compute_target="local", #script_params={'--data-size': 3000}, inputs=[fra_eng_ds.as_named_input('in_data')]) experiment = Experiment(workspace=ws, name="translator-fr-en") run = experiment.submit(config=estimator) run.wait_for_completion(show_output=True) run.register_model(model_name='translator-fr-en', model_path='outputs/', description='A translation model from english to french', tags={ 'source_language': 'eng', 'target_language': 'fr' }, model_framework=Model.Framework.TENSORFLOW, model_framework_version='2.2.0', properties={'BLEU Score': run.get_metrics()['bleu_score']})
) print("evaluateStep created") evaluateStep.run_after(trainStep) steps = [evaluateStep] pipeline = Pipeline(workspace=ws, steps=steps) print ("Pipeline is built") pipeline.validate() print("Simple validation complete") run = Run.get_context() experiment_name = run.experiment.name pipeline_run = Experiment(ws, experiment_name).submit(pipeline) print("Pipeline is submitted for execution") pipeline_run.wait_for_completion(show_output=True, timeout_seconds=43200) print("Downloading evaluation results...") # access the evaluate_output data = pipeline_run.find_step_run('evaluate')[0].get_output_data('evaluate_output') # download the predictions to local path data.download('.', show_progress=True) import json # load the eval info json with open(os.path.join('./', data.path_on_datastore, 'eval_info.json')) as f: eval_info = json.load(f) print("Printing evaluation results...")
best_loss = loss best_run_id = run except Exception as e: print("WARNING: Could get val_los for run_id", run) pass print("best run", best_run_id, best_loss) # start an Azure ML run run = Run.get_context() run_details = run.get_details() experiment_name = run_details['runDefinition']['environment']['name'].split( )[1] exp = Experiment(ws, name=experiment_name) best_run = Run(exp, best_run_id) # register the model if best_run_id: tags = {} tags['run_id'] = best_run_id tags['val_loss'] = metrics[best_run_id]['val_loss'][-1] model = best_run.register_model(model_name=experiment_name, model_path='outputs', tags=tags) else: raise Exception( "Couldn't not find a model to register. Probably because no run completed" )
runconfig=aml_run_config, allow_reuse=True ) # Build pipeline pipeline_steps = [ dataprep_step, train_step ] pipeline = Pipeline(workspace=workspace, steps=[pipeline_steps]) # Run pipeline run = Experiment(workspace=workspace, name='gensim_lda-pipeline').submit(pipeline) run.wait_for_completion(show_output=True) # Get training step run_train_step = [s for s in run.get_steps() if s.name == 'train.py'][0] print(run_train_step.get_metrics()) print(run_train_step.get_file_names()) # Register model model = run_train_step.register_model(model_name='gensim_lda', model_path='outputs') print(model.name, model.id, model.version, sep='\t')
# Get batch size and epochs batch_size = args.batch_size epochs = args.epochs # Get the current run. run = Run.get_context() # Offline run. Download the sample dataset and run locally. Still push results to Azure. if (run.id.startswith("OfflineRun")): print("Running in offline mode...") # Access workspace. print("Accessing workspace...") workspace = Workspace.from_config() experiment = Experiment(workspace, "training-junkyard") run = experiment.start_logging(outputs=None, snapshot_directory=".") # Get dataset. print("Accessing dataset...") if os.path.exists("dataset") == False: dataset_name = "anon-depthmap-npy" dataset = workspace.datasets[dataset_name] dataset.download(target_path='dataset', overwrite=False) dataset_path = "dataset" # Online run. Use dataset provided by training notebook. else: print("Running in online mode...") experiment = run.experiment workspace = experiment.workspace
# 06-run-pytorch-data.py from azureml.core import Workspace from azureml.core import Experiment from azureml.core import Environment from azureml.core import ScriptRunConfig from azureml.core import Dataset import os if __name__ == "__main__": ws = Workspace.from_config() datastore = ws.get_default_datastore() dataset = Dataset.File.from_files(path=(datastore, 'datasets/locations')) experiment = Experiment(workspace=ws, name='mic-999') config = ScriptRunConfig( source_directory='./src', script='model.py', compute_target='cpu-cluster', arguments=['--data_path', dataset.as_named_input('input').as_mount()], ) # set up pytorch environment env = Environment.from_conda_specification( name='monografia-env', file_path='./.azureml/multiclass-image-classification.yml') config.run_config.environment = env run = experiment.submit(config) aml_url = run.get_portal_url()
def main(): run = Run.get_context() if (run.id.startswith('OfflineRun')): from dotenv import load_dotenv # For local development, set values in this section load_dotenv() workspace_name = os.environ.get("WORKSPACE_NAME") experiment_name = os.environ.get("EXPERIMENT_NAME") resource_group = os.environ.get("RESOURCE_GROUP") subscription_id = os.environ.get("SUBSCRIPTION_ID") build_id = os.environ.get('BUILD_BUILDID') # run_id useful to query previous runs run_id = "bd184a18-2ac8-4951-8e78-e290bef3b012" aml_workspace = Workspace.get(name=workspace_name, subscription_id=subscription_id, resource_group=resource_group) ws = aml_workspace exp = Experiment(ws, experiment_name) else: ws = run.experiment.workspace exp = run.experiment run_id = 'amlcompute' parser = argparse.ArgumentParser("register") parser.add_argument( "--build_id", type=str, help="The Build ID of the build triggering this pipeline run", ) parser.add_argument( "--run_id", type=str, help="Training run ID", ) parser.add_argument( "--model_name", type=str, help="Name of the Model", default="sklearn_regression_model.pkl", ) args = parser.parse_args() if (args.build_id is not None): build_id = args.build_id if (args.run_id is not None): run_id = args.run_id if (run_id == 'amlcompute'): run_id = run.parent.id model_name = args.model_name if (build_id is None): register_aml_model(model_name, exp, run_id) else: run.tag("BuildId", value=build_id) builduri_base = os.environ.get("BUILDURI_BASE") if (builduri_base is not None): build_uri = builduri_base + build_id run.tag("BuildUri", value=build_uri) register_aml_model(model_name, exp, run_id, build_id, build_uri) else: register_aml_model(model_name, exp, run_id, build_id)
# PREPARE LOGGING logger = logging.getLogger() logger.setLevel("INFO") ch = logging.StreamHandler() ch.setLevel(logging.INFO) formatter = logging.Formatter( "%(asctime)s - %(name)s - %(levelname)s - %(message)s") ch.setFormatter(formatter) logger.addHandler(ch) # GET WS, EXP, ENV and COMPUTE TARGET ws = Workspace.from_config() experiment = Experiment(ws, "MaxFreezerTemperatureExceededPipeline", _create_in_cloud=True) compute_target = ComputeTarget(ws, "freezertrain") run_config = RunConfiguration() freezer_environment = ws.environments["sktime_freezer_environment"] run_config.environment = freezer_environment logger.info("Environment complete") # PIPELINE PARAMS output_df_long = PipelineData("output_df_long", datastore=ws.get_default_datastore()) output_df_nested = PipelineData("output_df_nested", datastore=ws.get_default_datastore()) time_series_length_param = PipelineParameter(name="time_series_length", default_value=10)
def main(): run = Run.get_context() if (run.id.startswith('OfflineRun')): from dotenv import load_dotenv load_dotenv() sources_dir = os.environ.get("SOURCES_DIR_TRAIN") if (sources_dir is None): sources_dir = 'MLOps' workspace_name = os.environ.get("WORKSPACE_NAME") experiment_name = os.environ.get("EXPERIMENT_NAME") resource_group = os.environ.get("RESOURCE_GROUP") subscription_id = os.environ.get("SUBSCRIPTION_ID") build_id = os.environ.get('BUILD_BUILDID') aml_workspace = Workspace.get(name=workspace_name, subscription_id=subscription_id, resource_group=resource_group) ws = aml_workspace exp = Experiment(ws, experiment_name) else: exp = run.experiment e = Env() parser = argparse.ArgumentParser("register") parser.add_argument( "--build_id", type=str, help="The Build ID of the build triggering this pipeline run", ) parser.add_argument("--output_model_version_file", type=str, default="model_version.txt", help="Name of a file to write model version to") args = parser.parse_args() if (args.build_id is not None): build_id = args.build_id model_name = e.model_name try: tag_name = 'BuildId' model = get_model(model_name=model_name, tag_name=tag_name, tag_value=build_id, aml_workspace=exp.workspace) if (model is not None): print("Model was registered for this build.") if (model is None): print("Model was not registered for this run.") sys.exit(1) except Exception as e: print(e) print("Model was not registered for this run.") sys.exit(1) # Save the Model Version for other AzDO jobs after script is complete if args.output_model_version_file is not None: with open(args.output_model_version_file, "w") as out_file: out_file.write(str(model.version))
def main(): # Loading azure credentials print("::debug::Loading azure credentials") azure_credentials = os.environ.get("INPUT_AZURE_CREDENTIALS", default="{}") try: azure_credentials = json.loads(azure_credentials) except JSONDecodeError: print( "::error::Please paste output of `az ad sp create-for-rbac --name <your-sp-name> --role contributor --scopes /subscriptions/<your-subscriptionId>/resourceGroups/<your-rg> --sdk-auth` as value of secret variable: AZURE_CREDENTIALS" ) raise AMLConfigurationException( "Incorrect or poorly formed output from azure credentials saved in AZURE_CREDENTIALS secret. See setup in https://github.com/Azure/aml-workspace/blob/master/README.md" ) # Checking provided parameters print("::debug::Checking provided parameters") validate_json(data=azure_credentials, schema=azure_credentials_schema, input_name="AZURE_CREDENTIALS") # Mask values print("::debug::Masking parameters") mask_parameter(parameter=azure_credentials.get("tenantId", "")) mask_parameter(parameter=azure_credentials.get("clientId", "")) mask_parameter(parameter=azure_credentials.get("clientSecret", "")) mask_parameter(parameter=azure_credentials.get("subscriptionId", "")) # Loading parameters file print("::debug::Loading parameters file") parameters_file = os.environ.get("INPUT_PARAMETERS_FILE", default="run.json") parameters_file_path = os.path.join(".cloud", ".azure", parameters_file) try: with open(parameters_file_path) as f: parameters = json.load(f) except FileNotFoundError: print( f"::debug::Could not find parameter file in {parameters_file_path}. Please provide a parameter file in your repository if you do not want to use default settings (e.g. .cloud/.azure/run.json)." ) parameters = {} # Checking provided parameters print("::debug::Checking provided parameters") validate_json(data=parameters, schema=parameters_schema, input_name="PARAMETERS_FILE") # Define target cloud if azure_credentials.get( "resourceManagerEndpointUrl", "").startswith("https://management.usgovcloudapi.net"): cloud = "AzureUSGovernment" elif azure_credentials.get( "resourceManagerEndpointUrl", "").startswith("https://management.chinacloudapi.cn"): cloud = "AzureChinaCloud" else: cloud = "AzureCloud" # Loading Workspace print("::debug::Loading AML Workspace") sp_auth = ServicePrincipalAuthentication( tenant_id=azure_credentials.get("tenantId", ""), service_principal_id=azure_credentials.get("clientId", ""), service_principal_password=azure_credentials.get("clientSecret", ""), cloud=cloud) config_file_path = os.environ.get("GITHUB_WORKSPACE", default=".cloud/.azure") config_file_name = "aml_arm_config.json" try: ws = Workspace.from_config(path=config_file_path, _file_name=config_file_name, auth=sp_auth) except AuthenticationException as exception: print( f"::error::Could not retrieve user token. Please paste output of `az ad sp create-for-rbac --name <your-sp-name> --role contributor --scopes /subscriptions/<your-subscriptionId>/resourceGroups/<your-rg> --sdk-auth` as value of secret variable: AZURE_CREDENTIALS: {exception}" ) raise AuthenticationException except AuthenticationError as exception: print(f"::error::Microsoft REST Authentication Error: {exception}") raise AuthenticationError except AdalError as exception: print( f"::error::Active Directory Authentication Library Error: {exception}" ) raise AdalError except ProjectSystemException as exception: print(f"::error::Workspace authorizationfailed: {exception}") raise ProjectSystemException # Create experiment print("::debug::Creating experiment") try: # Default experiment name repository_name = os.environ.get("GITHUB_REPOSITORY").split("/")[-1] branch_name = os.environ.get("GITHUB_REF").split("/")[-1] default_experiment_name = f"{repository_name}-{branch_name}" experiment = Experiment( workspace=ws, name=parameters.get("experiment_name", default_experiment_name)[:36]) except TypeError as exception: experiment_name = parameters.get("experiment", None) print( f"::error::Could not create an experiment with the specified name {experiment_name}: {exception}" ) raise AMLExperimentConfigurationException( f"Could not create an experiment with the specified name {experiment_name}: {exception}" ) except UserErrorException as exception: experiment_name = parameters.get("experiment", None) print( f"::error::Could not create an experiment with the specified name {experiment_name}: {exception}" ) raise AMLExperimentConfigurationException( f"Could not create an experiment with the specified name {experiment_name}: {exception}" ) # Loading run config print("::debug::Loading run config") run_config = None if run_config is None: # Loading run config from runconfig yaml file print("::debug::Loading run config from runconfig yaml file") run_config = load_runconfig_yaml(runconfig_yaml_file=parameters.get( "runconfig_yaml_file", "code/train/run_config.yml")) if run_config is None: # Loading run config from pipeline yaml file print("::debug::Loading run config from pipeline yaml file") run_config = load_pipeline_yaml(workspace=ws, pipeline_yaml_file=parameters.get( "pipeline_yaml_file", "code/train/pipeline.yml")) if run_config is None: # Loading run config from python runconfig file print("::debug::Loading run config from python runconfig file") run_config = load_runconfig_python( workspace=ws, runconfig_python_file=parameters.get("runconfig_python_file", "code/train/run_config.py"), runconfig_python_function_name=parameters.get( "runconfig_python_function_name", "main")) if run_config is None: # Loading values for errors pipeline_yaml_file = parameters.get("pipeline_yaml_file", "code/train/pipeline.yml") runconfig_yaml_file = parameters.get("runconfig_yaml_file", "code/train/run_config.yml") runconfig_python_file = parameters.get("runconfig_python_file", "code/train/run_config.py") runconfig_python_function_name = parameters.get( "runconfig_python_function_name", "main") print( f"::error::Error when loading runconfig yaml definition your repository (Path: /{runconfig_yaml_file})." ) print( f"::error::Error when loading pipeline yaml definition your repository (Path: /{pipeline_yaml_file})." ) print( f"::error::Error when loading python script or function in your repository which defines the experiment config (Script path: '/{runconfig_python_file}', Function: '{runconfig_python_function_name}()')." ) print( "::error::You have to provide either a yaml definition for your run, a yaml definition of your pipeline or a python script, which returns a runconfig (Pipeline, ScriptRunConfig, AutoMlConfig, Estimator, etc.). Please read the documentation for more details." ) raise AMLExperimentConfigurationException( "You have to provide a yaml definition for your run, a yaml definition of your pipeline or a python script, which returns a runconfig. Please read the documentation for more details." ) # Submit run config print("::debug::Submitting experiment config") try: # Defining default tags print("::debug::Defining default tags") default_tags = { "GITHUB_ACTOR": os.environ.get("GITHUB_ACTOR"), "GITHUB_REPOSITORY": os.environ.get("GITHUB_REPOSITORY"), "GITHUB_SHA": os.environ.get("GITHUB_SHA"), "GITHUB_REF": os.environ.get("GITHUB_REF") } run = experiment.submit(config=run_config, tags=dict(parameters.get("tags", {}), **default_tags)) except AzureMLException as exception: print( f"::error::Could not submit experiment config. Your script passed object of type {type(run_config)}. Object must be correctly configured and of type e.g. estimator, pipeline, etc.: {exception}" ) raise AMLExperimentConfigurationException( f"Could not submit experiment config. Your script passed object of type {type(run_config)}. Object must be correctly configured and of type e.g. estimator, pipeline, etc.: {exception}" ) except TypeError as exception: print( f"::error::Could not submit experiment config. Your script passed object of type {type(run_config)}. Object must be correctly configured and of type e.g. estimator, pipeline, etc.: {exception}" ) raise AMLExperimentConfigurationException( f"Could not submit experiment config. Your script passed object of type {type(run_config)}. Object must be correctly configured and of type e.g. estimator, pipeline, etc.: {exception}" ) # Create outputs print("::debug::Creating outputs") print(f"::set-output name=experiment_name::{run.experiment.name}") print(f"::set-output name=run_id::{run.id}") print(f"::set-output name=run_url::{run.get_portal_url()}") # Waiting for run to complete print("::debug::Waiting for run to complete") if parameters.get("wait_for_completion", True): run.wait_for_completion(show_output=True) # Creating additional outputs of finished run run_metrics = run.get_metrics(recursive=True) print(f"::set-output name=run_metrics::{run_metrics}") run_metrics_markdown = convert_to_markdown(run_metrics) print( f"::set-output name=run_metrics_markdown::{run_metrics_markdown}") # Download artifacts if enabled if parameters.get("download_artifacts", False): # Defining artifacts folder print("::debug::Defining artifacts folder") root_path = os.environ.get("GITHUB_WORKSPACE", default=None) folder_name = f"aml_artifacts_{run.id}" artifact_path = os.path.join(root_path, folder_name) # Downloading artifacts print("::debug::Downloading artifacts") run.download_files( output_directory=os.path.join(artifact_path, "parent")) children = run.get_children(recursive=True) for i, child in enumerate(children): child.download_files( output_directory=os.path.join(artifact_path, f"child_{i}")) # Creating additional outputs print(f"::set-output name=artifact_path::{artifact_path}") # Publishing pipeline print("::debug::Publishing pipeline") if type(run) is PipelineRun and parameters.get("publish_pipeline", False): # Default pipeline name repository_name = os.environ.get("GITHUB_REPOSITORY").split("/")[-1] branch_name = os.environ.get("GITHUB_REF").split("/")[-1] default_pipeline_name = f"{repository_name}-{branch_name}" published_pipeline = run.publish_pipeline( name=parameters.get("pipeline_name", default_pipeline_name), description="Pipeline registered by GitHub Run Action", version=parameters.get("pipeline_version", None), continue_on_step_failure=parameters.get( "pipeline_continue_on_step_failure", False)) # Creating additional outputs print( f"::set-output name=published_pipeline_id::{published_pipeline.id}" ) print( f"::set-output name=published_pipeline_status::{published_pipeline.status}" ) print( f"::set-output name=published_pipeline_endpoint::{published_pipeline.endpoint}" ) elif parameters.get("publish_pipeline", False): print( "::error::Could not register pipeline because you did not pass a pipeline to the action" ) print("::debug::Successfully finished Azure Machine Learning Train Action")
run_details["experiment_name"] = os.environ.get("EXPERIMENT_NAME", default=None) # Get workspace print("Loading Workspace") cli_auth = AzureCliAuthentication() config_file_path = os.environ.get("GITHUB_WORKSPACE", default="aml_service") config_file_name = "aml_arm_config.json" ws = Workspace.from_config(path=config_file_path, auth=cli_auth, _file_name=config_file_name) print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep='\n') # Loading Run print("Loading Run") experiment = Experiment(workspace=ws, name=run_details["experiment_name"]) run = Run(experiment=experiment, run_id=run_details["run_id"]) # Only register model, if it performs better than the production model print("Register model only if it performs better.") try: # Loading run of production model print("Loading Run of Production Model to evaluate new model") production_model = Model(workspace=ws, name=deployment_settings["model"]["name"]) production_model_run_id = production_model.tags.get(["run_id"]) production_model_run = Run(experiment=experiment, run_id=production_model_run_id) # Comparing models print("Comparing Metrics of production and newly trained model")
#conda activate py36 import sys from azureml.core import VERSION print("python version: " , sys.version) print("azureml version: ", VERSION) # enable logging # https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-enable-logging from azureml.core import Workspace, Experiment, Run Workspace.create( ) exp = Experiment(workspace=, name='test_experiment') run = exp.start_logging() run.log("test-val", 10)
def get_automl_environment(workspace: Workspace, training_pipeline_run_id: str, training_experiment_name: str): from azureml.core import Experiment, Run experiment = Experiment(workspace, training_experiment_name) run = Run(experiment, training_pipeline_run_id) step_run = list(run.get_children())[0] return step_run.get_environment()
def main(): run = Run.get_context() if (run.id.startswith('OfflineRun')): from dotenv import load_dotenv # For local development, set values in this section load_dotenv() workspace_name = os.environ.get("WORKSPACE_NAME") experiment_name = os.environ.get("EXPERIMENT_NAME") resource_group = os.environ.get("RESOURCE_GROUP") subscription_id = os.environ.get("SUBSCRIPTION_ID") # run_id useful to query previous runs run_id = "bd184a18-2ac8-4951-8e78-e290bef3b012" aml_workspace = Workspace.get(name=workspace_name, subscription_id=subscription_id, resource_group=resource_group) ws = aml_workspace exp = Experiment(ws, experiment_name) else: ws = run.experiment.workspace exp = run.experiment run_id = 'amlcompute' parser = argparse.ArgumentParser("register") parser.add_argument( "--run_id", type=str, help="Training run ID", ) parser.add_argument( "--model_name", type=str, help="Name of the Model", default="CMPE258United_model.pkl", ) parser.add_argument("--step_input", type=str, help=("input from previous steps")) args = parser.parse_args() if (args.run_id is not None): run_id = args.run_id if (run_id == 'amlcompute'): run_id = run.parent.id model_name = args.model_name model_path = args.step_input print("Getting registration parameters") # Load the registration parameters from the parameters file with open("parameters.json") as f: pars = json.load(f) try: register_args = pars["registration"] except KeyError: print("Could not load registration values from file") register_args = {"tags": []} model_tags = {} for tag in register_args["tags"]: try: mtag = run.parent.get_metrics()[tag] model_tags[tag] = mtag except KeyError: print(f"Could not find {tag} metric on parent run.") # load the model print("Loading model from " + model_path) model_file = os.path.join(model_path, model_name) model = joblib.load(model_file) parent_tags = run.parent.get_tags() try: build_id = parent_tags["BuildId"] except KeyError: build_id = None print("BuildId tag not found on parent run.") print(f"Tags present: {parent_tags}") try: build_uri = parent_tags["BuildUri"] except KeyError: build_uri = None print("BuildUri tag not found on parent run.") print(f"Tags present: {parent_tags}") if (model is not None): dataset_id = parent_tags["dataset_id"] if (build_id is None): register_aml_model(model_file, model_name, model_tags, exp, run_id, dataset_id) elif (build_uri is None): register_aml_model(model_file, model_name, model_tags, exp, run_id, dataset_id, build_id) else: register_aml_model(model_file, model_name, model_tags, exp, run_id, dataset_id, build_id, build_uri) else: print("Model not found. Skipping model registration.") sys.exit(0)
filepath = "environments/data_validation_subset/RunConfig/runconfig_data_validation.yml" input_name_train = 'newsgroups_raw_subset_train' input_name_test = 'newsgroups_raw_subset_test' dataset_train = Dataset.get_by_name(workspace, name=input_name_train) dataset_test = Dataset.get_by_name(workspace, name=input_name_test) # Load run Config file for data prep run_config = RunConfiguration.load(path=os.path.join( os.path.join( os.path.dirname(os.path.realpath(__file__)), "../..", filepath, )), name="datavalidation") est = ScriptRunConfig( source_directory=os.path.dirname(os.path.realpath(__file__)), run_config=run_config, arguments=[ '--data_folder_train', dataset_train.as_named_input('train').as_mount(), '--data_folder_test', dataset_test.as_named_input('test').as_mount(), '--local', 'no' ], ) # Define the ML experiment experiment = Experiment(workspace, "data-validation") # Submit experiment run, if compute is idle, this may take some time') run = experiment.submit(est)
cli_auth = AzureCliAuthentication() # Get workspace ws = Workspace.from_config(auth=cli_auth) # Paramaterize the matrics on which the models should be compared # Add golden data set on which all the model performance can be evaluated # Get the latest run_id with open("aml_config/run_id.json") as f: config = json.load(f) new_model_run_id = config["run_id"] experiment_name = config["experiment_name"] exp = Experiment(workspace=ws, name=experiment_name) try: # Get most recently registered model, we assume that is the model in production. Download this model and compare it with the recently trained model by running test with same data set. model_list = Model.list(ws) production_model = next( filter( lambda x: x.created_time == max(model.created_time for model in model_list), model_list, )) production_model_run_id = production_model.tags.get("run_id") run_list = exp.get_runs() # production_model_run = next(filter(lambda x: x.id == production_model_run_id, run_list)) # Get the run history for both production model and newly trained model and compare mse
# Step 3: Train Model train_step, train_outputs = train_step(data_preprocess_outputs['train_dir'], cpu_compute_target) # Step 4: Evaluate Model evaluate_step, evaluate_outputs = evaluate_step( train_outputs['model_dir'], data_preprocess_outputs['test_dir'], cpu_compute_target) # Step 5: Deploy Model deploy_step, deploy_outputs = deploy_step(train_outputs['model_dir'], evaluate_outputs['accuracy_file'], data_preprocess_outputs['test_dir'], cpu_compute_target) # Submit pipeline print('Submitting pipeline ...') pipeline_parameters = {'max_depth': 5, 'n_estimators': 500} # Submit pipeline print('Submitting pipeline ...') pipeline = Pipeline(workspace=ws, steps=[ data_ingestion_step, data_preprocess_step, train_step, evaluate_step, deploy_step ]) pipeline_run = Experiment(ws, 'turbofan-pipeline').submit( pipeline, pipeline_parameters=pipeline_parameters)
def main(): e = Env() print('********************') print(e.source_directory) files = os.listdir('./aml_pipeline') for f in files: print(f) print('***************') workspace_name = e.workspace_name subscription_id = e.subscription_id resource_group = e.resource_group #Connect to AML Workspace print('workspace_name = ' + workspace_name) print('subscription_id = ' + subscription_id) print('resource_group = ' + resource_group) ws = Workspace.get( name=workspace_name, subscription_id=subscription_id, resource_group=resource_group, ) print('Ready to use Azure ML {} to work with {}'.format( azureml.core.VERSION, ws.name)) default_ds = ws.get_default_datastore() if 'diabetes dataset' not in ws.datasets: default_ds.upload_files( files=['diabetes.csv', 'diabetes2.csv'], # Upload the diabetes csv files in /data target_path= 'diabetes-data/', # Put it in a folder path in the datastore overwrite=True, # Replace existing files of the same name show_progress=True) #Create a tabular dataset from the path on the datastore (this may take a short while) tab_data_set = Dataset.Tabular.from_delimited_files( path=(default_ds, 'diabetes-data/*.csv')) # Register the tabular dataset try: tab_data_set = tab_data_set.register(workspace=ws, name='diabetes dataset', description='diabetes data', tags={'format': 'CSV'}, create_new_version=True) print('Dataset registered.') except Exception as ex: print(ex) else: print('Dataset already registered.') # Create a folder for the pipeline step files experiment_folder = 'diabetes_pipeline' os.makedirs(experiment_folder, exist_ok=True) print(experiment_folder) cluster_name = "mmcomputecluster" try: # Check for existing compute target pipeline_cluster = ComputeTarget(workspace=ws, name=cluster_name) print('Found existing cluster, use it.') except ComputeTargetException: # If it doesn't already exist, create it try: compute_config = AmlCompute.provisioning_configuration( vm_size='STANDARD_DS11_V2', max_nodes=2) pipeline_cluster = ComputeTarget.create(ws, cluster_name, compute_config) pipeline_cluster.wait_for_completion(show_output=True) except Exception as ex: print(ex) # Create a Python environment for the experiment diabetes_env = Environment("diabetes-pipeline-env") diabetes_env.python.user_managed_dependencies = False # Let Azure ML manage dependencies diabetes_env.docker.enabled = True # Use a docker container # Create a set of package dependencies diabetes_packages = CondaDependencies.create( conda_packages=[ 'scikit-learn', 'ipykernel', 'matplotlib', 'pandas', 'pip' ], pip_packages=[ 'azureml-defaults', 'azureml-dataprep[pandas]', 'pyarrow' ]) # Add the dependencies to the environment diabetes_env.python.conda_dependencies = diabetes_packages # Register the environment diabetes_env.register(workspace=ws) registered_env = Environment.get(ws, 'diabetes-pipeline-env') # Create a new runconfig object for the pipeline pipeline_run_config = RunConfiguration() # Use the compute you created above. pipeline_run_config.target = pipeline_cluster # Assign the environment to the run configuration pipeline_run_config.environment = registered_env print("Run configuration created.") # Get the training dataset diabetes_ds = ws.datasets.get("diabetes dataset") # Create a PipelineData (temporary Data Reference) for the model folder prepped_data_folder = PipelineData("prepped_data_folder", datastore=ws.get_default_datastore()) # Step 1, Run the data prep script prep_step = PythonScriptStep(name="Prepare Data", script_name="prep_diabetes.py", source_directory='./aml_pipeline', arguments=[ '--input-data', diabetes_ds.as_named_input('raw_data'), '--prepped-data', prepped_data_folder ], outputs=[prepped_data_folder], compute_target=pipeline_cluster, runconfig=pipeline_run_config, allow_reuse=True) # Step 2, run the training script train_step = PythonScriptStep( name="Train and Register Model", source_directory='./aml_pipeline', script_name="train_diabetes.py", arguments=['--training-folder', prepped_data_folder], inputs=[prepped_data_folder], compute_target=pipeline_cluster, runconfig=pipeline_run_config, allow_reuse=True) print("Pipeline steps defined") pipeline_steps = [prep_step, train_step] pipeline = Pipeline(workspace=ws, steps=pipeline_steps) print("Pipeline is built.") # Create an experiment and run the pipeline experiment = Experiment(workspace=ws, name='jlg-exp') pipeline_run = experiment.submit(pipeline, regenerate_outputs=True) print("Pipeline submitted for execution.") pipeline_run.wait_for_completion(show_output=True) for run in pipeline_run.get_children(): print(run.name, ':') metrics = run.get_metrics() for metric_name in metrics: print('\t', metric_name, ":", metrics[metric_name]) for model in Model.list(ws): print(model.name, 'version:', model.version) for tag_name in model.tags: tag = model.tags[tag_name] print('\t', tag_name, ':', tag) for prop_name in model.properties: prop = model.properties[prop_name] print('\t', prop_name, ':', prop) print('\n') # Publish the pipeline from the run published_pipeline = pipeline_run.publish_pipeline( name="diabetes-training-pipeline", description="Trains diabetes model", version="1.0") published_pipeline rest_endpoint = published_pipeline.endpoint print(rest_endpoint)
except ComputeTargetException: print('Creating a new compute target...') provisioning_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6', max_nodes=5, idle_seconds_before_scaledown=1800) # create the cluster gpu_compute_target = ComputeTarget.create(ws, gpu_compute_name, provisioning_config) # can poll for a minimum number of nodes and for a specific timeout. # if no min node count is provided it uses the scale settings for the cluster gpu_compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20) est = TensorFlow(source_directory=script_folder, compute_target=gpu_compute_target, pip_packages=['keras==2.0.8', 'theano', 'tensorflow==1.8.0', 'tensorflow-gpu==1.8.0', 'matplotlib', 'horovod', 'hickle'], entry_script='train.py', use_gpu=True, node_count=1, script_params={"--remote_execution": None, "--data-folder": config["data_folder"]} ) experiment_name = "prednet_train" exp = Experiment(ws, experiment_name) run = exp.submit(est) run.wait_for_completion(show_output=True) print("done")
if args.local_run: #from subprocess import call, run import sys import subprocess env = os.environ.copy() if args.process_count == 1: env['CUDA_VISIBLE_DEVICES'] = '0' cmd_arry = [sys.executable, model_run_scripts[0]] + model_run_args_config else: cmd_arry = [sys.executable, '-m', 'torch.distributed.launch', '--nproc_per_node', args.process_count, model_run_scripts[0]] + model_run_args_config cmd_arry = [str(s) for s in cmd_arry] cmd = ' '.join(cmd_arry) subprocess.run(cmd_arry, env=env) else: # Create experiment for model model_experiment = Experiment(ws, name=model_experiment_name) distr_config = PyTorchConfiguration(process_count=args.process_count, node_count=args.node_count) # create script run config for the model+config model_run_config = ScriptRunConfig(source_directory='.', script=model_run_scripts[0], arguments=model_run_args_config, compute_target=gpu_compute_target, environment=hf_ort_env, distributed_job_config=distr_config) print(f"Submitting run for model: {args.hf_model}, config: {args.run_config}") run = model_experiment.submit(model_run_config) cuda_version = "10.2" if args.use_cu102 else "11.1" run.set_tags({'model' : args.hf_model, 'config' : args.run_config, 'bs' : model_batchsize, 'gpus' : str(args.process_count), 'cuda': cuda_version}) print(f"Job submitted to {run.get_portal_url()}")
# Control script for a training run from azureml.core import Workspace from azureml.core import Experiment from azureml.core import Environment from azureml.core import ScriptRunConfig from azureml.core import Dataset if __name__ == "__main__": ws = Workspace.from_config() dataset = Dataset.get_by_name(workspace=ws, name='bananas_dataset') experiment = Experiment(workspace=ws, name='bananas-experiment') config = ScriptRunConfig( source_directory='.', script='train.py', compute_target='gpu1', arguments=[ '--data-path', dataset.as_named_input('input').as_mount(), '--output-path', './outputs', '--epochs', 3, '--batch-size', 2, '--learning-rate', 0.001, '--scale', 0.5, '--to-bgr' ], ) # set up the training environment env = Environment.from_conda_specification( name='train-env', file_path='./train-env.yml'