def fetch_run_for_experiment(experiment_to_recover: Experiment, run_id_or_number: str) -> Run: """ :param experiment_to_recover: an experiment :param run_id_or_number: a string representing the Run ID or Run Number of one of the runs of the experiment :return: the run matching run_id_or_number; raises an exception if not found """ available_runs = experiment_to_recover.get_runs() try: run_number = int(run_id_or_number) for run in available_runs: if run.number == run_number: return run except ValueError: # will be raised in run_id_or_number does not represent a number pass try: return get_run(experiment=experiment_to_recover, run_id=run_id_or_number, rehydrate=True) except Exception: available_ids = ", ".join([run.id for run in available_runs]) raise (Exception( "Run {} not found for experiment: {}. Available runs are: {}". format(run_id_or_number, experiment_to_recover.name, available_ids)))
def cancel_running_and_queued_jobs() -> None: environ = os.environ print("Authenticating") auth = ServicePrincipalAuthentication( tenant_id='72f988bf-86f1-41af-91ab-2d7cd011db47', service_principal_id=environ["APPLICATION_ID"], service_principal_password=environ["APPLICATION_KEY"]) print("Getting AML workspace") workspace = Workspace.get(name="InnerEye-DeepLearning", auth=auth, subscription_id=environ["SUBSCRIPTION_ID"], resource_group="InnerEye-DeepLearning") branch = environ["BRANCH"] print(f"Branch: {branch}") if not branch.startswith("refs/pull/"): print("This branch is not a PR branch, hence not cancelling anything.") exit(0) experiment_name = branch.replace("/", "_") print(f"Experiment: {experiment_name}") experiment = Experiment(workspace, name=experiment_name) print(f"Retrieved experiment {experiment.name}") for run in experiment.get_runs(include_children=True, properties={}): assert isinstance(run, Run) status_suffix = f"'{run.status}' run {run.id} ({run.display_name})" if run.status in (RunStatus.COMPLETED, RunStatus.FAILED, RunStatus.FINALIZING, RunStatus.CANCELED, RunStatus.CANCEL_REQUESTED): print(f"Skipping {status_suffix}") else: print(f"Cancelling {status_suffix}") run.cancel()
def cancel_all_runs(exp_name, run_id=None): ws = get_workspace() exp = Experiment(ws, exp_name) if run_id: r = get_run(experiment=exp, run_id=run_id, rehydrate=True) # check the returned run type and status print(type(r), r.get_status()) # you can cancel a run if it hasn't completed or failed if r.get_status() not in ['Complete', 'Failed']: r.cancel() else: # if you don't know the run id, you can list all runs under an experiment for r in exp.get_runs(): run = get_run(experiment=exp, run_id=r.id, rehydrate=True) for c in run.get_children(): for gc in c.get_children(): if gc.get_status() == "Running" or gc.get_status( ) == "Queued": print(gc.id, gc.get_status()) gc.cancel() if c.get_status() == "Running" or c.get_status() == "Queued": print(c.id, c.get_status()) c.cancel() if r.get_status() == "Running" or r.get_status() == "Queued": print(r.id, r.get_status()) r.cancel()
def toAzure(): import azureml.core from azureml.core import Workspace from azureml.core import Experiment import shutil, os, glob from azureml.core.authentication import InteractiveLoginAuthentication with open("outputs/_experiment-name_.txt", "r", encoding="utf-8") as file: experiment_name = file.readline() try: ws = Workspace.get( name="sparknlp", subscription_id="bc5674c1-2f09-4eff-8497-b97f5466158f", resource_group="datascientists") except: interactive_auth = InteractiveLoginAuthentication( tenant_id="55574e46-daf5-45bd-8659-de00e36fb97c", force=True) ws = Workspace.get( name="sparknlp", subscription_id="bc5674c1-2f09-4eff-8497-b97f5466158f", resource_group="datascientists", auth=interactive_auth) experiment = Experiment(workspace=ws, name=experiment_name) notebooks = glob.glob("*.ipynb") for nb in notebooks: shutil.copy(nb, "outputs/_notebooks/CopyOf_" + nb) run = experiment.start_logging() print( f"Uploading the content of your '{experiment_name}' to Azure Cloud...") run.complete() runs = experiment.get_runs() print(f"Your {len(list(runs))}. run was uploaded.") print( """You can view your logs on Microsoft Azure Machine Learning Studio. To view the details of your last run, click the link below :""") runs = experiment.get_runs() return list(runs)[0]
def get_run_and_download_pytest(branch: str, number: int) -> Optional[Path]: experiment = Experiment(workspace, name=to_azure_friendly_string(branch)) runs = [run for run in experiment.get_runs() if run.number == number] if len(runs) != 1: raise ValueError( f"Expected to get exactly 1 run in experiment {experiment.name}" ) return download_pytest_result(runs[0], output_dir)
def cancel_runs_in_experiment(ws, experiment): failed_experiment = Experiment(ws, experiment) all_runs = failed_experiment.get_runs() for idx, run in enumerate(all_runs): try: if run.status == 'Running': run = Run(failed_experiment, run.id) print('Canceling run: ', run) run.cancel() except Exception as e: print('Canceling run failed due to ', e)
def show_git_versions(ctx): """ List all experiment runs and their git version """ ws = get_workspace(config) exp = Experiment(ws, config["experiment_name"]) versions = [(run.id, run.get_properties()["azureml.git.commit"]) for run in exp.get_runs()] print(tabulate(versions, headers=["Run ID", "Git Version"]))
def fetch_run_for_experiment(experiment_to_recover: Experiment, run_id: str) -> Run: """ :param experiment_to_recover: an experiment :param run_id: a string representing the Run ID of one of the runs of the experiment :return: the run matching run_id_or_number; raises an exception if not found """ try: return get_run(experiment=experiment_to_recover, run_id=run_id, rehydrate=True) except Exception: available_runs = experiment_to_recover.get_runs() available_ids = ", ".join([run.id for run in available_runs]) raise (Exception( "Run {} not found for experiment: {}. Available runs are: {}".format( run_id, experiment_to_recover.name, available_ids)))
class ExperimentStorage: def __init__(self, workspace: Workspace, experiment_id: str): self.experiment_id = experiment_id self.experiment = Experiment(workspace, experiment_id) def download_output(self, experiment_run=None): if experiment_run is None: experiment_run: Run = next(self.experiment.get_runs()) model_path = os.path.join(TRAINED_MODELS_PATH, self.experiment_id) logger.info(f"Downloading results in {model_path}") os.makedirs(model_path, exist_ok=True) experiment_run.download_files("outputs/", model_path, append_prefix=False)
def fetch_runs(experiment: Experiment, filters: List[str]) -> List[Run]: """ Fetch the runs in an experiment. :param experiment: the experiment to fetch runs from :param filters: a list of run status to include. Must be subset of [Running, Completed, Failed, Canceled]. :return: the list of runs in the experiment """ exp_runs = list(experiment.get_runs()) if len(filters) != 0: if set.issubset(set(filters), ["Running", "Completed", "Failed", "Canceled"]): runs = [run for run in exp_runs if run.status in filters] exp_runs = runs return exp_runs
def test_registered_model_metric(get_ws_config): try: with open("aml_config/run_id.json") as f: config = json.load(f) new_model_run_id = config["run_id"] if new_model_run_id != "": experiment_name = config["experiment_name"] exp = Experiment(workspace=ws, name=experiment_name) model_list = Model.list( ws, tags={"area": "predictive maintenance"}) production_model = model_list[0] run_list = exp.get_runs() new_model_run = Run(exp, run_id=new_model_run_id) new_model_metric = new_model_run.get_metrics().get('accuracy') assert new_model_metric > 0.85, "Above 85% accuracy" except FileNotFoundError: print("No new model registered to test")
def getOperationStatus(self, operationVerb, operationId, userId, subscriptionId): experimentName = subscriptionId exp = Experiment(self._workspace, experimentName) operationName = self.GetOperationNameByVerb(operationVerb) tags = { 'userId': userId, 'operationId': operationId, 'operationName': operationName, 'subscriptionId': subscriptionId } runs = exp.get_runs(type='azureml.PipelineRun', tags=tags) try: run = next(runs) result = {'operationId': operationId, 'status': run.status} return result except StopIteration: raise LunaUserException( HTTPStatus.NOT_FOUND, 'Operation "{}" with id {} does not exist.'.format( operationVerb, operationId))
def listAllOperations(self, operationVerb, userId, subscriptionId): experimentName = subscriptionId operationName = self.GetOperationNameByVerb(operationVerb) exp = Experiment(self._workspace, experimentName) tags = { 'userId': userId, 'operationName': operationName, 'subscriptionId': subscriptionId } runs = exp.get_runs(type='azureml.PipelineRun', tags=tags) resultList = [] while True: try: run = next(runs) result = { 'operationId': run.tags["operationId"], 'status': run.status } resultList.append(result) except StopIteration: break return resultList
def get_run(self, ws_name, run_name): if not "." in run_name: errors.general_error( "Azure ML run name must be of the form: exper.runname") ws = self.get_aml_ws(ws_name) console.diag("after get_aml_ws() call") exper_name, run_part = run_name.split(".") experiment = Experiment(ws, name=exper_name) runs = experiment.get_runs(properties={"xt_run_name": run_name}) console.diag("after experiment.get_runs() call") runs = list(runs) console.diag("after list(runs), len={}".format(len(runs))) # run_number = int(run_part[3:]) # target_run = None #runs = [run for run in runs if run.number == run_number] target_run = runs[0] if len(runs) else None return target_run
def listAllOperationOutputs(self, operationNoun, userId, subscriptionId): operationName = self.GetOperationNameByNoun(operationNoun) experimentName = subscriptionId exp = Experiment(self._workspace, experimentName) tags = { 'userId': userId, 'operationName': operationName, 'subscriptionId': subscriptionId } runs = exp.get_runs(type='azureml.PipelineRun', tags=tags) results = [] while True: try: run = next(runs) output, outputType = self.getOperationOutput( operationNoun, run.tags["operationId"], userId, subscriptionId, downloadFiles=False) if output: if outputType == "model" or outputType == "endpoint": results.append(output) elif outputType == "json": results.append({ "operationId": run.tags["operationId"], "output": result }) elif outputType == "file": results.append({ "operationId": run.tags["operationId"], "outputType": "file" }) except StopIteration: break return results
def build_results_dataframe_from_azml(): from azureml.core import Workspace, Experiment workspace = Workspace.get(sharedconfig.workspace_name) experiment = Experiment(workspace, sharedconfig.experiment_name) runs = [run for run in experiment.get_runs() if run.status == "Completed"] results = [] for run in tqdm(runs): tags = { k: v for k, v in run.get_tags().items() if not k.startswith("_") } tags["num_nodes"] = int(tags["num_nodes"]) tags["iter"] = int(tags["iter"]) tags["ims_per_gpu"] = int(tags["ims_per_gpu"]) tags["fps"], tags["dfps"], _ = get_driver0_fps(run) results.append(tags) return pd.DataFrame(results)
# # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see <https://www.gnu.org/licenses/>. # Register a model from a successful run in a given experiment from azureml.core.model import Model from azureml.core import Experiment from azureml.core import Run from workspace import get_workspace workspace = get_workspace() # Select the experiment experiment = Experiment(workspace=workspace, name='test_experiment_1') # Get all the runs for the experiment. Returns a generator, which yields the # runs in reverse chronological order - ie the latest run first. Here, we # simply select the latest run runs = experiment.get_runs() run = next(runs) # Register the model from the run with a name, some tags and some properties run.register_model(model_name='model_from_test_experiment_1', tags={'tag1': 'v1'}, properties={'property1': 'p1'}, model_path='outputs/churn-model-2.pkl')
args = parser.parse_args() file_prefix = args.file_prefix granularity = args.granularity # Setup AML subscription_id = os.environ['AML_SUBSCRIPTION'] resource_group = os.environ['AML_RESOURCE_GROUP'] workspace_name = os.environ['AML_WORKSPACE'] ws = Workspace(subscription_id, resource_group, workspace_name) experiment_name = 'forecast_automl_' + file_prefix + '_' + granularity # Register the model from last best run print('registering the latest model for {0}'.format(experiment_name)) exp = Experiment(workspace=ws, name=experiment_name) run_generator = exp.get_runs() run_latest = next(run_generator) if run_latest.get_status() != 'Completed' or run_latest.type != 'automl': raise Exception('the last run is not completed or is not automl') run_id = run_latest.get_details()['runId'] automl_run = AutoMLRun(exp, run_id) best_run, fitted_model = automl_run.get_output() model_name = experiment_name.replace('-', '').replace('_', '').lower() # Register a model model = best_run.register_model(model_name=model_name, model_path='outputs/model.pkl') # Get existing model #model=Model(ws, model_name) # Figure out the run's dependencies
and args.local_dir): print( 'Must specify both remote_dir and local_dir to sync files from Experiment' ) sys.exit() # Get the AzureML Workspace the Experiment is running in ws = Workspace.get(name=args.workspace, subscription_id=args.subscription, resource_group=args.resource_group) # Find the Experiment experiment = Experiment(workspace=ws, name=args.experiment) # Find the Run runs = [r for r in experiment.get_runs()] if len(runs) == 0: print("No runs found in Experiment '{}'".format(args.experiment)) sys.exit() run = runs[0] if args.run is not None: try: run = next(r for r in runs if r.id == args.run) except StopIteration: print("Run id '{}' not found in Experiment '{}'".format( args.run, args.experiment)) sys.exit() # Optionally start synchronizing files from Run
def get_run_by_tags(tags): exp = Experiment(ws, experimentName) runs = exp.get_runs(type='azureml.PipelineRun', tags=tags) run = next(runs) print(run.status) return run
from azure.common.client_factory import get_client_from_cli_profile from azure.mgmt.resource import SubscriptionClient from azureml.core import Experiment from azureml.core import Workspace from azureml.core.authentication import AzureCliAuthentication from azureml.tensorboard import Tensorboard cli_auth = AzureCliAuthentication() subscription_client = get_client_from_cli_profile(SubscriptionClient) subscription_id = next( subscription_client.subscriptions.list()).subscription_id ws = Workspace( subscription_id=subscription_id, resource_group="ds_envs_RG", workspace_name="ds_envs_ws", auth=cli_auth, ) experiment_name = "my_experiment" run_id = "my_experiment_1603471452_ed6739ca" experiment = Experiment(workspace=ws, name=experiment_name) run = [i for i in experiment.get_runs() if i.id == run_id][0] tb = Tensorboard([run]) tb.start(start_browser=True) input("Press Enter to continue...") tb.stop()
def getMetrics(ws, experiment_name, tags={}): experiment = Experiment(workspace=ws, name=experiment_name) for run in experiment.get_runs(tags=tags): print(run.get_metrics())
import argparse parser = argparse.ArgumentParser() parser.add_argument('experiment') parser.add_argument('--workspace-config', default="azureml_config.json") args = parser.parse_args() print(args) def stop_run(r): status = r.get_status() print(f"Stopping {r.type}, {r.id}, {status}") if status == 'Running': if 'cancel' in dir(r): r.cancel() else: r.complete() for c in r.get_children(): stop_run(c) ws = Workspace.from_config(path=args.workspace_config) print('=' * 40) print(ws) exp = Experiment(ws, args.experiment) for run in exp.get_runs(): stop_run(run)
def getOperationOutput(self, operationNoun, operationId, userId, subscriptionId, downloadFiles=True): operationName = self.GetOperationNameByNoun(operationNoun) if operationName == 'train': tags = [['userId', userId], ['modelId', operationId], ['subscriptionId', subscriptionId]] models = Model.list(self._workspace, tags=tags) if len(models) == 0: return None model = models[0] result = { 'id': operationId, 'description': model.description, 'created_time': model.created_time } return result, "model" if operationName == 'deploy': tags = [['userId', userId], ['endpointId', operationId], ['subscriptionId', subscriptionId]] endpoints = Webservice.list(self._workspace, tags=tags) if len(endpoints) == 0: return None, None endpoint = endpoints[0] primaryKey, secondaryKey = endpoint.get_keys() result = { 'id': operationId, 'description': endpoint.description, 'created_time': endpoint.created_time, 'scoring_uri': endpoint.scoring_uri, 'primary_key': primaryKey, 'secondary_key': secondaryKey } return result, "endpoint" tags = { 'userId': userId, 'operationId': operationId, 'operationName': operationName, 'subscriptionId': subscriptionId } experimentName = subscriptionId exp = Experiment(self._workspace, experimentName) runs = exp.get_runs(type='azureml.PipelineRun', tags=tags) try: run = next(runs) child_runs = run.get_children() child_run = next(child_runs) outputType = self._utils.GetOutputType(operationName) if outputType == 'json': with tempfile.TemporaryDirectory() as tmp: path = os.path.join(tmp, 'output.json') files = child_run.download_file('/outputs/output.json', path) with open(path) as file: return json.load(file), "json" elif outputType == 'file': if downloadFiles: tmp = tempfile.TemporaryDirectory().name path = os.path.join(tmp, "outputs") zip_file_path = os.path.join( tmp, "output_{}.zip".format(operationId)) files = child_run.download_files("/outputs", path, append_prefix=False) zipf = zipfile.ZipFile(zip_file_path, "w", zipfile.ZIP_DEFLATED) self.zipdir(path, zipf, "outputs") zipf.close() return zip_file_path, "file" else: return "file", "file" except StopIteration: return None
config = json.load(f) new_model_run_id = config["run_id"] experiment_name = config["experiment_name"] exp = Experiment(workspace=ws, name=experiment_name) try: # Get most recently registered model, we assume that is the model in production. Download this model and compare it with the recently trained model by running test with same data set. model_list = Model.list(ws) production_model = next( filter( lambda x: x.created_time == max(model.created_time for model in model_list), model_list)) production_model_run_id = production_model.tags.get('run_id') run_list = exp.get_runs() # Get the run history for both production model and newly trained model and compare mse production_model_run = Run(exp, run_id=production_model_run_id) new_model_run = Run(exp, run_id=new_model_run_id) production_model_metric = production_model_run.get_metrics().get( 'accuracy') new_model_metric = new_model_run.get_metrics().get('accuracy') print( 'Current Production model accuracy: {}, New trained model accuracy: {}' .format(production_model_metric, new_model_metric)) promote_new_model = False if new_model_metric < production_model_metric: promote_new_model = True print('New trained model performs better, thus it will be registered')
class AzureMLTrainer(trainer.Trainer): is_connected: bool = False __config_file: str = '.azureml/config.json' __workspace: Workspace = None __experiment: Experiment = None __current_experiment_name: str __current_run: Run = None __logger: Logger = None __vm_size_list: list = None def __init__(self, experiment_name: str, aml_workspace: Workspace, aml_run: Run = None): ''' Initializes a new connected Trainer that will persist and log all runs on AzureML workspace Args: experiment_name (str): The name of the experiment that will be seen on AzureML aml_workspace (Workspace): The connected workspace on AzureML ''' self.__workspace = aml_workspace self.__logger = logging.getLogger() if aml_run is not None: self.__current_run = aml_run self.__experiment = aml_run.experiment self.__current_experiment_name = aml_run.experiment.name else: self.__current_experiment_name = experiment_name self.__experiment = Experiment(workspace=self.__workspace, name=experiment_name) @classmethod def CreateFromContext(cls): ''' Creates a Trainer, based on the current Run context. This will only work when used in an Estimator Returns: AzureMLTrainer: an instance of AzureMLTrainer allowing the user to work connected. ''' run = Run.get_context() return cls(run.experiment.name, run.experiment.workspace, run) def new_run(self, description: str = None, copy_folder: bool = True, metrics: dict = None) -> Run: ''' This will begin a new interactive run on the existing AzureML Experiment. When a previous run was still active, it will be completed. Args: description (str): An optional description that will be added to the run metadata copy_folder (bool): Indicates if the output folder should be snapshotted and persisted metrics (dict): The metrics that should be logged in the run already Returns: Run: the AzureML Run object that can be used for further access and custom logic ''' if(self.__current_run is not None): self.__current_run.complete() if(copy_folder): self.__current_run = self.__experiment.start_logging() else: self.__current_run = self.__experiment.start_logging(snapshot_directory = None) if(metrics is not None): for k, v in metrics.items(): self.__current_run.log(k, v) if(description is not None): self.__current_run.log('Description', description) return self.__current_run def add_tuning_result(self, run_index: int, train_score: float, test_score: float, sample_count: int, durations:np.array, parameters: dict, estimator): ''' This add results of a cross validation fold to the child run in a Grid Search Args: train_score (float): The given score of the training data test_score (float): The given score of the test data sample_count (int): The number of samples that were part of a fold durations (np.array): The different durations of the Grid Search parameters (dict): The parameter combinations that have been tested in this cross validation fold estimate (model): The actual fitted estimator / model that was trained in this fold ''' _child_run = self.__current_run.child_run('Gridsearch' + str(run_index)) self.__current_run.log_row('Trainscore', score = train_score) self.__current_run.log_row('Testscore', score = test_score) _table = { 'Testing score': test_score, 'Training score': train_score } for k in parameters.keys(): v = parameters[k] if(v is None): v = 'None' _child_run.log(k, v) _table[k] = v self.__current_run.log_row('Results', '', **_table) _child_run.complete() def get_best_model(self, metric_name:str, take_highest:bool = True): ''' Tags and returns the best model of the experiment, based on the given metric Args: metric_name (str): The name of the metric, such as accuracy take_highest (bool): In case of accuracy and score, this is typically True. In case you want to get the model based on the lowest error, you can use False Returns: Run: the best run, which will be labeled as best run ''' runs = {} run_metrics = {} for r in tqdm(self.__experiment.get_runs()): metrics = r.get_metrics() if metric_name in metrics.keys(): runs[r.id] = r run_metrics[r.id] = metrics best_run_id = min(run_metrics, key = lambda k: run_metrics[k][metric_name]) best_run = runs[best_run_id] best_run.tag('Best run') return best_run def get_azureml_experiment(self): ''' Gives access to the AzureML experiment object Returns: Experiment: the existing experiment ''' return self.__experiment def complete_run(self, fitted_model, metrics_to_log: dict = None, upload_model: bool = True): ''' Saves all results of the active Run and completes it Args: fitted_model (model): The already fitted model to be tested. Sklearn and Keras models have been tested metrics_to_log (dict): The metrics that should be logged with the model to the run upload_model (bool): This will upload the model (pkl file or json) to AzureML run (defaults to True) ''' is_keras = 'keras' in str(type(fitted_model)) if(metrics_to_log is not None): for k, v in metrics_to_log.items(): self._log_metrics(k, v) if upload_model: # Save the model to the outputs directory for capture if(is_keras): model_folder_name = 'outputs/model' fitted_model.save(model_folder_name) files_to_upload = dict() else: model_file_name = 'outputs/model.pkl' joblib.dump(value = fitted_model, filename = model_file_name) self._complete_run() def evaluate_classifier(self, fitted_model, X_test: np.array, y_test: np.array, show_roc: bool = False, save_curves_as_image: bool = False, class_names: np.array = None, finish_existing_run: bool = True, upload_model: bool = True, return_predictions: bool = False) -> np.array: ''' Will predict and evaluate a model against a test set and save all results to the active Run on AzureML Args: fitted_model (model): The already fitted model to be tested. Sklearn and Keras models have been tested X_test (np.array): The test set to calculate the predictions with y_test (np.array): The output test set to evaluate the predictions against show_roc (bool): This will upload the ROC curve to the run in case of a binary classifier save_curves_as_image (bool): This will save the training & loss curves as images class_names (np.array): The class names that will be linked to the Confusion Matrix. If not provided, the unique values of the y_test matrix will be used finish_existing_run (bool): Will complete the existing run on AzureML (defaults to True) upload_model (bool): This will upload the model (pkl file) to AzureML run (defaults to True) return_predictions (bool): If true, the y_pred values will be returned Returns: np.array: The predicted (y_pred) values against the model ''' is_keras = 'keras' in str(type(fitted_model)) # Predict X_test with model if(is_keras): if 'predict_classes' in dir(fitted_model): y_pred = fitted_model.predict_classes(X_test) else: y_pred = fitted_model.predict(X_test) y_pred = np.argmax(y_pred, axis=1) self.add_training_plots(fitted_model, save_image=save_curves_as_image) else: y_pred = fitted_model.predict(X_test) if class_names is None: class_names = np.char.mod('%d', sorted(np.unique(y_test))) # Print classification report print(metrics.classification_report(y_test, y_pred)) # Confusion matrix cf = metrics.confusion_matrix(y_test, y_pred) self._log_confmatrix(cf, class_names) # Accuracy accuracy = metrics.accuracy_score(y_test, y_pred) * 100 self._log_metrics('accuracy', accuracy, description='') if(show_roc == True): # Verify that we are having a binary classifier if(len(class_names)!=2): raise AttributeError('Showing a ROC curve is only possible for binary classifier, not for multi class') self.__log_roc_curve(y_test, y_pred) if (finish_existing_run): self.complete_run(fitted_model, upload_model = upload_model) if return_predictions: return y_pred def add_training_plots(self, fitted_model, metrics=None, save_image: bool = False): ''' Add the training plots to the Run history Args: fitted_model (Keras model): the fitted model that contains the training history metrics (list): the metrics that should be tracked to the run. If None, all available metrics will be taken ''' history = fitted_model.history if metrics is None: metrics = history.history.keys() for metric in metrics: if(metric in history.history.keys()): self.__current_run.log_table(f'Plot {metric}', {metric: history.history[metric]}) if(save_image and not metric.startswith('val_') and metric in history.history.keys()): plt.plot(history.history[metric]) plt.plot(history.history[f'val_{metric}']) plt.title(f'model {metric}') plt.ylabel(metric) plt.xlabel('epoch') plt.legend(['train', 'test'], loc='upper left') #plt.show() self.__current_run.log_image(f'model {metric}', plot=plt) plt.close() def evaluate_image_classifier(self, fitted_model, X_test: np.array, y_test: np.array, show_roc: bool = False, failed_classifications_to_save: int = 0, image_shape = None, save_curves_as_image: bool = False, class_names: np.array = None, finish_existing_run: bool = True, upload_model: bool = True, return_predictions: bool = False) -> np.array: ''' Will predict and evaluate a model against a test set and save all results to the active Run on AzureML Args: fitted_model (model): The already fitted model to be tested. Sklearn and Keras models have been tested X_test (np.array): The test set to calculate the predictions with y_test (np.array): The output test set to evaluate the predictions against show_roc (bool): This will upload the ROC curve to the run in case of a binary classifier failed_classifications_to_save (int): If greather than 0, this amount of incorrectly classified images will be tracked to the Run image_shape ((int, int, int)): Indicates if images should be reshaped before saving them class_names (np.array): The class names that will be used in the description. If not provided, the unique values of the y_test matrix will be used finish_existing_run (bool): Will complete the existing run on AzureML (defaults to True) upload_model (bool): This will upload the model (pkl file) to AzureML run (defaults to True) Returns: np.array: The predicted (y_pred) values against the model ''' from arcus.ml.images import explorer y_pred = self.evaluate_classifier(fitted_model, X_test, y_test, show_roc=show_roc, save_curves_as_image=save_curves_as_image, class_names= class_names, finish_existing_run=False, upload_model=upload_model, return_predictions=True) if failed_classifications_to_save > 0: # Take incorrect classified images and save import random incorrect_predictions = [i for i, item in enumerate(y_pred) if item != y_test[i]] total_images = min(len(incorrect_predictions), failed_classifications_to_save) for i in random.sample(incorrect_predictions, total_images): pred_class = y_pred[i] act_class = y_test[i] if class_names is not None: pred_class = class_names[pred_class] act_class = class_names[act_class] if image_shape is not None: # Reshape image before saving it imgplot = explorer.show_image(X_test[i].reshape(image_shape), silent_mode=True) else: imgplot = explorer.show_image(X_test[i], silent_mode=True) description = f'Predicted {pred_class} - Actual {act_class}' self.__current_run.log_image(description, plot=imgplot) if return_predictions: return y_pred def __stack_images(self, img1: np.array, img2: np.array): ha,wa = img1.shape[:2] hb,wb = img2.shape[:2] max_width = np.max([wa, wb]) total_height = ha+hb new_img = np.zeros(shape=(total_height, max_width, 3)) new_img[:ha,:wa]=img1 new_img[ha:hb+ha,:wb]=img2 return new_img def __concat_images(self, image_list: np.array) -> np.array: output = None for i, img in enumerate(image_list): if i==0: output = img else: output = self.__stack_images(output, img) return output def save_image_outputs(self, X_test: np.array, y_test: np.array, y_pred: np.array, samples_to_save: int = 1) -> np.array: ''' Will save image outputs to the run Args: X_test (np.array): The input images for the model y_test (np.array): The actual expected output images of the model y_pred (np.array): The predicted or calculated output images of the model samples_to_save (int): If greather than 0, this amount of input, output and generated image combinations will be tracked to the Run ''' from arcus.ml.images import explorer if samples_to_save > 0: import random total_images = min(len(y_pred), samples_to_save) for i in random.sample(range(len(y_pred)), total_images): newimg = self.__concat_images([X_test[i], y_test[i], y_pred[i]]) imgplot = explorer.show_image(newimg, silent_mode=True) self.__current_run.log_image(f'Image combo sample {i}', plot=imgplot) imgplot.close() def setup_training(self, training_name: str, overwrite: bool = False): ''' Will initialize a new directory (using the given training_name) and add a training script and requirements file to run training Args: training_name (str): The name of a training. This will be used to create a directory. Can contain subdirectory overwrite (bool): Defines if the existing training files should be overwritten ''' if not os.path.exists(training_name): os.makedirs(training_name) # Take default training script and copy to the new folder default_training_script_file = os.path.join(str(os.path.dirname(__file__)), 'resources/train.py') default_requirements_file = os.path.join(str(os.path.dirname(__file__)), 'resources/requirements.txt') dest_training_script_file = os.path.join(training_name, 'train.py') dest_requirements_file = os.path.join(training_name, 'requirements.txt') if overwrite or not(os.path.isfile(dest_training_script_file)): shutil.copy2(default_training_script_file, training_name) if overwrite or not(os.path.isfile(dest_requirements_file)): shutil.copy2(default_requirements_file, training_name) def start_training(self, training_name: str, environment_type: str = None, input_datasets: np.array = None, input_datasets_to_download: np.array = None, compute_target:str='local', gpu_compute: bool = False, script_parameters: dict = None, show_widget: bool = True, use_estimator: bool = False, **kwargs): ''' Will start a new training, taking the training name as the folder of the run Args: training_name (str): The name of a training. This will be used to create a directory. Can contain subdirectory environment_type (str): either the name of an existing environment that will be taken as base, or one of these values (tensorflow, sklearn, pytorch). input_datasets (np.array): An array of data set names that will be mounted on the compute in a directory of the dataset name input_datasets_to_download (np.array): An array of data set names that will be downloaded to the compute in a directory of the dataset name compute_target (str): The compute target (default = 'local') on which the training should be executed gpu_compute (bool): Indicates if GPU compute is required for this script or not script_parameters (dict): A dictionary of key/value parameters that will be passed as arguments to the training script show_widget (bool): Will display the live tracking of the submitted Run Returns: Run : the submitted run ''' if use_estimator: print('Scheduling Estimator training') self._start_estimator_training(training_name, environment_type, input_datasets, input_datasets_to_download, compute_target, gpu_compute, script_parameters, show_widget, **kwargs) else: print('Scheduling ScriptRunConfig training') self._start_environment_training(training_name, environment_type, input_datasets, input_datasets_to_download, compute_target, gpu_compute, script_parameters, show_widget, **kwargs) if script_parameters is not None: for arg in script_parameters.keys(): self.__current_run.log(arg.replace('--', ''), script_parameters[arg]) print(self.__current_run.get_portal_url()) if(show_widget): from azureml.widgets import RunDetails RunDetails(self.__current_run).show() return self.__current_run def _start_environment_training(self, training_name: str, environment_type: str = None, input_datasets: np.array = None, input_datasets_to_download: np.array = None, compute_target:str='local', gpu_compute: bool = False, script_parameters: dict = None, show_widget: bool = True, **kwargs): ''' Will start a new training using ScriptRunConfig, taking the training name as the folder of the run Args: training_name (str): The name of a training. This will be used to create a directory. Can contain subdirectory environment_type (str): either the name of an existing environment that will be taken as base, or one of these values (tensorflow, sklearn, pytorch). input_datasets (np.array): An array of data set names that will be mounted on the compute in a directory of the dataset name input_datasets_to_download (np.array): An array of data set names that will be downloaded to the compute in a directory of the dataset name compute_target (str): The compute target (default = 'local') on which the training should be executed gpu_compute (bool): Indicates if GPU compute is required for this script or not script_parameters (dict): A dictionary of key/value parameters that will be passed as arguments to the training script show_widget (bool): Will display the live tracking of the submitted Run ''' from azureml.train.estimator import Estimator from azureml.core import Environment, ScriptRunConfig from azureml.core.runconfig import RunConfiguration from azureml.core.runconfig import DataReferenceConfiguration from azureml.core.runconfig import CondaDependencies from arcus.azureml.experimenting import train_environment as te # Check if directory exists if not(os.path.exists(training_name) and os.path.isdir(training_name)): raise FileNotFoundError(training_name) # Check compute target if compute_target != 'local': self.__check_compute_target(compute_target, gpu_compute) training_env = te.get_training_environment(self.__workspace, training_name, os.path.join(training_name, 'requirements.txt'), use_gpu=gpu_compute, include_prerelease=True, environment_type=environment_type) runconfig = RunConfiguration() # Add datasets datarefs = dict() scriptargs = list() if script_parameters is not None: for key in script_parameters.keys(): scriptargs.append(key) scriptargs.append(script_parameters[key]) if(input_datasets is not None): for ds in input_datasets: print(f'Adding mounting data reference for dataset {ds}') # scriptargs.append(ds) scriptargs.append(self.__workspace.datasets[ds].as_named_input(ds).as_mount(path_on_compute = ds)) # datastore, path = self._get_data_reference(self.__workspace.datasets[ds]) # datarefs[ds] = DataReferenceConfiguration(datastore_name=datastore, path_on_datastore = path, path_on_compute = '/' + ds, mode = 'mount', overwrite = False) if(input_datasets_to_download is not None): for ds in input_datasets_to_download: print(f'Adding download data reference for dataset {ds}') # scriptargs.append(ds) scriptargs.append(self.__workspace.datasets[ds].as_named_input(ds).as_download(path_on_compute = ds)) scriptrunconfig = ScriptRunConfig(source_directory='./' + training_name, script="train.py", run_config=runconfig, arguments=scriptargs) scriptrunconfig.run_config.target = compute_target scriptrunconfig.run_config.environment = training_env #scriptrunconfig.run_config.data_references = datarefs # Submit training self.__current_run = self.__experiment.submit(scriptrunconfig) def _get_data_reference(self, dataset: Dataset): import json j = json.loads(str(dataset).replace('FileDataset\n', '')) source = j['source'][0] sections = source.split("'") return sections[1], sections[3] def _start_estimator_training(self, training_name: str, estimator_type: str = None, input_datasets: np.array = None, input_datasets_to_download: np.array = None, compute_target:str='local', gpu_compute: bool = False, script_parameters: dict = None, show_widget: bool = True, **kwargs): ''' Will start a new training using an Estimator, taking the training name as the folder of the run Args: training_name (str): The name of a training. This will be used to create a directory. Can contain subdirectory environment_type (str): one of these values (tensorflow, sklearn, pytorch). input_datasets (np.array): An array of data set names that will be mounted on the compute in a directory of the dataset name input_datasets_to_download (np.array): An array of data set names that will be downloaded to the compute in a directory of the dataset name compute_target (str): The compute target (default = 'local') on which the training should be executed gpu_compute (bool): Indicates if GPU compute is required for this script or not script_parameters (dict): A dictionary of key/value parameters that will be passed as arguments to the training script show_widget (bool): Will display the live tracking of the submitted Run ''' from azureml.train.estimator import Estimator # Check if directory exists if not(os.path.exists(training_name) and os.path.isdir(training_name)): raise FileNotFoundError(training_name) # Check compute target if compute_target != 'local': self.__check_compute_target(compute_target, gpu_compute) # Add datasets datasets = list() if(input_datasets is not None): for ds in input_datasets: datasets.append(self.__workspace.datasets[ds].as_named_input(ds).as_mount(path_on_compute=ds)) if(input_datasets_to_download is not None): for ds in input_datasets_to_download: datasets.append(self.__workspace.datasets[ds].as_named_input(ds).as_download(path_on_compute=ds)) # as mount - as download constructor_parameters = { 'source_directory':training_name, 'script_params':script_parameters, 'inputs':datasets, 'compute_target':compute_target, 'entry_script':'train.py', 'pip_requirements_file':'requirements.txt', 'use_gpu':gpu_compute, 'use_docker':True} print('Creating estimator of type', estimator_type) if(estimator_type is None): # Using default Estimator estimator = Estimator(**constructor_parameters) elif(estimator_type == 'tensorflow'): from azureml.train.dnn import TensorFlow version_par = 'framework_version' if(not version_par in constructor_parameters.keys()): print('Defaulting to version 2.0 for TensorFlow') constructor_parameters[version_par] = '2.0' estimator = TensorFlow(**constructor_parameters) elif(estimator_type == 'sklearn'): from azureml.train.sklearn import SKLearn estimator = SKLearn(**constructor_parameters) elif(estimator_type == 'pytorch'): from azureml.train.dnn import PyTorch estimator = PyTorch(**constructor_parameters) # Submit training self.__current_run = self.__experiment.submit(estimator) # protected implementation methods def _log_metrics(self, metric_name: str, metric_value: float, description:str = None): print(metric_name, metric_value) self.__current_run.log(metric_name, metric_value, description=description) def _complete_run(self): ''' Completes the current run ''' self.__current_run.complete() def _log_confmatrix(self, confusion_matrix: np.array, class_names: np.array): data = {} data['schema_type'] = 'confusion_matrix' data['schema_version'] = 'v1' data['data'] = {} data['data']['class_labels'] = class_names.tolist() data['data']['matrix'] = confusion_matrix.tolist() print(confusion_matrix) json_data = json.dumps(data) self.__current_run.log_confusion_matrix('Confusion matrix', json_data, description='') def _save_roc_curve(self, roc_auc: float, roc_plot: plt): self._log_metrics('roc_auc', roc_auc) self.__current_run.log_image('ROC Curve', plot=plt) def __check_compute_target(self, compute_target, use_gpu: bool): __vm_size = '' if isinstance(compute_target, AmlCompute): __vm_size = compute_target.vm_size elif isinstance(compute_target, str): compute = ComputeTarget(workspace=self.__workspace, name=compute_target) __vm_size = compute.vm_size if self.__vm_size_list is None: self.__vm_size_list = AmlCompute.supported_vmsizes(self.__workspace) vm_description = list(filter(lambda vmsize: str.upper(vmsize['name']) == str.upper(__vm_size), self.__vm_size_list))[0] if(use_gpu and vm_description['gpus'] == 0): raise errors.TrainingComputeException(f'gpu_compute was specified, but the target does not have GPUs: {vm_description} ') if(not (use_gpu) and vm_description['vCPUs'] == 0): raise errors.TrainingComputeException(f'cpu_compute was specified, but the target does not have CPUs: {vm_description} ') def __log_roc_curve(self, y_pred: np.array, y_test: np.array): '''Will upload the Receiver Operating Characteristic (ROC) Curve for binary classifiers Args: y_pred (np.array): The predicted values of the test set y_test (np.array): The actual outputs of the test set Returns: float: The ROC_AUC value ''' # calculate the fpr and tpr for all thresholds of the classification fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred) roc_auc = metrics.auc(fpr, tpr) plt.cla() plt.title('Receiver Operating Characteristic') plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc) plt.legend(loc = 'lower right') plt.plot([0, 1], [0, 1],'r--') plt.xlim([0, 1]) plt.ylim([0, 1]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') self._save_roc_curve(roc_auc, plt) plt.show(block=False) plt.close() return roc_auc
# MAGIC # MAGIC Each `Run` object has a `get_metrics()` method that will retrieve our stored metrics. We can leverage the `get_runs()` method of the `Experiment` object to retrieve the run objects. # MAGIC # MAGIC We will then render a table to compare model performance. # COMMAND ---------- # Download RMSE and R2 from AML Service import pandas as pd # Use list comprehension to retrieve records from experiment object. run_results = pd.DataFrame.from_records([{ "id": run.id, "RMSE": run.get_metrics().get('RMSE'), 'R2': run.get_metrics().get('R2') } for run in experiment.get_runs() if run.get_metrics().get('RMSE') is not None ]) display(run_results[['id', 'RMSE', 'R2']]) # COMMAND ---------- # MAGIC %md # MAGIC #### 3. Select Run with Model to deploy # MAGIC # MAGIC Each time we ran the models, we stored a zip file with the trained model in AML. We can now retrieve the trained model of the particular run that we want to deploy. We'll copy the relevant `id` from above and retrieve the Run object. # COMMAND ---------- best_run_id = '6d670807-6477-4ea6-a98b-84069c888346' best_run = Run(experiment, best_run_id)
# Online run. Use dataset provided by training notebook. else: print("Running in online mode...") experiment = run.experiment workspace = experiment.workspace dataset_path = run.input_datasets["dataset"] # Download the model from the provided run. print("Downloading model from run with id {}...".format(args.run_id)) # Locate the run that contains the model. experiment_that_contains_model = Experiment(workspace=workspace, name=args.experiment_name) run_that_contains_model = None for experiment_run in experiment_that_contains_model.get_runs(): if experiment_run.id == args.run_id: run_that_contains_model = experiment_run break if run_that_contains_model is None: print("ERROR! Run not found!") exit(0) # Download the model. print("Downloading the model...") output_directory = "model-" + args.run_id run_that_contains_model.download_files(output_directory=output_directory) # Instantiate the model with its weights. print("Creating the model...") model = GAPNet()
model.fit(X=X_train, y=y_train) y_pred = model.predict(X=X_test) rmse = math.sqrt(mean_squared_error(y_true=y_test, y_pred=y_pred)) run.log("rmse", rmse) model_name = "model_alpha_" + str(alpha) + ".pkl" filename = "outputs/" + model_name joblib.dump(value=model, filename=filename) run.upload_file(name=model_name, path_or_stream=filename) run.complete() minimum_rmse_runid = None minimum_rmse = None for run in experiment.get_runs(): run_metrics = run.get_metrics() run_details = run.get_details() # each logged metric becomes a key in this returned dict run_rmse = run_metrics["rmse"] run_id = run_details["runId"] if minimum_rmse is None: minimum_rmse = run_rmse minimum_rmse_runid = run_id else: if run_rmse < minimum_rmse: minimum_rmse = run_rmse minimum_rmse_runid = run_id print("Best run_id: " + minimum_rmse_runid)
arguments=[ "--aoi", args.aoi_file, "--feature-file", args.feature_file, "--model-file", os.path.basename(model_file), ], max_run_duration_seconds=60 * 30, environment=load_azml_env(), ) display_name = f"{args.output_prefix} {args.run_id} {args.model_file}" existing_runs = [ run for run in experiment.get_runs() if run.display_name == display_name ] if len(existing_runs) == 0: print("no runs") run = experiment.submit(config) run.display_name = display_name run.wait_for_completion() else: print("run exists") run = existing_runs[0] output_dir = f"data/predictions/{args.output_prefix}_{args.run_id}" os.makedirs(output_dir, exist_ok=True) local_files = [] for file in run.get_file_names():