def mlflow_run(self, df, r_name="Lab-1:RF Petrol Regression Experiment"): """ This method trains, computes metrics, and logs all metrics, parameters, and artifacts for the current run :param df: pandas dataFrame :param r_name: Name of the experiment as logged by MLflow :return: MLflow Tuple (ExperimentID, runID) """ with mlflow.start_run(run_name=r_name) as run: # get all rows and columns but the last column X = dataset.iloc[:, 0:4].values # get all the last columns, which is what we want to predict y = dataset.iloc[:, 4].values # create train and test data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # Feature Scaling sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) # train and predict self.rf.fit(X_train, y_train) y_pred = self.rf.predict(X_test) # Log model and params using the MLflow sklearn APIs mlflow.sklearn.log_model(self.rf, "random-forest-reg-model") mlflow.log_params(self.params) # compute metrics mae = metrics.mean_absolute_error(y_test, y_pred) mse = metrics.mean_squared_error(y_test, y_pred) rsme = np.sqrt(mse) r2 = metrics.r2_score(y_test, y_pred) # Log metrics mlflow.log_metric("mae", mae) mlflow.log_metric("mse", mse) mlflow.log_metric("rsme", rsme) mlflow.log_metric("r2", r2) # update global class instance variable with values self.rsme.append(rsme) self.r2.append(r2) self.estimators.append(params["n_estimators"]) # plot RSME graph and save as artifacts (fig, ax) = Utils.plot_graphs(rfr.estimators, rfr.rsme, "Random Forest Estimators", "Root Mean Square", "Root Mean Square vs Estimators") # get current run and experiment id runID = run.info.run_uuid experimentID = run.info.experiment_id # create temporary artifact file name and log artifact temp_file_name = Utils.get_temporary_directory_path( "rsme_estimators-", ".png") temp_name = temp_file_name.name try: fig.savefig(temp_name) mlflow.log_artifact(temp_name, "rsme_estimators_plots") finally: temp_file_name.close() # Delete the temp file # plot R2 graph and save as artifacts (fig_2, ax) = Utils.plot_graphs(rfr.estimators, rfr.r2, "Random Forest Estimators", "R2", "R2 vs Estimators") # create temporary artifact file name and log artifact temp_file_name = Utils.get_temporary_directory_path( "r2-estimators-", ".png") temp_name = temp_file_name.name try: fig_2.savefig(temp_name) mlflow.log_artifact(temp_name, "r2_estimators_plots") finally: temp_file_name.close() # Delete the temp file # print some data print("-" * 100) print( "Inside MLflow Run with run_id {} and experiment_id {}".format( runID, experimentID)) print("Estimator trees :", self.params["n_estimators"]) print("Estimator trees depth :", self.params["max_depth"]) print('Mean Absolute Error :', mae) print('Mean Squared Error :', mse) print('Root Mean Squared Error:', rsme) print('R2 :', r2) return (experimentID, runID)
'batch_size': 128 }, { 'input_units': 256, 'input_shape': (4, ), 'activation': 'relu', 'optimizer': 'adam', 'loss': 'mse', 'epochs': 300, 'batch_size': 128 }, { 'input_units': 512, 'input_shape': (4, ), 'activation': 'relu', 'optimizer': 'adam', 'loss': 'mse', 'epochs': 500, 'batch_size': 256 }] dataset = Utils.load_data("data/petrol_consumption.csv") # get all feature independent attributes X = dataset.iloc[:, 0:4].values # get all the values of last columns, dependent variables, # which is what we want to predict as our values, the petrol consumption y = dataset.iloc[:, 4].values for params in params_list: keras_model = KerasRegModel(params) (runID, experimentID) = keras_model.train_model(X, y) print("MLflow completed with run_id {} and experiment_id {}".format( runID, experimentID))
def mlflow_run(self, df, r_name="Lab-4:RF Experiment Model"): """ Override the base class mlflow_run for this epxerimental runs This method trains the model, evaluates, computes the metrics, logs all the relevant metrics, artifacts, and models. :param df: pandas dataFrame :param r_name: name of the experiment run :return: MLflow Tuple (ExperimentID, runID) """ with mlflow.start_run(run_name=r_name) as run: X_train, X_test, y_train, y_test = train_test_split( df.drop(["price"], axis=1), df[["price"]].values.ravel(), random_state=42) self.rf.fit(X_train, y_train) predictions = self.rf.predict(X_test) # Log model and parameters mlflow.sklearn.log_model(self.rf, "random-forest-model") # Note we are logging as a dictionary of all params instead of logging each parameter mlflow.log_params(self.params) # Log params #[mlflow.log_param(param, value) for param, value in self.params.items()] # Create metrics mse = metrics.mean_squared_error(y_test, predictions) rmse = np.sqrt(mse) mae = metrics.mean_absolute_error(y_test, predictions) r2 = metrics.r2_score(y_test, predictions) # Log metrics mlflow.log_metric("mse", mse) mlflow.log_metric("mae", mae) mlflow.log_metric("rsme", rmse) mlflow.log_metric("r2", r2) # get experimentalID and runID runID = run.info.run_uuid experimentID = run.info.experiment_id # Create feature importance and save them as artifact # This allows us to remove least important features from the dataset # with each iteration if they don't have any effect on the predictive power of # the prediction. importance = pd.DataFrame( list(zip(df.columns, self.rf.feature_importances_)), columns=["Feature", "Importance"]).sort_values("Importance", ascending=False) # Log importance file as feature artifact temp_file_name = Utils.get_temporary_directory_path( "feature-importance-", ".csv") temp_name = temp_file_name.name try: importance.to_csv(temp_name, index=False) mlflow.log_artifact(temp_name, "feature-importance-files") finally: temp_file_name.close() # Delete the temp file # Create residual plots and image directory # Residuals R = observed value - predicted value (plt, fig, ax) = Utils.plot_residual_graphs( predictions, y_test, "Predicted values for Price ($)", "Residual", "Residual Plot") # Log residuals images temp_file_name = Utils.get_temporary_directory_path( "residuals-", ".png") temp_name = temp_file_name.name try: fig.savefig(temp_name) mlflow.log_artifact(temp_name, "residuals-plots") finally: temp_file_name.close() # Delete the temp file print("-" * 100) print("Inside MLflow {} Run with run_id {} and experiment_id {}". format(r_name, runID, experimentID)) print(" mse: {}".format(mse)) print(" rmse: {}".format(rmse)) print(" mae: {}".format(mae)) print(" R2 : {}".format(r2)) return (experimentID, runID)
return (experimentID, runID) # # TODO in Lab/Homework for Some Experimental runs # # 1. Consult RandomForestRegressor documentation # 2. Change or add parameters, such as depth of the tree or random_state: 42 etc. # 3. Change or alter the range of runs and increments of n_estimators. # 4. Check in MLfow UI if the metrics are affected # challenge-1: create mean square error and r2 artifacts and save them for each run if __name__ == '__main__': # load and print dataset dataset = Utils.load_data("data/petrol_consumption.csv") Utils.print_pandas_dataset(dataset) # iterate over several runs with different parameters, stepping up by 50 # limiting to 300 max_depth = 0 for n in range(50, 350, 50): max_depth = max_depth + 2 params = { "n_estimators": n, "max_depth": max_depth, "random_state": 42 } rfr = RFRModel.new_instance(params) (experimentID, runID) = rfr.mlflow_run(dataset) print( "MLflow Run completed with run_id {} and experiment_id {}".format(
# # Lab/Homework for Some Experimental runs # # 1. Consult RandomForestRegressor documentation # 2. Change or add parameters, such as depth of the tree or random_state: 42 etc. # 3. Change or alter the range of runs and increments of n_estimators # 4. Check in MLfow UI if the metrics are affected if __name__ == '__main__': # TODO add more parameters to the list # create four experiments with different parameters # run these different experiments, each with its own instance of model with the supplied parameters. # add more parameters to this dictionary list here params_list = [{"n_estimators": 200, "max_depth": 6, "random_state": 42}] # load the data dataset = Utils.load_data("data/airbnb-cleaned-mlflow.csv") # run these experiments, each with its own instance of model with the supplied parameters. for params in params_list: rfr = RFFExperimentModel.new_instance(params) experiment = "Experiment with {} trees".format(params['n_estimators']) (experimentID, runID) = rfr.mlflow_run(dataset, experiment) print( "MLflow Run completed with run_id {} and experiment_id {}".format( runID, experimentID)) print("-" * 100) # Use MLflowClient API to query programmatically any previous run info under an experiment ID # consult https://mlflow.org/docs/latest/python_api/mlflow.tracking.html client = MlflowClient() run_list = client.list_run_infos(experimentID)
def mlflow_run(self, df, r_name="Lab-2:RF Bank Note Classification Experiment"): """ This method trains, computes metrics, and logs all metrics, parameters, and artifacts for the current run :param df: pandas dataFrame :param r_name: Name of the experiment as logged by MLflow :return: MLflow Tuple (ExperimentID, runID) """ with mlflow.start_run(run_name=r_name) as run: # get all rows and columns but the last column, which is our class X = df.iloc[:, 0:4].values # get all observed values in the last columns, which is what we want to predict y = df.iloc[:, 4].values # create train and test data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # Feature Scaling sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) # train and predict self.rf.fit(X_train, y_train) y_pred = self.rf.predict(X_test) # Log model and params using the MLflow sklearn APIs mlflow.sklearn.log_model(self.rf, "random-forest-class-model") mlflow.log_params(self.params) # compute evaluation metrics acc = accuracy_score(y_test, y_pred) precision = precision_score(y_test, y_pred) conf_matrix = confusion_matrix(y_test, y_pred) # get confusion matrix values true_positive = conf_matrix[0][0] true_negative = conf_matrix[1][1] false_positive = conf_matrix[0][1] false_negative = conf_matrix[1][0] # get confusion matrix as a dictionary class_report = classification_report(y_test, y_pred, output_dict=True) recall_0 = class_report['0']['recall'] f1_score_0 = class_report['0']['f1-score'] recall_1 = class_report['1']['recall'] f1_score_1 = class_report['1']['f1-score'] # log metrics mlflow.log_metric("accuracy_score", acc) mlflow.log_metric("precision", precision) mlflow.log_metric("true_positive", true_positive) mlflow.log_metric("true_negative", true_negative) mlflow.log_metric("false_positive", false_positive) mlflow.log_metric("false_negative", false_negative) mlflow.log_metric("recall_0", recall_0) mlflow.log_metric("f1_score_0", f1_score_0) mlflow.log_metric("recall_1", recall_1) mlflow.log_metric("f1_score_1", f1_score_1) # get current run and experiment id runID = run.info.run_uuid experimentID = run.info.experiment_id # create confusion matrix images (plt, fig, ax) = Utils.plot_confusion_matrix( y_test, y_pred, y, title="Bank Note Classification Confusion Matrix") # create temporary artifact file name and log artifact temp_file_name = Utils.get_temporary_directory_path( "confusion_matrix-", ".png") temp_name = temp_file_name.name try: fig.savefig(temp_name) mlflow.log_artifact(temp_name, "confusion_matrix_plots") finally: temp_file_name.close() # Delete the temp file # print some data print("-" * 100) print( "Inside MLflow Run with run_id {} and experiment_id {}".format( runID, experimentID)) print("Estimators trees:", self.params["n_estimators"]) print(conf_matrix) print(classification_report(y_test, y_pred)) print("Accuracy Score:", acc) print("Precision :", precision) return (experimentID, runID)
return (experimentID, runID) # # Lab/Homework for Some Experimental runs # # 1. Consult RandomForestClassifier documentation # 2. Change or add parameters, such as depth of the tree or random_state: 42 etc. # 3. Change or alter the range of runs and increments of n_estimators # 4. Check in MLfow UI if the metrics are affected # 5. Log confusion matirx, recall and F1-score as metrics # Nice blog: https://joshlawman.com/metrics-classification-report-breakdown-precision-recall-f1/ if __name__ == '__main__': # load and print dataset dataset = Utils.load_data("data/bill_authentication.csv") Utils.print_pandas_dataset(dataset) # iterate over several runs with different parameters # TODO in the Lab (change these parameters, n_estimators and random_state # with each iteration. # Does that change the metrics and accuracy? # start with n=10, step by 10 up to X <=120 for n in range(10, 120, 10): params = {"n_estimators": n, "random_state": 42} rfr = RFCModel.new_instance(params) (experimentID, runID) = rfr.mlflow_run(dataset) print( "MLflow Run completed with run_id {} and experiment_id {}".format( runID, experimentID)) print("-" * 100)
print("Estimator trees :", self.params["n_estimators"]) print('Mean Absolute Error :', mae) print('Mean Squared Error :', mse) print('Root Mean Squared Error:', rsme) print('R2 :', r2) return (experimentID, runID) # # TODO in Lab/Homework for Some Experimental runs # # 1. Consult RandomForest documentation # 2. Run the baseline model # 3. Check in MLflow UI for parameters, metrics, and artifacts if __name__ == '__main__': # load and print dataset dataset = Utils.load_data("data/airbnb-cleaned-mlflow.csv") Utils.print_pandas_dataset(dataset) # # create a base line model parameters # this is our benchmark model to compare experimental results with # params = {"n_estimators": 100, "max_depth": 3, "random_state": 0} rfr = RFRBaseModel.new_instance(params) (experimentID, runID) = rfr.mlflow_run(dataset) print("MLflow completed with run_id {} and experiment_id {}".format( runID, experimentID)) print("-" * 100)
def mlflow_run(self, df, r_name="Lab-1:RF Petrol Regression Experiment"): """ This method trains, computes metrics, and logs all metrics, parameters, and artifacts for the current run :param df: pandas dataFrame :param r_name: Name of the run as logged by MLflow :return: MLflow Tuple (ExperimentID, runID) """ with mlflow.start_run(run_name=r_name) as run: # get all feature independent attributes X = df.iloc[:, 0:4].values # get all the values of last columns, dependent variables, # which is what we want to predict as our values, the petrol consumption y = df.iloc[:, 4].values # create train and test data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # Feature Scaling, though for RF is not necessary. # z = (X - u)/ s, where u is the man, s the standard deviation # get the handle to the transformer sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) # train and predict self.rf.fit(X_train, y_train) y_pred = self.rf.predict(X_test) # Log model and params using the MLflow sklearn APIs mlflow.sklearn.log_model(self.model, "random-forest-reg-model") mlflow.log_params(self.params) # compute metrics; r2 is a statistical measure of how well the # data fits the model: higher the value indicates better fit. mae = metrics.mean_absolute_error(y_test, y_pred) mse = metrics.mean_squared_error(y_test, y_pred) rsme = np.sqrt(mse) r2 = metrics.r2_score(y_test, y_pred) # Log metrics mlflow.log_metric("mae", mae) mlflow.log_metric("mse", mse) mlflow.log_metric("rsme", rsme) mlflow.log_metric("r2", r2) # update global class instance variable with values self.rsme.append(rsme) self.estimators.append(self._params["n_estimators"]) # plot graphs and save as artifacts (fig, ax) = Utils.plot_graphs(self.estimators, self.rsme, "Random Forest Estimators", "Root Mean Square", "Root Mean Square vs Estimators") # get current run and experiment id runID = run.info.run_uuid experimentID = run.info.experiment_id # create temporary artifact file name and log artifact temp_file_name = Utils.get_temporary_directory_path( "rsme_estimators-", ".png") temp_name = temp_file_name.name try: fig.savefig(temp_name) mlflow.log_artifact(temp_name, "rsme_estimators_plots") finally: temp_file_name.close() # Delete the temp file # print some data print("-" * 100) print( "Inside MLflow Run with run_id {} and experiment_id {}".format( runID, experimentID)) print("Estimator trees :", self.params["n_estimators"]) print('Mean Absolute Error :', mae) print('Mean Squared Error :', mse) print('Root Mean Squared Error:', rsme) print('R2 :', r2) return (experimentID, runID)