def register_model_to_azureml(ws: Workspace, model_name: str, assets_dir: str, scores={}, description: str = ""): """Register the model with it's assets to AzureML. Args: ws (Workspace): Workspace of the AzureML. model_name (str): Name to register the model. assets_dir (str): Directory containing the model and other assets, like scalers, preprocessors, etc. scores (dict, optional): Dictionary containing scores of the model, such as accuracy and f1-score. Defaults to {}. Returns: (Model): The object containing the registered model on AzureML. """ # register the model to AzureML logger.info("Register the model to AzureML") model = Model.register( model_path=assets_dir, model_name=model_name, tags={ "Model": "XGBClassifier", "Scaler": "None" }, properties=scores, description=description, workspace=ws, model_framework=Model.Framework.SCIKITLEARN, model_framework_version=sklearn.__version__, ) logger.info(f"Model {model_name} registered at version {model.version}") return model
def transform_infer_data(dataset: pd.DataFrame, aml_model_name: str, preprocessor_name: str) -> pd.DataFrame: """Transform the dataset for inference. Downloads the registered Model on AzureML and load the already fit preprocessor to use for the data transformation. Args: dataset (pd.DataFrame): Dataset to transform. aml_model_name (str): The name of the Model registered on AzureML. preprocessor_name (str): The name of the preprocessor registered with the Model's assets on AzureML. Returns: (pd.DataFrame): The preprocessor's transformations applied to the dataframe. """ aml_helper = AmlCustomHelper() try: # download registered model and assets logger.info("Download aml model") aml_model = Model(aml_helper.ws, aml_model_name) logger.info(f"aml_helper.ASSETS_DIR:\t{aml_helper.ASSETS_DIR}") aml_model.download( target_dir=f"{'/'.join(aml_helper.ASSETS_DIR.split('/')[0:-1])}", exist_ok=True, ) # load preprocessor object from model assets logger.info("Load aml model's preprocessor") preprocessor = joblib.load( f"{aml_helper.ASSETS_DIR}/{preprocessor_name}") logger.info(f"Transform the dataset") transformed_dataset = preprocessor.transform(dataset, is_inference=True) return transformed_dataset except (WebserviceException, ModelNotFoundException): logger.info( f"No previous registered model found with the name:\t{MODEL_NAME}") # force to fail the AzureML pipeline step raise AzureMLException(exception_message="Stopping the execution.")
def train_model( X_train_path: str, y_train_path: str, output_model_path: str, ): """Train the classification model using X input variables and y target variable. Persist the trained model. Args: X_train_path (str): The path for X dataset as input for training the model. y_train_path (str): The path for y dataset as input for training the model. output_model_path (str): The path to persist the trained model. """ # load dataset logger.info("Load the training datasets") X_train = pd.read_parquet(X_train_path) logger.info(f"X shape:\t{X_train.shape}") y_train = pd.read_parquet(y_train_path) logger.info(f"y shape:\t{y_train.shape}") # build the classification model model_clf = XGBClassifier( colsample_bytree=1, learning_rate=0.1, max_depth=4, min_child_weight=1e-05, n_estimators=200, objective="binary:logistic", subsample=0.5, ) # fit the classification model model_clf.fit(X_train, y_train) logger.info(f"XGBClassifier model:\t{model_clf}") # persist the trained model logger.info("Persist the model") joblib.dump(model_clf, output_model_path)
def upload_results_to_azureml_datastore( predictions: str, output_file_path: str, ): """Uploads the inference result file to the Azure Datastore. Args: predictions (str): Path to the dataset. output_file_path (str): Path to output the inference data. """ # get datastore according to env aml_helper = AmlCustomHelper() datastore = aml_helper.ws.get_default_datastore() # upload files to AzureBlobDatastore logger.info("Upload predictions...") datastore.upload_files( files=[predictions], target_path=output_file_path, overwrite=True, show_progress=True, )
def compare_models( model_path: str, X_test_path: str, y_test_path: str, ): """Compares the recall of the current model (new model) with the already registered model (old model). If the final model's score is NOT greater or equals than the old model's, an Error is raised stopping the execution, thus preventing the new model from being registered. Args: model_path (str): Path to the new model object. X_test_path (str): Path to the variables test dataset. y_test_path (str): Path to the target test dataset. Raises: AzureMLException: Error due to the final model having a worst recall than the old model. """ aml_helper = AmlCustomHelper() try: # download old model logger.info("Download old model") old_model_aml = Model(aml_helper.ws, MODEL_NAME) logger.info(f"aml_helper.ASSETS_DIR:\t{aml_helper.ASSETS_DIR}") old_model_aml.download( target_dir=f"{'/'.join(aml_helper.ASSETS_DIR.split('/')[0:-1])}", exist_ok=True, ) # load old model logger.info("Load old model") old_model = joblib.load( f"{aml_helper.ASSETS_DIR}/{model_path.split('/')[-1]}") logger.info(f"Old Model:\t{old_model}") # load new model logger.info("Load new model") new_model = joblib.load(model_path) logger.info(f"New Model:\t{new_model}") # load test dataset logger.info("Load the test datasets") X_test = pd.read_parquet(X_test_path) logger.info(f"X_test shape:\t{X_test.shape}") y_test = pd.read_parquet(y_test_path) logger.info(f"y_test shape:\t{y_test.shape}") # make predictions on the test set logger.info("Make predictions - Old Model") y_hat_old = old_model.predict(X_test) logger.info("Make predictions - Old Model") y_hat_new = new_model.predict(X_test) # Recall logger.info("Calculates Recall") recall_old = recall_score(y_test, y_hat_old) recall_new = recall_score(y_test, y_hat_new) logger.info(f"Old model recall:\t{recall_old}") logger.info(f"New model recall:\t{recall_new}") if recall_old > recall_new: # force to fail the AzureML pipeline step raise AzureMLException( exception_message= "The new model metric scored less than the old model. Thus, We will not proceed to the new model registration." ) except (WebserviceException, ModelNotFoundException): logger.info( f"No previous registered model found with the name:\t{MODEL_NAME}")
def make_predictions( transformed_data_path: str, original_data_path: str, inference_path: str, ): """Loads the datasets already transformed and make the predictions. Args: transformed_data_path (str): Path to the transformed dataset ready to be feed to the model for prediction. original_data_path (str): Path to the original dataset. inference_path (str): Path to the predictions. """ # helper aml_helper = AmlCustomHelper() # load dataset logger.info("Load the training datasets") X = pd.read_parquet(transformed_data_path) logger.info(f"X shape:\t{X.shape}") # load original dataset logger.info("Load the original datasets") original_data = pd.read_parquet(original_data_path) logger.info(f"original_data shape:\t{original_data.shape}") # download registered model logger.info("Download Azureml model") az_model = Model(aml_helper.ws, MODEL_NAME) logger.info(f"aml_helper.ASSETS_DIR:\t{aml_helper.ASSETS_DIR}") az_model.download( target_dir=f"{'/'.join(aml_helper.ASSETS_DIR.split('/')[0:-1])}", exist_ok=True, ) # load model logger.info("Load Azureml model") model = joblib.load(f"{aml_helper.ASSETS_DIR}/{MODEL_NAME}") logger.info(f"Model:\t{model}") # batch inference - get predictions pred = model.predict(X) logger.info(f"type:\t{type(pred)}") logger.info(f"pred.shape:\t{pred.shape}") # transform the predictions to DataFrame pred = pd.DataFrame(pred, columns=["prediction"]) logger.info(pred.head()) # concat the predictions with original dataset df_pred_and_orig = pd.concat([original_data, pred], axis=1) logger.info(f"df_pred_and_orig.head():\t{df_pred_and_orig.head()}") # persist predictions logger.info("persist results") df_pred_and_orig.to_csv(inference_path)
def evaluate_model( X_test_path: str, y_test_path: str, model_path: str, model_score_path: str, ): """Evaluate the model using metrics such as accuracy and f1-score and persists these metrics generated as an object. Also logs these metrics to AzureML. Args: X_test_path (str): Path to X input for model evaluation y_test_path (str): Path to y input for model evaluation model_path (str): Path to model to be evaluated model_score_path (str): Path to persist the evaluation metrics of the model """ # load dataset logger.info("Load the test datasets") X_test = pd.read_parquet(X_test_path) logger.info(f"X_test shape:\t{X_test.shape}") y_test = pd.read_parquet(y_test_path) logger.info(f"y_test shape:\t{y_test.shape}") # load the model model = joblib.load(model_path) # make predictions on the test set logger.info("Make predictions") y_hat = model.predict(X_test) scores = {} # model accuracy logger.info("Calculates accuracy") acc_score = accuracy_score(y_test, y_hat) scores["accuracy"] = acc_score # F1-Score Macro logger.info("Calculates F1-Score Macro") f1_score_macro = f1_score(y_test, y_hat, average="macro") scores["f1_score_macro"] = f1_score_macro # Recall logger.info("Calculates Recall") recall = recall_score(y_test, y_hat) scores["recall"] = recall logger.info(f"scores:\t{scores}") # Confusion Matrix logger.info("Calculates Confusion Matrix") cm = confusion_matrix(y_test, y_hat) data_prep = DatasetPreprocessor() cm_json = { "schema_type": "confusion_matrix", # "schema_version": "v1", "data": { "class_labels": data_prep.target_col_description, "matrix": [[int(y) for y in x] for x in cm], }, } # plot the confusion matrix logger.info("Plot Confusion Matrix") cm_plot = plot_confusion_matrix(cm, data_prep.target_col_description, "Heart Disease") # Log to AzureML run = Run.get_context() if not isinstance(run, _OfflineRun): parent_run = run.parent parent_run.log(name="Accuracy", value=acc_score) parent_run.log(name="F1-Score Macro", value=f1_score_macro) parent_run.log(name="Recall", value=recall) parent_run.log_confusion_matrix(name="Confusion Matrix", value=cm_json) parent_run.log_image(name="Confusion Matrix", plot=cm_plot) else: logger.info("Offline run") run.log(name="Accuracy", value=acc_score) run.log(name="F1-Score Macro", value=f1_score_macro) run.log(name="Recall", value=recall) run.log(name="Confusion Matrix", value=cm) # persist scores logger.info(f"persist scores") joblib.dump(scores, model_score_path)
def validate_input_data(input_file_path: str): """Validates the expected input used for the training. Args: input_file_path (str): Path used to read the input data for training. """ # get datastore according to env aml_helper = AmlCustomHelper() # datastore that points to a storage account datastore = aml_helper.ws.get_default_datastore() logger.info(f"Datastore:\t{datastore}") # load dataset logger.info(f"Loading data...") # set column data types data_types = { "age": DataType.to_long(), "sex": DataType.to_string(), "cp": DataType.to_string(), "trestbps": DataType.to_long(), "chol": DataType.to_long(), "fbs": DataType.to_string(), "restecg": DataType.to_string(), "thalach": DataType.to_long(), "exang": DataType.to_string(), "oldpeak": DataType.to_float(), "slope": DataType.to_string(), "ca": DataType.to_string(), "thal": DataType.to_string(), } # Create a TabularDataset to represent tabular data in delimited files # read the file located on the datastore provided df_data = Dataset.Tabular.from_parquet_files( path=[(datastore, input_file_path)], set_column_types=data_types ).to_pandas_dataframe() logger.info(f"Loaded data shape:\t{df_data.shape}") logger.info(f"Loaded data info:\t{df_data.info()}") logger.info(f"Loaded data first 5 rows:\t{df_data.head(5)}") # Validates the input dataset logger.info("Validates the input dataset") data_prep = DatasetPreprocessor() data_prep.validate_data(df_data, is_inference=True)
def register_new_model( model_path: str, score_path: str, preprocessor_path: str, labels_path: str, ): """Gets all the model's assets and register the new model and it's assets to as an AzureML Model. Args: model_path (str): Path pointing to the model object. score_path (str): Path pointing to the score object. preprocessor_path (str): Path pointing to the preprocessor object. labels_path (str): Path pointing to the class labels object. Raises: AzureMLException: Error due to the final model having a worst f1-score than the old model. """ logger.info("Proceeding with registration of new model on AzureML") # load current model scores logger.info("load the current model scores") scores = joblib.load(score_path) # get names logger.info("Get names") model_name = model_path.split("/")[-1] preprocessor_name = preprocessor_path.split("/")[-1].split(".")[0] score_name = score_path.split("/")[-1].split(".")[0] labels_name = labels_path.split("/")[-1].split(".")[0] logger.info("Model_name:\t{}".format(model_name)) logger.info("Preprocessor_name:\t{}".format(preprocessor_name)) logger.info("Score_name:\t{}".format(score_name)) logger.info("Labels_name:\t{}".format(labels_name)) # copyfiles to ASSETS_DIR aml_helper = AmlCustomHelper() if os.path.exists(aml_helper.ASSETS_DIR) and os.path.isdir( aml_helper.ASSETS_DIR): shutil.rmtree(aml_helper.ASSETS_DIR) logger.info(f"Deleted previous folder found: {aml_helper.ASSETS_DIR}") logger.info(f"Copy files to: {aml_helper.ASSETS_DIR}") os.makedirs(aml_helper.ASSETS_DIR, exist_ok=True) shutil.copyfile(model_path, os.path.join(aml_helper.ASSETS_DIR, model_name)) shutil.copyfile( preprocessor_path, os.path.join(aml_helper.ASSETS_DIR, preprocessor_name), ) shutil.copyfile(score_path, os.path.join(aml_helper.ASSETS_DIR, score_name)) shutil.copyfile(labels_path, os.path.join(aml_helper.ASSETS_DIR, labels_name)) # Register the model on AzureML register_model_to_azureml( aml_helper.ws, MODEL_NAME, aml_helper.ASSETS_DIR, scores, description="Heart Disease Classification Model", )
def preprocess_input_data( input_file_path: str, preprocessor: str, labels: str, output_data_X_train: str, output_data_y_train: str, output_data_X_test: str, output_data_y_test: str, ): """Prepare the data for the model training and model evaluation. Fit a preprocessor (for later use when inferencing) and applies it to the dataset to transform it to the expected input for the model. Args: input_file_path (str): Path to the input file to be loaded. preprocessor (str): Path to persist the fit preprocessor. labels (str): Path to persist the class labels. output_data_X_train (str): Path to persist X_train. output_data_y_train (str): Path to persist y_train. output_data_X_test (str): Path to persist X_test. output_data_y_test (str): Path to persist y_test. """ # get datastore aml_helper = AmlCustomHelper() datastore = aml_helper.ws.get_default_datastore() # load dataset logger.info(f"Loading data..") # set column data types data_types = { "age": DataType.to_long(), "sex": DataType.to_string(), "cp": DataType.to_string(), "trestbps": DataType.to_long(), "chol": DataType.to_long(), "fbs": DataType.to_string(), "restecg": DataType.to_string(), "thalach": DataType.to_long(), "exang": DataType.to_string(), "oldpeak": DataType.to_float(), "slope": DataType.to_string(), "ca": DataType.to_string(), "thal": DataType.to_string(), "target": DataType.to_long(), } # Create a TabularDataset to represent tabular data in delimited files df_data = Dataset.Tabular.from_delimited_files( path=[(datastore, input_file_path)], set_column_types=data_types).to_pandas_dataframe() logger.info(f"Loaded data shape:\t{df_data.shape}") logger.info(f"Loaded data info:\t{df_data.info()}") logger.info(f"Loaded data first rows:\t{df_data.head(5)}") logger.info("Fit the input data to the preprocessor") data_prep = DatasetPreprocessor() data_prep.fit(df_data) # apply the transformations on the input dataset logger.info("apply the transformations on the input dataset") logger.info(f"before transformations: {df_data.shape}") output_df = data_prep.transform(df_data, is_inference=False) logger.info(f"after transformations: {output_df.shape}") logger.info(f"after transformations: {output_df.info()}") # split training and target features logger.info("split training and target features") X = output_df.drop(columns=[data_prep.target_col]) y = output_df[[data_prep.target_col]] # split train and test dataset sss = StratifiedShuffleSplit(n_splits=2, test_size=0.3, random_state=123) for train_index, test_index in sss.split(X, y): X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] # reset the suffled indexes X_train = X_train.reset_index(drop=True) X_test = X_test.reset_index(drop=True) y_train = y_train.reset_index(drop=True) y_test = y_test.reset_index(drop=True) print(f"X_train:\t{X_train.shape}") print(f"X_test:\t{X_test.shape}") print(f"y_train:\t{y_train.shape}") print(f"y_test:\t{y_test.shape}") # persist the train outputs logger.info("persist the train outputs") X_train.to_parquet(output_data_X_train) y_train.to_parquet(output_data_y_train) # persist the test outputs logger.info("persist the test outputs") X_test.to_parquet(output_data_X_test) y_test.to_parquet(output_data_y_test) # persist fit preprocessor logger.info("persist fit preprocessor") joblib.dump(data_prep, preprocessor) # persist the class labels logger.info("persist class labels") label_classes = { 0: "absence of heart disease", 1: "presence of heart disease", } joblib.dump(label_classes, labels)
def preprocess_input_data(input_file_path: str, transformed_data_path: str, original_data_path: str): """Preprocess the dataset making it ready for inference. The preprocessor is loaded from model assets and it's transformation is applied to the dataset so it could be ready to input the model. Args: input_file_path (str): Path to the input file to be loaded. transformed_data_path (str): Path to persist transformed data. original_data_path (str): Path to persist original data. """ # get datastore aml_helper = AmlCustomHelper() datastore = aml_helper.ws.get_default_datastore() # load dataset logger.info(f"Loading data..") # set column data types data_types = { "age": DataType.to_long(), "sex": DataType.to_string(), "cp": DataType.to_string(), "trestbps": DataType.to_long(), "chol": DataType.to_long(), "fbs": DataType.to_string(), "restecg": DataType.to_string(), "thalach": DataType.to_long(), "exang": DataType.to_string(), "oldpeak": DataType.to_float(), "slope": DataType.to_string(), "ca": DataType.to_string(), "thal": DataType.to_string(), } # Create a TabularDataset to represent tabular data in parquet files df_data = Dataset.Tabular.from_parquet_files( path=[(datastore, input_file_path)], set_column_types=data_types).to_pandas_dataframe() logger.info(f"Loaded data shape:\t{df_data.shape}") logger.info(f"Loaded data info:\t{df_data.info()}") logger.info(f"Loaded data first rows:\t{df_data.head(5)}") # transform dataset transformed_data = transform_infer_data(df_data, MODEL_NAME, PREPROCESSOR_NAME) logger.info(f"Preprocessed data shape:\t{transformed_data.shape}") logger.info(f"Preprocessed data info:\t{transformed_data.info()}") logger.info(f"Preprocessed data first rows:\t{transformed_data.head(5)}") # save transformed dataset to parquet logger.info("Persist the transformed dataset") transformed_data.to_parquet(transformed_data_path) # save original data logger.info("Persist the original dataset") df_data.to_parquet(original_data_path)