Exemple #1
0
def register_model_to_azureml(ws: Workspace,
                              model_name: str,
                              assets_dir: str,
                              scores={},
                              description: str = ""):
    """Register the model with it's assets to AzureML.

    Args:
        ws (Workspace): Workspace of the AzureML.
        model_name (str): Name to register the model.
        assets_dir (str): Directory containing the model and other assets, like scalers, preprocessors, etc.
        scores (dict, optional): Dictionary containing scores of the model, such as accuracy and f1-score. Defaults to {}.

    Returns:
        (Model): The object containing the registered model on AzureML.
    """
    # register the model to AzureML
    logger.info("Register the model to AzureML")
    model = Model.register(
        model_path=assets_dir,
        model_name=model_name,
        tags={
            "Model": "XGBClassifier",
            "Scaler": "None"
        },
        properties=scores,
        description=description,
        workspace=ws,
        model_framework=Model.Framework.SCIKITLEARN,
        model_framework_version=sklearn.__version__,
    )
    logger.info(f"Model {model_name} registered at version {model.version}")

    return model
def transform_infer_data(dataset: pd.DataFrame, aml_model_name: str,
                         preprocessor_name: str) -> pd.DataFrame:
    """Transform the dataset for inference. Downloads the registered Model on AzureML and load the already
    fit preprocessor to use for the data transformation.

    Args:
        dataset (pd.DataFrame): Dataset to transform.
        aml_model_name (str): The name of the Model registered on AzureML.
        preprocessor_name (str): The name of the preprocessor registered with the Model's assets on AzureML.

    Returns:
        (pd.DataFrame): The preprocessor's transformations applied to the dataframe.
    """
    aml_helper = AmlCustomHelper()
    try:
        # download registered model and assets
        logger.info("Download aml model")
        aml_model = Model(aml_helper.ws, aml_model_name)

        logger.info(f"aml_helper.ASSETS_DIR:\t{aml_helper.ASSETS_DIR}")
        aml_model.download(
            target_dir=f"{'/'.join(aml_helper.ASSETS_DIR.split('/')[0:-1])}",
            exist_ok=True,
        )

        # load preprocessor object from model assets
        logger.info("Load aml model's preprocessor")
        preprocessor = joblib.load(
            f"{aml_helper.ASSETS_DIR}/{preprocessor_name}")

        logger.info(f"Transform the dataset")
        transformed_dataset = preprocessor.transform(dataset,
                                                     is_inference=True)

        return transformed_dataset

    except (WebserviceException, ModelNotFoundException):
        logger.info(
            f"No previous registered model found with the name:\t{MODEL_NAME}")
        # force to fail the AzureML pipeline step
        raise AzureMLException(exception_message="Stopping the execution.")
Exemple #3
0
def train_model(
    X_train_path: str,
    y_train_path: str,
    output_model_path: str,
):
    """Train the classification model using X input variables and y target variable.
    Persist the trained model.

    Args:
        X_train_path (str): The path for X dataset as input for training the model.
        y_train_path (str): The path for y dataset as input for training the model.
        output_model_path (str): The path to persist the trained model.
    """
    # load dataset
    logger.info("Load the training datasets")
    X_train = pd.read_parquet(X_train_path)
    logger.info(f"X shape:\t{X_train.shape}")

    y_train = pd.read_parquet(y_train_path)
    logger.info(f"y shape:\t{y_train.shape}")

    # build the classification model
    model_clf = XGBClassifier(
        colsample_bytree=1,
        learning_rate=0.1,
        max_depth=4,
        min_child_weight=1e-05,
        n_estimators=200,
        objective="binary:logistic",
        subsample=0.5,
    )

    # fit the classification model
    model_clf.fit(X_train, y_train)
    logger.info(f"XGBClassifier model:\t{model_clf}")

    # persist the trained model
    logger.info("Persist the model")
    joblib.dump(model_clf, output_model_path)
def upload_results_to_azureml_datastore(
    predictions: str,
    output_file_path: str,
):
    """Uploads the inference result file to the Azure Datastore.

    Args:
        predictions (str): Path to the dataset.
        output_file_path (str): Path to output the inference data.
    """

    # get datastore according to env
    aml_helper = AmlCustomHelper()
    datastore = aml_helper.ws.get_default_datastore()

    # upload files to AzureBlobDatastore
    logger.info("Upload predictions...")
    datastore.upload_files(
        files=[predictions],
        target_path=output_file_path,
        overwrite=True,
        show_progress=True,
    )
def compare_models(
    model_path: str,
    X_test_path: str,
    y_test_path: str,
):
    """Compares the recall of the current model (new model) with the already registered model (old model).
    If the final model's score is NOT greater or equals than the old model's, an Error is raised stopping the execution,
    thus preventing the new model from being registered.

    Args:
        model_path (str): Path to the new model object.
        X_test_path (str): Path to the variables test dataset.
        y_test_path (str): Path to the target test dataset.

    Raises:
        AzureMLException: Error due to the final model having a worst recall than the old model.
    """
    aml_helper = AmlCustomHelper()
    try:
        # download old model
        logger.info("Download old model")
        old_model_aml = Model(aml_helper.ws, MODEL_NAME)

        logger.info(f"aml_helper.ASSETS_DIR:\t{aml_helper.ASSETS_DIR}")
        old_model_aml.download(
            target_dir=f"{'/'.join(aml_helper.ASSETS_DIR.split('/')[0:-1])}",
            exist_ok=True,
        )

        # load old model
        logger.info("Load old model")
        old_model = joblib.load(
            f"{aml_helper.ASSETS_DIR}/{model_path.split('/')[-1]}")
        logger.info(f"Old Model:\t{old_model}")

        # load new model
        logger.info("Load new model")
        new_model = joblib.load(model_path)
        logger.info(f"New Model:\t{new_model}")

        # load test dataset
        logger.info("Load the test datasets")
        X_test = pd.read_parquet(X_test_path)
        logger.info(f"X_test shape:\t{X_test.shape}")

        y_test = pd.read_parquet(y_test_path)
        logger.info(f"y_test shape:\t{y_test.shape}")

        # make predictions on the test set
        logger.info("Make predictions - Old Model")
        y_hat_old = old_model.predict(X_test)

        logger.info("Make predictions - Old Model")
        y_hat_new = new_model.predict(X_test)

        # Recall
        logger.info("Calculates Recall")
        recall_old = recall_score(y_test, y_hat_old)
        recall_new = recall_score(y_test, y_hat_new)
        logger.info(f"Old model recall:\t{recall_old}")
        logger.info(f"New model recall:\t{recall_new}")

        if recall_old > recall_new:
            # force to fail the AzureML pipeline step
            raise AzureMLException(
                exception_message=
                "The new model metric scored less than the old model. Thus, We will not proceed to the new model registration."
            )

    except (WebserviceException, ModelNotFoundException):
        logger.info(
            f"No previous registered model found with the name:\t{MODEL_NAME}")
def make_predictions(
    transformed_data_path: str,
    original_data_path: str,
    inference_path: str,
):
    """Loads the datasets already transformed and make the predictions.

    Args:
        transformed_data_path (str): Path to the transformed dataset ready to be
        feed to the model for prediction.
        original_data_path (str): Path to the original dataset.
        inference_path (str): Path to the predictions.
    """
    # helper
    aml_helper = AmlCustomHelper()

    # load dataset
    logger.info("Load the training datasets")
    X = pd.read_parquet(transformed_data_path)
    logger.info(f"X shape:\t{X.shape}")

    # load original dataset
    logger.info("Load the original datasets")
    original_data = pd.read_parquet(original_data_path)
    logger.info(f"original_data shape:\t{original_data.shape}")

    # download registered model
    logger.info("Download Azureml model")
    az_model = Model(aml_helper.ws, MODEL_NAME)

    logger.info(f"aml_helper.ASSETS_DIR:\t{aml_helper.ASSETS_DIR}")
    az_model.download(
        target_dir=f"{'/'.join(aml_helper.ASSETS_DIR.split('/')[0:-1])}",
        exist_ok=True,
    )

    # load model
    logger.info("Load Azureml model")
    model = joblib.load(f"{aml_helper.ASSETS_DIR}/{MODEL_NAME}")
    logger.info(f"Model:\t{model}")

    # batch inference - get predictions
    pred = model.predict(X)

    logger.info(f"type:\t{type(pred)}")
    logger.info(f"pred.shape:\t{pred.shape}")

    # transform the predictions to DataFrame
    pred = pd.DataFrame(pred, columns=["prediction"])
    logger.info(pred.head())

    # concat the predictions with original dataset
    df_pred_and_orig = pd.concat([original_data, pred], axis=1)
    logger.info(f"df_pred_and_orig.head():\t{df_pred_and_orig.head()}")

    # persist predictions
    logger.info("persist results")
    df_pred_and_orig.to_csv(inference_path)
def evaluate_model(
    X_test_path: str,
    y_test_path: str,
    model_path: str,
    model_score_path: str,
):
    """Evaluate the model using metrics such as accuracy and f1-score and persists these metrics generated as an object.
    Also logs these metrics to AzureML.

    Args:
        X_test_path (str): Path to X input for model evaluation
        y_test_path (str): Path to y input for model evaluation
        model_path (str): Path to model to be evaluated
        model_score_path (str): Path to persist the evaluation metrics of the model
    """
    # load dataset
    logger.info("Load the test datasets")

    X_test = pd.read_parquet(X_test_path)
    logger.info(f"X_test shape:\t{X_test.shape}")

    y_test = pd.read_parquet(y_test_path)
    logger.info(f"y_test shape:\t{y_test.shape}")

    # load the model
    model = joblib.load(model_path)

    # make predictions on the test set
    logger.info("Make predictions")
    y_hat = model.predict(X_test)

    scores = {}
    # model accuracy
    logger.info("Calculates accuracy")
    acc_score = accuracy_score(y_test, y_hat)
    scores["accuracy"] = acc_score

    # F1-Score Macro
    logger.info("Calculates F1-Score Macro")
    f1_score_macro = f1_score(y_test, y_hat, average="macro")
    scores["f1_score_macro"] = f1_score_macro

    # Recall
    logger.info("Calculates Recall")
    recall = recall_score(y_test, y_hat)
    scores["recall"] = recall

    logger.info(f"scores:\t{scores}")

    # Confusion Matrix
    logger.info("Calculates Confusion Matrix")
    cm = confusion_matrix(y_test, y_hat)
    data_prep = DatasetPreprocessor()
    cm_json = {
        "schema_type": "confusion_matrix",
        #    "schema_version": "v1",
        "data": {
            "class_labels": data_prep.target_col_description,
            "matrix": [[int(y) for y in x] for x in cm],
        },
    }

    # plot the confusion matrix
    logger.info("Plot Confusion Matrix")
    cm_plot = plot_confusion_matrix(cm, data_prep.target_col_description,
                                    "Heart Disease")

    # Log to AzureML
    run = Run.get_context()
    if not isinstance(run, _OfflineRun):
        parent_run = run.parent
        parent_run.log(name="Accuracy", value=acc_score)
        parent_run.log(name="F1-Score Macro", value=f1_score_macro)
        parent_run.log(name="Recall", value=recall)
        parent_run.log_confusion_matrix(name="Confusion Matrix", value=cm_json)
        parent_run.log_image(name="Confusion Matrix", plot=cm_plot)
    else:
        logger.info("Offline run")
        run.log(name="Accuracy", value=acc_score)
        run.log(name="F1-Score Macro", value=f1_score_macro)
        run.log(name="Recall", value=recall)
        run.log(name="Confusion Matrix", value=cm)

    # persist scores
    logger.info(f"persist scores")
    joblib.dump(scores, model_score_path)
Exemple #8
0
def validate_input_data(input_file_path: str):
    """Validates the expected input used for the training.

    Args:
        input_file_path (str): Path used to read the input data for training.
    """
    # get datastore according to env
    aml_helper = AmlCustomHelper()
    # datastore that points to a storage account
    datastore = aml_helper.ws.get_default_datastore()
    logger.info(f"Datastore:\t{datastore}")

    # load dataset
    logger.info(f"Loading data...")

    # set column data types
    data_types = {
        "age": DataType.to_long(),
        "sex": DataType.to_string(),
        "cp": DataType.to_string(),
        "trestbps": DataType.to_long(),
        "chol": DataType.to_long(),
        "fbs": DataType.to_string(),
        "restecg": DataType.to_string(),
        "thalach": DataType.to_long(),
        "exang": DataType.to_string(),
        "oldpeak": DataType.to_float(),
        "slope": DataType.to_string(),
        "ca": DataType.to_string(),
        "thal": DataType.to_string(),
    }

    # Create a TabularDataset to represent tabular data in delimited files
    # read the file located on the datastore provided
    df_data = Dataset.Tabular.from_parquet_files(
        path=[(datastore, input_file_path)], set_column_types=data_types
    ).to_pandas_dataframe()

    logger.info(f"Loaded data shape:\t{df_data.shape}")
    logger.info(f"Loaded data info:\t{df_data.info()}")
    logger.info(f"Loaded data first 5 rows:\t{df_data.head(5)}")

    # Validates the input dataset
    logger.info("Validates the input dataset")
    data_prep = DatasetPreprocessor()
    data_prep.validate_data(df_data, is_inference=True)
Exemple #9
0
def register_new_model(
    model_path: str,
    score_path: str,
    preprocessor_path: str,
    labels_path: str,
):
    """Gets all the model's assets and register the new model and it's assets to as an AzureML Model.

    Args:
        model_path (str): Path pointing to the model object.
        score_path (str): Path pointing to the score object.
        preprocessor_path (str): Path pointing to the preprocessor object.
        labels_path (str): Path pointing to the class labels object.

    Raises:
        AzureMLException: Error due to the final model having a worst f1-score than the old model.
    """
    logger.info("Proceeding with registration of new model on AzureML")

    # load current model scores
    logger.info("load the current model scores")
    scores = joblib.load(score_path)

    # get names
    logger.info("Get names")
    model_name = model_path.split("/")[-1]
    preprocessor_name = preprocessor_path.split("/")[-1].split(".")[0]
    score_name = score_path.split("/")[-1].split(".")[0]
    labels_name = labels_path.split("/")[-1].split(".")[0]

    logger.info("Model_name:\t{}".format(model_name))
    logger.info("Preprocessor_name:\t{}".format(preprocessor_name))
    logger.info("Score_name:\t{}".format(score_name))
    logger.info("Labels_name:\t{}".format(labels_name))

    # copyfiles to ASSETS_DIR
    aml_helper = AmlCustomHelper()
    if os.path.exists(aml_helper.ASSETS_DIR) and os.path.isdir(
            aml_helper.ASSETS_DIR):
        shutil.rmtree(aml_helper.ASSETS_DIR)
        logger.info(f"Deleted previous folder found: {aml_helper.ASSETS_DIR}")

    logger.info(f"Copy files to: {aml_helper.ASSETS_DIR}")
    os.makedirs(aml_helper.ASSETS_DIR, exist_ok=True)
    shutil.copyfile(model_path, os.path.join(aml_helper.ASSETS_DIR,
                                             model_name))
    shutil.copyfile(
        preprocessor_path,
        os.path.join(aml_helper.ASSETS_DIR, preprocessor_name),
    )
    shutil.copyfile(score_path, os.path.join(aml_helper.ASSETS_DIR,
                                             score_name))
    shutil.copyfile(labels_path,
                    os.path.join(aml_helper.ASSETS_DIR, labels_name))

    # Register the model on AzureML
    register_model_to_azureml(
        aml_helper.ws,
        MODEL_NAME,
        aml_helper.ASSETS_DIR,
        scores,
        description="Heart Disease Classification Model",
    )
def preprocess_input_data(
    input_file_path: str,
    preprocessor: str,
    labels: str,
    output_data_X_train: str,
    output_data_y_train: str,
    output_data_X_test: str,
    output_data_y_test: str,
):
    """Prepare the data for the model training and model evaluation. Fit a preprocessor (for later use when inferencing)
    and applies it to the dataset to transform it to the expected input for the model.

    Args:
        input_file_path (str): Path to the input file to be loaded.
        preprocessor (str): Path to persist the fit preprocessor.
        labels (str): Path to persist the class labels.
        output_data_X_train (str): Path to persist X_train.
        output_data_y_train (str): Path to persist y_train.
        output_data_X_test (str): Path to persist X_test.
        output_data_y_test (str): Path to persist y_test.
    """
    # get datastore
    aml_helper = AmlCustomHelper()
    datastore = aml_helper.ws.get_default_datastore()

    # load dataset
    logger.info(f"Loading data..")
    # set column data types
    data_types = {
        "age": DataType.to_long(),
        "sex": DataType.to_string(),
        "cp": DataType.to_string(),
        "trestbps": DataType.to_long(),
        "chol": DataType.to_long(),
        "fbs": DataType.to_string(),
        "restecg": DataType.to_string(),
        "thalach": DataType.to_long(),
        "exang": DataType.to_string(),
        "oldpeak": DataType.to_float(),
        "slope": DataType.to_string(),
        "ca": DataType.to_string(),
        "thal": DataType.to_string(),
        "target": DataType.to_long(),
    }

    # Create a TabularDataset to represent tabular data in delimited files
    df_data = Dataset.Tabular.from_delimited_files(
        path=[(datastore, input_file_path)],
        set_column_types=data_types).to_pandas_dataframe()

    logger.info(f"Loaded data shape:\t{df_data.shape}")
    logger.info(f"Loaded data info:\t{df_data.info()}")
    logger.info(f"Loaded data first rows:\t{df_data.head(5)}")

    logger.info("Fit the input data to the preprocessor")
    data_prep = DatasetPreprocessor()
    data_prep.fit(df_data)

    # apply the transformations on the input dataset
    logger.info("apply the transformations on the input dataset")
    logger.info(f"before transformations: {df_data.shape}")

    output_df = data_prep.transform(df_data, is_inference=False)

    logger.info(f"after transformations: {output_df.shape}")
    logger.info(f"after transformations: {output_df.info()}")

    # split training and target features
    logger.info("split training and target features")
    X = output_df.drop(columns=[data_prep.target_col])
    y = output_df[[data_prep.target_col]]

    # split train and test dataset
    sss = StratifiedShuffleSplit(n_splits=2, test_size=0.3, random_state=123)

    for train_index, test_index in sss.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # reset the suffled indexes
    X_train = X_train.reset_index(drop=True)
    X_test = X_test.reset_index(drop=True)
    y_train = y_train.reset_index(drop=True)
    y_test = y_test.reset_index(drop=True)

    print(f"X_train:\t{X_train.shape}")
    print(f"X_test:\t{X_test.shape}")
    print(f"y_train:\t{y_train.shape}")
    print(f"y_test:\t{y_test.shape}")

    # persist the train outputs
    logger.info("persist the train outputs")
    X_train.to_parquet(output_data_X_train)
    y_train.to_parquet(output_data_y_train)

    # persist the test outputs
    logger.info("persist the test outputs")
    X_test.to_parquet(output_data_X_test)
    y_test.to_parquet(output_data_y_test)

    # persist fit preprocessor
    logger.info("persist fit preprocessor")
    joblib.dump(data_prep, preprocessor)

    # persist the class labels
    logger.info("persist class labels")
    label_classes = {
        0: "absence of heart disease",
        1: "presence of heart disease",
    }
    joblib.dump(label_classes, labels)
def preprocess_input_data(input_file_path: str, transformed_data_path: str,
                          original_data_path: str):
    """Preprocess the dataset making it ready for inference. The preprocessor is loaded
    from model assets and it's transformation is applied to the dataset so it could be
    ready to input the model.

    Args:
        input_file_path (str): Path to the input file to be loaded.
        transformed_data_path (str): Path to persist transformed data.
        original_data_path (str): Path to persist original data.
    """
    # get datastore
    aml_helper = AmlCustomHelper()
    datastore = aml_helper.ws.get_default_datastore()

    # load dataset
    logger.info(f"Loading data..")
    # set column data types
    data_types = {
        "age": DataType.to_long(),
        "sex": DataType.to_string(),
        "cp": DataType.to_string(),
        "trestbps": DataType.to_long(),
        "chol": DataType.to_long(),
        "fbs": DataType.to_string(),
        "restecg": DataType.to_string(),
        "thalach": DataType.to_long(),
        "exang": DataType.to_string(),
        "oldpeak": DataType.to_float(),
        "slope": DataType.to_string(),
        "ca": DataType.to_string(),
        "thal": DataType.to_string(),
    }

    # Create a TabularDataset to represent tabular data in parquet files
    df_data = Dataset.Tabular.from_parquet_files(
        path=[(datastore, input_file_path)],
        set_column_types=data_types).to_pandas_dataframe()

    logger.info(f"Loaded data shape:\t{df_data.shape}")
    logger.info(f"Loaded data info:\t{df_data.info()}")
    logger.info(f"Loaded data first rows:\t{df_data.head(5)}")

    # transform dataset
    transformed_data = transform_infer_data(df_data, MODEL_NAME,
                                            PREPROCESSOR_NAME)

    logger.info(f"Preprocessed data shape:\t{transformed_data.shape}")
    logger.info(f"Preprocessed data info:\t{transformed_data.info()}")
    logger.info(f"Preprocessed data first rows:\t{transformed_data.head(5)}")

    # save transformed dataset to parquet
    logger.info("Persist the transformed dataset")
    transformed_data.to_parquet(transformed_data_path)

    # save original data
    logger.info("Persist the original dataset")
    df_data.to_parquet(original_data_path)