def model_train(ds_df, run): ds_df.drop("Sno", axis=1, inplace=True) y_raw = ds_df['Risk'] X_raw = ds_df.drop('Risk', axis=1) categorical_features = X_raw.select_dtypes(include=['object']).columns numeric_features = X_raw.select_dtypes(include=['int64', 'float']).columns categorical_transformer = Pipeline( steps=[('imputer', SimpleImputer(strategy='constant', fill_value="missing")), ('onehotencoder', OneHotEncoder(categories='auto', sparse=False))]) numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())]) feature_engineering_pipeline = ColumnTransformer(transformers=[ ('numeric', numeric_transformer, numeric_features), ('categorical', categorical_transformer, categorical_features) ], remainder="drop") # Encode Labels le = LabelEncoder() encoded_y = le.fit_transform(y_raw) # Train test split X_train, X_test, y_train, y_test = train_test_split(X_raw, encoded_y, test_size=0.20, stratify=encoded_y, random_state=42) # Create sklearn pipeline lr_clf = Pipeline( steps=[('preprocessor', feature_engineering_pipeline ), ('classifier', LogisticRegression(solver="lbfgs"))]) # Train the model lr_clf.fit(X_train, y_train) # Capture metrics train_acc = lr_clf.score(X_train, y_train) test_acc = lr_clf.score(X_test, y_test) print("Training accuracy: %.3f" % train_acc) print("Test data accuracy: %.3f" % test_acc) # Log to Azure ML run.log('Train accuracy', train_acc) run.log('Test accuracy', test_acc) # Explain model from azureml.contrib.interpret.explanation.explanation_client import ExplanationClient from azureml.core.run import Run from interpret.ext.blackbox import TabularExplainer from azureml.contrib.interpret.visualize import ExplanationDashboard client = ExplanationClient.from_run(run) explainer = TabularExplainer(lr_clf.steps[-1][1], initialization_examples=X_train, features=X_raw.columns, classes=["Good", "Bad"], transformations=feature_engineering_pipeline) # explain overall model predictions (global explanation) global_explanation = explainer.explain_global(X_test) # Sorted SHAP values print('ranked global importance values: {}'.format( global_explanation.get_ranked_global_values())) # Corresponding feature names print('ranked global importance names: {}'.format( global_explanation.get_ranked_global_names())) # Feature ranks (based on original order of features) print('global importance rank: {}'.format( global_explanation.global_importance_rank)) client = ExplanationClient.from_run(run) client.upload_model_explanation(global_explanation, comment='global explanation: all features') return lr_clf
from sklearn.linear_model import Ridge from interpret.ext.blackbox import TabularExplainer from azureml.contrib.interpret.explanation.explanation_client import ExplanationClient from sklearn.model_selection import train_test_split from azureml.core.run import Run from sklearn.externals import joblib import os import numpy as np OUTPUT_DIR = './outputs/' os.makedirs(OUTPUT_DIR, exist_ok=True) boston_data = datasets.load_boston() run = Run.get_context() client = ExplanationClient.from_run(run) X_train, X_test, y_train, y_test = train_test_split(boston_data.data, boston_data.target, test_size=0.2, random_state=0) # write x_test out as a pickle file for later visualization x_test_pkl = 'x_test.pkl' with open(x_test_pkl, 'wb') as file: joblib.dump(value=X_test, filename=os.path.join(OUTPUT_DIR, x_test_pkl)) run.upload_file('x_test_boston_housing.pkl', os.path.join(OUTPUT_DIR, x_test_pkl)) alpha = 0.5 # Use Ridge algorithm to create a regression model reg = Ridge(alpha)
def train_model(df, target): # Creating dummy columns for each categorical feature categorical = [] for col, value in df.iteritems(): if value.dtype == 'object': categorical.append(col) # Store the numerical columns in a list numerical numerical = df.columns.difference(categorical) numeric_transformations = [ ([f], Pipeline(steps=[('imputer', SimpleImputer( strategy='median')), ('scaler', StandardScaler())])) for f in numerical ] categorical_transformations = [([f], OneHotEncoder(handle_unknown='ignore', sparse=False)) for f in categorical] transformations = numeric_transformations + categorical_transformations # Append classifier to preprocessing pipeline clf = Pipeline(steps=[('preprocessor', DataFrameMapper(transformations) ), ('classifier', LogisticRegression(solver='lbfgs'))]) # Split data into train and test x_train, x_test, y_train, y_test = train_test_split(df, target, test_size=0.35, random_state=0, stratify=target) clf.fit(x_train, y_train) y_pred = clf.predict(x_test) print(classification_report(y_test, y_pred)) accu = accuracy_score(y_test, y_pred) model_file_name = 'classifier.pkl' # save model in the outputs folder so it automatically get uploaded with open(model_file_name, 'wb') as file: joblib.dump(value=clf, filename=os.path.join('./outputs/', model_file_name)) run = Run.get_context() run.log("accuracy", accu) # we upload the model into the experiment artifact store, but do not register it as a model until unit tests are sucessfully passed in next ML step run.upload_file(model_file_name, os.path.join('./outputs/', model_file_name)) #Interpret steps client = ExplanationClient.from_run(run) # Using SHAP TabularExplainer explainer = TabularExplainer(clf.steps[-1][1], initialization_examples=x_train, features=df.columns, classes=["Not leaving", "leaving"], transformations=transformations) # explain overall model predictions (global explanation) global_explanation = explainer.explain_global(x_test) # Sorted SHAP values print('ranked global importance values: {}'.format( global_explanation.get_ranked_global_values())) # Corresponding feature names print('ranked global importance names: {}'.format( global_explanation.get_ranked_global_names())) # Feature ranks (based on original order of features) print('global importance rank: {}'.format( global_explanation.global_importance_rank)) # uploading global model explanation data for storage or visualization in webUX # the explanation can then be downloaded on any compute # multiple explanations can be uploaded client.upload_model_explanation(global_explanation, comment='global explanation: all features')