Beispiel #1
0
def shap_values(model, x_train, x_text, features, initialization_examples):
    """    
    Provides feature importances to explain the model.
    
    Parameters:
    x_train: input dataset to train the model
    x_test: test dataset
    model: trained model
    features: list of feature names. Optional, used if doing classification
    classes: list of output class labels or names. Optional, used if doing classification
    
    Returns:
    explainer (object): provides the feature importances that determines the prediction of the model
    global_explanation (object): provides the global feature importances that determines the prediction of the model
    local_explanation (object): provides the global feature importances that determines the prediction of the model
    
    """
    explainer = TabularExplainer(model, x_train, features=features)

    # you can use the training data or the test data here
    global_explanation = explainer.explain_global(x_test)

    # explain the selected data point in the test set
    local_explanation = explainer.explain_local(x_test)

    return explainer, global_explanation, local_explanation
Beispiel #2
0
def main():
    # Add arguments to script
    parser = argparse.ArgumentParser()
    
    parser.add_argument('--C', type=float, default=1.0, help="Inverse of regularization strength. Smaller values cause stronger regularization")
    parser.add_argument('--max_iter', type=int, default=100, help="Maximum number of iterations to converge")

    args = parser.parse_args()

    run.log("Regularization Strength:", np.float(args.C))
    run.log("Max iterations:", np.int(args.max_iter))

    model = LogisticRegression(C=args.C, max_iter=args.max_iter).fit(x_train, y_train)

    accuracy = model.score(x_test, y_test)
    run.log("Accuracy", np.float(accuracy))
    
    os.makedirs('outputs', exist_ok=True)
    # files saved in the "outputs" folder are automatically uploaded into run history
    joblib.dump(model, 'outputs/modelht.pkl')
    
    #model_name
    model_file_name = 'modelht.pkl'
    
    # register the model
    run.upload_file('original_model.pkl', os.path.join('./outputs/', model_file_name))
    original_model = run.register_model(model_name='model_explain',model_path='original_model.pkl')

    # Explain predictions on your local machine
    tabular_explainer = TabularExplainer(model, x_train, features=feature_names)
    global_explanation = tabular_explainer.explain_global(x_test)

    # The explanation can then be downloaded on any compute
    comment = 'Global explanation on regression model trained on bank marketing campaing dataset'
    client.upload_model_explanation(global_explanation, comment=comment, model_id=original_model.id)    
def interpret_model(model, x_train, x_test, feature_names=None, classes=None):
    #Using SHAP TabularExplainer
    explainer = TabularExplainer(model,
                                 x_train,
                                 features=feature_names,
                                 classes=classes)
    #Generate global explanations
    global_explanation = explainer.explain_global(x_test)
    #Return Generated Explanation dashboard
    return ExplanationDashboard(global_explanation, model, datasetX=x_test)
def Global_Model_Explanation(model,
                             x_train,
                             x_test,
                             feature_names=None,
                             classes=None,
                             explantion_data=None):
    #Using SHAP TabularExplainer
    explainer = TabularExplainer(model,
                                 x_train,
                                 features=feature_names,
                                 classes=classes)
    #Generate global explanations
    if explantion_data == 'Training':
        global_explanation = explainer.explain_global(x_train)
    else:
        global_explanation = explainer.explain_global(x_test)
    ##print the global importance rank data
    print('global importance rank: {}'.format(
        global_explanation.get_feature_importance_dict()))
    #Return Generated Explanation dashboard
    return global_explanation
Beispiel #5
0
def interpret_global(model,
                     train,
                     test,
                     features=None,
                     classes=None,
                     local=False,
                     task=None):
    # explain predictions on your local machine
    # "features" and "classes" fields are optional
    explainer = TabularExplainer(model,
                                 train,
                                 features=features,
                                 classes=classes,
                                 model_task=task)

    # explain overall model predictions (global explanation)
    global_explanation = explainer.explain_global(test)

    # uploading global model explanation data for storage or visualization in webUX
    # the explanation can then be downloaded on any compute
    # multiple explanations can be uploaded
    return global_explanation
model_file_name = 'ridge_{0:.2f}.pkl'.format(alpha)
# save model in the outputs folder so it automatically get uploaded
with open(model_file_name, 'wb') as file:
    joblib.dump(value=reg, filename=os.path.join(OUTPUT_DIR, model_file_name))

# register the model
run.upload_file('original_model.pkl',
                os.path.join('./outputs/', model_file_name))
original_model = run.register_model(
    model_name='model_explain_model_on_amlcomp',
    model_path='original_model.pkl')

# Explain predictions on your local machine
tabular_explainer = TabularExplainer(model,
                                     X_train,
                                     features=boston_data.feature_names)

# Explain overall model predictions (global explanation)
# Passing in test dataset for evaluation examples - note it must be a representative sample of the original data
# x_train can be passed as well, but with more examples explanations it will
# take longer although they may be more accurate
global_explanation = tabular_explainer.explain_global(X_test)

# Uploading model explanation data for storage or visualization in webUX
# The explanation can then be downloaded on any compute
comment = 'Global explanation on regression model trained on boston dataset'
client.upload_model_explanation(global_explanation,
                                comment=comment,
                                model_id=original_model.id)
Beispiel #7
0
def model_train(df):
    run = Run.get_context()

    df.drop("Sno", axis=1, inplace=True)

    y_raw = df['Risk']
    X_raw = df.drop('Risk', axis=1)

    categorical_features = X_raw.select_dtypes(include=['object']).columns
    numeric_features = X_raw.select_dtypes(include=['int64', 'float']).columns

    categorical_transformer = Pipeline(
        steps=[('imputer',
                SimpleImputer(strategy='constant', fill_value="missing")),
               ('onehotencoder',
                OneHotEncoder(categories='auto', sparse=False))])

    numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

    feature_engineering_pipeline = ColumnTransformer(transformers=[
        ('numeric', numeric_transformer, numeric_features),
        ('categorical', categorical_transformer, categorical_features)
    ],
                                                     remainder="drop")

    # Encode Labels
    le = LabelEncoder()
    encoded_y = le.fit_transform(y_raw)

    # Train test split
    X_train, X_test, y_train, y_test = train_test_split(X_raw,
                                                        encoded_y,
                                                        test_size=0.20,
                                                        stratify=encoded_y,
                                                        random_state=42)

    # Create sklearn pipeline
    lr_clf = Pipeline(
        steps=[('preprocessor', feature_engineering_pipeline
                ), ('classifier', LogisticRegression(solver="lbfgs"))])
    # Train the model
    lr_clf.fit(X_train, y_train)

    # Capture metrics
    train_acc = lr_clf.score(X_train, y_train)
    test_acc = lr_clf.score(X_test, y_test)
    print("Training accuracy: %.3f" % train_acc)
    print("Testing accuracy: %.3f" % test_acc)

    # Log to Azure ML
    run.log('Train accuracy', train_acc)
    run.log('Test accuracy', test_acc)

    # Explain model
    explainer = TabularExplainer(lr_clf.steps[-1][1],
                                 initialization_examples=X_train,
                                 features=X_raw.columns,
                                 classes=["Good", "Bad"],
                                 transformations=feature_engineering_pipeline)

    # explain overall model predictions (global explanation)
    global_explanation = explainer.explain_global(X_test)

    # Sorted SHAP values
    print('ranked global importance values: {}'.format(
        global_explanation.get_ranked_global_values()))
    # Corresponding feature names
    print('ranked global importance names: {}'.format(
        global_explanation.get_ranked_global_names()))
    # Feature ranks (based on original order of features)
    print('global importance rank: {}'.format(
        global_explanation.global_importance_rank))

    client = ExplanationClient.from_run(run)
    client.upload_model_explanation(global_explanation,
                                    comment='Global Explanation: All Features')

    return lr_clf
# preds = reg.predict(X_test)
run.log('C', C)
run.log('gamma', gamma)


model_file_name = 'svc.pkl'
# save model in the outputs folder so it automatically get uploaded
with open(model_file_name, 'wb') as file:
    joblib.dump(value=reg, filename=os.path.join(OUTPUT_DIR,
                                                 model_file_name))

# register the model
run.upload_file('original_model.pkl', os.path.join('./outputs/', model_file_name))
original_model = run.register_model(model_name='model_explain_model_on_amlcomp',
                                    model_path='original_model.pkl')

# Explain predictions on your local machine
tabular_explainer = TabularExplainer(model, X_train.to_pandas(), features=X_train.columns, use_gpu=True)

# Explain overall model predictions (global explanation)
# Passing in test dataset for evaluation examples - note it must be a representative sample of the original data
# x_train can be passed as well, but with more examples explanations it will
# take longer although they may be more accurate
global_explanation = tabular_explainer.explain_global(X_test.to_pandas()[:50])

# Uploading model explanation data for storage or visualization in webUX
# The explanation can then be downloaded on any compute
comment = 'Global explanation on regression model trained on boston dataset'
client.upload_model_explanation(global_explanation, comment=comment, model_id=original_model.id)
Beispiel #9
0
def main():
    print("Running train_aml.py")

    parser = argparse.ArgumentParser("train")
    parser.add_argument(
        "--model_name",
        type=str,
        help="Name of the Model",
        default="diabetes_model.pkl",
    )

    parser.add_argument("--step_output",
                        type=str,
                        help=("output for passing data to next step"))

    parser.add_argument("--dataset_version",
                        type=str,
                        help=("dataset version"))

    parser.add_argument("--data_file_path",
                        type=str,
                        help=("data file path, if specified,\
               a new version of the dataset will be registered"))

    parser.add_argument(
        "--caller_run_id",
        type=str,
        help=("caller run id, for example ADF pipeline run id"))

    parser.add_argument("--dataset_name",
                        type=str,
                        help=("Dataset name. Dataset must be passed by name\
              to always get the desired dataset version\
              rather than the one used while the pipeline creation"))

    args = parser.parse_args()

    print("Argument [model_name]: %s" % args.model_name)
    print("Argument [step_output]: %s" % args.step_output)
    print("Argument [dataset_version]: %s" % args.dataset_version)
    print("Argument [data_file_path]: %s" % args.data_file_path)
    print("Argument [caller_run_id]: %s" % args.caller_run_id)
    print("Argument [dataset_name]: %s" % args.dataset_name)

    model_name = args.model_name
    step_output_path = args.step_output
    dataset_version = args.dataset_version
    data_file_path = args.data_file_path
    dataset_name = args.dataset_name

    run = Run.get_context()

    print("Getting training parameters")

    # Load the training parameters from the parameters file
    with open("parameters.json") as f:
        pars = json.load(f)
    try:
        train_args = pars["training"]
    except KeyError:
        print("Could not load training values from file")
        train_args = {}

    # Log the training parameters
    print(f"Parameters: {train_args}")
    for (k, v) in train_args.items():
        run.log(k, v)
        run.parent.log(k, v)

    # Get the dataset
    if (dataset_name):
        if (data_file_path == 'none'):
            dataset = Dataset.get_by_name(run.experiment.workspace,
                                          dataset_name,
                                          dataset_version)  # NOQA: E402, E501
        else:
            dataset = register_dataset(run.experiment.workspace, dataset_name,
                                       os.environ.get("DATASTORE_NAME"),
                                       data_file_path)
    else:
        e = ("No dataset provided")
        print(e)
        raise Exception(e)

    # Link dataset to the step run so it is trackable in the UI
    run.input_datasets['training_data'] = dataset
    run.parent.tag("dataset_id", value=dataset.id)

    # Split the data into test/train
    df0 = dataset.to_pandas_dataframe()
    df = prepare_data(df0)
    data = split_data(df)

    # Train the model
    model = train_model(data, train_args)
    explainer = TabularExplainer(model,
                                 data["train"]["X"],
                                 features=df0.drop(['car name', 'mpg'],
                                                   axis=1).columns)
    global_explanation = explainer.explain_global(data["test"]["X"])
    client = ExplanationClient.from_run(run)
    client.upload_model_explanation(global_explanation,
                                    comment='MPG Predication Explanation')

    # Evaluate and log the metrics returned from the train function
    metrics = get_model_metrics(model, data)
    for (k, v) in metrics.items():
        run.log(k, v)
        run.parent.log(k, v)

    # Pass model file to next step
    os.makedirs(step_output_path, exist_ok=True)
    model_output_path = os.path.join(step_output_path, model_name)
    joblib.dump(value=model, filename=model_output_path)

    # Also upload model file to run outputs for history
    os.makedirs('outputs', exist_ok=True)
    output_path = os.path.join('outputs', model_name)
    joblib.dump(value=model, filename=output_path)

    run.tag("run_type", value="train")
    print(f"tags now present for run: {run.tags}")

    run.complete()
y = iris['target']
classes = iris['target_names']
feature_names = iris['feature_names']

x_train, x_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0)

clf = svm.SVC(gamma=0.001, C=100., probability=True)
model = clf.fit(x_train, y_train)

explainer = TabularExplainer(model,
                             x_train,
                             features=feature_names,
                             classes=classes)

global_explanation = explainer.explain_global(x_test)


instance_num = 0
local_explanation = explainer.explain_local(x_test[instance_num, :])

prediction_value = clf.predict(x_test)[instance_num]

sorted_local_importance_values = local_explanation.get_ranked_local_values()[
    prediction_value]
sorted_local_importance_names = local_explanation.get_ranked_local_names()[
    prediction_value]


ExplanationDashboard(global_explanation, model, dataset=x_test, true_y=y_test)
ModelPerformanceDashboard(model, dataset=x_test, true_y=y_test)
Beispiel #11
0
# Train a decision tree model
print('Training a decision tree model')
model = DecisionTreeClassifier().fit(X_train, y_train)

# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
run.log('Accuracy', np.float(acc))

# calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test, y_scores[:, 1])
run.log('AUC', np.float(auc))

os.makedirs('outputs', exist_ok=True)
# note file saved in the outputs folder is automatically uploaded into experiment record
joblib.dump(value=model, filename='outputs/diabetes.pkl')

# Get explanation
explainer = TabularExplainer(model, X_train, features=features, classes=labels)
explanation = explainer.explain_global(X_test)

# Get an Explanation Client and upload the explanation
explain_client = ExplanationClient.from_run(run)
explain_client.upload_model_explanation(explanation,
                                        comment='Tabular Explanation')

# Complete the run
run.complete()
# Evaluate the RFC model
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test, Y_predict)
score = rfc.score(X_test, Y_test)

#explainability

from interpret.ext.blackbox import TabularExplainer

classes = ['Not Greater than 50k', "Greater than 50k"]
features = list(X.columns)

tab_explainer = TabularExplainer(trained_model,
                                 X_train,
                                 features=features,
                                 classes=classes)

#global
global_explaination = tab_explainer.explain_global(X_train)

global_fi = global_explaination.get_feature_importance_dict()

#local

X_explain = X_test[:5]

local_explaination = tab_explainer.explain_local(X_explain)

local_f = local_explaination.get_ranked_local_names()
local_importance = local_explaination.get_ranked_local_values()
run.log("Test R2 Score", test_r2)
run.log("RMSE", rmse)

print("Saving the model to outputs ...")

model_file_name = 'gbr_tickfund.pkl'
joblib.dump(value=gbr, filename='outputs/model.pkl')

with open(model_file_name, 'wb') as file:
    joblib.dump(value=gbr, filename=os.path.join(OUTPUT_DIR, model_file_name))
# register the model
run.upload_file('dev_model.pkl', os.path.join('./outputs/', model_file_name))
original_model = run.register_model(model_name='gbr_model_train_msft',
                                    model_path='dev_model.pkl')

# Explain predictions on your local machine
tabular_explainer = TabularExplainer(gbr, train_features, features=df.columns)

# Explain overall model predictions (global explanation)
# Passing in test dataset for evaluation examples - note it must be a representative sample of the original data
# x_train can be passed as well, but with more examples explanations it will
# take longer although they may be more accurate
global_explanation = tabular_explainer.explain_global(test_features)

# Uploading model explanation data for storage or visualization in webUX
# The explanation can then be downloaded on any compute
comment = 'Global explanation on regression model trained on ticker fund dataset'
client.upload_model_explanation(global_explanation,
                                comment=comment,
                                model_id=original_model.id)
Beispiel #14
0
# PFIExplainer
from interpret.ext.blackbox import PFIExplainer

pfi_explainer = PFIExplainer(model = loan_model,
                             features=['loan_amount','income','age','marital_status'],
                             classes=['reject', 'approve'])



# MimicExplainer
global_mim_explanation = mim_explainer.explain_global(X_train)
global_mim_feature_importance = global_mim_explanation.get_feature_importance_dict()


# TabularExplainer
global_tab_explanation = tab_explainer.explain_global(X_train)
global_tab_feature_importance = global_tab_explanation.get_feature_importance_dict()


# PFIExplainer
global_pfi_explanation = pfi_explainer.explain_global(X_train, y_train)
global_pfi_feature_importance = global_pfi_explanation.get_feature_importance_dict()


# MimicExplainer
local_mim_explanation = mim_explainer.explain_local(X_test[0:5])
local_mim_features = local_mim_explanation.get_ranked_local_names()
local_mim_importance = local_mim_explanation.get_ranked_local_values()


# TabularExplainer
Beispiel #15
0
def train_model(df, target):
    # Creating dummy columns for each categorical feature
    categorical = []
    for col, value in df.iteritems():
        if value.dtype == 'object':
            categorical.append(col)
    # Store the numerical columns in a list numerical
    numerical = df.columns.difference(categorical)
    numeric_transformations = [
        ([f],
         Pipeline(steps=[('imputer', SimpleImputer(
             strategy='median')), ('scaler', StandardScaler())]))
        for f in numerical
    ]
    categorical_transformations = [([f],
                                    OneHotEncoder(handle_unknown='ignore',
                                                  sparse=False))
                                   for f in categorical]
    transformations = numeric_transformations + categorical_transformations
    # Append classifier to preprocessing pipeline
    clf = Pipeline(steps=[('preprocessor', DataFrameMapper(transformations)
                           ), ('classifier',
                               LogisticRegression(solver='lbfgs'))])
    # Split data into train and test
    x_train, x_test, y_train, y_test = train_test_split(df,
                                                        target,
                                                        test_size=0.35,
                                                        random_state=0,
                                                        stratify=target)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print(classification_report(y_test, y_pred))
    accu = accuracy_score(y_test, y_pred)
    model_file_name = 'classifier.pkl'
    # save model in the outputs folder so it automatically get uploaded
    with open(model_file_name, 'wb') as file:
        joblib.dump(value=clf,
                    filename=os.path.join('./outputs/', model_file_name))
    run = Run.get_context()
    run.log("accuracy", accu)
    # we upload the model into the experiment artifact store, but do not register it as a model until unit tests are sucessfully passed in next ML step
    run.upload_file(model_file_name, os.path.join('./outputs/',
                                                  model_file_name))
    #Interpret steps
    client = ExplanationClient.from_run(run)
    # Using SHAP TabularExplainer
    explainer = TabularExplainer(clf.steps[-1][1],
                                 initialization_examples=x_train,
                                 features=df.columns,
                                 classes=["Not leaving", "leaving"],
                                 transformations=transformations)
    # explain overall model predictions (global explanation)
    global_explanation = explainer.explain_global(x_test)
    # Sorted SHAP values
    print('ranked global importance values: {}'.format(
        global_explanation.get_ranked_global_values()))
    # Corresponding feature names
    print('ranked global importance names: {}'.format(
        global_explanation.get_ranked_global_names()))
    # Feature ranks (based on original order of features)
    print('global importance rank: {}'.format(
        global_explanation.global_importance_rank))
    # uploading global model explanation data for storage or visualization in webUX
    # the explanation can then be downloaded on any compute
    # multiple explanations can be uploaded
    client.upload_model_explanation(global_explanation,
                                    comment='global explanation: all features')