コード例 #1
0
def model_train(ds_df, run):

    ds_df.drop("Sno", axis=1, inplace=True)

    y_raw = ds_df['Risk']
    X_raw = ds_df.drop('Risk', axis=1)

    categorical_features = X_raw.select_dtypes(include=['object']).columns
    numeric_features = X_raw.select_dtypes(include=['int64', 'float']).columns

    categorical_transformer = Pipeline(
        steps=[('imputer',
                SimpleImputer(strategy='constant', fill_value="missing")),
               ('onehotencoder',
                OneHotEncoder(categories='auto', sparse=False))])

    numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

    feature_engineering_pipeline = ColumnTransformer(transformers=[
        ('numeric', numeric_transformer, numeric_features),
        ('categorical', categorical_transformer, categorical_features)
    ],
                                                     remainder="drop")

    # Encode Labels
    le = LabelEncoder()
    encoded_y = le.fit_transform(y_raw)

    # Train test split
    X_train, X_test, y_train, y_test = train_test_split(X_raw,
                                                        encoded_y,
                                                        test_size=0.20,
                                                        stratify=encoded_y,
                                                        random_state=42)

    # Create sklearn pipeline
    lr_clf = Pipeline(
        steps=[('preprocessor', feature_engineering_pipeline
                ), ('classifier', LogisticRegression(solver="lbfgs"))])
    # Train the model
    lr_clf.fit(X_train, y_train)

    # Capture metrics
    train_acc = lr_clf.score(X_train, y_train)
    test_acc = lr_clf.score(X_test, y_test)
    print("Training accuracy: %.3f" % train_acc)
    print("Test data accuracy: %.3f" % test_acc)

    # Log to Azure ML
    run.log('Train accuracy', train_acc)
    run.log('Test accuracy', test_acc)

    # Explain model
    from azureml.contrib.interpret.explanation.explanation_client import ExplanationClient
    from azureml.core.run import Run
    from interpret.ext.blackbox import TabularExplainer
    from azureml.contrib.interpret.visualize import ExplanationDashboard

    client = ExplanationClient.from_run(run)

    explainer = TabularExplainer(lr_clf.steps[-1][1],
                                 initialization_examples=X_train,
                                 features=X_raw.columns,
                                 classes=["Good", "Bad"],
                                 transformations=feature_engineering_pipeline)

    # explain overall model predictions (global explanation)
    global_explanation = explainer.explain_global(X_test)

    # Sorted SHAP values
    print('ranked global importance values: {}'.format(
        global_explanation.get_ranked_global_values()))
    # Corresponding feature names
    print('ranked global importance names: {}'.format(
        global_explanation.get_ranked_global_names()))
    # Feature ranks (based on original order of features)
    print('global importance rank: {}'.format(
        global_explanation.global_importance_rank))

    client = ExplanationClient.from_run(run)
    client.upload_model_explanation(global_explanation,
                                    comment='global explanation: all features')

    return lr_clf
コード例 #2
0
from sklearn.linear_model import Ridge
from interpret.ext.blackbox import TabularExplainer
from azureml.contrib.interpret.explanation.explanation_client import ExplanationClient
from sklearn.model_selection import train_test_split
from azureml.core.run import Run
from sklearn.externals import joblib
import os
import numpy as np

OUTPUT_DIR = './outputs/'
os.makedirs(OUTPUT_DIR, exist_ok=True)

boston_data = datasets.load_boston()

run = Run.get_context()
client = ExplanationClient.from_run(run)

X_train, X_test, y_train, y_test = train_test_split(boston_data.data,
                                                    boston_data.target,
                                                    test_size=0.2,
                                                    random_state=0)
# write x_test out as a pickle file for later visualization
x_test_pkl = 'x_test.pkl'
with open(x_test_pkl, 'wb') as file:
    joblib.dump(value=X_test, filename=os.path.join(OUTPUT_DIR, x_test_pkl))
run.upload_file('x_test_boston_housing.pkl',
                os.path.join(OUTPUT_DIR, x_test_pkl))

alpha = 0.5
# Use Ridge algorithm to create a regression model
reg = Ridge(alpha)
コード例 #3
0
def train_model(df, target):
    # Creating dummy columns for each categorical feature
    categorical = []
    for col, value in df.iteritems():
        if value.dtype == 'object':
            categorical.append(col)
    # Store the numerical columns in a list numerical
    numerical = df.columns.difference(categorical)
    numeric_transformations = [
        ([f],
         Pipeline(steps=[('imputer', SimpleImputer(
             strategy='median')), ('scaler', StandardScaler())]))
        for f in numerical
    ]
    categorical_transformations = [([f],
                                    OneHotEncoder(handle_unknown='ignore',
                                                  sparse=False))
                                   for f in categorical]
    transformations = numeric_transformations + categorical_transformations
    # Append classifier to preprocessing pipeline
    clf = Pipeline(steps=[('preprocessor', DataFrameMapper(transformations)
                           ), ('classifier',
                               LogisticRegression(solver='lbfgs'))])
    # Split data into train and test
    x_train, x_test, y_train, y_test = train_test_split(df,
                                                        target,
                                                        test_size=0.35,
                                                        random_state=0,
                                                        stratify=target)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print(classification_report(y_test, y_pred))
    accu = accuracy_score(y_test, y_pred)
    model_file_name = 'classifier.pkl'
    # save model in the outputs folder so it automatically get uploaded
    with open(model_file_name, 'wb') as file:
        joblib.dump(value=clf,
                    filename=os.path.join('./outputs/', model_file_name))
    run = Run.get_context()
    run.log("accuracy", accu)
    # we upload the model into the experiment artifact store, but do not register it as a model until unit tests are sucessfully passed in next ML step
    run.upload_file(model_file_name, os.path.join('./outputs/',
                                                  model_file_name))
    #Interpret steps
    client = ExplanationClient.from_run(run)
    # Using SHAP TabularExplainer
    explainer = TabularExplainer(clf.steps[-1][1],
                                 initialization_examples=x_train,
                                 features=df.columns,
                                 classes=["Not leaving", "leaving"],
                                 transformations=transformations)
    # explain overall model predictions (global explanation)
    global_explanation = explainer.explain_global(x_test)
    # Sorted SHAP values
    print('ranked global importance values: {}'.format(
        global_explanation.get_ranked_global_values()))
    # Corresponding feature names
    print('ranked global importance names: {}'.format(
        global_explanation.get_ranked_global_names()))
    # Feature ranks (based on original order of features)
    print('global importance rank: {}'.format(
        global_explanation.global_importance_rank))
    # uploading global model explanation data for storage or visualization in webUX
    # the explanation can then be downloaded on any compute
    # multiple explanations can be uploaded
    client.upload_model_explanation(global_explanation,
                                    comment='global explanation: all features')