def test_explain_model_pandas_string(self, tabular_explainer): np.random.seed(777) num_rows = 100 num_ints = 10 num_cols = 4 split_ratio = 0.2 A = np.random.randint(num_ints, size=num_rows) B = np.random.random(size=num_rows) C = np.random.randn(num_rows) cat = np.random.choice(['New York', 'San Francisco', 'Los Angeles', 'Atlanta', 'Denver', 'Chicago', 'Miami', 'DC', 'Boston'], 100) label = np.random.choice([0, 1], num_rows) df = pd.DataFrame(data={'A': A, 'B': B, 'C': C, 'cat': cat, 'label': label}) df.cat = df.cat.astype('category') X = df.drop('label', axis=1) y = df.label x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=split_ratio) clf = create_lightgbm_classifier(x_train, y_train) explainer = tabular_explainer(clf, initialization_examples=x_train, features=x_train.columns) global_explanation = explainer.explain_global(x_test) local_shape = global_explanation._local_importance_values.shape num_rows_expected = split_ratio * num_rows assert local_shape == (2, num_rows_expected, num_cols) assert len(global_explanation.global_importance_values) == num_cols assert global_explanation.num_features == num_cols
def test_raw_timestamp_explanation(self, mimic_explainer): df = retrieve_dataset( 'insurance_claims.csv', na_values='?', parse_dates=['policy_bind_date', 'incident_date']) label = 'fraud_reported' df_y = df[label] df_X = df.drop(columns=label) x_train, x_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.2, random_state=7) str_cols = df_X.select_dtypes( exclude=[np.number, np.datetime64]).columns.tolist() dt_cols = df_X.select_dtypes(include=[np.datetime64]).columns.tolist() numeric_cols = df_X.select_dtypes(include=[np.number]).columns.tolist() transforms_list = [] for str_col in str_cols: transforms_list.append( (str_col, Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent') ), ('ohe', OneHotEncoder(sparse=False))]), [str_col])) for numeric_col in numeric_cols: transforms_list.append( (numeric_col, Pipeline(steps=[('imputer', SimpleImputer( strategy='mean')), ('scaler', StandardScaler())]), [numeric_col])) for dt_col in dt_cols: transforms_list.append( (dt_col, Pipeline(steps=[('scaler', StandardScaler())]), [dt_col])) transformations = ColumnTransformer(transforms_list) x_train_transformed = transformations.fit_transform(x_train) model = create_lightgbm_classifier(x_train_transformed, y_train) model_task = ModelTask.Classification features = df_X.columns.tolist() explainer = mimic_explainer(model, x_train, LGBMExplainableModel, transformations=transformations, features=features, model_task=model_task) explanation = explainer.explain_global(x_train) dashboard_pipeline = Pipeline(steps=[('preprocess', transformations), ('model', model)]) ExplanationDashboard(explanation, dashboard_pipeline, datasetX=x_train, trueY=y_train)
def test_explain_model_lightgbm_multiclass(self, tabular_explainer, iris): # Fit a lightgbm model model = create_lightgbm_classifier(iris[DatasetConstants.X_TRAIN], iris[DatasetConstants.Y_TRAIN]) # Create tabular explainer exp = tabular_explainer(model, iris[DatasetConstants.X_TRAIN], features=iris[DatasetConstants.FEATURES], classes=iris[DatasetConstants.CLASSES]) test_logger.info('Running explain global for test_explain_model_lightgbm_multiclass') explanation = exp.explain_global(iris[DatasetConstants.X_TEST]) assert len(explanation.local_importance_values[0]) == len(iris[DatasetConstants.X_TEST]) assert explanation.num_examples == len(iris[DatasetConstants.X_TEST]) assert len(explanation.local_importance_values) == len(iris[DatasetConstants.CLASSES]) assert explanation.num_classes == len(iris[DatasetConstants.CLASSES])
def test_explain_model_lightgbm_binary(self, tabular_explainer, iris): X, y = shap.datasets.adult() x_train, x_test, y_train, _ = train_test_split(X, y, test_size=0.2, random_state=7) # Fit a tree model model = create_lightgbm_classifier(x_train, y_train) classes = ["<50k", ">50k"] # Create local tabular explainer without run history exp = tabular_explainer(model, x_train, features=X.columns.values, classes=classes) test_logger.info( 'Running explain global for test_explain_model_lightgbm_binary') explanation = exp.explain_global(x_test) assert len(explanation.local_importance_values[0]) == len(x_test) assert len(explanation.local_importance_values) == len(classes)
def create_model(x_train, y_train): return create_lightgbm_classifier(x_train, y_train)