Beispiel #1
0
def test_model_backcompat_local(mimic_explainer):
    class DummyModel:
        def predict(self, X):
            return X['TotalBalance']

    dummy_model = DummyModel()
    model_file = 'old_mimic_model2.json'
    if not path.exists(model_file):
        model_file = path.join('test', model_file)
    with open(model_file, 'r') as file:
        data = file.read()
    properties = json.loads(data)
    explainer = mimic_explainer._load(dummy_model, properties)
    eval_data = retrieve_dataset('backcompat_data.csv')
    df = pd.DataFrame(
        np.random.randint(0,
                          eval_data.shape[0],
                          size=(eval_data.shape[0], 674 - 5)))
    eval_data = eval_data[eval_data.columns[-5:]]
    eval_data = pd.concat([df, eval_data], axis=1)
    local_explanation = explainer.explain_local(eval_data)
    assert local_explanation._local_importance_values.shape[
        1] == explainer.surrogate_model.model._n_features
    global_explanation = explainer.explain_global(eval_data)
    assert len(global_explanation.global_importance_values
               ) == explainer.surrogate_model.model._n_features
Beispiel #2
0
    def test_get_local_raw_explanations_sparse_regression(
            self, mimic_explainer):
        X, y = retrieve_dataset('a1a.svmlight')
        x_train, x_test, y_train, _ = train_test_split(X,
                                                       y,
                                                       test_size=0.2,
                                                       random_state=7)
        # Fit a linear regression model
        model = create_sklearn_linear_regressor(x_train, y_train)

        explainer = mimic_explainer(
            model,
            x_train,
            LinearExplainableModel,
            explainable_model_args={'sparse_data': True})
        global_explanation = explainer.explain_global(x_test)
        assert global_explanation.method == LINEAR_METHOD

        num_engineered_feats = x_train.shape[1]
        feature_map = np.eye(5, num_engineered_feats)

        global_raw_explanation = global_explanation.get_raw_explanation(
            [feature_map])
        self.validate_global_raw_explanation_regression(
            global_explanation, global_raw_explanation, feature_map)
 def create_msx_data(self, test_size):
     sparse_matrix = retrieve_dataset('msx_transformed_2226.npz')
     sparse_matrix_x = sparse_matrix[:, :sparse_matrix.shape[1] - 2]
     sparse_matrix_y = sparse_matrix[:, (sparse_matrix.shape[1] -
                                         2):(sparse_matrix.shape[1] - 1)]
     return train_test_split(sparse_matrix_x,
                             sparse_matrix_y,
                             test_size=test_size,
                             random_state=7)
def create_reviews_data(test_size):
    reviews_data = retrieve_dataset('reviews.json')
    papers = reviews_data['paper']
    reviews = []
    evaluation = []
    for paper in papers:
        if paper['review'] is None or not paper['review']:
            continue
        reviews.append(paper['review'][0]['text'])
        evaluation.append(paper['review'][0]['evaluation'])
    return train_test_split(reviews, evaluation, test_size=test_size, random_state=7)
def create_cancer_data():
    # Import cancer dataset
    cancer = retrieve_dataset('breast-cancer.train.csv', na_values='?').interpolate().astype('int64')
    cancer_target = cancer.iloc[:, 0]
    cancer_data = cancer.iloc[:, 1:]
    feature_names = cancer_data.columns.values
    target_names = ['no_cancer', 'cancer']
    # Split data into train and test
    x_train, x_test, y_train, y_validation = train_test_split(cancer_data, cancer_target,
                                                              test_size=0.2, random_state=0)
    return x_train, x_test, y_train, y_validation, feature_names, target_names
def create_energy_data():
    # Import energy data
    energy_data = retrieve_dataset('energyefficiency2012_data.train.csv')
    # Get the Y1 column
    target = energy_data.iloc[:, len(energy_data.columns) - 2]
    energy_data = energy_data.iloc[:, :len(energy_data.columns) - 3]
    feature_names = energy_data.columns.values
    # Split data into train and test
    x_train, x_test, y_train, y_validation = train_test_split(energy_data, target,
                                                              test_size=0.2, random_state=0)
    return x_train, x_test, y_train, y_validation, feature_names
Beispiel #7
0
 def test_raw_timestamp_explanation(self, mimic_explainer):
     df = retrieve_dataset(
         'insurance_claims.csv',
         na_values='?',
         parse_dates=['policy_bind_date', 'incident_date'])
     label = 'fraud_reported'
     df_y = df[label]
     df_X = df.drop(columns=label)
     x_train, x_test, y_train, y_test = train_test_split(df_X,
                                                         df_y,
                                                         test_size=0.2,
                                                         random_state=7)
     str_cols = df_X.select_dtypes(
         exclude=[np.number, np.datetime64]).columns.tolist()
     dt_cols = df_X.select_dtypes(include=[np.datetime64]).columns.tolist()
     numeric_cols = df_X.select_dtypes(include=[np.number]).columns.tolist()
     transforms_list = []
     for str_col in str_cols:
         transforms_list.append(
             (str_col,
              Pipeline(steps=[('imputer',
                               SimpleImputer(strategy='most_frequent')
                               ), ('ohe', OneHotEncoder(sparse=False))]),
              [str_col]))
     for numeric_col in numeric_cols:
         transforms_list.append(
             (numeric_col,
              Pipeline(steps=[('imputer', SimpleImputer(
                  strategy='mean')), ('scaler', StandardScaler())]),
              [numeric_col]))
     for dt_col in dt_cols:
         transforms_list.append(
             (dt_col, Pipeline(steps=[('scaler', StandardScaler())]),
              [dt_col]))
     transformations = ColumnTransformer(transforms_list)
     x_train_transformed = transformations.fit_transform(x_train)
     model = create_lightgbm_classifier(x_train_transformed, y_train)
     model_task = ModelTask.Classification
     features = df_X.columns.tolist()
     explainer = mimic_explainer(model,
                                 x_train,
                                 LGBMExplainableModel,
                                 transformations=transformations,
                                 features=features,
                                 model_task=model_task)
     explanation = explainer.explain_global(x_train)
     dashboard_pipeline = Pipeline(steps=[('preprocess',
                                           transformations), ('model',
                                                              model)])
     ExplanationDashboard(explanation,
                          dashboard_pipeline,
                          datasetX=x_train,
                          trueY=y_train)
Beispiel #8
0
    def test_explain_model_sparse_tree(self, tabular_explainer):
        X, y = retrieve_dataset('a1a.svmlight')
        x_train, x_test, y_train, _ = train_test_split(X, y, test_size=0.002, random_state=7)
        # Fit a random forest regression model
        model = create_sklearn_random_forest_regressor(x_train, y_train)
        _, cols = x_train.shape
        shape = 1, cols
        background = csr_matrix(shape, dtype=x_train.dtype)

        # Create tabular explainer
        exp = tabular_explainer(model, background)
        test_logger.info('Running explain global for test_explain_model_sparse_tree')
        policy = SamplingPolicy(allow_eval_sampling=True)
        exp.explain_global(x_test, sampling_policy=policy)
Beispiel #9
0
    def test_explain_model_string_classes(self, mimic_explainer):
        adult_census_income = retrieve_dataset('AdultCensusIncome.csv',
                                               skipinitialspace=True)
        X = adult_census_income.drop(['income'], axis=1)
        y = adult_census_income[['income']]
        features = X.columns.values.tolist()
        classes = y['income'].unique().tolist()
        pipe_cfg = {
            'num_cols': X.dtypes[X.dtypes == 'int64'].index.values.tolist(),
            'cat_cols': X.dtypes[X.dtypes == 'object'].index.values.tolist(),
        }
        num_pipe = Pipeline([('num_imputer', SimpleImputer(strategy='median')),
                             ('num_scaler', StandardScaler())])
        cat_pipe = Pipeline([
            ('cat_imputer', SimpleImputer(strategy='constant',
                                          fill_value='?')),
            ('cat_encoder', OneHotEncoder(handle_unknown='ignore',
                                          sparse=False))
        ])
        feat_pipe = ColumnTransformer([
            ('num_pipe', num_pipe, pipe_cfg['num_cols']),
            ('cat_pipe', cat_pipe, pipe_cfg['cat_cols'])
        ])
        X_train = X.copy()
        y_train = y.copy()
        X_train.reset_index(drop=True, inplace=True)
        y_train.reset_index(drop=True, inplace=True)
        X_train = feat_pipe.fit_transform(X_train)
        model = SGDClassifier()
        model = model.fit(X_train, y_train['income'])
        model_task = ModelTask.Classification
        explainer = mimic_explainer(model,
                                    X.iloc[:1000],
                                    LinearExplainableModel,
                                    augment_data=True,
                                    max_num_of_augmentations=10,
                                    features=features,
                                    classes=classes,
                                    model_task=model_task,
                                    transformations=feat_pipe)
        global_explanation = explainer.explain_global(X.iloc[:1000])
        assert global_explanation.method == LINEAR_METHOD

        self._verify_predictions_and_replication_metric(
            explainer, X.iloc[:1000])
Beispiel #10
0
 def test_explain_model_imbalanced_classes(self, mimic_explainer):
     model = retrieve_model('unbalanced_model.pkl')
     x_train = retrieve_dataset('unbalanced_dataset.npz')
     model_predictions = model.predict(x_train)
     # Assert the model's predictions are skewed
     assert len(np.unique(model_predictions)) == 2
     explainable_model = LGBMExplainableModel
     explainer = mimic_explainer(model, x_train, explainable_model, max_num_of_augmentations=10)
     global_explanation = explainer.explain_global(x_train, include_local=True)
     # There should be an explanation per feature
     assert len(global_explanation.global_importance_values) == 1585
     # We should get back an explanation for each class
     assert len(global_explanation.local_importance_values) == 3
     # Get the underlying multiclass model
     surrogate_predictions = explainer.surrogate_model.model.predict(x_train)
     assert len(np.unique(surrogate_predictions)) == 2
     assert len(np.unique(model_predictions)) == 2
     assert np.isclose(surrogate_predictions, model_predictions).all()
Beispiel #11
0
def load_msx():
    Z = retrieve_dataset('msx_transformed_2226.npz')

    return Z[:, :-2], Z[:, -2].toarray().flatten(), "msx", LinearRegression()
Beispiel #12
0
    experiment = f'{args["dataset"]}_test' if args['name'] == '' else args['name']

    # Define the compute device (either GPU or CPU)
    compute_device = torch.device(args['gpu'] if torch.cuda.is_available() else 'cpu')

    # Set up a parameters object for saving hyperparameters, etc.
    parameters = parameters.Parameters(experiment, 'test', **args)
    with open(os.path.abspath(f'{args["network_dir"]}{experiment}_parameters.pkl'), 'rb') as f:
        parameters = pickle.load(f)

    # Create the data transforms for each respective set
    transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])])

    # Retrieve the datasets
    _, val_dataset, test_dataset = retrieve_dataset(args['dataset'], args['image_dir'], transform, transform, test_equals_val=True)

    val_dataloader = DataLoader(val_dataset, batch_size=args['batch_size'], shuffle=False)
    test_dataloader = DataLoader(test_dataset, batch_size=args['batch_size'], shuffle=False)

    # Create the network, (potentially) load network state dictionary, and send the network to the compute device
    num_classes = val_dataset.num_classes()
    loader = retrieve_network(args['dataset'], args['network'])
    network = loader(num_classes=num_classes)
    network.load_state_dict(torch.load(os.path.abspath(f'{args["network_dir"]}{experiment}/{experiment}.pth'), map_location='cpu'))
    network.eval()

    # Send to GPU
    network = network.to(compute_device)

    # Get the batch size