Python ExplainableBoostingClassifier Exemples, interpret.glassbox.ExplainableBoostingClassifier Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : test_convert.py Projet : interpretml/ebm2onnx

def train_titanic_binary_classification(interactions, with_categorical=False):
    df = pd.read_csv(
        os.path.join('examples', 'titanic_train.csv'),
        #dtype= {
        #    'Age': np.float32,
        #    'Fare': np.float32,
        #    'Pclass': np.float32, # np.int
        #}
    )
    df = df.dropna()
    df['Old'] = df['Age'] > 65
    feature_types = ['continuous', 'continuous', 'continuous', 'continuous']
    feature_columns = ['Age', 'Fare', 'Pclass', 'Old']
    if with_categorical is True:
        feature_columns.append('Embarked')
        feature_types.append('categorical')
    label_column = "Survived"

    y = df[[label_column]]
    le = LabelEncoder()
    y_enc = le.fit_transform(y)
    x = df[feature_columns]
    x_train, x_test, y_train, y_test = train_test_split(x, y_enc)
    model = ExplainableBoostingClassifier(interactions=interactions,
                                          feature_types=feature_types)
    model.fit(x_train, y_train)

    return model, x_test, y_test

Exemple #2

0

Afficher le fichier

Fichier : GAM_ppi_for_jeff.py Projet : jlaw9/sars_ppi

def tune_ebm(X_train, y_train):
    reslist = []
    metric_idx=1  # index where AUC is stored
    for interac in [50, 100, 500]: 
        clf = ExplainableBoostingClassifier(random_state=seed, interactions=interac)
        cv_results = cross_validate(clf, X_train, y_train, cv=3, scoring='average_precision')
        reslist.append((interac, np.mean(cv_results['test_score'])))
    print(*reslist, sep='\n')
    reslist = np.asarray(reslist)
    bestid = np.where(reslist[:,metric_idx]==max(reslist[:,metric_idx]))[0][0]
    clf = ExplainableBoostingClassifier(random_state=seed, interactions=reslist[bestid,0])
    clf = clf.fit(X_train, y_train)
    return clf

Exemple #3

0

Afficher le fichier

def run_training_process():
    df = load_and_clean_data()
    X = df[train_cols].reset_index(drop=True)
    y = df["target"].to_numpy()

    clf = ExplainableBoostingClassifier()

    for tr, tst in StratifiedKFold(n_splits=3).split(X, y):
        print("Shape of train data: {:d}\nShape of test data: {:d}\n".format(
            len(tr), len(tst)))
        print(
            "Sum of labels in train: {:d}\nSum of labels in test: {:d}".format(
                y[tr].sum(), y[tst].sum()))

        clf.fit(X.loc[tr], y[tr])
        print("ROC AUC Score: {:4f}".format(
            roc_auc_score(y[tst],
                          clf.predict_proba(X.loc[tst])[:, 1])))

    clf.fit(X, y)

    with open("model_file", "bw") as file:
        pickle.dump(clf, file)

    df.to_csv("features_file.csv", index=False)
    df["preds"] = clf.predict_proba(X)[:, 1]

    df[[
        "inn",
        "preds",
        "target",
    ]].to_csv("score.csv", index=False)

Exemple #4

0

Afficher le fichier

Fichier : interpretable_functions.py Projet : glmmey-github/interpretable-machine-learning

def EBM(X,
        Y,
        learning_rate=None,
        depth=None,
        estimators=None,
        holdout_split=None,
        seed=None):

    ### model & parameters
    ebm = ExplainableBoostingClassifier(random_state=seed)
    c_grid = {
        "n_estimators": estimators,
        "max_tree_splits": depth,
        "learning_rate": learning_rate,
        "holdout_split": holdout_split
    }

    c_grid = {k: v for k, v in c_grid.items() if v is not None}

    summary = nested_cross_validate(X=X,
                                    Y=Y,
                                    estimator=ebm,
                                    c_grid=c_grid,
                                    seed=seed)
    return summary

Exemple #5

0

Afficher le fichier

Fichier : model_ga2m.py Projet : solversa/driverlessai-recipes

    def fit(self,
            X,
            y,
            sample_weight=None,
            eval_set=None,
            sample_weight_eval_set=None,
            **kwargs):
        from interpret.glassbox import (
            ExplainableBoostingClassifier,
            ExplainableBoostingRegressor,
        )

        logging.root.level = (
            10
        )  # HACK - EBM can't handle our custom logger with unknown level 9 (DATA)

        orig_cols = list(X.names)
        if self.num_classes >= 2:
            lb = LabelEncoder()
            lb.fit(self.labels)
            y = lb.transform(y)
            model = ExplainableBoostingClassifier(**self.params)
        else:
            model = ExplainableBoostingRegressor(**self.params)

        # Replace missing values with a value smaller than all observed values
        self.min = dict()
        for col in X.names:
            XX = X[:, col]
            self.min[col] = XX.min1()
            if self.min[col] is None or np.isnan(self.min[col]):
                self.min[col] = -1e10
            else:
                self.min[col] -= 1
            XX.replace(None, self.min[col])
            X[:, col] = XX
            assert X[dt.isna(dt.f[col]), col].nrows == 0
        X = X.to_numpy()

        model.fit(X, y)
        importances = self.get_importances(model, X.shape[1])
        self.set_model_properties(
            model=model,
            features=orig_cols,
            importances=importances,
            iterations=self.params["n_estimators"],
        )

Exemple #6

0

Afficher le fichier

def EBM(KY_x, KY_y, FL_x, FL_y, learning_rate, depth, estimators, seed):

    KY_validation = []
    KY_score = []
    FL_score = []
    auc_diff = []
    best_param = []
    KY_x = KY_x.drop(['person_id'], axis=1)
    FL_x = FL_x.drop(['person_id'], axis=1)

    ### model & parameters
    gam = ExplainableBoostingClassifier(random_state=seed)
    c_grid = {
        "n_estimators": estimators,
        "max_tree_splits": depth,
        "learning_rate": learning_rate
    }

    ## cross validation set up
    outer_cv = KFold(n_splits=5, shuffle=True, random_state=seed)
    inner_cv = KFold(n_splits=5, shuffle=True, random_state=seed)

    for outer_train, outer_test in outer_cv.split(KY_x, KY_y):

        ## split FL data -- only use 4 folds
        outer_train_x, outer_train_y = KY_x.iloc[outer_train], KY_y[
            outer_train]
        outer_test_x, outer_test_y = KY_x.iloc[outer_test], KY_y[outer_test]

        ### cross validation on 4 folds
        clf = GridSearchCV(estimator=gam,
                           param_grid=c_grid,
                           scoring='roc_auc',
                           cv=inner_cv,
                           return_train_score=True).fit(
                               outer_train_x, outer_train_y)

        train_score = clf.cv_results_['mean_train_score']
        test_score = clf.cv_results_['mean_test_score']

        ## save results
        KY_validation.append(clf.best_score_)
        auc_diff.append(train_score[np.where(
            test_score == clf.best_score_)[0][0]] - clf.best_score_)
        best_param.append(clf.best_params_)

        ## best model
        FL_score.append(roc_auc_score(FL_y, clf.predict_proba(FL_x)[:, 1]))
        KY_score.append(
            roc_auc_score(outer_test_y,
                          clf.predict_proba(outer_test_x)[:, 1]))

    return {
        'auc_diff': auc_diff,
        'best_param': best_param,
        'KY_validation': KY_validation,
        'KY_score': KY_score,
        'FL_score': FL_score
    }

Exemple #7

0

Afficher le fichier

Fichier : test_convert.py Projet : interpretml/ebm2onnx

def train_bank_churners_multiclass_classification():
    df = pd.read_csv(os.path.join('examples', 'BankChurners.csv'), )
    df = df.dropna()
    feature_types = ['continuous', 'continuous', 'categorical', 'continuous']
    feature_columns = [
        'Customer_Age', 'Dependent_count', 'Education_Level', 'Credit_Limit'
    ]
    label_column = "Income_Category"

    y = df[[label_column]]
    le = LabelEncoder()
    y_enc = le.fit_transform(y)
    x = df[feature_columns]
    x_train, x_test, y_train, y_test = train_test_split(x, y_enc)
    model = ExplainableBoostingClassifier(interactions=0,
                                          feature_types=feature_types)
    model.fit(x_train, y_train)

    return model, x_test, y_test

Exemple #8

0

Afficher le fichier

    def fit(self,
            X,
            y,
            sample_weight=None,
            eval_set=None,
            sample_weight_eval_set=None,
            **kwargs):
        from interpret.glassbox import (
            ExplainableBoostingClassifier,
            ExplainableBoostingRegressor,
        )

        logging.root.level = (
            10
        )  # HACK - EBM can't handle our custom logger with unknown level 9 (DATA)

        orig_cols = list(X.names)
        if self.num_classes >= 2:
            lb = LabelEncoder()
            lb.fit(self.labels)
            y = lb.transform(y)
            model = ExplainableBoostingClassifier(**self.params)
        else:
            model = ExplainableBoostingRegressor(**self.params)

        X = self.basic_impute(X)
        X = X.to_numpy()

        model.fit(X, y)
        importances = self.get_importances(model, X.shape[1])
        self.set_model_properties(
            model=model,
            features=orig_cols,
            importances=importances,
            iterations=self.params["n_estimators"],
        )

Exemple #9

0

Afficher le fichier

Fichier : GAM_ppi_cov.py Projet : meghana-kshirsagar/sars_ppi

        print("======================== ", interac, " ======================")
        splitwise_perf = []
        for split in range(0, 5):
            X_train_cov, X_test_cov = X_cov.iloc[
                train_idxes_cov[split], :], X_cov.iloc[
                    test_idxes_cov[split], :]
            y_train_cov, y_test_cov = y_cov[train_idxes_cov[split]], y_cov[
                test_idxes_cov[split]]

            #X_train_cov, y_train_cov = undersample_negatives(X_train_cov, y_train_cov, 50)

            y_train_cov = y_train_cov.ravel()
            #clf = tune_ebm(X_train_cov, y_train_cov)

            if interac == 0:
                clf = ExplainableBoostingClassifier()
            else:
                clf = ExplainableBoostingClassifier(interactions=interac)

            clf.fit(X_train_cov, y_train_cov)
            curr_perf = []
            y_pred_cov = clf.predict(X_test_cov)
            #curr_perf += [metrics.accuracy_score(y_test_cov, y_pred_cov)]
            print(metrics.confusion_matrix(y_test_cov, y_pred_cov))
            y_pred_cov = clf.predict_proba(X_test_cov)
            curr_perf += [get_aucpr_R(y_test_cov, y_pred_cov[:, 1])]
            curr_perf += [get_auc_R(y_test_cov, y_pred_cov[:, 1])]
            curr_perf += [get_fmax(y_test_cov, y_pred_cov[:, 1])]
            curr_perf += get_early_prec(y_test_cov, y_pred_cov[:, 1])
            print(curr_perf)
            splitwise_perf.append(curr_perf)

Exemple #10

0

Afficher le fichier

 def build_estimator(args, train_data=None):
     feature_names = [f"featur_{i}" for i in range(train_data[0].shape[1])]
     return ExplainableBoostingClassifier(random_state=RANDOM_STATE,
                                          feature_names=feature_names,
                                          **args)

Exemple #11

0

Afficher le fichier

Fichier : GAM_ppi_for_jeff.py Projet : jlaw9/sars_ppi

    # generate train/test splits
    X_train_neg, X_test_neg = train_test_split(X_neg, test_size=0.2)
    X_train = pd.DataFrame(np.row_stack((X_train_pos, X_train_neg)), columns=feat_names)
    X_test = pd.DataFrame(np.row_stack((X_test_pos, X_test_neg)), columns=feat_names)
    y_test = np.zeros((X_test.shape[0],1))
    y_train = np.zeros((X_train.shape[0],1))
    y_train[range(X_train_pos.shape[0])]=1
    y_test[range(X_test_pos.shape[0])]=1
    print("X size: ",X_train.shape[0],'x',X_train.shape[1])
    print("y size: ",y_train.shape[0],'x',y_train.shape[1])
    print("X-test size: ",X_test.shape[0],'x',X_test.shape[1])
    print("y-test size: ",y_test.shape[0],'x',y_test.shape[1])

    # train and test, performance output    
    #clf = tune_ebm(X_train, y_train)
    clf = ExplainableBoostingClassifier(random_state=seed, interactions=100)
    clf.fit(X_train, y_train)
    print("Finished training ...")
    curr_perf = []
    y_pred = clf.predict(X_test)
    curr_perf += [metrics.accuracy_score(y_test, y_pred)]
    print(metrics.confusion_matrix(y_test, y_pred))
    y_pred = clf.predict_proba(X_test)
    curr_perf += [get_aucpr(y_test, y_pred[:,1])]
    curr_perf += [get_auc(y_test, y_pred[:,1])]
    print("Performance: ",curr_perf)

    # predict on larger set, output predictions
    print("Predicting on all test pairs now... ")
    scores = (clf.predict_proba(X_neg_all))[:,1]
    neg_pps['score'] = scores

Exemple #12

0

Afficher le fichier

def fit_ga2m(configuration, res_dir, predicted_variable='Row', threshold=3):
    """
    Fits a ga2m model, using the data retrieved by the function get_data, and stores the fit object the training data
    and the test set in pickle files. Always fits a two class prediction model for a given predicted_variable, and a
    threshold to separate that variable by. The predicted_variable is assumed to be ordinal. The defaults
    predicted_variable and threshold are set up for predicting the LFS and WT mutation of p53 for the individuals in the
    dataset.

    :param configuration: a dictionary of list of str
    :param res_dir: path to directory to store resulting fit model, test split and train split
    :param predicted_variable: The column in the LFS data which will be predicted.
    :param threshold: threshold for the predicted_variable
    :return: dictionary with keys 'fit', 'train', 'test' with values corresponding to the paths to the respective files.
    """
    seed(7)
    dat = get_data()

    # Label "mutant" observations, comes from the original prediction task though mutant may not be an appropriate label
    # depending on the predicted_variable, but the mutant column will be the binary predicted classes for the fit model.
    dat['mutant'] = dat[predicted_variable] > threshold
    # dat['mutant'] = dat.Column >= (max(dat.Column) - min(dat.Column))/2 + min(dat.Column)

    # Apply given configuration
    if configuration['subset_features'][0] != 'None':
        dat = dat[configuration['subset_features'] + ['mutant']]

    # Drop labelling columns and shuffle data order.
    if configuration['test'][0] == 'random':
        dat = dat.drop(columns=['Row', 'Column', 'Time', 'S', 'M', 'FocusScore3', 'FocusScore4', 'FocusScore5', 'Centroid_1', 'Centroid_2', 'Orientation']).sample(frac=1)

    # Select random train and test sets.
        dat_train = dat.iloc[:floor(len(dat) * 0.9), :]
        dat_test = dat.iloc[floor(len(dat) * 0.9):, :]

    elif sum([b.isdigit() for b in configuration['test']]) == len(configuration['test']):
        # Assume the values in configuration['test'] refer to specific entries which will only be in the test set.
        if not (sum([int(b) in dat[predicted_variable] for b in configuration['test']]) == len(configuration['test'])):
            raise Exception('not all test values are rows in the data.')
        test_rows = [int(r) for r in configuration['test']]
        # Let the test set be a set of entries, for default predicted_variable this corresponds to individuals in our
        # data.
        dat_train = dat.loc[~dat[predicted_variable].isin(test_rows)]
        # This are all indicator/irrelevant variables we don't want to consider, which should be removed from train and
        # test sets.
        dat_train = dat_train.drop(columns=['Row', 'Column', 'Time', 'S', 'M', 'FocusScore3', 'FocusScore4',
                                            'FocusScore5', 'Centroid_1', 'Centroid_2', 'Orientation'])
        dat_test = dat.loc[dat[predicted_variable].isin(test_rows)]
        dat_test = dat_test.drop(columns=['Row', 'Column', 'Time', 'S', 'M', 'FocusScore3', 'FocusScore4',
                                          'FocusScore5', 'Centroid_1', 'Centroid_2', 'Orientation'])
    else:
        raise Exception('test = x, where x must be random, or a comma separated seq of digits which are valid entries '
                        'in the predicted_variable in the data')

    # Check that the original predicted_variable isn't in the training or testing data

    ebm = ExplainableBoostingClassifier(interactions=int(configuration['num_interaction'][0]))
    ebm.fit(X=dat_train.drop(columns='mutant'), y=dat_train['mutant'])

    with open(res_dir + 'ga2m_fit', 'wb') as ga2m_file:
        pk.dump(ebm, ga2m_file)

    with open(res_dir + 'dat_train', 'wb') as train_file:
        pk.dump(dat_train, train_file)

    with open(res_dir + 'dat_test', 'wb') as test_file:
        pk.dump(dat_test, test_file)

    return {'fit': res_dir + 'ga2m_fit', 'train': res_dir + 'dat_train', 'test': res_dir + 'dat_test'}

Exemple #13

0

Afficher le fichier

    def __init__(self,
                 features,
                 cluster_labels,
                 feature_names=None,
                 clusters_to_analyze=None,
                 classifier='ebm',
                 score_threshold=0.8,
                 verbose=False):
        """Interpret-clusters is a utility that aims to provide cluster interpretations. This is done by using the cluster ids as labels 
        and training supervised learning models to predict the clusters. The given features do not need to be the same set of features 
        as what was used to calculate the clusters. By calculating the feature importance of the supervised model we can find the features 
        that are important to distinguishing a particular cluster. 
        
        Parameters
        ----------

        features: array or pandas.DataFrame
            The set of features to pass to the supervised learning model. This does not need to be the same set of features as what was used to calculate the clusters.
        
        cluster_labels: list
            The list of cluster labels that specify the cluster to which a point belongs. This must have the same dimension as features (i.e. there must be one label per data point).

        feature_names: list (optional, default None)
            The list of feature names which correspond to the columns of features. If None the column indices will be used.
        
        clusters_to_analyze: list (optional, None)
            The list of cluster labels to calculate feature importances for. If None then all clusters will be analyzed.
        
        classifier: string or callable (optional, default ebm)
            The classifier to use for predicting cluster labels. It must be a classifier from the interpret package. Built-in options are ["ebm", "logistic_regression"].

        score_threshold: float (optional, default 0.8)
            Warn if the trained model has a score below this threshold.
        
        verbose: bool (optional, default False)
            Display progress information.

        """
        self.features = features
        self.cluster_labels = np.array(cluster_labels)

        if feature_names is not None:
            self.feature_names = np.array(feature_names)
        else:
            self.feature_names = np.arange(features.shape[1])

        self.cluster_models = {}

        if clusters_to_analyze is None:
            self.clusters_to_analyze = list(set(self.cluster_labels))
        else:
            self.clusters_to_analyze = sorted(clusters_to_analyze)

        for cluster_id in self.clusters_to_analyze:
            if classifier == 'ebm':
                classifier = ExplainableBoostingClassifier(
                    feature_names=self.feature_names)
            elif classifier == 'logistic_regression':
                classifier = LogisticRegression(
                    feature_names=self.feature_names,
                    penalty='l1',
                    solver='liblinear')

            cluster_model = ClusterModel(cluster_id,
                                         deepcopy(classifier),
                                         features,
                                         cluster_labels,
                                         score_threshold=score_threshold,
                                         verbose=verbose)
            self.cluster_models[cluster_id] = cluster_model

        self.verbose = verbose
        self.local_explanations = {}
        self.global_explanations = {}

Exemple #14

0

Afficher le fichier

df_B = pd.DataFrame({'x': x_B, 'y': y_B}, columns=['x', 'y'])
df_A['category'] = 0
df_B['category'] = 1

#define training df (first 500 elements of each cathegory)
training_columns = ['x', 'y']
training_df = pd.concat([df_A.iloc[:500], df_B.iloc[:500]],
                        ignore_index=True,
                        sort=True)

#define test df (second 500 elements of each cathegory)
test_df = pd.concat([df_A.iloc[500:], df_B.iloc[500:]],
                    ignore_index=True,
                    sort=True)

ebm_clf = ExplainableBoostingClassifier()
ebm_clf.fit(training_df[training_columns], training_df['category'])

probabilities = ebm_clf.predict_proba(test_df[training_columns])
ebm_global = ebm_clf.explain_global()
show(ebm_global)

for prob in range(2):
    test_df['prob_{0}'.format(prob)] = probabilities[:, prob]

figcontur = plt.figure(figsize=(18, 7.5))
contourax = figcontur.add_subplot(111)
xx, yy = make_meshgrid(test_df['x'], test_df['y'])
plot_contours(contourax, ebm_clf, xx, yy, cmap='RdYlBu', alpha=0.8)
contourax.scatter(test_df.x,
                  test_df.y,

Exemple #15

0

Afficher le fichier

train_data = pd.read_csv('../data/titanic_train.csv')
test_data = pd.read_csv('../data/titanic_test.csv')

train_data = train_data.fillna(
    train_data.groupby(['Pclass', 'Sex']).transform('mean'))
test_data = test_data.fillna(
    test_data.groupby(['Pclass', 'Sex']).transform('mean'))

train_data = train_data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Survived']]
test_data = test_data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch']]

X_train, X_validate, y_train, y_validate = train_test_split(
    train_data.drop('Survived', axis=1), train_data['Survived'], test_size=.25)

ebm = ExplainableBoostingClassifier()
lrm = LogisticRegression()

ebm.fit(X_train, y_train)

le = LabelEncoder()
X_train_lr = X_train
X_train_lr['Sex'] = le.fit_transform(X_train['Sex'])
lrm.fit(X_train_lr, y_train)

ebm_global = ebm.explain_global()
show(ebm_global)
ebm_local = ebm.explain_local(X_validate, y_validate)
show(ebm_local)

lrm_global = lrm.explain_global()

Exemple #16

0

Afficher le fichier

Fichier : HeartDisease.py Projet : stanton119/data-analysis

# %% Logistic regression
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model = model.fit(X=X_train, y=y_train)
model.predict(X_train).mean()
model.coef_
X_train.columns
model.intercept_
model.get_params()

# %% Explainable gbm
from interpret.glassbox import ExplainableBoostingClassifier, LogisticRegression
from interpret import show

ebm = ExplainableBoostingClassifier()
ebm.fit(X=X_train, y=y_train)

ebm_global = ebm.explain_global(name='EBM')
show(ebm_global)

# %%
log_model = LogisticRegression()
log_model.fit(X=X_train, y=y_train)
log_global = log_model.explain_global(name='LogReg')
show(log_global)

show([ebm_global, log_global], share_tables=True)

# %%
from interpret.data import ClassHistogram

Exemple #17

0

Afficher le fichier

show(lr_global)

# %% Fit decision tree model
tree = ClassificationTree()
tree.fit(X_train, y_train)
print("Training finished.")
y_pred = tree.predict(X_test)
print(f"F1 Score {f1_score(y_test, y_pred, average='macro')}")
print(f"Accuracy {accuracy_score(y_test, y_pred)}")

# %% Explain local prediction
tree_local = tree.explain_local(X_test[:100], y_test[:100], name='Tree')
show(tree_local)

# %% Fit Explainable Boosting Machine
ebm = ExplainableBoostingClassifier(random_state=2021)
ebm.fit(X_train, y_train) 
print("Training finished.")
y_pred = ebm.predict(X_test)
print(f"F1 Score {f1_score(y_test, y_pred, average='macro')}")
print(f"Accuracy {accuracy_score(y_test, y_pred)}")

# %% Explain locally
ebm_local = ebm.explain_local(X_test[:100], y_test[:100], name='EBM')
show(ebm_local)

# %% Explain globally
ebm_global = ebm.explain_global(name='EBM')
show(ebm_global)
# %%

Exemple #18

0

Afficher le fichier

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('RFTree_ROC')
plt.show()

# ### Explainable Boosting Machine

# In[9]:

from interpret.glassbox import ExplainableBoostingClassifier

ebm = ExplainableBoostingClassifier()
ebm.fit(train_X, train_y)

# In[12]:

# display confusion matrices for train and test data

classificationSummary(train_y, ebm.predict(train_X))
classificationSummary(test_y, ebm.predict(test_X))

# In[10]:

from interpret import show

ebm_global = ebm.explain_global()
show(ebm_global)

Exemple #19

0

Afficher le fichier

kf = StratifiedKFold(n_splits=5, shuffle=True)
train_idxes = []
test_idxes = []
for train_index, test_index in kf.split(X, y):
    train_idxes.append(train_index)
    test_idxes.append(test_index)

splitwise_perf = []
for split in range(0, 5):
    X_train, X_test = X.iloc[train_idxes[split], :], X.iloc[
        test_idxes[split], :]
    y_train, y_test = y[train_idxes[split]], y[test_idxes[split]]
    #X_train, X_test, X_cov = normalize_train_test_cov(X_train, X_test, X_cov)
    y_train = y_train.ravel()
    clf = ExplainableBoostingClassifier(
        random_state=seed)  #, interactions=100)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(metrics.confusion_matrix(y_test, y_pred))
    curr_perf = []
    curr_perf += [metrics.accuracy_score(y_test, y_pred)]
    y_pred = clf.predict_proba(X_test)
    curr_perf += [get_aucpr(y_test, y_pred[:, 1])]
    curr_perf += [get_auc(y_test, y_pred[:, 1])]
    y_pred_cov = clf.predict(X_cov)
    print(metrics.confusion_matrix(y_cov, y_pred_cov))
    y_pred_cov = clf.predict_proba(X_cov)
    curr_perf += [get_aucpr(y_cov, y_pred_cov[:, 1])]
    curr_perf += [get_auc(y_cov, y_pred_cov[:, 1])]
    print(curr_perf)
    splitwise_perf.append(curr_perf)

Exemple #20

0

Afficher le fichier

# %% [markdown]
# ### Training and Interpreting EBM
# Train a Explainable Boosting Machine (with [interpret.ml](https://github.com/interpretml/interpret/))
#
# For a tutorial see: [[Tutorial](https://nbviewer.jupyter.org/github/interpretml/interpret/blob/master/examples/python/notebooks/Interpretable%20Classification%20Methods.ipynb)]
#
# **Q7**. Report (global) feature importances for EBM as a table or figure. What are the most important three features in EBM? Are they the same as in the linear model?
#
# w_1X + w_2Y + w_3(XY) = Z
# %%
from interpret.glassbox import ExplainableBoostingClassifier
from interpret import show

train_features, train_labels, dev_features, dev_labels, test_features, test_labels = prepare_load_classification_data(
)
ebm = ExplainableBoostingClassifier(n_jobs=-1)
ebm.fit(train_features, train_labels)
# EBM
#%% # Global Explanation
ebm_global = ebm.explain_global(name='EBM')
show(ebm_global)
#%% # Local Explanation
ebm_local = ebm.explain_local(dev_features[:5], dev_labels[:5], name='EBM')
show(ebm_local)
#%% # Performance
from interpret.perf import ROC
ebm_perf = ROC(ebm.predict_proba).explain_perf(dev_features,
                                               dev_labels,
                                               name='EBM')
show(ebm_perf)
# %% [markdown]

Exemple #21

0

Afficher le fichier

Fichier : model_ecg.py Projet : andturken/ECG-Anomaly-Detection

iX = list(range(data_train.shape[0]))
X = data_train
y = labels_train.ravel()

iX_train,  iX_test, y_train, y_test = \
    train_test_split(iX, y, test_size=0.25, stratify=y, random_state=0)

X_train, X_test = X[iX_train], X[iX_test]

X_test_out = data_test_out
y_test_out = labels_test_out

#%%
from interpret.glassbox import ExplainableBoostingClassifier

ebm = ExplainableBoostingClassifier()
ebm.fit(data_pts_1, labels_pts_1)

labels_pt_2_pred = ebm.predict(data_pts_2)
#%%

# Try isolation forest for outlier detection
X = data_pts_1

from sklearn.ensemble import IsolationForest

clf = IsolationForest(random_state=0, n_jobs=-1, contamination=0.25).fit(X)

A = clf.predict(X)

print((A == -1).mean(), (labels != 0).mean(),

Exemple #22

0

Afficher le fichier

Fichier : Predicting election corruption in Africa using machine learning and deep learning - Automated conversion.py Projet : abdjiber/Election-corruption-in-Africa

# ## Defining models
# %% [markdown]
# We will be using 6 models:
# * LightGBM
# * XGBoost
# * CatBoost
# * 2 layers Neural Nets
# * Explainable Boosting Classifier
# * Hist Gradient Boosting Classifier

# %%
clf_lgb = lgb.LGBMClassifier(**models_common_params_GBM)
clf_xgb = xgb.XGBClassifier(**models_common_params_GBM)
clf_cat = cat.CatBoostClassifier(**models_common_params_GBM)
nn = neural_nets.NeuralNets(x_train, x_val, y_train, y_val)
clf_int = ExplainableBoostingClassifier(random_state=constants.RANDOM_STATE)
clf_hist = HistGradientBoostingClassifier(random_state=constants.RANDOM_STATE)
list_models = {
    'LightGBM': clf_lgb,
    'XGBoost': clf_xgb,
    'CatBoost': clf_cat,
    'Neural Nets': nn,
    'Explainable Boosting': clf_int,
    'Hist Gradient boosting': clf_hist
}

# %%
# Defining an instance of the classification class
classifiers = classification.Classification(list_models, x_train, x_val,
                                            y_train, y_val)

Exemple #23

0

Afficher le fichier

Fichier : gen-ucihd-model.py Projet : umarmohammed/xai-demo

def build_model():

    ucihd_attr = [
        "age",
        "sex",  # 0 = female 1 = male
        "cp",  # chest pain type 1: typical angina 2: atypical angina 3: non-anginal pain 4: asymptomatic
        # resting blood pressure (in mm Hg on admission to the hospital)
        "trestbps",
        "chol",  # serum cholestoral in mg/dl
        "fbs",  # (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
        "restecg",  # resting electrocardiographic results 0: normal 1: having ST-T wave abnormality 2: showing probable or definite left ventricular hypertrophy by Estes' criteria
        "thalach",  # maximum heart rate achieved
        "exang",  # exercise induced angina (1 = yes; 0 = no)
        "oldpeak",  # ST depression induced by exercise relative to rest
        "slope",  # the slope of the peak exercise ST segment
        "ca",  # number of major vessels (0-3) colored by flouroscopy
        "thal",  # 3 = normal; 6 = fixed defect; 7 = reversable defect
        # diagnosis of heart disease (angiographic disease status) 0: < 50% diameter narrowing 1-4: > 50% diameter narrowing
        "label"
    ]

    ucihd_local_path = "../datasets/processed.cleveland.data"

    ucihd = pd.read_csv(ucihd_local_path,
                        header=None,
                        names=ucihd_attr,
                        na_values="?")

    categorical_attr = ["sex", "cp", "fbs", "restecg", "exang", "thal"]
    for col in categorical_attr:
        ucihd[col] = ucihd[col].astype("category")

    # Clean label.
    ucihd.loc[ucihd["label"] > 1, "label"] = 1

    # sklearn's implementation of RF doesn't allow missing value.
    # For categorical (as string) we can leave one special category for missing,
    # but for numerical we need to do some special encoding or imputation.
    ucihd_2 = ucihd.copy()
    ucihd_2.loc[ucihd_2["ca"].isna(), "ca"] = -1  # Encode missing numerical.

    ucihd_2 = pd.get_dummies(ucihd_2, columns=categorical_attr, dummy_na=True)
    ucihd_y = ucihd_2.pop("label")
    train, test, ucihd_y_train, _ = train_test_split(ucihd_2,
                                                     ucihd_y.values,
                                                     test_size=.3,
                                                     random_state=64)

    # horrible hack to reverse effect of pd.get_dummies
    _, test_display, _, _ = train_test_split(ucihd,
                                             ucihd_y.values,
                                             test_size=.3,
                                             random_state=64)

    ucihd_rf = RandomForestClassifier(n_estimators=100, random_state=64)
    _ = ucihd_rf.fit(train, ucihd_y_train)

    feature_names = ucihd_2.columns
    class_names = ["Negative", "Positive"]
    caterogical_features = [
        i for i, col in enumerate(feature_names) if "_" in col
    ]
    feature_names_display = ucihd_attr

    ucihd_ebm = ExplainableBoostingClassifier(n_estimators=16,
                                              feature_names=ucihd_2.columns,
                                              n_jobs=1)
    _ = ucihd_ebm.fit(train, ucihd_y_train)

    return (ucihd_rf, train.values, test, feature_names, class_names,
            caterogical_features, test_display, feature_names_display,
            ucihd_ebm)