Exemple #1
0
def train_titanic_binary_classification(interactions, with_categorical=False):
    df = pd.read_csv(
        os.path.join('examples', 'titanic_train.csv'),
        #dtype= {
        #    'Age': np.float32,
        #    'Fare': np.float32,
        #    'Pclass': np.float32, # np.int
        #}
    )
    df = df.dropna()
    df['Old'] = df['Age'] > 65
    feature_types = ['continuous', 'continuous', 'continuous', 'continuous']
    feature_columns = ['Age', 'Fare', 'Pclass', 'Old']
    if with_categorical is True:
        feature_columns.append('Embarked')
        feature_types.append('categorical')
    label_column = "Survived"

    y = df[[label_column]]
    le = LabelEncoder()
    y_enc = le.fit_transform(y)
    x = df[feature_columns]
    x_train, x_test, y_train, y_test = train_test_split(x, y_enc)
    model = ExplainableBoostingClassifier(interactions=interactions,
                                          feature_types=feature_types)
    model.fit(x_train, y_train)

    return model, x_test, y_test
Exemple #2
0
def tune_ebm(X_train, y_train):
    reslist = []
    metric_idx=1  # index where AUC is stored
    for interac in [50, 100, 500]: 
        clf = ExplainableBoostingClassifier(random_state=seed, interactions=interac)
        cv_results = cross_validate(clf, X_train, y_train, cv=3, scoring='average_precision')
        reslist.append((interac, np.mean(cv_results['test_score'])))
    print(*reslist, sep='\n')
    reslist = np.asarray(reslist)
    bestid = np.where(reslist[:,metric_idx]==max(reslist[:,metric_idx]))[0][0]
    clf = ExplainableBoostingClassifier(random_state=seed, interactions=reslist[bestid,0])
    clf = clf.fit(X_train, y_train)
    return clf
Exemple #3
0
def run_training_process():
    df = load_and_clean_data()
    X = df[train_cols].reset_index(drop=True)
    y = df["target"].to_numpy()

    clf = ExplainableBoostingClassifier()

    for tr, tst in StratifiedKFold(n_splits=3).split(X, y):
        print("Shape of train data: {:d}\nShape of test data: {:d}\n".format(
            len(tr), len(tst)))
        print(
            "Sum of labels in train: {:d}\nSum of labels in test: {:d}".format(
                y[tr].sum(), y[tst].sum()))

        clf.fit(X.loc[tr], y[tr])
        print("ROC AUC Score: {:4f}".format(
            roc_auc_score(y[tst],
                          clf.predict_proba(X.loc[tst])[:, 1])))

    clf.fit(X, y)

    with open("model_file", "bw") as file:
        pickle.dump(clf, file)

    df.to_csv("features_file.csv", index=False)
    df["preds"] = clf.predict_proba(X)[:, 1]

    df[[
        "inn",
        "preds",
        "target",
    ]].to_csv("score.csv", index=False)
def EBM(X,
        Y,
        learning_rate=None,
        depth=None,
        estimators=None,
        holdout_split=None,
        seed=None):

    ### model & parameters
    ebm = ExplainableBoostingClassifier(random_state=seed)
    c_grid = {
        "n_estimators": estimators,
        "max_tree_splits": depth,
        "learning_rate": learning_rate,
        "holdout_split": holdout_split
    }

    c_grid = {k: v for k, v in c_grid.items() if v is not None}

    summary = nested_cross_validate(X=X,
                                    Y=Y,
                                    estimator=ebm,
                                    c_grid=c_grid,
                                    seed=seed)
    return summary
    def fit(self,
            X,
            y,
            sample_weight=None,
            eval_set=None,
            sample_weight_eval_set=None,
            **kwargs):
        from interpret.glassbox import (
            ExplainableBoostingClassifier,
            ExplainableBoostingRegressor,
        )

        logging.root.level = (
            10
        )  # HACK - EBM can't handle our custom logger with unknown level 9 (DATA)

        orig_cols = list(X.names)
        if self.num_classes >= 2:
            lb = LabelEncoder()
            lb.fit(self.labels)
            y = lb.transform(y)
            model = ExplainableBoostingClassifier(**self.params)
        else:
            model = ExplainableBoostingRegressor(**self.params)

        # Replace missing values with a value smaller than all observed values
        self.min = dict()
        for col in X.names:
            XX = X[:, col]
            self.min[col] = XX.min1()
            if self.min[col] is None or np.isnan(self.min[col]):
                self.min[col] = -1e10
            else:
                self.min[col] -= 1
            XX.replace(None, self.min[col])
            X[:, col] = XX
            assert X[dt.isna(dt.f[col]), col].nrows == 0
        X = X.to_numpy()

        model.fit(X, y)
        importances = self.get_importances(model, X.shape[1])
        self.set_model_properties(
            model=model,
            features=orig_cols,
            importances=importances,
            iterations=self.params["n_estimators"],
        )
Exemple #6
0
def EBM(KY_x, KY_y, FL_x, FL_y, learning_rate, depth, estimators, seed):

    KY_validation = []
    KY_score = []
    FL_score = []
    auc_diff = []
    best_param = []
    KY_x = KY_x.drop(['person_id'], axis=1)
    FL_x = FL_x.drop(['person_id'], axis=1)

    ### model & parameters
    gam = ExplainableBoostingClassifier(random_state=seed)
    c_grid = {
        "n_estimators": estimators,
        "max_tree_splits": depth,
        "learning_rate": learning_rate
    }

    ## cross validation set up
    outer_cv = KFold(n_splits=5, shuffle=True, random_state=seed)
    inner_cv = KFold(n_splits=5, shuffle=True, random_state=seed)

    for outer_train, outer_test in outer_cv.split(KY_x, KY_y):

        ## split FL data -- only use 4 folds
        outer_train_x, outer_train_y = KY_x.iloc[outer_train], KY_y[
            outer_train]
        outer_test_x, outer_test_y = KY_x.iloc[outer_test], KY_y[outer_test]

        ### cross validation on 4 folds
        clf = GridSearchCV(estimator=gam,
                           param_grid=c_grid,
                           scoring='roc_auc',
                           cv=inner_cv,
                           return_train_score=True).fit(
                               outer_train_x, outer_train_y)

        train_score = clf.cv_results_['mean_train_score']
        test_score = clf.cv_results_['mean_test_score']

        ## save results
        KY_validation.append(clf.best_score_)
        auc_diff.append(train_score[np.where(
            test_score == clf.best_score_)[0][0]] - clf.best_score_)
        best_param.append(clf.best_params_)

        ## best model
        FL_score.append(roc_auc_score(FL_y, clf.predict_proba(FL_x)[:, 1]))
        KY_score.append(
            roc_auc_score(outer_test_y,
                          clf.predict_proba(outer_test_x)[:, 1]))

    return {
        'auc_diff': auc_diff,
        'best_param': best_param,
        'KY_validation': KY_validation,
        'KY_score': KY_score,
        'FL_score': FL_score
    }
Exemple #7
0
def train_bank_churners_multiclass_classification():
    df = pd.read_csv(os.path.join('examples', 'BankChurners.csv'), )
    df = df.dropna()
    feature_types = ['continuous', 'continuous', 'categorical', 'continuous']
    feature_columns = [
        'Customer_Age', 'Dependent_count', 'Education_Level', 'Credit_Limit'
    ]
    label_column = "Income_Category"

    y = df[[label_column]]
    le = LabelEncoder()
    y_enc = le.fit_transform(y)
    x = df[feature_columns]
    x_train, x_test, y_train, y_test = train_test_split(x, y_enc)
    model = ExplainableBoostingClassifier(interactions=0,
                                          feature_types=feature_types)
    model.fit(x_train, y_train)

    return model, x_test, y_test
Exemple #8
0
    def fit(self,
            X,
            y,
            sample_weight=None,
            eval_set=None,
            sample_weight_eval_set=None,
            **kwargs):
        from interpret.glassbox import (
            ExplainableBoostingClassifier,
            ExplainableBoostingRegressor,
        )

        logging.root.level = (
            10
        )  # HACK - EBM can't handle our custom logger with unknown level 9 (DATA)

        orig_cols = list(X.names)
        if self.num_classes >= 2:
            lb = LabelEncoder()
            lb.fit(self.labels)
            y = lb.transform(y)
            model = ExplainableBoostingClassifier(**self.params)
        else:
            model = ExplainableBoostingRegressor(**self.params)

        X = self.basic_impute(X)
        X = X.to_numpy()

        model.fit(X, y)
        importances = self.get_importances(model, X.shape[1])
        self.set_model_properties(
            model=model,
            features=orig_cols,
            importances=importances,
            iterations=self.params["n_estimators"],
        )
        print("======================== ", interac, " ======================")
        splitwise_perf = []
        for split in range(0, 5):
            X_train_cov, X_test_cov = X_cov.iloc[
                train_idxes_cov[split], :], X_cov.iloc[
                    test_idxes_cov[split], :]
            y_train_cov, y_test_cov = y_cov[train_idxes_cov[split]], y_cov[
                test_idxes_cov[split]]

            #X_train_cov, y_train_cov = undersample_negatives(X_train_cov, y_train_cov, 50)

            y_train_cov = y_train_cov.ravel()
            #clf = tune_ebm(X_train_cov, y_train_cov)

            if interac == 0:
                clf = ExplainableBoostingClassifier()
            else:
                clf = ExplainableBoostingClassifier(interactions=interac)

            clf.fit(X_train_cov, y_train_cov)
            curr_perf = []
            y_pred_cov = clf.predict(X_test_cov)
            #curr_perf += [metrics.accuracy_score(y_test_cov, y_pred_cov)]
            print(metrics.confusion_matrix(y_test_cov, y_pred_cov))
            y_pred_cov = clf.predict_proba(X_test_cov)
            curr_perf += [get_aucpr_R(y_test_cov, y_pred_cov[:, 1])]
            curr_perf += [get_auc_R(y_test_cov, y_pred_cov[:, 1])]
            curr_perf += [get_fmax(y_test_cov, y_pred_cov[:, 1])]
            curr_perf += get_early_prec(y_test_cov, y_pred_cov[:, 1])
            print(curr_perf)
            splitwise_perf.append(curr_perf)
Exemple #10
0
 def build_estimator(args, train_data=None):
     feature_names = [f"featur_{i}" for i in range(train_data[0].shape[1])]
     return ExplainableBoostingClassifier(random_state=RANDOM_STATE,
                                          feature_names=feature_names,
                                          **args)
Exemple #11
0
    # generate train/test splits
    X_train_neg, X_test_neg = train_test_split(X_neg, test_size=0.2)
    X_train = pd.DataFrame(np.row_stack((X_train_pos, X_train_neg)), columns=feat_names)
    X_test = pd.DataFrame(np.row_stack((X_test_pos, X_test_neg)), columns=feat_names)
    y_test = np.zeros((X_test.shape[0],1))
    y_train = np.zeros((X_train.shape[0],1))
    y_train[range(X_train_pos.shape[0])]=1
    y_test[range(X_test_pos.shape[0])]=1
    print("X size: ",X_train.shape[0],'x',X_train.shape[1])
    print("y size: ",y_train.shape[0],'x',y_train.shape[1])
    print("X-test size: ",X_test.shape[0],'x',X_test.shape[1])
    print("y-test size: ",y_test.shape[0],'x',y_test.shape[1])

    # train and test, performance output    
    #clf = tune_ebm(X_train, y_train)
    clf = ExplainableBoostingClassifier(random_state=seed, interactions=100)
    clf.fit(X_train, y_train)
    print("Finished training ...")
    curr_perf = []
    y_pred = clf.predict(X_test)
    curr_perf += [metrics.accuracy_score(y_test, y_pred)]
    print(metrics.confusion_matrix(y_test, y_pred))
    y_pred = clf.predict_proba(X_test)
    curr_perf += [get_aucpr(y_test, y_pred[:,1])]
    curr_perf += [get_auc(y_test, y_pred[:,1])]
    print("Performance: ",curr_perf)

    # predict on larger set, output predictions
    print("Predicting on all test pairs now... ")
    scores = (clf.predict_proba(X_neg_all))[:,1]
    neg_pps['score'] = scores   
Exemple #12
0
def fit_ga2m(configuration, res_dir, predicted_variable='Row', threshold=3):
    """
    Fits a ga2m model, using the data retrieved by the function get_data, and stores the fit object the training data
    and the test set in pickle files. Always fits a two class prediction model for a given predicted_variable, and a
    threshold to separate that variable by. The predicted_variable is assumed to be ordinal. The defaults
    predicted_variable and threshold are set up for predicting the LFS and WT mutation of p53 for the individuals in the
    dataset.

    :param configuration: a dictionary of list of str
    :param res_dir: path to directory to store resulting fit model, test split and train split
    :param predicted_variable: The column in the LFS data which will be predicted.
    :param threshold: threshold for the predicted_variable
    :return: dictionary with keys 'fit', 'train', 'test' with values corresponding to the paths to the respective files.
    """
    seed(7)
    dat = get_data()

    # Label "mutant" observations, comes from the original prediction task though mutant may not be an appropriate label
    # depending on the predicted_variable, but the mutant column will be the binary predicted classes for the fit model.
    dat['mutant'] = dat[predicted_variable] > threshold
    # dat['mutant'] = dat.Column >= (max(dat.Column) - min(dat.Column))/2 + min(dat.Column)

    # Apply given configuration
    if configuration['subset_features'][0] != 'None':
        dat = dat[configuration['subset_features'] + ['mutant']]

    # Drop labelling columns and shuffle data order.
    if configuration['test'][0] == 'random':
        dat = dat.drop(columns=['Row', 'Column', 'Time', 'S', 'M', 'FocusScore3', 'FocusScore4', 'FocusScore5', 'Centroid_1', 'Centroid_2', 'Orientation']).sample(frac=1)

    # Select random train and test sets.
        dat_train = dat.iloc[:floor(len(dat) * 0.9), :]
        dat_test = dat.iloc[floor(len(dat) * 0.9):, :]

    elif sum([b.isdigit() for b in configuration['test']]) == len(configuration['test']):
        # Assume the values in configuration['test'] refer to specific entries which will only be in the test set.
        if not (sum([int(b) in dat[predicted_variable] for b in configuration['test']]) == len(configuration['test'])):
            raise Exception('not all test values are rows in the data.')
        test_rows = [int(r) for r in configuration['test']]
        # Let the test set be a set of entries, for default predicted_variable this corresponds to individuals in our
        # data.
        dat_train = dat.loc[~dat[predicted_variable].isin(test_rows)]
        # This are all indicator/irrelevant variables we don't want to consider, which should be removed from train and
        # test sets.
        dat_train = dat_train.drop(columns=['Row', 'Column', 'Time', 'S', 'M', 'FocusScore3', 'FocusScore4',
                                            'FocusScore5', 'Centroid_1', 'Centroid_2', 'Orientation'])
        dat_test = dat.loc[dat[predicted_variable].isin(test_rows)]
        dat_test = dat_test.drop(columns=['Row', 'Column', 'Time', 'S', 'M', 'FocusScore3', 'FocusScore4',
                                          'FocusScore5', 'Centroid_1', 'Centroid_2', 'Orientation'])
    else:
        raise Exception('test = x, where x must be random, or a comma separated seq of digits which are valid entries '
                        'in the predicted_variable in the data')

    # Check that the original predicted_variable isn't in the training or testing data

    ebm = ExplainableBoostingClassifier(interactions=int(configuration['num_interaction'][0]))
    ebm.fit(X=dat_train.drop(columns='mutant'), y=dat_train['mutant'])

    with open(res_dir + 'ga2m_fit', 'wb') as ga2m_file:
        pk.dump(ebm, ga2m_file)

    with open(res_dir + 'dat_train', 'wb') as train_file:
        pk.dump(dat_train, train_file)

    with open(res_dir + 'dat_test', 'wb') as test_file:
        pk.dump(dat_test, test_file)

    return {'fit': res_dir + 'ga2m_fit', 'train': res_dir + 'dat_train', 'test': res_dir + 'dat_test'}
Exemple #13
0
    def __init__(self,
                 features,
                 cluster_labels,
                 feature_names=None,
                 clusters_to_analyze=None,
                 classifier='ebm',
                 score_threshold=0.8,
                 verbose=False):
        """Interpret-clusters is a utility that aims to provide cluster interpretations. This is done by using the cluster ids as labels 
        and training supervised learning models to predict the clusters. The given features do not need to be the same set of features 
        as what was used to calculate the clusters. By calculating the feature importance of the supervised model we can find the features 
        that are important to distinguishing a particular cluster. 
        
        Parameters
        ----------

        features: array or pandas.DataFrame
            The set of features to pass to the supervised learning model. This does not need to be the same set of features as what was used to calculate the clusters.
        
        cluster_labels: list
            The list of cluster labels that specify the cluster to which a point belongs. This must have the same dimension as features (i.e. there must be one label per data point).

        feature_names: list (optional, default None)
            The list of feature names which correspond to the columns of features. If None the column indices will be used.
        
        clusters_to_analyze: list (optional, None)
            The list of cluster labels to calculate feature importances for. If None then all clusters will be analyzed.
        
        classifier: string or callable (optional, default ebm)
            The classifier to use for predicting cluster labels. It must be a classifier from the interpret package. Built-in options are ["ebm", "logistic_regression"].

        score_threshold: float (optional, default 0.8)
            Warn if the trained model has a score below this threshold.
        
        verbose: bool (optional, default False)
            Display progress information.

        """
        self.features = features
        self.cluster_labels = np.array(cluster_labels)

        if feature_names is not None:
            self.feature_names = np.array(feature_names)
        else:
            self.feature_names = np.arange(features.shape[1])

        self.cluster_models = {}

        if clusters_to_analyze is None:
            self.clusters_to_analyze = list(set(self.cluster_labels))
        else:
            self.clusters_to_analyze = sorted(clusters_to_analyze)

        for cluster_id in self.clusters_to_analyze:
            if classifier == 'ebm':
                classifier = ExplainableBoostingClassifier(
                    feature_names=self.feature_names)
            elif classifier == 'logistic_regression':
                classifier = LogisticRegression(
                    feature_names=self.feature_names,
                    penalty='l1',
                    solver='liblinear')

            cluster_model = ClusterModel(cluster_id,
                                         deepcopy(classifier),
                                         features,
                                         cluster_labels,
                                         score_threshold=score_threshold,
                                         verbose=verbose)
            self.cluster_models[cluster_id] = cluster_model

        self.verbose = verbose
        self.local_explanations = {}
        self.global_explanations = {}
Exemple #14
0
df_B = pd.DataFrame({'x': x_B, 'y': y_B}, columns=['x', 'y'])
df_A['category'] = 0
df_B['category'] = 1

#define training df (first 500 elements of each cathegory)
training_columns = ['x', 'y']
training_df = pd.concat([df_A.iloc[:500], df_B.iloc[:500]],
                        ignore_index=True,
                        sort=True)

#define test df (second 500 elements of each cathegory)
test_df = pd.concat([df_A.iloc[500:], df_B.iloc[500:]],
                    ignore_index=True,
                    sort=True)

ebm_clf = ExplainableBoostingClassifier()
ebm_clf.fit(training_df[training_columns], training_df['category'])

probabilities = ebm_clf.predict_proba(test_df[training_columns])
ebm_global = ebm_clf.explain_global()
show(ebm_global)

for prob in range(2):
    test_df['prob_{0}'.format(prob)] = probabilities[:, prob]

figcontur = plt.figure(figsize=(18, 7.5))
contourax = figcontur.add_subplot(111)
xx, yy = make_meshgrid(test_df['x'], test_df['y'])
plot_contours(contourax, ebm_clf, xx, yy, cmap='RdYlBu', alpha=0.8)
contourax.scatter(test_df.x,
                  test_df.y,
Exemple #15
0
train_data = pd.read_csv('../data/titanic_train.csv')
test_data = pd.read_csv('../data/titanic_test.csv')

train_data = train_data.fillna(
    train_data.groupby(['Pclass', 'Sex']).transform('mean'))
test_data = test_data.fillna(
    test_data.groupby(['Pclass', 'Sex']).transform('mean'))

train_data = train_data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Survived']]
test_data = test_data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch']]

X_train, X_validate, y_train, y_validate = train_test_split(
    train_data.drop('Survived', axis=1), train_data['Survived'], test_size=.25)

ebm = ExplainableBoostingClassifier()
lrm = LogisticRegression()

ebm.fit(X_train, y_train)

le = LabelEncoder()
X_train_lr = X_train
X_train_lr['Sex'] = le.fit_transform(X_train['Sex'])
lrm.fit(X_train_lr, y_train)

ebm_global = ebm.explain_global()
show(ebm_global)
ebm_local = ebm.explain_local(X_validate, y_validate)
show(ebm_local)

lrm_global = lrm.explain_global()
# %% Logistic regression
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model = model.fit(X=X_train, y=y_train)
model.predict(X_train).mean()
model.coef_
X_train.columns
model.intercept_
model.get_params()

# %% Explainable gbm
from interpret.glassbox import ExplainableBoostingClassifier, LogisticRegression
from interpret import show

ebm = ExplainableBoostingClassifier()
ebm.fit(X=X_train, y=y_train)

ebm_global = ebm.explain_global(name='EBM')
show(ebm_global)

# %%
log_model = LogisticRegression()
log_model.fit(X=X_train, y=y_train)
log_global = log_model.explain_global(name='LogReg')
show(log_global)

show([ebm_global, log_global], share_tables=True)

# %%
from interpret.data import ClassHistogram
Exemple #17
0
show(lr_global)

# %% Fit decision tree model
tree = ClassificationTree()
tree.fit(X_train, y_train)
print("Training finished.")
y_pred = tree.predict(X_test)
print(f"F1 Score {f1_score(y_test, y_pred, average='macro')}")
print(f"Accuracy {accuracy_score(y_test, y_pred)}")

# %% Explain local prediction
tree_local = tree.explain_local(X_test[:100], y_test[:100], name='Tree')
show(tree_local)

# %% Fit Explainable Boosting Machine
ebm = ExplainableBoostingClassifier(random_state=2021)
ebm.fit(X_train, y_train) 
print("Training finished.")
y_pred = ebm.predict(X_test)
print(f"F1 Score {f1_score(y_test, y_pred, average='macro')}")
print(f"Accuracy {accuracy_score(y_test, y_pred)}")

# %% Explain locally
ebm_local = ebm.explain_local(X_test[:100], y_test[:100], name='EBM')
show(ebm_local)

# %% Explain globally
ebm_global = ebm.explain_global(name='EBM')
show(ebm_global)
# %%
Exemple #18
0
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('RFTree_ROC')
plt.show()

# ### Explainable Boosting Machine

# In[9]:

from interpret.glassbox import ExplainableBoostingClassifier

ebm = ExplainableBoostingClassifier()
ebm.fit(train_X, train_y)

# In[12]:

# display confusion matrices for train and test data

classificationSummary(train_y, ebm.predict(train_X))
classificationSummary(test_y, ebm.predict(test_X))

# In[10]:

from interpret import show

ebm_global = ebm.explain_global()
show(ebm_global)
Exemple #19
0
kf = StratifiedKFold(n_splits=5, shuffle=True)
train_idxes = []
test_idxes = []
for train_index, test_index in kf.split(X, y):
    train_idxes.append(train_index)
    test_idxes.append(test_index)

splitwise_perf = []
for split in range(0, 5):
    X_train, X_test = X.iloc[train_idxes[split], :], X.iloc[
        test_idxes[split], :]
    y_train, y_test = y[train_idxes[split]], y[test_idxes[split]]
    #X_train, X_test, X_cov = normalize_train_test_cov(X_train, X_test, X_cov)
    y_train = y_train.ravel()
    clf = ExplainableBoostingClassifier(
        random_state=seed)  #, interactions=100)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(metrics.confusion_matrix(y_test, y_pred))
    curr_perf = []
    curr_perf += [metrics.accuracy_score(y_test, y_pred)]
    y_pred = clf.predict_proba(X_test)
    curr_perf += [get_aucpr(y_test, y_pred[:, 1])]
    curr_perf += [get_auc(y_test, y_pred[:, 1])]
    y_pred_cov = clf.predict(X_cov)
    print(metrics.confusion_matrix(y_cov, y_pred_cov))
    y_pred_cov = clf.predict_proba(X_cov)
    curr_perf += [get_aucpr(y_cov, y_pred_cov[:, 1])]
    curr_perf += [get_auc(y_cov, y_pred_cov[:, 1])]
    print(curr_perf)
    splitwise_perf.append(curr_perf)
Exemple #20
0
# %% [markdown]
# ### Training and Interpreting EBM
# Train a Explainable Boosting Machine (with [interpret.ml](https://github.com/interpretml/interpret/))
#
# For a tutorial see: [[Tutorial](https://nbviewer.jupyter.org/github/interpretml/interpret/blob/master/examples/python/notebooks/Interpretable%20Classification%20Methods.ipynb)]
#
# **Q7**. Report (global) feature importances for EBM as a table or figure. What are the most important three features in EBM? Are they the same as in the linear model?
#
# w_1X + w_2Y + w_3(XY) = Z
# %%
from interpret.glassbox import ExplainableBoostingClassifier
from interpret import show

train_features, train_labels, dev_features, dev_labels, test_features, test_labels = prepare_load_classification_data(
)
ebm = ExplainableBoostingClassifier(n_jobs=-1)
ebm.fit(train_features, train_labels)
# EBM
#%% # Global Explanation
ebm_global = ebm.explain_global(name='EBM')
show(ebm_global)
#%% # Local Explanation
ebm_local = ebm.explain_local(dev_features[:5], dev_labels[:5], name='EBM')
show(ebm_local)
#%% # Performance
from interpret.perf import ROC
ebm_perf = ROC(ebm.predict_proba).explain_perf(dev_features,
                                               dev_labels,
                                               name='EBM')
show(ebm_perf)
# %% [markdown]
iX = list(range(data_train.shape[0]))
X = data_train
y = labels_train.ravel()

iX_train,  iX_test, y_train, y_test = \
    train_test_split(iX, y, test_size=0.25, stratify=y, random_state=0)

X_train, X_test = X[iX_train], X[iX_test]

X_test_out = data_test_out
y_test_out = labels_test_out

#%%
from interpret.glassbox import ExplainableBoostingClassifier

ebm = ExplainableBoostingClassifier()
ebm.fit(data_pts_1, labels_pts_1)

labels_pt_2_pred = ebm.predict(data_pts_2)
#%%

# Try isolation forest for outlier detection
X = data_pts_1

from sklearn.ensemble import IsolationForest

clf = IsolationForest(random_state=0, n_jobs=-1, contamination=0.25).fit(X)

A = clf.predict(X)

print((A == -1).mean(), (labels != 0).mean(),
# ## Defining models
# %% [markdown]
# We will be using 6 models:
# * LightGBM
# * XGBoost
# * CatBoost
# * 2 layers Neural Nets
# * Explainable Boosting Classifier
# * Hist Gradient Boosting Classifier

# %%
clf_lgb = lgb.LGBMClassifier(**models_common_params_GBM)
clf_xgb = xgb.XGBClassifier(**models_common_params_GBM)
clf_cat = cat.CatBoostClassifier(**models_common_params_GBM)
nn = neural_nets.NeuralNets(x_train, x_val, y_train, y_val)
clf_int = ExplainableBoostingClassifier(random_state=constants.RANDOM_STATE)
clf_hist = HistGradientBoostingClassifier(random_state=constants.RANDOM_STATE)
list_models = {
    'LightGBM': clf_lgb,
    'XGBoost': clf_xgb,
    'CatBoost': clf_cat,
    'Neural Nets': nn,
    'Explainable Boosting': clf_int,
    'Hist Gradient boosting': clf_hist
}

# %%
# Defining an instance of the classification class
classifiers = classification.Classification(list_models, x_train, x_val,
                                            y_train, y_val)
def build_model():

    ucihd_attr = [
        "age",
        "sex",  # 0 = female 1 = male
        "cp",  # chest pain type 1: typical angina 2: atypical angina 3: non-anginal pain 4: asymptomatic
        # resting blood pressure (in mm Hg on admission to the hospital)
        "trestbps",
        "chol",  # serum cholestoral in mg/dl
        "fbs",  # (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
        "restecg",  # resting electrocardiographic results 0: normal 1: having ST-T wave abnormality 2: showing probable or definite left ventricular hypertrophy by Estes' criteria
        "thalach",  # maximum heart rate achieved
        "exang",  # exercise induced angina (1 = yes; 0 = no)
        "oldpeak",  # ST depression induced by exercise relative to rest
        "slope",  # the slope of the peak exercise ST segment
        "ca",  # number of major vessels (0-3) colored by flouroscopy
        "thal",  # 3 = normal; 6 = fixed defect; 7 = reversable defect
        # diagnosis of heart disease (angiographic disease status) 0: < 50% diameter narrowing 1-4: > 50% diameter narrowing
        "label"
    ]

    ucihd_local_path = "../datasets/processed.cleveland.data"

    ucihd = pd.read_csv(ucihd_local_path,
                        header=None,
                        names=ucihd_attr,
                        na_values="?")

    categorical_attr = ["sex", "cp", "fbs", "restecg", "exang", "thal"]
    for col in categorical_attr:
        ucihd[col] = ucihd[col].astype("category")

    # Clean label.
    ucihd.loc[ucihd["label"] > 1, "label"] = 1

    # sklearn's implementation of RF doesn't allow missing value.
    # For categorical (as string) we can leave one special category for missing,
    # but for numerical we need to do some special encoding or imputation.
    ucihd_2 = ucihd.copy()
    ucihd_2.loc[ucihd_2["ca"].isna(), "ca"] = -1  # Encode missing numerical.

    ucihd_2 = pd.get_dummies(ucihd_2, columns=categorical_attr, dummy_na=True)
    ucihd_y = ucihd_2.pop("label")
    train, test, ucihd_y_train, _ = train_test_split(ucihd_2,
                                                     ucihd_y.values,
                                                     test_size=.3,
                                                     random_state=64)

    # horrible hack to reverse effect of pd.get_dummies
    _, test_display, _, _ = train_test_split(ucihd,
                                             ucihd_y.values,
                                             test_size=.3,
                                             random_state=64)

    ucihd_rf = RandomForestClassifier(n_estimators=100, random_state=64)
    _ = ucihd_rf.fit(train, ucihd_y_train)

    feature_names = ucihd_2.columns
    class_names = ["Negative", "Positive"]
    caterogical_features = [
        i for i, col in enumerate(feature_names) if "_" in col
    ]
    feature_names_display = ucihd_attr

    ucihd_ebm = ExplainableBoostingClassifier(n_estimators=16,
                                              feature_names=ucihd_2.columns,
                                              n_jobs=1)
    _ = ucihd_ebm.fit(train, ucihd_y_train)

    return (ucihd_rf, train.values, test, feature_names, class_names,
            caterogical_features, test_display, feature_names_display,
            ucihd_ebm)