def selec_by_features_random_weight(self, model=None):
        train_X, val_X, train_y, val_y = train_test_split(self.train_X, self.train_y, random_state=1)
        if not model:
            mymodel = LinearRegression()
        else:
            mymodel= model


        from eli5.sklearn import PermutationImportance
        perm = PermutationImportance(mymodel, n_iter=5, random_state=1024, cv=5)
        perm.fit(train_X.values, train_y.values)

        result_ = {'var': train_X.columns.values, 'feature_importances_': perm.feature_importances_,
                   'feature_importances_std_': perm.feature_importances_std_}
        feature_importances_ = pd.DataFrame(result_, columns=['var', 'feature_importances_', 'feature_importances_std_'])
        feature_importances_ = feature_importances_.sort_values('feature_importances_', ascending=False)

        # import eli5
        # display(eli5.show_weights(perm))
        # eli5.show_weights(perm, feature_names=train_X.columns.tolist()) #结果可视化

        sel = SelectFromModel(perm, threshold=0.00, prefit=True)
        X_train_ = sel.transform(train_X)
        X_valid_ = sel.transform(val_X)

        return feature_importances_, X_train_, X_valid_
Ejemplo n.º 2
0
def compute_imp_score(pipe, clf_name, training_features, training_classes,
                      random_state, perm):
    # clf = pipe.named_steps[clf_name]
    clf = pipe
    # pdb.set_trace()
    if hasattr(clf, 'coef_'):
        coefs = np.abs(clf.coef_.flatten())
        coefs = coefs / np.sum(coefs)
    elif clf_name == 'ScaleLR':
        coefs = np.abs(clf.named_steps['lr'].coef_.flatten())
        coefs = coefs / np.sum(coefs)
    else:
        coefs = getattr(clf, 'feature_importances_', None)
    # print('coefs:',coefs)

    if coefs is None or perm:
        perm = PermutationImportance(estimator=clf,
                                     n_iter=5,
                                     random_state=random_state,
                                     refit=False)
        perm.fit(training_features, training_classes)
        coefs = perm.feature_importances_

    #return (coefs-np.min(coefs))/(np.max(coefs)-np.min(coefs))
    # return coefs/np.sum(coefs)
    return coefs
Ejemplo n.º 3
0
def load_and_feature_analysis():

    from sklearn import preprocessing
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve
    import xgboost

    ################### eli5
    import eli5  # pip install eli5
    from eli5.sklearn import PermutationImportance

    #featrues_filename = 'features_3_sec.csv'       # Test Accuracy: 0.90224
    #featrues_filename = 'data_adv_3_sec_no_var_hccho.csv'  # 실수로 mfcc_var를 빼먹고 만들었다. Test Accuracy: 0.96663
    featrues_filename = 'data_adv_3_sec_hccho.csv'  # Test Accuracy : 0.95762

    data = pd.read_csv(f'{general_path}/{featrues_filename}')
    data = data.iloc[0:, 1:]  # 첫번째 column은 파일 이름이므로, 버린다.
    print(data.shape, data.head(5))

    y = data['label']  # genre variable.
    X = data.loc[:, data.columns !=
                 'label']  #select all columns but not the labels

    #### NORMALIZE X ####

    # Normalize so everything is on the same scale.

    cols = X.columns
    min_max_scaler = preprocessing.MinMaxScaler()
    np_scaled = min_max_scaler.fit_transform(X)  # return numpy array (9990,58)

    # new data frame with the new scaled data.
    X = pd.DataFrame(np_scaled, columns=cols)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42)  # Frame in Frame out

    ########## model load ##############
    xgb_classifier = pickle.load(open("my_xgb_model.pkl", "rb"))
    preds = xgb_classifier.predict(
        X_test)  # array(['hiphop', 'jazz', 'blues', ....],dtype=object)

    print('Accuracy', ':', round(accuracy_score(y_test, preds), 5), '\n')

    # feature F score. pandas dataframe으로 train했기 때문에, featrue이름이 표시된다. numpy array로 data를 넣었다면, feature이름이 표시되지 않는다.
    xgboost.plot_importance(xgb_classifier)
    plt.show()

    #######   eli5 PermutationImportance  #################
    perm = PermutationImportance(estimator=xgb_classifier, random_state=1)
    perm.fit(X_test, y_test)

    # return  되는 값은 정확도의 변화이다. 한번만 simulation하는 것이 아니므로, +/-가 있다.
    weights = eli5.show_weights(estimator=perm,
                                feature_names=X_test.columns.tolist()
                                )  #### weights.data가 string인데, 내용은 html형식.
    with open('Permutation_Importance.htm', 'wb') as f:
        f.write(weights.data.encode("UTF-8"))
Ejemplo n.º 4
0
def test_allow_nans(iris_train):
    xgboost = pytest.importorskip('xgboost')

    X, y, feature_names, target_names = iris_train
    X = X.copy()
    X[0, 0] = np.nan

    perm = PermutationImportance(xgboost.XGBClassifier(), cv=5)
    # There should be not error thrown during fitting of the model
    perm.fit(X, y)
Ejemplo n.º 5
0
    def feature_extraction_method(self, method=Names.ELI5_PERMUTATION):
        print("Starting Feature Extraction...")
        start_time = time.time()

        if method == True:
            method = Names.ELI5_PERMUTATION

        if method == Names.ELI5_PERMUTATION:
            pi_object = PermutationImportance(self.base_run_instance.test_harness_model.model)
            pi_object.fit(self.base_run_instance.testing_data[self.base_run_instance.feature_cols_to_use],
                          self.base_run_instance.testing_data[self.base_run_instance.col_to_predict]
                          )
            feature_importances_df = pd.DataFrame()
            feature_importances_df["Feature"] = self.base_run_instance.feature_cols_to_use
            feature_importances_df["Importance"] = pi_object.feature_importances_
            feature_importances_df["Importance_Std"] = pi_object.feature_importances_std_
            feature_importances_df.sort_values(by='Importance', inplace=True, ascending=False)
            self.feature_importances = feature_importances_df.copy()
        elif method == Names.RFPIMP_PERMUTATION:
            pis = rfpimp.importances(self.base_run_instance.test_harness_model.model,
                                     self.base_run_instance.testing_data[self.base_run_instance.feature_cols_to_use],
                                     self.base_run_instance.testing_data[self.base_run_instance.col_to_predict])
            pis['Feature'] = pis.index
            pis.reset_index(inplace=True, drop=True)
            pis = pis[['Feature', 'Importance']]
            pis.sort_values(by='Importance', inplace=True, ascending=False)
            self.feature_importances = pis.copy()
        elif method == "sklearn_rf_default":
            pass  # TODO

        elif method == Names.BBA_AUDIT:
            self.bba_plots_dict = {}
            data = self.perform_bba_audit(training_data=self.base_run_instance.training_data.copy(),
                                          testing_data=self.base_run_instance.testing_data.copy(),
                                          features=self.base_run_instance.feature_cols_to_use,
                                          classifier=self.base_run_instance.test_harness_model.model,
                                          col_to_predict=self.base_run_instance.col_to_predict)
            feature_importances_df = pd.DataFrame(data, columns=["Feature", "Importance"])
            self.feature_importances = feature_importances_df.copy()

        elif method == Names.SHAP_AUDIT:
            self.shap_plots_dict = {}
            data = self.perform_shap_audit()
            feature_importances_df = pd.DataFrame(data, columns=["Feature", "Importance"])
            self.feature_importances = feature_importances_df.copy()

        print(("Feature Extraction time with method {0} was: {1:.2f} seconds".format(method, time.time() - start_time)))
Ejemplo n.º 6
0
    def perm_importance(self):
        """
        Calculates feature importances for each treatment group, based on the permutation method.
        """
        importance_dict = {}
        for group, idx in self.classes.items():
            if self.r_learners is None:
                perm_estimator = self.model_tau
                cv = 3
            else:
                perm_estimator = self.r_learners[group]
                cv = 'prefit'
            perm_fitter = PermutationImportance(perm_estimator, cv=cv)
            perm_fitter.fit(self.X, self.tau[:, idx])
            importance_dict[group] = perm_fitter.feature_importances_

        return importance_dict
Ejemplo n.º 7
0
def compute_imp_score(model, model_name, training_features, training_classes,
                      random_state):
    clf = model.named_steps[model_name]
    # pdb.set_trace()
    if hasattr(clf, 'coef_'):
        coefs = np.abs(clf.coef_.flatten())

    else:
        coefs = getattr(clf, 'feature_importances_', None)
    if coefs is None:
        perm = PermutationImportance(estimator=model,
                                     n_iter=5,
                                     random_state=random_state,
                                     refit=False)
        perm.fit(training_features, training_classes)
        coefs = perm.feature_importances_

    #return (coefs-np.min(coefs))/(np.max(coefs)-np.min(coefs))
    return coefs / np.sum(coefs)
Ejemplo n.º 8
0
def evaluate(model, X_tr, y_tr, X_vl, y_vl, metric=False, imp=False):
    model.fit(X_tr, y_tr)
    y_tr_pred = model.predict(X_tr)
    y_vl_pred = model.predict(X_vl)

    if metric:
        index = ['mae', 'r2']
        metrics = {
            'tr':
            [mean_absolute_error(y_tr, y_tr_pred),
             r2_score(y_tr, y_tr_pred)],
            'vl':
            [mean_absolute_error(y_vl, y_vl_pred),
             r2_score(y_vl, y_vl_pred)],
        }
        display(pd.DataFrame(metrics, index=index))

    if imp:
        pimp = PermutationImportance(model, random_state=42, n_iter=20)
        pimp.fit(X_vl, y_vl)
        display(eli5.show_weights(pimp))
Ejemplo n.º 9
0
    def analyze_fi_pi(self):
        "Feature Importance - Permutation Importance"

        # we need to impute the data first before calculating permutation importance
        train_X_imp = self.imputer.transform(self.X)
        # set up the met-estimator to calculate permutation importance on our training
        # data
        perm_train = PermutationImportance(self.estimator,
                                           scoring=self.spearman_scorer,
                                           n_iter=50,
                                           random_state=RANDOM_STATE)
        # fit and see the permuation importances
        perm_train.fit(train_X_imp, self.y)
        eli5.explain_weights_df(perm_train, feature_names=self.features)

        # plot the distributions
        perm_train_feat_imp_df = pd.DataFrame(data=perm_train.results_,
                                              columns=self.features)
        sns.boxplot(data=perm_train_feat_imp_df).set(
            title='Permutation Importance Distributions (training data)',
            ylabel='Importance')
Ejemplo n.º 10
0
def permutationImports(model, X_val, y_val):
    '''
    Get and display permutation importances
    '''
    
    # We'll look at the importances for both accuracy score and recall
    permuter = PermutationImportance(
        model,
        scoring='accuracy',
        random_state=42
    )

    permuter.fit(X_val, y_val)
    
    print('Permutation Importances\n')
    permute_scores = pd.Series(permuter.feature_importances_, X_val.columns)
    display(permute_scores.sort_values(ascending=False))
    print('\n')

    plt.figure(figsize=(10, len(X_val.columns) / 2))
    permute_scores.sort_values().plot.barh()
    plt.show()  
Ejemplo n.º 11
0
def permuter(model, X,y, **kwargs):
    """
    Uses eli5 package to plot permutation importance

    Scoring parameter keyword argument takes string arguments avilable here:
    https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
    If no arguments are passed, defaults are:
    scoring = "accuracy",
    cv = "prefit",
    n_iter = 5
    """

    if 'scoring' in kwargs:
        scoring = kwargs['scoring']
    else:
        scoring = 'accuracy'
    if 'cv' in kwargs:
        cv = kwargs['cv']
    else:
        cv = 'prefit'
    if 'n_iter' in kwargs:
        n_iter = kwargs['n_iter']
    else:
        n_iter = 5
        
    perm= PermutationImportance(model,
                                scoring = scoring,
                                cv = cv,
                                n_iter = n_iter,
                                random_state = 42)

    #fit 
    perm.fit(X,y)

    #show weights based on feature names
    feature_names = X.columns.tolist()
    display(show_weights(perm, top=None, feature_names=feature_names))
Ejemplo n.º 12
0
def get_permutation_imp(m,
                        X,
                        y,
                        feats,
                        random_state=random_state,
                        scoring='roc_auc'):
    perm_train = PermutationImportance(m,
                                       random_state=random_state,
                                       scoring=scoring)
    _ = perm_train.fit(X, y)
    all_feat_imp_df = eli5.explain_weights_df(perm_train, feature_names=feats)

    perm_train_feat_imp_df = pd.DataFrame(data=perm_train.results_,
                                          columns=feats)
    perm_train_feat_imp_df = perm_train_feat_imp_df[list(
        all_feat_imp_df.feature)]
    ax = perm_train_feat_imp_df.iloc[:, :15].boxplot(figsize=(9, 7))
    ax.set(title='Permutation Importance Distributions (training data)',
           ylabel='Importance')
    plt.xticks(rotation=90)
    plt.show()
    display(all_feat_imp_df[:15])

    return all_feat_imp_df
plt.yticks(fontsize = 12)
plt.ylabel('features', fontsize = 20)

# More balanced results - its use is not extremely spread.



### Permutation feature importance - (or Mean Decrease Accuracy)

# Import special library, designed for interpretation tasks
import eli5
from eli5.sklearn import PermutationImportance

#Fit and see permutation importance on our training data
perm_train = PermutationImportance(classifier)
perm_train.fit(X_train, y_train)
eli5.explain_weights_df(perm_train, feature_names=features)

#Fit and see permutation importance on our test data
perm_test = PermutationImportance(classifier)
perm_test.fit(X_test, y_test)
eli5.explain_weights_df(perm_test, feature_names=features)

# For this method, it is not clear on what set it should be applied. 
# In both cases, we can observe a table where features are ranked according to their importance. 
# The output takes the form of a weight (along with a standard deviation measure)



# Results vary according to the method used. Take into account limitations / biases of each method 
# Combinining them allows to get a more objective view of true feature importance, which is a great explanation factor. 
Ejemplo n.º 14
0
print(classification_report(y_train_svm, easy_lgbm.predict(X_train_svm)))
print(confusion_matrix(y_train_svm, easy_lgbm.predict(X_train_svm)))
print('Recall Score = ',
      recall_score(y_train_svm, easy_lgbm.predict(X_train_svm)))
print('Precision Score = ',
      precision_score(y_train_svm, easy_lgbm.predict(X_train_svm)))

print(f1_score(y_train_svm, easy_lgbm.predict(X_train_svm)))
print(f1_score(y_test_svm, easy_lgbm.predict(X_test_svm)))

eli5_permutation = PermutationImportance(estimator=easy_lgbm,
                                         scoring='f1',
                                         random_state=42,
                                         n_iter=5)
eli5_permutation.fit(X_test_svm, y_test_svm)
eli5_permutation.feature_importances_.T.reshape(-1, 1)

feature_importance_with_eli5 = pd.DataFrame(np.hstack(
    (np.array([X.columns[0:]]).T,
     eli5_permutation.feature_importances_.T.reshape(-1, 1))),
                                            columns=['feature', 'importance'])
feature_importance_with_eli5['importance'] = pd.to_numeric(
    feature_importance_with_eli5['importance'])
feature_importance_with_eli5.sort_values(by='importance', ascending=False)

fig = plt.figure(figsize=(15, 8))
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
sns.barplot(x='importance',
            y='feature',
Ejemplo n.º 15
0
                     'test':'deepskyblue'})
plt.legend(loc=9)
plt.title('Distributions of Feature Contributions');

!pip install eli5
from eli5.sklearn import PermutationImportance

import eli5
# let's check the importance of each attributes
perm = PermutationImportance(model, random_state = 0).fit(X_test, y_test)
eli5.show_weights(perm, feature_names = X_test.columns.tolist())

perm_train = PermutationImportance(model, scoring='accuracy',
                                   n_iter=100, random_state=1)
# fit and see the permuation importances
perm_train.fit(X_train, y_train)
eli5.explain_weights_df(perm_train, feature_names=X_train.columns.tolist()).head()

# figure size in inches
from matplotlib import rcParams
rcParams['figure.figsize'] = 25,5

perm_train_df = pd.DataFrame(data=perm.results_,
                                      columns=X.columns)
(sns.boxplot(data=perm_train_df)
        .set(title='Permutation Importance Distributions (training data)',
             ylabel='Importance'));
plt.xticks(rotation=90)
plt.show()

!pip install pdpbox
Ejemplo n.º 16
0
import eli5
from eli5.sklearn import PermutationImportance

encoder = GDB_pipeline.named_steps.ordinalencoder
X_train_encoded = encoder.fit_transform(X_train_cut)
X_val_encoded = encoder.transform(X_val_cut)

imputer = GDB_pipeline.named_steps.iterativeimputer
X_train_imputed = imputer.fit_transform(X_train_encoded)
X_val_imputed = imputer.fit_transform(X_val_encoded)

model = GDB_pipeline.named_steps.gradientboostingclassifier
# model.fit(X_train_imputed,y_train)

permuter = PermutationImportance(model, scoring='accuracy', n_iter=2)
permuter.fit(X_val_imputed, y_val)
feature_names = X_val_encoded.columns.tolist()
eli5.show_weights(permuter, top=None, feature_names=feature_names)

# In[78]:

from pdpbox import pdp

plt.style.use('seaborn-dark-palette')
feature = 'down'
model = GDB_pipeline.named_steps['gradientboostingclassifier']
model_features = X_train_cut.columns
X_train_imputed = pd.DataFrame(X_train_imputed)
X_train_imputed.columns = X_train_cut.columns
pdp_dist = pdp.pdp_isolate(model=model,
                           dataset=X_train_imputed,

#Random forest classifier
rf_model=RandomForestClassifier(n_estimators=10,random_state=42)
#fitting the model
rf_model.fit(X_train,y_train)


# In[41]:


#Permutation importance
from eli5.sklearn import PermutationImportance
perm_imp=PermutationImportance(rf_model,random_state=42)
#fitting the model
perm_imp.fit(X_valid,y_valid)


# In[44]:


#Important features
eli5.show_weights(perm_imp,feature_names=X_valid.columns.tolist(),top=200)


# Findings:
# * The variables in green rows have positive impact on our prediction
# * The variables in white rows have no impact on our prediction
# * The variables in red rows have negative impact on our prediction

# ### Handling of imbalanced data
                        learning_rate=.005,
                        reg_lambda=.01,
                        verbosity=1)
print('fitting...')
model.fit(X_train, y_train, eval_set=eval_set, eval_metric='auc', verbose=True)

y_pred_proba = model.predict_proba(X_val)[:, 1]
print(f'Validation ROC AUC score: {roc_auc_score(y_val, y_pred_proba)}')

print('permuting...')
permuter = PermutationImportance(model,
                                 cv='prefit',
                                 n_iter=5,
                                 scoring='roc_auc',
                                 random_state=42)
permuter.fit(X_val, y_val)
features_of_import = pd.Series(permuter.feature_importances_,
                               val.columns).sort_values(ascending=True)
print('importance', features_of_import)

print('plotting...')
fig1 = go.Figure()
fig1.add_trace(go.Bar(x=features_of_import, y=val.columns))
py.iplot(fig1, filename='features1')

mask = features_of_import > 0
trimmed_columns = train.columns[mask]
train_trimmed = train[trimmed_columns]
val_trimmed = val[trimmed_columns]
test_trimmed = test[trimmed_columns]
Ejemplo n.º 19
0
shap.initjs()
shap.force_plot(
    base_value=explainer.expected_value,
    shap_values=shap_values,
    features=row,
    link='logit'               # For classification, this returns predicted probs
)

permuter = PermutationImportance(
    model1,
    scoring='accuracy',
    n_iter=5,
    random_state=42
)
permuter.fit(X_val_processed, y_val)

permuter.feature_importances_

eli5.show_weights(
    permuter,
    top=None,
    feature_names=X_val.columns.tolist()
)

"""## Logistic Regression Model for Classification: Models 3 and 4"""

# 3rd Model
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)
Ejemplo n.º 20
0
def RFE_perm(model,
             X,
             y,
             feats,
             cv=5,
             scoring='neg_mean_absolute_error',
             timing=False):
    #def RFE_perm(model,X,y,min_features=1,step=1,cv=5,scoring='neg_mean_absolute_error',timing=False):
    from eli5.sklearn import PermutationImportance
    from types import GeneratorType
    import time

    # if pandas data then convert to numpy arrays
    if isinstance(X, pd.DataFrame): X = X.to_numpy()
    if isinstance(y, pd.Series): y = y.to_numpy()

    # if cv is a generator convert to list so it doesn't disappear after first iter
    if isinstance(cv, GeneratorType): cv = list(cv)

    nfeat = np.shape(X)[1]
    index = np.arange(nfeat)
    bestscore = -99
    niter = len(feats)
    #    niter = int(np.floor((nfeat - min_features)/step)+1)
    scores = np.empty(niter)
    nfeats = np.empty(niter)
    traintime = np.empty(niter)
    predtime = np.empty(niter)
    featsets = np.zeros([niter, nfeat])
    #    for i, n in enumerate(range(nfeat,min_features-1,-step)):
    for i, n in enumerate(feats):
        if n == nfeat:  # first iter
            newfeat = index
            Xcut = X
        else:
            newfeat = sortimport[:
                                 n]  # take n most important features from previous iter
            Xcut = Xcut[:, newfeat]
        index = index[newfeat]

        # Get train time and prediction time
        if timing:
            start = time.time()
            model.fit(Xcut, y)
            end = time.time()
            traintime[i] = end - start
            start = time.time()
            model.predict(Xcut)
            end = time.time()
            predtime[i] = end - start

        perm = PermutationImportance(model,
                                     random_state=42,
                                     scoring=scoring,
                                     cv=cv)
        perm.fit(Xcut, y)
        featimport = perm.feature_importances_
        sortimport = np.argsort(featimport)[::-1]

        score = np.mean(perm.scores_)
        print('Number of features: %i, score: %.2f %%' %
              (n, 100 * np.abs(score)))

        scores[i] = score
        nfeats[i] = n
        featsets[i, index] = 1

        if (score >= bestscore
            ):  #> or = because if equal with smaller nfeat, then better!
            bestscore = score
            bestfeat = index

    if timing:
        return [nfeats, scores, traintime,
                predtime], bestscore, bestfeat, featsets
    else:
        return [nfeats, scores], bestscore, bestfeat, featsets
Ejemplo n.º 21
0
prediction = pipeline.predict(X)

y_test = numpy.asanyarray(Y)

r, p_value = scipy.stats.pearsonr(y_test, prediction)

error = 0
tot_samples = len(prediction)
for j in range(0, tot_samples):
    print("%s - %s" % (y_test[j], prediction[j]))
    error = error + (prediction[j] - y_test[j])**2

mse = error / len(prediction)
rmse = numpy.math.sqrt(mse)

perm = PermutationImportance(pipeline, random_state=1)
res = perm.fit(X, y_test)
ret = eli5.format_as_dict(
    eli5.explain_weights(res, top=180, feature_names=X.columns.tolist()))
print(ret)

print("---------------")
print("MEAN SQUARED ERROR:", mse)
print("ROOT MEAN SQUARED ERROR:", rmse)
print("PEARSON'S CORRELATION COEFFICIENT:", r, "p-value", p_value)

print("---------------")

for i in ret['feature_importances']['importances']:
    print(i)
    SimpleImputer(strategy='median'))

# fit the model
Rand_pipeline.fit(X_train, y_train)

# transform the model
TT_val = Rand_pipeline.transform(X_val)

model_permuter = PermutationImportance(
    model_predictor,
    scoring='accuracy',
    n_iter=7,
    random_state=42
)

model_permuter.fit(TT_val, y_val);

# eli5 graph with weight and feature with my 14 selecting features
eli5.show_weights(
    model_permuter,
    top=None,
    feature_names=X_val.columns.tolist()
)

"""### Model Interpretation

### Isolated Partial Dependence Plots with 1 feature
"""

plt.rcParams['figure.dpi']=70
pdf_feature = 'lead_time'
Ejemplo n.º 23
0
cm = confusion_matrix(y_test, y_pred)
print("True positives: {}\nFalse positives: {}".format(cm[0, 0], cm[0, 1]))
print("True negatives: {}\nFalse negatives: {}".format(cm[1, 1], cm[1, 0]))

# visualize confusion matrix with seaborn heatmap
cm_matrix = pd.DataFrame(data=cm,
                         columns=['Actual Positive:1', 'Actual Negative:0'],
                         index=['Predict Positive:1', 'Predict Negative:0'])
sns.heatmap(cm_matrix, annot=True, fmt='d', cmap='YlGnBu')

X_list = X_test.columns.tolist()

clf = xgb.XGBClassifier(n_estimators=150, random_state=2020)
clf.fit(X_train, y_train)
perm = PermutationImportance(clf, random_state=2010)
perm.fit(X_test, y_test)
# Store feature weights in an object
html_obj = eli5.show_weights(perm, feature_names=X_list)
# Write html object to a file (adjust file path; Windows path is used here)
with open(
        r'C:\Users\lukem\Desktop\Github AI Projects\Higgs-Boson-machine-learning-challenge\boson-importance.htm',
        'wb') as f:
    f.write(html_obj.data.encode("UTF-8"))

lr = LogisticRegression()
lr.fit(X_train, y_train)
pred = lr.predict(X_test)
mae = mean_absolute_error(y_test, pred)
print(f"logistic regression, mae: {mae}")

Ejemplo n.º 24
0
del train0,train1,train2,train3,train4,train5,train6,train7,train8,train9,
label = pd.read_csv("../Documents/safety/labels/part-00000-e9445087-aa0a-433b-a7f6-7f4c19d78ad6-c000.csv")
label = pd.DataFrame(label.groupby('bookingID')['label'].sum())
label = label.reset_index()
train = preprocessing(train)
train_agg = all_features(train)
cols = ['Bearing_zero_crossing','gyro_tot_zero_crossing','Accuracy_zero_crossing',
        'tilt_angle_zero_crossing','acc_tot_zero_crossing','second_zero_crossing',
        'second_ssc','is_negative_speed_zero_crossing','roll_cpt5','tilt_angle_cpt5',
       'second_cpt5','Accuracy_cpt5','Bearing_cpt5','Speed_cpt5','pitch_cpt5','gyro_z_cpt5','gyro_y_cpt5']
train_agg.drop(cols,axis=1,inplace=True)
train_agg = pd.merge(train_agg,label,on='bookingID')
#Calculated permutation importance using XGBoost
X = train_agg.drop(['bookingID','label'],axis=1)
perm = PermutationImportance(xgb.XGBClassifier(), cv=skf)
perm.fit(X.values,y)
#put feature importances in dataframe
importances = pd.DataFrame()
importances['features'] = X.columns
importances['value'] = perm.feature_importances_
importances = importances.sort_values(by=['value'],ascending=False)
importances = importances.reset_index()
#Building weighted ensemble model
scaler = StandardScaler()
X = train_agg.drop(['bookingID','label'],axis=1)[importances[:150]['features']]
y = train_agg['label']
X_scaling = scaler.fit_transform(X)
model_xgb = xgb.XGBClassifier(n_estimators = 100)
model_lgb = lgb.LGBMClassifier()
model_lr = LogisticRegression()
model_rf = RandomForestClassifier()
X = df.iloc[:, 3:194]
Y_tmp = df.iloc[:, 0]
Y = []

total_sents = len(Y_tmp)
for i in range(0,total_sents):
    Y.append(Y_tmp[i]/total_sents)

# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)
Y = numpy.asanyarray(Y)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)


perm = PermutationImportance(pipeline, random_state=1)

res = perm.fit(X_test,y_test)

#ret = eli5.format_as_text(eli5.explain_weights(perm))
ret = eli5.format_as_dict(eli5.explain_weights(res))

#ret = eli5.show_weights(perm, feature_names = X.columns.tolist())
print(ret)

for i in ret['feature_importances']['importances']:
    print(i)

print('------')
print(perm.feature_importances_)
Ejemplo n.º 26
0






import eli5
from eli5.sklearn import PermutationImportance
from sklearn.svm import SVC
from sklearn.feature_selection import SelectFromModel

# ... load data

perm = PermutationImportance(SVC(), cv=5)
perm.fit(TL.drop('Telo.Length',axis=1).loc[training_indices], TL.loc[training_indices,'Telo.Length'])

# perm.feature_importances_ attribute is now available, it can be used
# for feature selection - let's e.g. select features which increase
# accuracy by at least 0.05:
sel = SelectFromModel(perm, threshold=0.05, prefit=True)
X_trans = sel.transform(X)

# It is possible to combine SelectFromModel and
# PermutationImportance directly, without fitting
# PermutationImportance first:
sel = SelectFromModel(
    PermutationImportance(SVC(), cv=5),
    threshold=0.05,
).fit(X, y)
X_trans = sel.transform(X)
model1.fit(X_train_transformed, y_train)

# Get permutation importances
! pip install eli5
from eli5.sklearn import PermutationImportance
import eli5

permuter = PermutationImportance(
    model1,
    scoring='r2',
    n_iter=2,
    random_state=42
)

permuter.fit(X_val_transformed, y_val)
feature_names = X_val.columns.tolist()

eli5.show_weights(
    permuter,
    top=None, # show permutation importances for all features
    feature_names=feature_names
)

from sklearn.metrics import mean_squared_error, r2_score

# Coefficient of determination r2 for the training set
pipeline_score = permuter.score(X_train_transformed,y_train)
print("Coefficient of determination r2 for the training set.: ", pipeline_score)

# Coefficient of determination r2 for the validation set
Ejemplo n.º 28
0
    for i in range(5):
        if key == 'xgboost':
            model = XGBClassifier(**params_XGboost[str(i)])
        elif key == 'catboost':
            model = CatBoostClassifier(**paramsCatBoost[str(i)])

        y = target.iloc[:, i]
        train_X, val_X, train_y, val_y = \
                     train_test_split(X_train, y, random_state=SEED,
                                      shuffle=True)
        model.fit(train_X.values, train_y.values)
        perm = PermutationImportance(model,
                                     cv=5,
                                     scoring='roc_auc',
                                     random_state=SEED)
        perm.fit(val_X.values, val_y.values)
        sel = SelectFromModel(perm, threshold=value['threshold'], prefit=True)
        X_train_transformed = sel.transform(X_train)
        X_test_transformed = sel.transform(X_test)

        prediction, cv_scores_mean = train_and_predict(model,
                                                       X_train_transformed,
                                                       y.values,
                                                       X_test_transformed, cv)

        cv_scores.append(cv_scores_mean)
        predictions.append(prediction)

    print(round(np.array(cv_scores).mean(), 5))
    write_to_submission_file(predictions, ID, value['filename'])