Example #1
0
def explore_final_model():
  #https://github.com/gameofdimension/xgboost_explainer/blob/master/xgboost_explainer_demo.ipynb
  
  nr_labels = len(y)
  value_counts = y.value_counts()
  perc_per_label = {k:round(100 * v/float(nr_labels),2) for k,v in value_counts.items()}
  print('value counts:', y.value_counts())
  print('perc per label:', perc_per_label)

  model = pickle.load(open(filename_model, "rb"))
  model_feature_names = model.attr('feature_names').split('|')    
  index_to_class = json.loads(model.attr('index_to_class'))
  print(index_to_class)
  classes = [index_to_class[k] for k in sorted(index_to_class.keys())]
  print(classes)
  
  print('eli5 explain weights (gain):\n',eli5.format_as_text(eli5.explain_weights(model, top=10))) #gain
  
  df_test = pd.read_json(open(test_filename, "r"))
  df_test = df_test.head(5)
  feature_extractor = FeatureExtractor(df_test)
  X_test, X_test_featurenames = feature_extractor.get_features_pred_instances(df_test, model_feature_names)
  
  
  print(X)
  print(set(X.dtypes))
#   print(X.iloc[0])
  print(eli5.format_as_text(eli5.explain_prediction(model, X_test.head(1), target_names = classes, top = 10, feature_names = X_test_featurenames)))
Example #2
0
def explain(model_path):
    wordseg = Wordsegmenter("./bin/pyseg.so", "./bin/qsegconf.ini")
    sent_word2vec_path = "./data/word2vec.query.bin"
    sent_vocab_path = "./data/word2vec.query.vocab"
    sent_model_path = "./data/sif.model"

    sent_word2vec = KeyedVectors.load_word2vec_format(sent_word2vec_path,
                                                      binary=True)
    sent_vocab_dict = load_vocab(sent_vocab_path)
    sent_model = joblib.load(sent_model_path)

    tfidf_count_hash_vectorModels = VectorModels()

    ner_dict_path = "./data/ner.dict"
    syn_dict_path = "./data/syn.dict"
    ner_dict, syn_dict = load_ner_dict(ner_dict_path, syn_dict_path)

    model = joblib.load(model_path)

    pd.set_option('display.max_rows', None)

    explain = eli5.explain_weights(model, top=None)
    explain = eli5.format_as_text(explain)
    print explain

    feature_names = []
    column_names = ["qid", "ql", "qr"]
    #reader = pd.read_csv(in_path, sep="\t", dtype="str", names=column_names, chunksize=100)
    reader = pd.read_csv(sys.stdin,
                         sep="\t",
                         dtype="str",
                         names=column_names,
                         chunksize=1)
    first_chunk = True
    feature_extractor = lambda row: extract_features(
        wordseg, row["ql"], row["qr"], tfidf_count_hash_vectorModels,
        sent_word2vec, sent_vocab_dict, sent_model, ner_dict, syn_dict)
    for data in reader:
        _ = data.fillna("", inplace=True)

        X = data[["ql", "qr"]].apply(feature_extractor, axis=1)
        X_features = X.apply(pd.Series)
        feature_names = X_features.columns.values.tolist()
        X_features = X_features[feature_names]
        y_preds = model.predict_proba(X_features,
                                      ntree_limit=model.best_ntree_limit)
        y_preds = map(lambda o: o[1], y_preds)
        data = pd.concat([data, X_features], axis=1)
        data = data.assign(predict=y_preds)

        #if first_chunk:
        #    data.to_csv(in_path + ".predict", header=True, sep="\t", mode="w")
        #    first_chunk = False
        #else:
        #    data.to_csv(in_path + ".predict", header=False, sep="\t", mode="a")
        data.to_csv(sys.stdout, header=False, sep="\t")
        explain = eli5.explain_prediction(model, X_features.iloc[0])
        explain = eli5.format_as_text(explain)
        print explain
        print X_features.iloc[0]
Example #3
0
def test_transition_features():
    expl = Explanation(
        estimator='some estimator',
        targets=[
            TargetExplanation('class1',
                              feature_weights=FeatureWeights(
                                  pos=[FeatureWeight('pos', 13, value=1)],
                                  neg=[],
                              )),
            TargetExplanation('class2',
                              feature_weights=FeatureWeights(
                                  pos=[FeatureWeight('pos', 13, value=1)],
                                  neg=[],
                              )),
        ],
        transition_features=TransitionFeatureWeights(
            class_names=['class2', 'class1'],  # reverse on purpose
            coef=np.array([[1.5, 2.5], [3.5, 4.5]]),
        ))
    df_dict = format_as_dataframes(expl)
    assert isinstance(df_dict, dict)
    assert set(df_dict) == {'targets', 'transition_features'}
    assert df_dict['targets'].equals(format_as_dataframe(expl.targets))
    df = df_dict['transition_features']
    print(df)
    print(format_as_text(expl))
    assert str(df) == ('to      class2  class1\n'
                       'from                  \n'
                       'class2     1.5     2.5\n'
                       'class1     3.5     4.5')

    with pytest.warns(UserWarning):
        single_df = format_as_dataframe(expl)
    assert single_df.equals(df)
Example #4
0
def result():
    tweet = str(request.data)
    explain = explain_prediction(gnb,
                                 tweet,
                                 vec=tfid,
                                 target_names=['known weird', 'less weird'])
    return str(format_as_text(explain))
Example #5
0
def visualise_feature_importance(model, title_variable, x_test, y_test):
    print(
        eli5.format_as_text(
            eli5.explain_weights(
                PermutationImportance(model,
                                      random_state=42).fit(x_test, y_test))))

    importances = model.feature_importances_
    labels = [
        "X{} - {:4.1f}%".format(i, importances[i - 1] * 100)
        for i in range(1, 8)
    ]

    patches, texts = plt.pie(importances,
                             wedgeprops=dict(width=0.5),
                             startangle=90,
                             radius=1.2)

    plt.legend(patches,
               labels,
               prop={'size': 12},
               bbox_to_anchor=(0.74, 0.5),
               loc="center right",
               fontsize=8)
    plt.title("Feature Importance for {}".format(title_variable))
    plt.savefig("plots/fi{}.png".format(title_variable))
    plt.show()
Example #6
0
def performance_measurement(crf_model, x, y, g_sentences):
    """Utilizes different functions to measure the model's performance and saves the results to files for review."""
    # Cross-validating the model
    cross_val_predictions = cross_val_predict(estimator=crf_model,
                                              X=x,
                                              y=y,
                                              cv=5)
    report = flat_classification_report(y_pred=cross_val_predictions, y_true=y)
    file = open(
        f'results/performance_measurement_results_{datetime.datetime.today().date()}.txt',
        'a',
        encoding='utf-8')
    file.seek(0)
    file.truncate()
    print2both('created on:',
               str(datetime.datetime.today().date()),
               '\n',
               file=file)
    print2both('flat_classification_report:\n\n', report, '\n\n', file=file)
    print2both('cross_val_predict:\n\n',
               cross_val_predictions,
               '\n\n',
               file=file)
    # Showing the weights assigned to each feature
    print2both('eli5.explain_weights(crf, top=100):\n\n',
               eli5.format_as_text(eli5.explain_weights(crf_model, top=100)),
               '\n\n',
               file=file)
    file.close()
    # Saving the potentially correct and the incorrect classifications in separate CSV files for review
    categorize_predictions(gold_sents=g_sentences,
                           y_hat=cross_val_predictions,
                           y_actual=y)
Example #7
0
def SGD():
    train_text, test_text, ytrain, ytest = train_test_split(df['description'],
                                                            df['category'],
                                                            random_state=42)

    word_vectorizer = TfidfVectorizer(sublinear_tf=True,
                                      strip_accents='unicode',
                                      analyzer='word',
                                      token_pattern=r'\w{1,}',
                                      ngram_range=(1, 8))
    word_vectorizer.fit(train_text)

    char_vectorizer = TfidfVectorizer(sublinear_tf=True,
                                      strip_accents='unicode',
                                      analyzer='char',
                                      ngram_range=(1, 5))
    char_vectorizer.fit(train_text)

    sgd_cls = SGDClassifier(max_iter=2)
    sgd_cls.fit(word_vectorizer.transform(train_text), ytrain)

    print(
        eli5.format_as_text(eli5.explain_weights(sgd_cls,
                                                 vec=word_vectorizer)))

    print(
        eli5.format_as_text(
            eli5.explain_prediction(
                sgd_cls,
                df['description'][df['points'] <= 81].values[0],
                vec=word_vectorizer)))

    X = hstack([
        word_vectorizer.transform(train_text),
        char_vectorizer.transform(train_text)
    ])

    sgd_cls = SGDClassifier(max_iter=2)
    sgd_cls.fit(X, ytrain)
    predict = sgd_cls.predict(
        hstack([
            word_vectorizer.transform(test_text),
            char_vectorizer.transform(test_text)
        ]))
    acc = np.mean(ytest == np.around(predict))
    print('Dokladnosc: {0:.3}'.format(acc))
Example #8
0
def process_xgb():
    col, train, test, test_ref = load_data()
    print(train.shape, test.shape, test_ref.shape)

    params = {
        'colsample_bytree': 0.055,
        'colsample_bylevel': 0.4,
        'gamma': 1.5,
        'learning_rate': 0.01,
        'max_depth': 5,
        'objective': 'reg:linear',
        'booster': 'gbtree',
        'min_child_weight': 10,
        'n_estimators': 1800,
        'reg_alpha': 0,
        'reg_lambda': 0,
        'eval_metric': 'rmse',
        'subsample': 0.7,
        'silent': True,
        'seed': 7,
    }
    folds = 20
    full_score = 0.0
    xg_test = xgb.DMatrix(test[col])
    use_regressor = True
    use_regressor = False
    for fold in range(folds):
        x1, x2, y1, y2 = model_selection.train_test_split(train[col], np.log1p(train.target.values), test_size=0.0010, random_state=fold)

        if use_regressor:
            p = params
            model = xgb.XGBRegressor(colsample_bytree=p['colsample_bytree'], colsample_bylevel=p['colsample_bylevel'], gamma=p['gamma'], learning_rate=p['learning_rate'], max_depth=p['max_depth'], objective=p['objective'], booster=p['booster'], min_child_weight=p['min_child_weight'], n_estimators=p['n_estimators'], reg_alpha=p['reg_alpha'], reg_lambda=p['reg_lambda'], eval_metric=p['eval_metric'] , subsample=p['subsample'], silent=1, n_jobs = -1, early_stopping_rounds = 100, random_state=7, nthread=-1)
            model.fit(x1, y1)
            score = np.sqrt(mean_squared_error(y2, model.predict(x2)))
            test['target'] += np.expm1(model.predict(test[col]))
        else:
            xg_valid = xgb.DMatrix(x2, label=y2)
            xg_train = xgb.DMatrix(x1, label=y1)
            model = xgb.train(params, xg_train, params['n_estimators'])
            score = np.sqrt(mean_squared_error(y2, model.predict(xg_valid)))
            test['target'] += np.expm1(model.predict(xg_test))

        print('Fold', fold, 'Score', score)
        full_score += score

    full_score /= folds
    print('Full score', full_score)

    test['target'] /= folds

    test.loc[test_ref.target > 0, 'target'] = test_ref[test_ref.target > 0].target.values

    test[['ID', 'target']].to_csv('subxgb.csv', index=False)

    explain=False
    #explain=True
    if explain and not use_regressor:
        print(eli5.format_as_text(eli5.explain_weights(model, top=200)))
Example #9
0
def interpret_model(model: Pipeline, all_features: list):
    log.debug("All features: {}".format(all_features))

    # Explain the model
    log.info(
        eli5.format_as_text(
            eli5.explain_weights(
                model.named_steps["model"],
                feature_names=all_features)))
Example #10
0
 def show_feature_importance(self):
     print("\n\n\n\n+++++++++++++++++++++++++++++++")
     print('Calculating feature importance for model ', self.name, "...")
     perm = PermutationImportance(self.get_eli5_model(),
                                  random_state=1).fit(
                                      self.x_test, self.y_test)
     print(self.name, 'model feature importance')
     print(
         eli5.format_as_text(
             eli5.explain_weights(perm, feature_names=self.feature_names)))
def test_transition_features():
    expl = Explanation(
        estimator='some estimator',
        targets=[
            TargetExplanation('class1',
                              feature_weights=FeatureWeights(
                                  pos=[FeatureWeight('pos', 13, value=1)],
                                  neg=[],
                              )),
            TargetExplanation('class2',
                              feature_weights=FeatureWeights(
                                  pos=[FeatureWeight('pos', 13, value=1)],
                                  neg=[],
                              )),
        ],
        transition_features=TransitionFeatureWeights(
            class_names=['class2', 'class1'],  # reverse on purpose
            coef=np.array([[1.5, 2.5], [3.5, 4.5]]),
        ))
    df_dict = format_as_dataframes(expl)
    assert isinstance(df_dict, dict)
    assert set(df_dict) == {'targets', 'transition_features'}
    assert df_dict['targets'].equals(format_as_dataframe(expl.targets))
    df = df_dict['transition_features']
    print(df)
    print(format_as_text(expl))
    expected = pd.DataFrame([
        {
            'from': 'class2',
            'to': 'class2',
            'coef': 1.5
        },
        {
            'from': 'class2',
            'to': 'class1',
            'coef': 2.5
        },
        {
            'from': 'class1',
            'to': 'class2',
            'coef': 3.5
        },
        {
            'from': 'class1',
            'to': 'class1',
            'coef': 4.5
        },
    ],
                            columns=['from', 'to', 'coef'])
    assert df.equals(expected)
    with pytest.warns(UserWarning):
        single_df = format_as_dataframe(expl)
    assert single_df.equals(df)
Example #12
0
 def computePermutationImportance(data_test, target_test, clf):
     perm = PermutationImportance(clf, random_state=1).fit(data_test, target_test)
     permString = (eli5.format_as_text(eli5.explain_weights(perm, feature_names=data_test.columns.tolist())))
     permString = permString.split('\n', 9)[-1]
     all_rows = permString.split("\n")
     all_cols = [row.split(' ') for row in all_rows]
     all_cols.pop(0)
     fimp = [row[0] for row in all_cols]
     errot = [row[2] for row in all_cols]
     name = [row[4] for row in all_cols]
     dfvals = pd.DataFrame(list(zip(fimp, errot, name)), columns=['A', 'B', 'C'])
     fname = os.path.join(tree_evaluations_out, str(classifierName) + '_permutations_importances.csv')
     dfvals.to_csv(fname, index=False)
Example #13
0
def permutation_importance(clf,X,Y,features,random_state=42,scoring=None):
    from eli5 import explain_weights, format_as_text
    from eli5.sklearn import PermutationImportance
    
    # Extract the classifier object from the clf multilearn object
    clf.verbose = False #Turn verbose off after this to tidy prints

    # Calculate feature importances #TODO - how to pick out label from clf to print feature importances and pdp's for specified label
    perm = PermutationImportance(clf, random_state=random_state,scoring=scoring).fit(X, Y)
    print(format_as_text(explain_weights(perm, feature_names = features),show=['feature_importances']))

    clf.verbose = True # reset

    return
Example #14
0
def permutation_importance(dataset, Processing_Unit):
    data = dataset
    y = data.author
    X = data.drop("author", axis=1)
    if Processing_Unit == "FUNCTION":
        X = X.drop("function", axis=1)

    feature_names = [i for i in data.columns if data[i].dtype in [np.int64]]
    X = data[feature_names]
    train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
    my_model = RandomForestClassifier(n_estimators=100,
                                      random_state=0).fit(train_X, train_y)

    perm = PermutationImportance(my_model, random_state=1).fit(val_X, val_y)

    #print(eli5.format_as_text(eli5.explain_weights(perm,feature_names = val_X.columns.tolist())))
    w = open(settings.aux_perm, 'w')
    w.truncate(0)
    w.write(
        eli5.format_as_text(
            eli5.explain_weights(perm, feature_names=val_X.columns.tolist())))
    def _train(self):
        """
        Build the model with the experiment configuration represented by this object
        """
        self._logger.debug("---Building model for %s", self._signature)
        assert self._regression_inputs
        xdata, ydata = self._regression_inputs.get_xy_data(
            self._regression_inputs.inputs_split["training"])
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            self._regressor.fit(xdata, ydata)
        self._logger.debug("---Model built")

        # Da simone

        expl = eli5.xgboost.explain_weights_xgboost(
            self._regressor,
            top=None)  # feature_names= XXX self.feature_names XXX
        expl_weights = eli5.format_as_text(expl)
        self._logger.debug("---Features Importance Computed")  # OK
        target = open(
            os.path.join(self._experiment_directory, "explanations.txt"), 'w')
        target.write(expl_weights)
        target.close()
Example #16
0
print("Decided Tree Classification")
clf = DecisionTreeClassifier()
clf = clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)
print('Accuracy: ',accuracy_score(y_test,y_pred))

scores = cross_val_score(clf,X,y,cv=5)
print('Scores: ',scores)
print('Final Score: ',scores.mean())

import eli5
from eli5.sklearn import PermutationImportance
perm = PermutationImportance(clf,random_state=1).fit(X_test,y_test)
print(eli5.format_as_text(eli5.explain_weights(perm)))

"Support Vector Classification"


print("Support Vector Classification")
from sklearn.svm import SVC
clf = SVC()
clf = clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)
print('Accuracy: ',accuracy_score(y_test,y_pred))

scores = cross_val_score(clf,X,y,cv=5)
print("Scores: ",scores)
print('Final Score: ',scores.mean())
Example #17
0
    accuracy_score(yTest, valid_pred_GBoost)))
print('Accuracy of Mnb classifier on test set: {:.3f}'.format(
    accuracy_score(yTest, valid_pred_Mnb)))

# save the model to disk
from joblib import dump
dump(tfidf_logit_pipeline, './models/LinearRegression-model.joblib')
dump(tfidf_logit_pipeline_RandomForestClassifier,
     './models/RandomForestClassifier-model.joblib')
dump(tfidf_logit_pipeline_SVC, './models/SVC-model.joblib')
dump(tfidf_logit_pipeline_GBoost, './models/GBoost-model.joblib')
dump(tfidf_logit_pipeline_Mnb, './models/Mnb-model.joblib')

cm_lrc = confusion_matrix(yTest, valid_pred_SVC)
f, ax = plt.subplots(figsize=(5, 5))
sns.heatmap(cm_lrc,
            annot=True,
            linewidths=0.5,
            linecolor="gray",
            fmt=".0f",
            ax=ax)
plt.title('Confusion matrix of SVC')
plt.ylabel('Predicted label')
plt.xlabel('True label')

#showing weights
import eli5
print(eli5.format_as_text(eli5.explain_weights(tfidf_logit_pipeline)))

plt.show()
X = data[base_features]

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
first_model = RandomForestRegressor(n_estimators=50,
                                    random_state=1).fit(train_X, train_y)

# show data
print("Data sample:")
print(data.head())

# Show permutation importance
perm = PermutationImportance(first_model, random_state=1).fit(val_X, val_y)
eli5.show_weights(perm, feature_names=val_X.columns.tolist())
print(
    eli5.format_as_text(
        eli5.explain_weights(perm, feature_names=val_X.columns.tolist())))

############
### Creating new features
############

data['abs_lon_change'] = abs(data.dropoff_longitude - data.pickup_longitude)
data['abs_lat_change'] = abs(data.dropoff_latitude - data.pickup_latitude)

features_2 = [
    'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
    'dropoff_latitude', 'abs_lat_change', 'abs_lon_change'
]

X = data[features_2]
new_train_X, new_val_X, new_train_y, new_val_y = train_test_split(
Example #19
0
      -1]  #[::-1] reverses the ascending result of argsort, indices of arrays sorted
    rankedFeatures = features[rankedFeatureIds]
    numRanks = 15  # RF: At 15 (score:0.786) and 30 max scores. Score declines with decreasing features
    # LR: at 10 scores were less than full-set but at 15 features, scores halved.
    # With PermutationInporatance RF performs better than LR
    featuresTopNRanks = list(rankedFeatures[0:numRanks])
    featuresToDrop = list(rankedFeatures[numRanks:-1])
    print('Selected features \n', featuresTopNRanks)
    print('features to drop:\n', repr(featuresToDrop))
    # print('Feature importance in accordance to weights\n')
    # print(features[permFeatureRanks])

    # only for printing in readable format
    permExpWghts = eli5.explain_weights(
        perm, feature_names=X_train.columns.to_list())
    permFeatureRanksText = eli5.format_as_text(
        permExpWghts)  # only for printing
    print(permFeatureRanksText)

    # based on importance, select only top 10 columns for building model

    dataDFp = dataDF.copy()
    fdataDFp = dataDFp[featuresTopNRanks]
    X_train, X_test, y_train, y_test = train_test_split(fdataDFp,
                                                        outcomeVarDF,
                                                        test_size=0.2,
                                                        random_state=42)
    clf = RandomForestClassifier(random_state=0).fit(X_train, y_train)

    print('RF Model fitted with' + repr(numRanks) +
          'features\n RF feature importances\n')
    important_features = pd.Series(data=clf.feature_importances_[0:numRanks],
Example #20
0
    'random_state': [0],
}

# Instantiate the grid search model
hyperp_srch = GridSearchCV(estimator=rf_model,
                           param_grid=group_param,
                           cv=5,
                           return_train_score=False)

hyperp_srch.fit(x_train, y_train)
#print(hyperp_srch.best_params_)
best_hyper = hyperp_srch.best_estimator_
rf_model = RandomForestClassifier(**best_hyper.get_params())
rf_model.fit(x_train, y_train)

y_pred_train = rf_model.predict(x_train)
y_pred_val = rf_model.predict(x_val)

## End
print('Classification Report: \n')
print(classification_report(y_val, y_pred_val))
print('\nConfusion Matrix: \n')
print(confusion_matrix(y_val, y_pred_val))

permutation = PermutationImportance(rf_model,
                                    random_state=2).fit(x_train, y_train)
eli5.explain_weights(permutation, feature_names=x.columns.tolist())
print(
    eli5.format_as_text(
        eli5.explain_weights(permutation, feature_names=x.columns.tolist())))
Example #21
0
##################################################################################

## Random Forest Regressor for permutation importance

rf = RandomForestRegressor(n_estimators=100,
                           n_jobs=-1,
                           oob_score=True,
                           bootstrap=True,
                           random_state=42)
rf.fit(X_train, Y_train)

perm = PermutationImportance(rf, random_state=1).fit(X_validation,
                                                     Y_validation)
results.write('\n\n\nRANDOM FOREST REGRESSOR PERMUTATION IMPORTANCE\n\n\n')
print(
    eli5.format_as_text(
        eli5.explain_weights(perm, feature_names=data.columns.tolist())))

results.write(
    eli5.format_as_text(
        eli5.explain_weights(perm, feature_names=data.columns.tolist())))

##########################################################
#### CREATE SHADOW MODEL IN FORM OF RULE FIT ALGORITHM ###
##########################################################

rf = RuleFit()
rf.fit(X_train, [int(i) for i in Y_train],
       feature_names=[
           'Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness',
           'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age'
       ])
Example #22
0
    def process(self, inputs):

        max_features = self._campaign_configuration['FeatureSelection']['max_features']

        # setting parameters for XGboost design space expoloration
        xgboost_parameters = copy.deepcopy(self._campaign_configuration)

        xgboost_parameters['General']['techniques'] = ['XGBoost']

        xgboost_parameters['General']['run_num'] = 1

        local_root_directory = self._campaign_configuration['General']['output']
        for token in self._prefix:
            local_root_directory = os.path.join(local_root_directory, token)
        xgboost_parameters['General']['output'] = local_root_directory

        del xgboost_parameters['FeatureSelection']

        model_building_var = model_building.model_building.ModelBuilding(0)

        if 'XGBoost' not in xgboost_parameters:
            # default parameters if not provided in the ini file
            xgboost_parameters['XGBoost'] = {}
            xgboost_parameters['XGBoost']['min_child_weight'] = [1, 3]
            xgboost_parameters['XGBoost']['gamma'] = [0, 1]
            xgboost_parameters['XGBoost']['n_estimators'] = [50, 100, 150, 250]
            xgboost_parameters['XGBoost']['learning_rate'] = [0.01, 0.05, 0.1]
            xgboost_parameters['XGBoost']['max_depth'] = [1, 2, 3, 5, 9, 13]

        best_conf = model_building_var.process(xgboost_parameters, inputs, int(self._campaign_configuration['General']['j']))

        # best_conf is a XGBoost configuration exeperiment
        xgb_regressor = best_conf.get_regressor()

        # top = None means all
        expl = eli5.xgboost.explain_weights_xgboost(xgb_regressor, feature_names=inputs.x_columns, top=max_features, importance_type='gain')

        # text version
        expl_weights = eli5.format_as_text(expl)

        self._logger.debug("XGBoost feature scores:\n%s", str(expl_weights))

        df = eli5.format_as_dataframe(expl)  # data frame version

        xgb_sorted_features = df['feature'].values.tolist()  # features list

        features_sig = df['weight'].values.tolist()  # significance score weights

        cumulative_significance = 0

        tolerance = self._campaign_configuration['FeatureSelection']['XGBoost_tolerance']

        index = 0

        while cumulative_significance < tolerance and index < len(features_sig):
            cumulative_significance = cumulative_significance + features_sig[index]
            index = index + 1

        feat_res = xgb_sorted_features[0:index]

        self._logger.info("XGBoost selected features: %s", str(feat_res))

        data = inputs
        data.x_columns = feat_res

        return data
def perm_import(model, features, X_val, y_val):
    perm = PermutationImportance(model, random_state=1).fit(X_val, y_val)
    # eli5.show_weights(perm, feature_names = features)
    print(eli5.format_as_text(eli5.explain_weights(model)))
    pass
Example #24
0
def process_xgb():
    col, train, test, test_ref = load_data()
    print(train.shape, test.shape, test_ref.shape)

    params = {
        'colsample_bytree': 0.055,
        'colsample_bylevel': 0.4,
        'gamma': 1.5,
        'learning_rate': 0.01,
        'max_depth': 5,
        'objective': 'reg:linear',
        'booster': 'gbtree',
        'min_child_weight': 10,
        'n_estimators': 1800,
        'reg_alpha': 0,
        'reg_lambda': 0,
        'eval_metric': 'rmse',
        'subsample': 0.7,
        'silent': True,
        'seed': 7,
    }
    folds = 20
    full_score = 0.0
    xg_test = xgb.DMatrix(test[col])
    use_regressor = True
    use_regressor = False
    for fold in range(folds):
        x1, x2, y1, y2 = model_selection.train_test_split(
            train[col],
            np.log1p(train.target.values),
            test_size=0.0010,
            random_state=fold)

        if use_regressor:
            p = params
            model = xgb.XGBRegressor(colsample_bytree=p['colsample_bytree'],
                                     colsample_bylevel=p['colsample_bylevel'],
                                     gamma=p['gamma'],
                                     learning_rate=p['learning_rate'],
                                     max_depth=p['max_depth'],
                                     objective=p['objective'],
                                     booster=p['booster'],
                                     min_child_weight=p['min_child_weight'],
                                     n_estimators=p['n_estimators'],
                                     reg_alpha=p['reg_alpha'],
                                     reg_lambda=p['reg_lambda'],
                                     eval_metric=p['eval_metric'],
                                     subsample=p['subsample'],
                                     silent=1,
                                     n_jobs=-1,
                                     early_stopping_rounds=100,
                                     random_state=7,
                                     nthread=-1)
            model.fit(x1, y1)
            score = np.sqrt(mean_squared_error(y2, model.predict(x2)))
            test['target'] += np.expm1(model.predict(test[col]))
        else:
            xg_valid = xgb.DMatrix(x2, label=y2)
            xg_train = xgb.DMatrix(x1, label=y1)
            model = xgb.train(params, xg_train, params['n_estimators'])
            score = np.sqrt(mean_squared_error(y2, model.predict(xg_valid)))
            test['target'] += np.expm1(model.predict(xg_test))

        print('Fold', fold, 'Score', score)
        full_score += score

    full_score /= folds
    print('Full score', full_score)

    test['target'] /= folds

    test.loc[test_ref.target > 0,
             'target'] = test_ref[test_ref.target > 0].target.values

    test[['ID', 'target']].to_csv('subxgb.csv', index=False)

    explain = False
    #explain=True
    if explain and not use_regressor:
        print(eli5.format_as_text(eli5.explain_weights(model, top=200)))
Example #25
0
#%%
#parseNER('Data/Train/DrugBank/Aciclovir_ddi.xml')
#buildTrainTestNER()

if __name__ == "__main__":

    train_x, train_y, test_x, test_y, testfull = prepareTrainTestforTraining()

    crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                               c1=0.1,
                               c2=0.1,
                               max_iterations=10,
                               all_possible_transitions=True)

    crf.fit(train_x, train_y)
    weight_explined = eli5.format_as_text(eli5.explain_weights(crf, top=30))

    labels = list(crf.classes_)
    labels.remove('O')
    labels
    y_pred = crf.predict(test_x)

    sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0]))

    print(
        metrics.flat_classification_report(test_y,
                                           y_pred,
                                           labels=sorted_labels,
                                           digits=3))
    classification_report = metrics.flat_classification_report(
        test_y, y_pred, labels=sorted_labels, digits=3)
Example #26
0
train_y = train.SalePrice
train_X = train[train.columns.difference(['SalePrice', 'Id'])].values

preprocessing = Pipeline([('impute',
                           SimpleImputer(missing_values=np.nan,
                                         strategy='most_frequent')),
                          ('onehot', OneHotEncoder())])

#visualizer = rank2d(preprocessing.fit_transform(train_X).todense(),  train_y)
#plt.show()

y_test = train.SalePrice
X_text = train[train.columns.difference(['SalePrice', 'Id'])].values

pipeline = Pipeline([('preprocessing', preprocessing),
                     ('tree',
                      DecisionTreeRegressor(criterion='mse',
                                            random_state=1,
                                            max_leaf_nodes=100))])

# Fit Model
pipeline.fit(train_X, train_y)

predictions = pipeline.predict(X_text)
print(r2_score(y_test, predictions))

import eli5

print(eli5.format_as_text(eli5.explain_weights(pipeline.named_steps['tree'])))
Example #27
0
#count_vectorizer.fit(data["text"])
#feature_names = count_vectorizer.get_feature_names()
#tokens_with_weights = sorted(zip(classifier.coef_[0], feature_names))[:20]
#counts = count_vectorizer.fit_transform(data["text"].values)
#feature_names = count_vectorizer.get_feature_names()


kf = KFold(3)
for train_index, test_index in kf.split(data):
    print("TRAIN:", train_index, "TEST:", test_index)
    train_targets = data.iloc[train_index,1].values
    train_values = data.iloc[train_index,0].values
    test_targets = data.iloc[test_index,1].values
    test_values = data.iloc[test_index,0].values
    vec = TfidfVectorizer(stop_words=skt.ENGLISH_STOP_WORDS)
    clf = LogisticRegressionCV()
    pipe = make_pipeline(vec, clf)
    pipe.fit(train_values, train_targets)
    print_report(pipe, test_values, test_targets)
    print(eli5.format_as_text(eli5.explain_weights(clf, vec=vec, target_names=("UNRELIABLE", "RELIABLE"))))

#counts = count_vectorizer.fit_transform(data["text"].values)
#classifier.fit(counts,data.iloc[:,1].values)
#pizza_data = pd.read_csv("pizzagate.csv")
#test_counts = count_vectorizer.transform(pizza_data["text"].values)
#print(pizza_data)
#print(classifier.classes_)
#print(classifier.coef_)
#print(classifier.predict_proba(test_counts))

Example #28
0
auc(fpr, tpr)

#Nombre de prédiction correctes (VP+VN normalisé)
rf.score(X_test,y_test)




"""Permutation importance"""

perm = PermutationImportance(rf, random_state=1).fit(X_test, y_test)
eli5.show_weights(perm, feature_names = X_test.columns.tolist())


#Code pour afficher sur l'IDE (méthode 1)
print(eli5.format_as_text(eli5.explain_weights(perm, feature_names=X_test.columns.tolist())))


#Code pour afficher sur l'IDE (méthode 2)
perm = PermutationImportance(rf, random_state=1).fit(X_test, y_test)
html_obj = eli5.show_weights(perm, feature_names = X_test.columns.tolist())
with open('permutation-importance.htm','wb') as f:
    f.write(html_obj.data.encode("UTF-8"))




#Analyse de sensibilité 
#from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingClassifier
print(metrics.flat_classification_report(y_test, y_pred, digits=3))

from collections import Counter

# def print_transitions(trans_features):
#     for (label_from, label_to), weight in trans_features:
#         print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))
#
# print("Top likely transitions:")
# print_transitions(Counter(crf.transition_features_).most_common(20))
#
# print("\nTop unlikely transitions:")
# print_transitions(Counter(crf.transition_features_).most_common()[-20:])
#
#
#
# def print_state_features(state_features):
#     for (attr, label), weight in state_features:
#         print("%0.6f %-8s %s" % (weight, label, attr))
#
# print("Top positive:")
# print_state_features(Counter(crf.state_features_).most_common(30))
#
# print("\nTop negative:")
# print_state_features(Counter(crf.state_features_).most_common()[-30:])

# eli5.show_weights(crf, top=30)
expl = eli5.explain_weights(crf, top=5)
print(eli5.format_as_text(expl))
#
Example #30
0
crf.fit(X_train, y_train)

labels = list(crf.classes_)
#labels.remove('O')

print(labels)

y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))
import sys
sys.exit()
print(eli5.format_as_text((eli5.explain_weights(crf, top=30))))

'''''
eli5.show_weights(crf, top=5, show=['transition_features'])
eli5.show_weights(crf, top=10, targets=['O', 'B-ORG', 'I-ORG'])
eli5.show_weights(crf, top=10, feature_re='^word\.is',
                  horizontal_layout=False, show=['targets'])
expl = eli5.explain_weights(crf, top=5, targets=['O', 'B-LOC', 'I-LOC'])
print(eli5.format_as_text(expl))
'''''
    trainEntries, testEntries, \
    trainLabels, testLabels, \
    _, testOrigText = train_test_split(allEntries, allLabels, allOrigText)

    pipeCV.fit(trainEntries, trainLabels)
    pred = pipeCV.predict(testEntries)

    for i in range(len(pred)):
        if pred[i] != testLabels[i]:
            key = (testLabels[i], pred[i])
            if key not in confSampleDict:
                confSampleDict[key] = []
            predExplan = eli5.format_as_text(
                eli5.explain_prediction(clfCV,
                                        testEntries[i],
                                        top=TOP_K_MODEL_PRED_FEATURES,
                                        vec=vectorizer))
            confSampleDict[key].append(
                (testOrigText[i], testEntries[i], pred[i], predExplan))

    cm = confusion_matrix(testLabels, pred) / (CV_FOLDS * len(testLabels))

    if confMatrix is None:
        confMatrix = cm
    else:
        confMatrix += cm

    f1 = f1_score(testLabels, pred, average=AVG_TYPE)
    multiClassF1Avg += f1 / CV_FOLDS
    recall = recall_score(testLabels, pred, average=AVG_TYPE)