def display_logistic_regression(df, train_x, valid_x, train_y, valid_y): print('(8) Display logistic regression\n') # fit a logistic regression (set penalty=l2 and C=1e42 to avoid regularization) logit_reg = LogisticRegression(penalty='l2', C=1e42, solver='liblinear') logit_reg.fit(train_x, train_y) print('intercept ', logit_reg.intercept_[0]) print( pd.DataFrame({'coeff': sorted(abs(logit_reg.coef_[0]), reverse=True)}, index=PREDICTORS), '\n') print( 'AIC', AIC_score(valid_y, logit_reg.predict(valid_x), df=len(train_x.columns) + 1)) classificationSummary(train_y, logit_reg.predict(train_x)) classificationSummary(valid_y, logit_reg.predict(valid_x)) prediction_valid = logit_reg.predict(valid_x) prediction_train = logit_reg.predict(train_x) print('precision on test is:', precision_score(valid_y, prediction_valid)) print('recall on test is:', recall_score(valid_y, prediction_valid)) print('f1 on test is:', f1_score(valid_y, prediction_valid)) print('Logistic Regression:Accuracy on train is:', accuracy_score(train_y, prediction_train)) print('Logistic Regression:Accuracy on test is:', accuracy_score(valid_y, prediction_valid), '\n')
def test_classificationSummary(self): y_true = [1, 0, 0, 1, 1, 1] y_pred = [1, 0, 1, 1, 0, 0] out = StringIO() with redirect_stdout(out): classificationSummary(y_true, y_pred, class_names=['a', 'b']) s = out.getvalue() self.assertIn('Confusion Matrix', s) self.assertIn(' Prediction', s) self.assertIn('a 1 1', s) self.assertIn('b 2 2', s)
def test_classificationSummary(self): y_true = [1, 0, 0, 1, 1, 1] y_pred = [1, 0, 1, 1, 0, 0] out = StringIO() with redirect_stdout(out): classificationSummary(y_true, y_pred, class_names=['a', 'b']) s = out.getvalue() self.assertIn('Confusion Matrix', s) self.assertIn(' Prediction', s) self.assertIn('a 1 1', s) self.assertIn('b 2 2', s) lines = s.split('\n') self.assertEqual(lines[0], 'Confusion Matrix (Accuracy 0.5000)') self.assertEqual(lines[3], 'Actual a b') self.assertEqual(lines[4], ' a 1 1')
true_y = y == 'default' true_pos = true_y & pred_y true_neg = ~true_y & ~pred_y false_pos = ~true_y & pred_y false_neg = true_y & ~pred_y conf_mat = pd.DataFrame([[np.sum(true_pos), np.sum(false_neg)], [np.sum(false_pos), np.sum(true_neg)]], index=['Y = default', 'Y = paid off'], columns=['Yhat = default', 'Yhat = paid off']) print(conf_mat) print(confusion_matrix(y, logit_reg.predict(X))) # The package _dmba_ contains the function `classificationSummary` that prints confusion matrix and accuracy for a classification model. classificationSummary(y, logit_reg.predict(X), class_names=logit_reg.classes_) ### Precision, Recall, and Specificity # The _scikit-learn_ function `precision_recall_fscore_support` returns # precision, recall, fbeta_score and support. conf_mat = confusion_matrix(y, logit_reg.predict(X)) print('Precision', conf_mat[0, 0] / sum(conf_mat[:, 0])) print('Recall', conf_mat[0, 0] / sum(conf_mat[0, :])) print('Specificity', conf_mat[1, 1] / sum(conf_mat[1, :])) precision_recall_fscore_support(y, logit_reg.predict(X), labels=['default', 'paid off']) ### ROC Curve # The function `roc_curve` in _Scikit-learn_ calculates all the information that is required for plotting a ROC curve.
logit_reg = LogisticRegressionCV(penalty="l2", Cs=100, solver='liblinear', cv=10, class_weight='balanced', scoring='accuracy', max_iter=1000) logit_reg.fit(train_X, train_y) # In[20]: # display confusion matrices for train and test data classificationSummary(train_y, logit_reg.predict(train_X)) classificationSummary(test_y, logit_reg.predict(test_X)) # In[21]: # display classification report for the test data classes = logit_reg.predict(test_X) print(metrics.classification_report(test_y, classes)) # ### Build a default RandomForest classifier # In[22]: # Rerun the same train/test split as before
# Subset a specific set/ predicting for new data df = pd.concat([ pd.DataFrame({ 'actual': y_valid, 'predicted': y_valid_pred }), pd.DataFrame(predProb_valid, index=y_valid.index) ], axis=1) mask = ((X_valid.inventorygrowthabovefive_YES == 1) & (X_valid.populationgrowthabove_YES == 1)) print(df[mask]) #Confusionmatrix classificationSummary(y_train, y_train_pred, class_names=classes) print() classificationSummary(y_valid, y_valid_pred, class_names=classes) # In[47]: #Regressiontree get_ipython().run_line_magic('matplotlib', 'inline') from pathlib import Path import pandas as pd import numpy as np