# predicted probability of switching to OCC attwork['pred_logit_prob'] = my_logit_model_fit.predict(linear = False) # map from probability to ATT (0) or OCC (1) def prob_to_pred(x): if(x > 0.5): return(1) else: return(0) attwork['pred_logit'] =\ attwork['pred_logit_prob'].apply(lambda d: prob_to_pred(d)) print('\n Logistic Regression Performance\n',\ 'Percentage of Choices Correctly Classified:',\ 100 * round(eval.evaluate_classifier(attwork['pred_logit'],\ attwork['pick'])[4], 3),'\n') # -------------------------------------- # Support vector machines # -------------------------------------- my_svm = svm.SVC() my_svm_fit = my_svm.fit(x, np.ravel(y)) attwork['pred_svm_binary'] = my_svm_fit.predict(x) print('\n Support Vector Machine Performance\n',\ 'Percentage of Choices Correctly Classified:',\ 100 * round(eval.evaluate_classifier(attwork['pred_svm_binary'],\ attwork['pick'])[4], 3),'\n') # -------------------------------------- # Random forests
bankwork['pred_logit_prob'].\ apply(lambda d: prob_to_pred(d, cutoff = 0.50)) print('\nConfusion matrix for 0.50 cutoff\n',\ pd.crosstab(bankwork.pred_logit_50, bankwork.response, margins = True)) # cutoff 0.50 does not work for targeting... all predictions 0 or No # try cutoff set at 0.10 bankwork['pred_logit_10'] =\ bankwork['pred_logit_prob'].\ apply(lambda d: prob_to_pred(d, cutoff = 0.10)) print('\nConfusion matrix for 0.10 cutoff\n',\ pd.crosstab(bankwork.pred_logit_10, bankwork.response, margins = True)) print('\n Logistic Regression Performance (0.10 cutoff)\n',\ 'Percentage of Targets Correctly Classified:',\ 100 * round(eval.evaluate_classifier(bankwork['pred_logit_10'],\ bankwork['response'])[4], 3),'\n') # direct calculation of lift # decile labels from highest to lowest decile_label = [] for i in range(10): decile_label.append('Decile_'+str(10 - i)) # draws on baseline response rate computed earlier def lift(x): return(x / baseline_response_rate) prediction_deciles = pd.qcut(bankwork.pred_logit_prob, 10, labels = decile_label) decile_groups = bankwork.response.groupby(prediction_deciles) print(decile_groups.mean()) lift_values = decile_groups.mean() / baseline_response_rate print('\nLift Chart Values by Decile:\n', lift_values, '\n')
bankwork['pred_logit_prob'].\ apply(lambda d: prob_to_pred(d, cutoff = 0.50)) print('\nConfusion matrix for 0.50 cutoff\n',\ pd.crosstab(bankwork.pred_logit_50, bankwork.response, margins = True)) # cutoff 0.50 does not work for targeting... all predictions 0 or No # try cutoff set at 0.10 bankwork['pred_logit_10'] =\ bankwork['pred_logit_prob'].\ apply(lambda d: prob_to_pred(d, cutoff = 0.10)) print('\nConfusion matrix for 0.10 cutoff\n',\ pd.crosstab(bankwork.pred_logit_10, bankwork.response, margins = True)) print('\n Logistic Regression Performance (0.10 cutoff)\n',\ 'Percentage of Targets Correctly Classified:',\ 100 * round(eval.evaluate_classifier(bankwork['pred_logit_10'],\ bankwork['response'])[4], 3),'\n') # direct calculation of lift # decile labels from highest to lowest decile_label = [] for i in range(10): decile_label.append('Decile_' + str(10 - i)) # draws on baseline response rate computed earlier def lift(x): return (x / baseline_response_rate) prediction_deciles = pd.qcut(bankwork.pred_logit_prob, 10, labels=decile_label) decile_groups = bankwork.response.groupby(prediction_deciles)
'freq_table_cosine', 'freq_table_LL', 'freq_table_CR', 'freq_table_HC', 'multinomial_NB', 'KNN_5', 'KNN_2', 'SVM' ] vocab_sizes=[250, 1000, 3000] df = pd.DataFrame() n_split = 10 no_clf = 9 no_vocab_sizes = 3 for ic in range(no_clf) : for iv in range(no_vocab_sizes) : vocab_size = vocab_sizes[iv] clf_name = clf_names[ic] print("classifier = {}".format(clf_name)) print("vocab size = {}".format(vocab_size)) acc = evaluate_classifier(clf_name, vocab_size, n_split) print("accuracy = {}".format(acc)) df = df.append({'clf_name' : clf_name, 'vocab_size' : vocab_size, 'accuracy' : acc, 'n_split' : n_split }, ignore_index = True) df.to_csv('results.csv')