# predicted probability of switching to OCC
attwork['pred_logit_prob'] = my_logit_model_fit.predict(linear = False)

# map from probability to ATT (0) or OCC (1)
def prob_to_pred(x):
    if(x > 0.5):
        return(1)
    else:
        return(0)

attwork['pred_logit'] =\
    attwork['pred_logit_prob'].apply(lambda d: prob_to_pred(d))

print('\n Logistic Regression Performance\n',\
    'Percentage of Choices Correctly Classified:',\
    100 * round(eval.evaluate_classifier(attwork['pred_logit'],\
    attwork['pick'])[4], 3),'\n')

# --------------------------------------
# Support vector machines
# --------------------------------------
my_svm = svm.SVC()  
my_svm_fit = my_svm.fit(x, np.ravel(y))
attwork['pred_svm_binary'] = my_svm_fit.predict(x)

print('\n Support Vector Machine Performance\n',\
    'Percentage of Choices Correctly Classified:',\
    100 * round(eval.evaluate_classifier(attwork['pred_svm_binary'],\
    attwork['pick'])[4], 3),'\n')

# --------------------------------------
# Random forests
    bankwork['pred_logit_prob'].\
    apply(lambda d: prob_to_pred(d, cutoff = 0.50))    
print('\nConfusion matrix for 0.50 cutoff\n',\
    pd.crosstab(bankwork.pred_logit_50, bankwork.response, margins = True))    
# cutoff 0.50 does not work for targeting... all predictions 0 or No    

# try cutoff set at 0.10
bankwork['pred_logit_10'] =\
    bankwork['pred_logit_prob'].\
    apply(lambda d: prob_to_pred(d, cutoff = 0.10))    
print('\nConfusion matrix for 0.10 cutoff\n',\
    pd.crosstab(bankwork.pred_logit_10, bankwork.response, margins = True)) 

print('\n Logistic Regression Performance (0.10 cutoff)\n',\
    'Percentage of Targets Correctly Classified:',\
    100 * round(eval.evaluate_classifier(bankwork['pred_logit_10'],\
    bankwork['response'])[4], 3),'\n')

# direct calculation of lift 
# decile labels from highest to lowest 
decile_label = []
for i in range(10):
    decile_label.append('Decile_'+str(10 - i))
# draws on baseline response rate computed earlier    
def lift(x):
    return(x / baseline_response_rate)

prediction_deciles = pd.qcut(bankwork.pred_logit_prob, 10, labels = decile_label)
decile_groups = bankwork.response.groupby(prediction_deciles)
print(decile_groups.mean())
lift_values = decile_groups.mean() / baseline_response_rate
print('\nLift Chart Values by Decile:\n', lift_values, '\n')
Example #3
0
# predicted probability of switching to OCC
attwork['pred_logit_prob'] = my_logit_model_fit.predict(linear = False)

# map from probability to ATT (0) or OCC (1)
def prob_to_pred(x):
    if(x > 0.5):
        return(1)
    else:
        return(0)

attwork['pred_logit'] =\
    attwork['pred_logit_prob'].apply(lambda d: prob_to_pred(d))

print('\n Logistic Regression Performance\n',\
    'Percentage of Choices Correctly Classified:',\
    100 * round(eval.evaluate_classifier(attwork['pred_logit'],\
    attwork['pick'])[4], 3),'\n')

# --------------------------------------
# Support vector machines
# --------------------------------------
my_svm = svm.SVC()  
my_svm_fit = my_svm.fit(x, np.ravel(y))
attwork['pred_svm_binary'] = my_svm_fit.predict(x)

print('\n Support Vector Machine Performance\n',\
    'Percentage of Choices Correctly Classified:',\
    100 * round(eval.evaluate_classifier(attwork['pred_svm_binary'],\
    attwork['pick'])[4], 3),'\n')

# --------------------------------------
# Random forests
Example #4
0
    bankwork['pred_logit_prob'].\
    apply(lambda d: prob_to_pred(d, cutoff = 0.50))
print('\nConfusion matrix for 0.50 cutoff\n',\
    pd.crosstab(bankwork.pred_logit_50, bankwork.response, margins = True))
# cutoff 0.50 does not work for targeting... all predictions 0 or No

# try cutoff set at 0.10
bankwork['pred_logit_10'] =\
    bankwork['pred_logit_prob'].\
    apply(lambda d: prob_to_pred(d, cutoff = 0.10))
print('\nConfusion matrix for 0.10 cutoff\n',\
    pd.crosstab(bankwork.pred_logit_10, bankwork.response, margins = True))

print('\n Logistic Regression Performance (0.10 cutoff)\n',\
    'Percentage of Targets Correctly Classified:',\
    100 * round(eval.evaluate_classifier(bankwork['pred_logit_10'],\
    bankwork['response'])[4], 3),'\n')

# direct calculation of lift
# decile labels from highest to lowest
decile_label = []
for i in range(10):
    decile_label.append('Decile_' + str(10 - i))


# draws on baseline response rate computed earlier
def lift(x):
    return (x / baseline_response_rate)


prediction_deciles = pd.qcut(bankwork.pred_logit_prob, 10, labels=decile_label)
decile_groups = bankwork.response.groupby(prediction_deciles)
'freq_table_cosine',
'freq_table_LL',
'freq_table_CR',
'freq_table_HC',
'multinomial_NB',
'KNN_5',
'KNN_2',
'SVM'
]

vocab_sizes=[250, 1000, 3000]
df = pd.DataFrame()

n_split = 10
no_clf = 9
no_vocab_sizes = 3
for ic in range(no_clf) :
    for iv in range(no_vocab_sizes) :
        vocab_size = vocab_sizes[iv] 
        clf_name = clf_names[ic]
        print("classifier = {}".format(clf_name))
        print("vocab size = {}".format(vocab_size))
        acc = evaluate_classifier(clf_name, vocab_size, n_split)
        print("accuracy = {}".format(acc))
        df = df.append({'clf_name' : clf_name,
            'vocab_size' : vocab_size,
            'accuracy' : acc, 
            'n_split' : n_split
            }, ignore_index = True)        
        df.to_csv('results.csv')