models_cr = dict(knn=KNeighborsClassifier(p=8, n_neighbors=35, leaf_size=70, algorithm='kd_tree'), mlp=skorch_murder_bot(input_size=cr_input_size, hidden_layers=np.array([ cr_input_size * 2, cr_input_size // 2 ])), svm=LinearSVC(class_weight='balanced'), baggingTrees=BaggingClassifier( DecisionTreeClassifier(criterion='entropy', ccp_alpha=0.005)), boosting=GradientBoostingClassifier(ccp_alpha=0.005)) compare_models_all_metrics(models_cr, x_cr, y_cr, train_sizes) models_sh = dict(knn=KNeighborsClassifier(p=8, n_neighbors=35, leaf_size=70, algorithm='kd_tree'), mlp=skorch_murder_bot(input_size=sh_input_size, hidden_layers=np.array([ sh_input_size * 2, sh_input_size // 2 ])), svm=LinearSVC(class_weight='balanced'), baggingTrees=BaggingClassifier( DecisionTreeClassifier(criterion='entropy', ccp_alpha=0.005)), boosting=GradientBoostingClassifier(ccp_alpha=0.005))
if __name__ == "__main__": # Load the data x_cr, y_cr = load_normalized_credit_fraud_numpy( filepath='../data/creditcard.csv') x_sh, y_sh = load_normalized_shopper_intention_numpy( filepath='../data/online_shoppers_intention.csv') train_sizes = np.linspace(0.3, 0.6, 4) x_sh = x_sh.astype(np.float32) input_size = x_sh.shape[1] nets = { 'murderbot': skorch_murder_bot(input_size=input_size, hidden_layers=np.array( [input_size * 2, input_size // 2])), } # compare_models_all_metrics(nets, x_sh, y_sh, train_sizes) x_cr = x_cr.astype(np.float32) y_cr = y_cr.astype(np.int) input_size = x_cr.shape[1] nets = { 'murderbot': skorch_murder_bot(input_size=input_size, hidden_layers=np.array( [input_size * 2, input_size // 2])), } compare_models_all_metrics(nets, x_cr, y_cr, train_sizes)
DecisionTreeClassifier(criterion='entropy', class_weight='balanced', ccp_alpha=a))) train_sizes = np.linspace(0.2, 1, 7) # Show learning curve comparison of gini vs entropy # TODO: the P-R and ROC curves aren't too useful, why? # compare_models_all_metrics(models_criterion, x_cr, y_cr, train_sizes=train_sizes, title_prefix="Credit Fraud") # Show learning curve comparison of balanced vs unbalanced # compare_models_all_metrics(models_balancing, x_cr, y_cr, train_sizes=train_sizes, title_prefix="Credit Fraud") compare_models_all_metrics(ensembles, x_cr, y_cr, train_sizes=train_sizes, title_prefix="Credit Fraud") x_sub, x_val, y_sub, y_val = train_test_split(x_cr, y_cr, test_size=0.3, random_state=0) x_train, x_test, y_train, y_test = train_test_split(x_sub, y_sub, test_size=0.6, random_state=0) # Show how to determine an optimal ccpa value for post-pruning # tree = DecisionTreeClassifier(criterion='entropy', class_weight='balanced') # plot_nodes_vs_alpha(tree, x_train, y_train)
algorithm='kd_tree', weights='distance', n_jobs=-1) train_sizes = np.linspace(0.1, 1, 10) # Compare different n_neighbors values clfs_neighbors = dict() for n_neighbors in range(15, 56, 5): clf = copy.deepcopy(base) clf.n_neighbors = n_neighbors clfs_neighbors['{}-nn'.format(n_neighbors)] = clf compare_models_all_metrics(clfs_neighbors, x_cr, y_cr, train_sizes=train_sizes, title_prefix="Credit Fraud", plot_learning_curve=False) # Compare different leaf sizes clfs_leaf_size = dict() for leaf_size in range(5, 71, 10): clf = copy.deepcopy(base) clf.leaf_size = leaf_size clfs_leaf_size['{}-leaf_size'.format(leaf_size)] = clf compare_models_all_metrics(clfs_leaf_size, x_cr, y_cr, train_sizes=train_sizes, title_prefix="Credit Fraud",