models_cr = dict(knn=KNeighborsClassifier(p=8,
                                              n_neighbors=35,
                                              leaf_size=70,
                                              algorithm='kd_tree'),
                     mlp=skorch_murder_bot(input_size=cr_input_size,
                                           hidden_layers=np.array([
                                               cr_input_size * 2,
                                               cr_input_size // 2
                                           ])),
                     svm=LinearSVC(class_weight='balanced'),
                     baggingTrees=BaggingClassifier(
                         DecisionTreeClassifier(criterion='entropy',
                                                ccp_alpha=0.005)),
                     boosting=GradientBoostingClassifier(ccp_alpha=0.005))

    compare_models_all_metrics(models_cr, x_cr, y_cr, train_sizes)

    models_sh = dict(knn=KNeighborsClassifier(p=8,
                                              n_neighbors=35,
                                              leaf_size=70,
                                              algorithm='kd_tree'),
                     mlp=skorch_murder_bot(input_size=sh_input_size,
                                           hidden_layers=np.array([
                                               sh_input_size * 2,
                                               sh_input_size // 2
                                           ])),
                     svm=LinearSVC(class_weight='balanced'),
                     baggingTrees=BaggingClassifier(
                         DecisionTreeClassifier(criterion='entropy',
                                                ccp_alpha=0.005)),
                     boosting=GradientBoostingClassifier(ccp_alpha=0.005))
Beispiel #2
0

if __name__ == "__main__":
    # Load the data
    x_cr, y_cr = load_normalized_credit_fraud_numpy(
        filepath='../data/creditcard.csv')
    x_sh, y_sh = load_normalized_shopper_intention_numpy(
        filepath='../data/online_shoppers_intention.csv')
    train_sizes = np.linspace(0.3, 0.6, 4)

    x_sh = x_sh.astype(np.float32)
    input_size = x_sh.shape[1]
    nets = {
        'murderbot':
        skorch_murder_bot(input_size=input_size,
                          hidden_layers=np.array(
                              [input_size * 2, input_size // 2])),
    }
    # compare_models_all_metrics(nets, x_sh, y_sh, train_sizes)

    x_cr = x_cr.astype(np.float32)
    y_cr = y_cr.astype(np.int)
    input_size = x_cr.shape[1]
    nets = {
        'murderbot':
        skorch_murder_bot(input_size=input_size,
                          hidden_layers=np.array(
                              [input_size * 2, input_size // 2])),
    }
    compare_models_all_metrics(nets, x_cr, y_cr, train_sizes)
Beispiel #3
0
                         DecisionTreeClassifier(criterion='entropy',
                                                class_weight='balanced',
                                                ccp_alpha=a)))

    train_sizes = np.linspace(0.2, 1, 7)

    # Show learning curve comparison of gini vs entropy
    # TODO: the P-R and ROC curves aren't too useful, why?
    # compare_models_all_metrics(models_criterion, x_cr, y_cr, train_sizes=train_sizes, title_prefix="Credit Fraud")

    # Show learning curve comparison of balanced vs unbalanced
    # compare_models_all_metrics(models_balancing, x_cr, y_cr, train_sizes=train_sizes, title_prefix="Credit Fraud")

    compare_models_all_metrics(ensembles,
                               x_cr,
                               y_cr,
                               train_sizes=train_sizes,
                               title_prefix="Credit Fraud")

    x_sub, x_val, y_sub, y_val = train_test_split(x_cr,
                                                  y_cr,
                                                  test_size=0.3,
                                                  random_state=0)
    x_train, x_test, y_train, y_test = train_test_split(x_sub,
                                                        y_sub,
                                                        test_size=0.6,
                                                        random_state=0)

    # Show how to determine an optimal ccpa value for post-pruning
    # tree = DecisionTreeClassifier(criterion='entropy', class_weight='balanced')
    # plot_nodes_vs_alpha(tree, x_train, y_train)
                                algorithm='kd_tree',
                                weights='distance',
                                n_jobs=-1)

    train_sizes = np.linspace(0.1, 1, 10)

    # Compare different n_neighbors values
    clfs_neighbors = dict()
    for n_neighbors in range(15, 56, 5):
        clf = copy.deepcopy(base)
        clf.n_neighbors = n_neighbors
        clfs_neighbors['{}-nn'.format(n_neighbors)] = clf

    compare_models_all_metrics(clfs_neighbors,
                               x_cr,
                               y_cr,
                               train_sizes=train_sizes,
                               title_prefix="Credit Fraud",
                               plot_learning_curve=False)

    # Compare different leaf sizes
    clfs_leaf_size = dict()
    for leaf_size in range(5, 71, 10):
        clf = copy.deepcopy(base)
        clf.leaf_size = leaf_size
        clfs_leaf_size['{}-leaf_size'.format(leaf_size)] = clf

    compare_models_all_metrics(clfs_leaf_size,
                               x_cr,
                               y_cr,
                               train_sizes=train_sizes,
                               title_prefix="Credit Fraud",