コード例 #1
0
mlp = consider_neural_net()

print("Best Logistic Regression", logit)
print("Best Perceptron", perceptron)
print("Best DTree", dtree)
print("Best RForest", rforest)
print("Best MLP", mlp)

#%% Plot Results

# Helper method to make a series of box-plots from a dictionary:
simple_boxplot(
    {
        "Logistic Regression": bootstrap_accuracy(logit.model, X_vali, y_vali),
        "Perceptron": bootstrap_accuracy(perceptron.model, X_vali, y_vali),
        "Decision Tree": bootstrap_accuracy(dtree.model, X_vali, y_vali),
        "RandomForest": bootstrap_accuracy(rforest.model, X_vali, y_vali),
        "MLP/NN": bootstrap_accuracy(mlp.model, X_vali, y_vali),
    },
    title="Validation Accuracy",
    xlabel="Model",
    ylabel="Accuracy",
    save="model-cmp.png",
)

TODO("1. Understand consider_decision_trees; I have 'tuned' it.")
TODO("2. Find appropriate max_iter settings to stop warning messages.")
TODO(
    "3. Pick a model: {perceptron, logistic regression, neural_network} and optimize it!"
)
コード例 #2
0
ファイル: features.py プロジェクト: lizeth25/ml_project
from sklearn.tree import DecisionTreeRegressor

# Direct feature-importances (can think of them as how many times a feature was used):
rf = RandomForestRegressor(random_state=RAND, n_estimators=100)
rf.fit(train_X, train_y)

# loop over each tree and ask them how important each feature was!
importances = dict((name, []) for name in numberer.feature_names_)
for tree in rf.estimators_:
    for name, weight in zip(numberer.feature_names_, tree.feature_importances_):
        importances[name].append(weight)

# Think: what does 'how many splits' actually measure? Usefulness, or something else?
simple_boxplot(
    importances,
    title="Tree Importances",
    ylabel="Decision Tree Criterion Importances",
    save="graphs/DecisionTree-importances.png",
)


#%%

# graphs: T.Dict[str, T.List[float]] = {}


# @dataclass
# class Model:
#     vali_score: float
#     m: T.Any

コード例 #3
0
        linear, roc_auc_score(y_score=scores, y_true=y_vali)))
    # bootstrap AUC: (doing this manually because the helper function doesn't accept scores out of nowhere!)
    dist = []
    # do the bootstrap:
    for trial in range(100):
        sample_pred, sample_truth = resample(scores,
                                             y_vali,
                                             random_state=trial +
                                             RANDOM_SEED)  # type:ignore
        score = roc_auc_score(y_true=sample_truth,
                              y_score=sample_pred)  # type:ignore
        dist.append(score)
    results["Linear[{}]".format(linear)] = dist

#%% Boxplot all AUC results:
simple_boxplot(results, ylabel="AUC", save="{}-text-AUC.png".format(dataset))

from shared import TODO

TODO(
    "1. Explore alpha and linear parameters; make a decision about what a good choice for this dataset might be."
)

# 2 is once again a choose-your-own:
TODO(
    "2A. Explore ngrams, lowercase v. uppercase, etc. (how changing CountVectorizer changes performance, or not)"
)
TODO(
    "2B. Explore the difference between today's approaches to the WIKI dataset and yesterday's."
)
TODO(
コード例 #4
0
    ca = ca_restart()
    train_score = ca.score(X_vali, y_vali)
    if train_score > best_score or best_model is None:
        best_score = train_score
        best_model = ca
        print("ca[{}] = {:.3}".format(i, train_score))

accuracy_graphs["CoordinateAscent"] = bootstrap_accuracy(
    best_model, X_vali, y_vali)
auc_graphs["CoordinateAscent"] = bootstrap_auc(best_model, X_vali, y_vali)

do_slow_AUC_experiment = False
if do_slow_AUC_experiment:
    best_score = 0.0
    best_model = None
    for i in range(20):
        ca = ca_restart(measure=lambda m, X, y: m.compute_auc(X, y))
        train_score = ca.score(X_vali, y_vali)
        if train_score > best_score or best_model is None:
            best_score = train_score
            best_model = ca
            print("ca-AUC[{}] = {:.3}".format(i, train_score))
    accuracy_graphs["CoordinateAscent-AUC"] = bootstrap_accuracy(
        best_model, X_vali, y_vali)
    auc_graphs["CoordinateAscent-AUC"] = bootstrap_auc(best_model, X_vali,
                                                       y_vali)

simple_boxplot(auc_graphs, "Linear Model AUC", save="graphs/p11-AUC.png")
simple_boxplot(accuracy_graphs,
               "Linear Model Accuracy",
               save="graphs/p11-Accuracy.png")
コード例 #5
0
)
vali_sX = np.hstack(
    [
        best_textual.m.predict_proba(vali_xd["textual"]),
        vali_xd["numeric"],
    ]
)
test_sX = np.hstack(
    [
        best_textual.m.predict_proba(test_xd["textual"]),
        test_xd["numeric"],
    ]
)


stacked = LogisticRegression(random_state=RAND)
stacked.fit((train_sX), train_y)

graphs = {
    "textual": bootstrap_auc(best_textual.m, test_xd["textual"], test_y),
    "numeric": bootstrap_auc(best_numeric.m, test_xd["numeric"], test_y),
    "merged": bootstrap_auc(best_merged.m, test_xd["merged"], test_y),
    "stacked": bootstrap_auc(stacked, test_sX, test_y),
}

simple_boxplot(
    graphs, ylabel="AUC", xlabel="method", save="graphs/p10-early-vs-stacked.png"
)

# %%
コード例 #6
0
# plot area from means & stddev
plt.fill_between(sample_subset, means - std, means + std, alpha=0.2)
# Manage axes/show:
plt.xlabel("Training Data by 50")
plt.ylabel("Mean Accuracy")
plt.xlim([0, N])
plt.title("Shaded Accuracy Plot")
plt.savefig("graphs/p09-area-Accuracy.png")
plt.show()


# Second look at the boxplots in-order: (I like this better, IMO)
simple_boxplot(
    scores,
    "Learning Curve",
    xlabel="Percent Training Data",
    ylabel="Accuracy",
    save="graphs/p09-boxplots-Accuracy.png",
)



# TODO: (practical tasks)
# 1. Swap in a better, but potentially more expensive classifier.
#    - Even DecisionTreeClassifier has some more interesting behavior on these plots.
# 2. Change the plots to operate over multiples of 50 samples, instead of percentages.
#    - This will likely be how you want to make these plots for your project.

# OPTIONAL CHALLENGE:
#  Refactor the code so that you can evaluate multiple models in this fashion.
#  Two different models at the same time will likely max out the visual utility of the plot.
コード例 #7
0

# TODO: C is the most important value for a SVM.
#       1/C is how important the model stays small.
# TODO: RBF Kernel is the best; explore it's 'gamma' parameter.

for cfg in configs:
    variants: T.List[ModelInfo] = []
    for class_weights in [None, "balanced"]:
        for c_val in [10, 25, 30, 50]:
            if (cfg["kernel"] == "gamma"):
                continue
            for gam in ["scale", "auto"]:
                svm = SVMClassifier(C=c_val, class_weight=class_weights, **cfg)
                svm.fit(X_train, y_train)
                name = "k={}{} C={} {}, gamma={}".format(
                    cfg["kernel"], cfg.get("degree", ""), c_val, class_weights
                    or "", gam)
                accuracy = svm.score(X_vali, y_vali)
                print("{}. score= {:.3}".format(name, accuracy))
                variants.append(ModelInfo(name, accuracy, svm))
    best = max(variants, key=lambda x: x.accuracy)
    graphs[best.name] = bootstrap_accuracy(best.model, X_vali, y_vali)

simple_boxplot(
    graphs,
    title="Kernelized Models for Poetry",
    ylabel="Accuracy",
    save="graphs/p15-kernel-cmp.png",
)
コード例 #8
0
importances = dict((name, []) for name in feature_numbering.feature_names_)
for tree in rf.estimators_:
    for name, weight in zip(feature_numbering.feature_names_,
                            tree.feature_importances_):
        importances[name].append(weight)

im = {}
import statistics as st
for name in importances.keys():
    if st.mean(importances[name]) > 0.04:
        im[name] = importances[name]

from shared import simple_boxplot, bootstrap_r2

simple_boxplot(im,
               title="Tree Importances",
               ylabel="Decision Tree Criterion Importances",
               save='graphs/project/feature-importance')

import typing as T
from dataclasses import dataclass
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

graphs: T.Dict[str, T.List[float]] = {}


@dataclass
class Model:
    vali_score: float
    m: T.Any
コード例 #9
0
# Coordinate ascent; try them:
best_score = 0.0
best_model = None
for i in range(3):
    ca = ca_restart()
    train_score = ca.score(X_vali, y_vali)
    if train_score > best_score or best_model is None:
        best_score = train_score
        best_model = ca
    print("ca[{}] = {:.3}".format(i, train_score))

graphs["CoordinateAscent"] = bootstrap_r2(best_model, X_vali, y_vali)

# --- now try some nonlinear models --- #
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
graphs["RF"] = bootstrap_r2(rf, X_vali, y_vali)
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)
graphs["KNN"] = bootstrap_r2(knn, X_vali, y_vali)

# Graph everything:
simple_boxplot(graphs,
               "{} R**2".format(PREDICT_COL),
               save="graphs/p11-r2-score.png")

##
# TODO:
# 1. remove the 'best-of-random' graph, so you can see the other ones!
# 2. See if there's anything here that might help your project.
コード例 #10
0
f_single.fit(X_train, y_train)
y_pred = f_single.predict(X_vali)
assert f_single.score(X_vali, y_vali) == accuracy_score(y_vali,
                                                        y_pred)  # type:ignore

# do the bootstrap:
for trial in range(N_SAMPLES):
    sample_pred, sample_truth = resample(y_pred,
                                         y_vali,
                                         random_state=trial +
                                         RANDOM_SEED)  # type:ignore
    score = accuracy_score(y_true=sample_truth,
                           y_pred=sample_pred)  # type:ignore
    bootstrap_based_accuracies.append(score)

plot = simple_boxplot(
    {
        "Seed-Based (Best)": splitter_best_seeds,
        "Seed-Based (Random)": splitter_random_seeds,
        "Bootstrap-Based": bootstrap_based_accuracies,
    },
    xlabel="Sampling Method",
    ylabel="Accuracy",
    show=False,
    save="split-compare.png",
)
plot.ylim([0.8, 1.0])
plot.show()
# if plt.show is not working, try opening the result of plt.savefig instead!
# plot.savefig("dtree-variance.png")  # This doesn't work well on repl.it.
コード例 #11
0
# ExperimentResult(vali_acc=0.7033403492268693, params={'n_neighbors': 8, 'weights': 'uniform'}, model=KNeighborsRegressor(n_neighbors=8))

# Linear model does not work for this dataset. Features are highly correlated, making the linear model
# very unstable. Large number of iterations is needed to reach the depth of optimization for linear modesl,
# and it is too slow to train.

del X_temp, y_temp

from shared import simple_boxplot, bootstrap_regressor

simple_boxplot(
    {
        "Decision Tree": bootstrap_regressor(dtree.model, X_vali, y_vali),
        "knn": bootstrap_regressor(knn.model, X_vali, y_vali),
        #"MLP/NN": bootstrap_accuracy(mlp.model, X_vali, y_vali),
    },
    title="Validation Accuracy",
    xlabel="Model",
    ylabel="Mean Squared Error",
    save="graphs/project/model-cmp.png",
)

## Decision tree performs better than knn for this dataset. The bootstrapped boxplot shows
# that this dataset has rather high quality without many outliers and much variance.

del dtree, knn



#%% Is my dataset large enough?
#%% Compute performance for each % of training data