def train_and_eval(name, x, y, vx, vy):
    """ Train and Eval a single model. """
    options: T.List[Model] = []
    for i in range(5):
        m = SGDClassifier(class_weight="balanced", random_state=RAND + i)
        m.fit(x, y)
        options.append(Model(m.score(vx, vy), m))

    for d in range(3, 15):
        m = DecisionTreeClassifier(
            max_depth=d, class_weight="balanced", random_state=RAND
        )
        m.fit(x, y)
        options.append(Model(m.score(vx, vy), m))

    for d in range(3, 15):
        for c in ["balanced", "balanced_subsample"]:
        	m = RandomForestClassifier(
            	max_depth=d, class_weight=c, random_state=RAND
        	)
        	m.fit(x, y)
        	options.append(Model(m.score(vx, vy), m))

    for d in range(6, 12):
        for a in ['ball_tree','brute']:
            m = KNeighborsClassifier(
                n_neighbors=d,
                algorithm =a
            )
            m.fit(x, y)
            options.append(Model(m.score(vx, vy), m))

    #print("Perceptron", m.score(vx, vy) )
    # m = GaussianNB()
    # m.fit(x, y)
    # options.append(Model(m.score(vx, vy), m))

    #for d in range(3, 15):
    #    m = MLPClassifier(max_iter=1000, random_state=RAND)
    #    m.fit(x, y)
    #    options.append(Model(m.score(vx, vy), m))

    # pick the best model:
    best = max(options, key=lambda m: m.vali_score)
    # bootstrap its output:
    graphs[name] = bootstrap_accuracy(best.m, vx, vy)
    # record our progress:
    print("{:20}\t{:.3}\t{}".format(name, np.mean(graphs[name]), best.m))
Example #2
0
def train_and_eval(name, x, y, vx, vy):
    """ Train and Eval a single model. """
    options: T.List[Model] = []
    for i in range(5):
        m = SGDClassifier(class_weight="balanced", random_state=RAND + i)
        m.fit(x, y)
        options.append(Model(m.score(vx, vy), m))

    for d in range(3, 15):
        m = DecisionTreeClassifier(max_depth=d,
                                   class_weight="balanced",
                                   random_state=RAND)
        m.fit(x, y)
        options.append(Model(m.score(vx, vy), m))

    # pick the best model:
    best = max(options, key=lambda m: m.vali_score)
    # bootstrap its output:
    graphs[name] = bootstrap_accuracy(best.m, vx, vy)
    # record our progress:
    print("{:20}\t{:.3}\t{}".format(name, np.mean(graphs[name]), best.m))
def train_and_eval(name, x, y, vx, vy):
    """ Train and Eval a single model. """
    options: T.List[Model] = []
    for i in range(5):
        m = SGDClassifier(class_weight="balanced", random_state=RAND + i)
        m.fit(x, y)
        options.append(Model(m.score(vx, vy), m))

    for d in range(3, 15):
        m = DecisionTreeClassifier(max_depth=d,
                                   class_weight="balanced",
                                   random_state=RAND)
        m.fit(x, y)
        options.append(Model(m.score(vx, vy), m))

    for rnd in range(3):
        m = Perceptron(random_state=rnd, penalty=None, max_iter=1000)
        m.fit(x, y)
        options.append(Model(m.score(vx, vy), m))

    for rnd in range(3):
        for d in range(4, 9):
            params = {
                "criterion": "entropy",
                "max_depth": d,
                "random_state": rnd,
            }
            m = RandomForestClassifier(**params)
            m.fit(x, y)
            options.append(Model(m.score(vx, vy), m))

    # pick the best model:
    best = max(options, key=lambda m: m.vali_score)
    # bootstrap its output:
    graphs[name] = bootstrap_accuracy(best.m, vx, vy)
    # record our progress:
    print("{:20}\t{:.3}\t{}".format(name, np.mean(graphs[name]), best.m))
Example #4
0
dtree = consider_decision_trees()
rforest = consider_random_forest()
mlp = consider_neural_net()

print("Best Logistic Regression", logit)
print("Best Perceptron", perceptron)
print("Best DTree", dtree)
print("Best RForest", rforest)
print("Best MLP", mlp)

#%% Plot Results

# Helper method to make a series of box-plots from a dictionary:
simple_boxplot(
    {
        "Logistic Regression": bootstrap_accuracy(logit.model, X_vali, y_vali),
        "Perceptron": bootstrap_accuracy(perceptron.model, X_vali, y_vali),
        "Decision Tree": bootstrap_accuracy(dtree.model, X_vali, y_vali),
        "RandomForest": bootstrap_accuracy(rforest.model, X_vali, y_vali),
        "MLP/NN": bootstrap_accuracy(mlp.model, X_vali, y_vali),
    },
    title="Validation Accuracy",
    xlabel="Model",
    ylabel="Accuracy",
    save="model-cmp.png",
)

TODO("1. Understand consider_decision_trees; I have 'tuned' it.")
TODO("2. Find appropriate max_iter settings to stop warning messages.")
TODO(
    "3. Pick a model: {perceptron, logistic regression, neural_network} and optimize it!"
accuracy_graphs = {}
auc_graphs = {}

best_score = 0.0
best_model = None
for i in range(1000):
    m = LinearModel.random(D)
    train_score = m.score(X_vali, y_vali)
    if train_score > best_score or best_model is None:
        best_score = train_score
        best_model = m
        print("rand[{}] = {:.3}".format(i, train_score))

print(["{:1.3f}".format(x[0]) for x in best_model.weights.tolist()])

accuracy_graphs["Random"] = bootstrap_accuracy(best_model, X_vali, y_vali)
auc_graphs["Random"] = bootstrap_auc(best_model, X_vali, y_vali)

for i in range(20):
    sgd = SGDClassifier(random_state=i + RANDOM_SEED)
    sgd.fit(X_train, y_train)
    train_score = sgd.score(X_vali, y_vali)
    if train_score > best_score or best_model is None:
        best_score = train_score
        best_model = sgd
        print("sgd[{}] = {:.3}".format(i, train_score))

accuracy_graphs["SGD"] = bootstrap_accuracy(best_model, X_vali, y_vali)
auc_graphs["SGD"] = bootstrap_auc(best_model, X_vali, y_vali)

Example #6
0
X_train = Xd_train["numeric"]
X_vali = Xd_vali["numeric"]

(N, D) = X_train.shape
#%% Train up Forest models:

forest = RandomForestClassifier()
forest.fit(X_train, y_train)
print("Forest.score = {:.3}".format(forest.score(X_vali, y_vali)))

lr = LogisticRegression()
lr.fit(X_train, y_train)
sgd = SGDClassifier()
sgd.fit(X_train, y_train)
graphs = {
    "RF": bootstrap_accuracy(forest, X_vali, y_vali),
    "SGD": bootstrap_accuracy(sgd, X_vali, y_vali),
    "LR": bootstrap_accuracy(lr, X_vali, y_vali),
}

#%% SVM
from sklearn.svm import SVC as SVMClassifier

configs = []
configs.append({"kernel": "linear"})
configs.append({"kernel": "poly", "degree": 2})
configs.append({"kernel": "poly", "degree": 3})
configs.append({"kernel": "rbf"})
# configs.append({"kernel": "sigmoid"}) # just awful.

Example #7
0
    if hasattr(m, "decision_function"):
        scores = m.decision_function(X_vali)
    else:
        scores = m.predict_proba(X_vali)[:, 1]
    print("\tVali-AUC: {:.3}".format(
        roc_auc_score(y_score=scores, y_true=y_vali)))

from sklearn.utils import resample
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

# Is it randomness? Use simple_boxplot and bootstrap_auc/bootstrap_acc to see if the differences are meaningful!
from shared import bootstrap_accuracy, bootstrap_auc
f = DecisionTreeClassifier()
f.fit(X_train, y_train)
bootstrap_acc = bootstrap_accuracy(f=f, X=X_vali, y=y_vali)
bootstrap_auc = bootstrap_auc(f=f, X=X_vali, y=y_vali)

print(bootstrap_acc[:1])
print(bootstrap_auc[:1])

plt.boxplot([bootstrap_acc, bootstrap_auc])
plt.xticks(ticks=[1, 2], labels=["bootstrap_acc", "bootstrap_auc"])
plt.xlabel("DecisionTree bootstraps")
plt.ylabel("Accuracy")
plt.ylim([0.3, 1.0])
plt.show()

# 2.D. Is it randomness? Control for random_state parameters!
"""
Results should be something like: