def train_and_eval(name, x, y, vx, vy): """ Train and Eval a single model. """ options: T.List[Model] = [] for i in range(5): m = SGDClassifier(class_weight="balanced", random_state=RAND + i) m.fit(x, y) options.append(Model(m.score(vx, vy), m)) for d in range(3, 15): m = DecisionTreeClassifier( max_depth=d, class_weight="balanced", random_state=RAND ) m.fit(x, y) options.append(Model(m.score(vx, vy), m)) for d in range(3, 15): for c in ["balanced", "balanced_subsample"]: m = RandomForestClassifier( max_depth=d, class_weight=c, random_state=RAND ) m.fit(x, y) options.append(Model(m.score(vx, vy), m)) for d in range(6, 12): for a in ['ball_tree','brute']: m = KNeighborsClassifier( n_neighbors=d, algorithm =a ) m.fit(x, y) options.append(Model(m.score(vx, vy), m)) #print("Perceptron", m.score(vx, vy) ) # m = GaussianNB() # m.fit(x, y) # options.append(Model(m.score(vx, vy), m)) #for d in range(3, 15): # m = MLPClassifier(max_iter=1000, random_state=RAND) # m.fit(x, y) # options.append(Model(m.score(vx, vy), m)) # pick the best model: best = max(options, key=lambda m: m.vali_score) # bootstrap its output: graphs[name] = bootstrap_accuracy(best.m, vx, vy) # record our progress: print("{:20}\t{:.3}\t{}".format(name, np.mean(graphs[name]), best.m))
def train_and_eval(name, x, y, vx, vy): """ Train and Eval a single model. """ options: T.List[Model] = [] for i in range(5): m = SGDClassifier(class_weight="balanced", random_state=RAND + i) m.fit(x, y) options.append(Model(m.score(vx, vy), m)) for d in range(3, 15): m = DecisionTreeClassifier(max_depth=d, class_weight="balanced", random_state=RAND) m.fit(x, y) options.append(Model(m.score(vx, vy), m)) # pick the best model: best = max(options, key=lambda m: m.vali_score) # bootstrap its output: graphs[name] = bootstrap_accuracy(best.m, vx, vy) # record our progress: print("{:20}\t{:.3}\t{}".format(name, np.mean(graphs[name]), best.m))
def train_and_eval(name, x, y, vx, vy): """ Train and Eval a single model. """ options: T.List[Model] = [] for i in range(5): m = SGDClassifier(class_weight="balanced", random_state=RAND + i) m.fit(x, y) options.append(Model(m.score(vx, vy), m)) for d in range(3, 15): m = DecisionTreeClassifier(max_depth=d, class_weight="balanced", random_state=RAND) m.fit(x, y) options.append(Model(m.score(vx, vy), m)) for rnd in range(3): m = Perceptron(random_state=rnd, penalty=None, max_iter=1000) m.fit(x, y) options.append(Model(m.score(vx, vy), m)) for rnd in range(3): for d in range(4, 9): params = { "criterion": "entropy", "max_depth": d, "random_state": rnd, } m = RandomForestClassifier(**params) m.fit(x, y) options.append(Model(m.score(vx, vy), m)) # pick the best model: best = max(options, key=lambda m: m.vali_score) # bootstrap its output: graphs[name] = bootstrap_accuracy(best.m, vx, vy) # record our progress: print("{:20}\t{:.3}\t{}".format(name, np.mean(graphs[name]), best.m))
dtree = consider_decision_trees() rforest = consider_random_forest() mlp = consider_neural_net() print("Best Logistic Regression", logit) print("Best Perceptron", perceptron) print("Best DTree", dtree) print("Best RForest", rforest) print("Best MLP", mlp) #%% Plot Results # Helper method to make a series of box-plots from a dictionary: simple_boxplot( { "Logistic Regression": bootstrap_accuracy(logit.model, X_vali, y_vali), "Perceptron": bootstrap_accuracy(perceptron.model, X_vali, y_vali), "Decision Tree": bootstrap_accuracy(dtree.model, X_vali, y_vali), "RandomForest": bootstrap_accuracy(rforest.model, X_vali, y_vali), "MLP/NN": bootstrap_accuracy(mlp.model, X_vali, y_vali), }, title="Validation Accuracy", xlabel="Model", ylabel="Accuracy", save="model-cmp.png", ) TODO("1. Understand consider_decision_trees; I have 'tuned' it.") TODO("2. Find appropriate max_iter settings to stop warning messages.") TODO( "3. Pick a model: {perceptron, logistic regression, neural_network} and optimize it!"
accuracy_graphs = {} auc_graphs = {} best_score = 0.0 best_model = None for i in range(1000): m = LinearModel.random(D) train_score = m.score(X_vali, y_vali) if train_score > best_score or best_model is None: best_score = train_score best_model = m print("rand[{}] = {:.3}".format(i, train_score)) print(["{:1.3f}".format(x[0]) for x in best_model.weights.tolist()]) accuracy_graphs["Random"] = bootstrap_accuracy(best_model, X_vali, y_vali) auc_graphs["Random"] = bootstrap_auc(best_model, X_vali, y_vali) for i in range(20): sgd = SGDClassifier(random_state=i + RANDOM_SEED) sgd.fit(X_train, y_train) train_score = sgd.score(X_vali, y_vali) if train_score > best_score or best_model is None: best_score = train_score best_model = sgd print("sgd[{}] = {:.3}".format(i, train_score)) accuracy_graphs["SGD"] = bootstrap_accuracy(best_model, X_vali, y_vali) auc_graphs["SGD"] = bootstrap_auc(best_model, X_vali, y_vali)
X_train = Xd_train["numeric"] X_vali = Xd_vali["numeric"] (N, D) = X_train.shape #%% Train up Forest models: forest = RandomForestClassifier() forest.fit(X_train, y_train) print("Forest.score = {:.3}".format(forest.score(X_vali, y_vali))) lr = LogisticRegression() lr.fit(X_train, y_train) sgd = SGDClassifier() sgd.fit(X_train, y_train) graphs = { "RF": bootstrap_accuracy(forest, X_vali, y_vali), "SGD": bootstrap_accuracy(sgd, X_vali, y_vali), "LR": bootstrap_accuracy(lr, X_vali, y_vali), } #%% SVM from sklearn.svm import SVC as SVMClassifier configs = [] configs.append({"kernel": "linear"}) configs.append({"kernel": "poly", "degree": 2}) configs.append({"kernel": "poly", "degree": 3}) configs.append({"kernel": "rbf"}) # configs.append({"kernel": "sigmoid"}) # just awful.
if hasattr(m, "decision_function"): scores = m.decision_function(X_vali) else: scores = m.predict_proba(X_vali)[:, 1] print("\tVali-AUC: {:.3}".format( roc_auc_score(y_score=scores, y_true=y_vali))) from sklearn.utils import resample from sklearn.metrics import accuracy_score import matplotlib.pyplot as plt # Is it randomness? Use simple_boxplot and bootstrap_auc/bootstrap_acc to see if the differences are meaningful! from shared import bootstrap_accuracy, bootstrap_auc f = DecisionTreeClassifier() f.fit(X_train, y_train) bootstrap_acc = bootstrap_accuracy(f=f, X=X_vali, y=y_vali) bootstrap_auc = bootstrap_auc(f=f, X=X_vali, y=y_vali) print(bootstrap_acc[:1]) print(bootstrap_auc[:1]) plt.boxplot([bootstrap_acc, bootstrap_auc]) plt.xticks(ticks=[1, 2], labels=["bootstrap_acc", "bootstrap_auc"]) plt.xlabel("DecisionTree bootstraps") plt.ylabel("Accuracy") plt.ylim([0.3, 1.0]) plt.show() # 2.D. Is it randomness? Control for random_state parameters! """ Results should be something like: