def make_model(size_ts = 10000, size_ls = 250, data_set = 1, graph = False, depth = None, random_state = 0): if data_set == 1 : [X_train, y_train, X_test, y_test] = make_data1(size_ts,size_ls,0,False,random_state=random_state) else: [X_train, y_train, X_test, y_test] = make_data2(size_ts,size_ls,0,False,random_state=random_state) clf = DecisionTreeClassifier(max_depth=depth) clf = clf.fit(X_train, y_train) y_pred = clf.predict(X_test) if graph: if depth == None : plot_boundary(fname="figures/data"+str(data_set)+"_depthNone",fitted_estimator=clf, X=X_test,y=y_pred,title="data set "+str(data_set)+" with depth = None") tree.plot_tree(clf) dot_data = tree.export_graphviz(clf, out_file=None) graph = graphviz.Source(dot_data) graph.render("figures/tree_data"+str(data_set)+"_depthNone") else: plot_boundary(fname="figures/data"+str(data_set)+"_depth"+str(depth),fitted_estimator=clf, X=X_test,y=y_pred,title="data set "+str(data_set)+" with depth = "+str(depth)) return accuracy_score(y_test, y_pred)
def test_and_plot(title, X, y, n_neighbors): """Generate an image of our data and the predictions of the decision tree Parameters ---------- title : str The title we give to our image X : array of shape [n_samples, 2] The input samples. y : array of shape [n_samples] The output values. n_neighbors : int > 0, optional (default = None) The number of neighbors of our model. """ X_train = X[:150] y_train = y[:150] X_test = X[-1850:] y_test = y[-1850:] clf = KNeighborsClassifier(n_neighbors) clf.fit(X_train, y_train) plot_boundary(title, clf, X_test, y_test)
def compute_accuracy(nbPoints, nbGen, dataset="dataset1"): """Computes the test set accuracies over nbGen generations of the dataset using a LinearDiscriminantAnalysis() as a classifier Parameters ---------- - nbPoints : number of samples. - nbGen : number of generations of the dataset. Returns ------- accuracy : accuracies mean over ngGen generations """ accuracy = [] for gen in range(nbGen): if dataset == "dataset2": X, y = make_dataset2(nbPoints, gen) else: X, y = make_dataset1(nbPoints, gen) X_ls, X_ts, y_ls, y_ts = train_test_split(X, y, train_size=0.8, test_size=0.2) estimator = LinearDiscriminantAnalysis().fit(X_ls, y_ls) accuracy.append(estimator.score(X_ts, y_ts)) if gen == 1: plot_boundary("LDA {}".format(dataset), estimator, X_ts, y_ts, 0.1) return np.array(accuracy)
def test_and_plot(title, X, y, m_depth=None): """Generate an image of our data and the predictions of the decision tree Parameters ---------- title : str The title we give to our image X : array of shape [n_samples, 2] The input samples. y : array of shape [n_samples] The output values. m_depth : int > 0, optional (default = None) The maximum depth allowed of our decision tree """ X_train = X[:150] y_train = y[:150] X_test = X[-1850:] y_test = y[-1850:] clf = DecisionTreeClassifier(max_depth=m_depth) clf.fit(X_train, y_train) plot_boundary(title, clf, X_test, y_test)
def q21(x,y): trainSample = (x[:1000,:], y[:1000]) testSample = (x[1000:,:], y[1000:]) for i in (1,5,50,100,500): knn = KNeighborsClassifier(n_neighbors=i) estimator = knn.fit(trainSample[0], trainSample[1]) yPredicted = estimator.predict(testSample[0]) print("Accuracy with {} neighbors is : {}. ".format(i, accuracy_score(testSample[1], yPredicted))) name = "boundaryKNN"+ str(i) title = "Distibution for n_neighbors = " + str(i) plot_boundary(name, estimator, testSample[0], testSample[1],title=title) return
def predLR(n_iter, learning_rate, trainSample, testSample, plot=False): lr = LogisticRegressionClassifier(n_iter=n_iter, learning_rate=learning_rate) lr.fit(trainSample[0], trainSample[1]) yPredicted = lr.predict(testSample[0]) acc = accuracy_score(testSample[1], yPredicted) if (plot): name = "boundaryLR" title = "Distibution for " + str( n_iter) + "iterations and a learning_rate of" + str( learning_rate) + "." plot_boundary(name, lr, testSample[0], testSample[1], title=title) return acc
def make_model(size_ts = 10000, size_ls = 250, data_set = 1, graph = False, n_neigh = 1, cv = False): if data_set == 1 : [X_train, y_train, X_test, y_test] = make_data1(size_ts,size_ls,0,None) else: [X_train, y_train, X_test, y_test] = make_data2(size_ts,size_ls,0,None) clf = KNeighborsClassifier(n_neighbors=n_neigh) clf = clf.fit(X_train, y_train) y_pred = clf.predict(X_test) if graph : plot_boundary(fname="figures/data_" + str(data_set) + "_neighbors"+str(n), fitted_estimator=clf, X=X_test, y=y_pred, title="data set " + str(data_set) + " with neighbors = "+str(n)) return accuracy_score(y_test, y_pred)
def predPlotDT(trainSample, testSample, max_depth=None, plot=False): #Generation of the training and test datasets global DScount #Computing the acc. Not using predDT function because we need the dt in order to plot dt = DecisionTreeClassifier(max_depth=max_depth, random_state=42) estimator = dt.fit(trainSample[0], trainSample[1]) yPredicted = estimator.predict(testSample[0]) acc = accuracy_score(testSample[1], yPredicted) if max_depth != None: print("The accuracy of dataset {} with max depth of {} is {}. ".format( DScount, max_depth, acc)) else: print("The accuracy of dataset {} without max depth is {}. ".format( DScount, acc)) if (plot): print("Saving the file.") name = "boundaryDT" + str(max_depth) title = "Distibution for max depth tree of " + str( max_depth ) if max_depth != None else "Distibution for tree without max depth" plot_boundary(name, estimator, testSample[0], testSample[1], title=title) name = "boundaryDTLS" + str(max_depth) title = "Distibution for max depth tree of " + str( max_depth ) + " for the learning sample" if max_depth != None else "Distibution for tree without max depth for the learning sample" plot_boundary(name, estimator, trainSample[0], trainSample[1], title=title) nameTree = "Tree" + str(max_depth) + ".dot" tree.export_graphviz(dt, out_file=nameTree) return acc
def tree(max_depth_input=None, fname=""): model = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=max_depth_input, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, class_weight=None, presort=False) model.fit(x_data_train, y_data_train) y_pred = model.predict(x_data_test) plot_boundary(fname, model, X, y) return accuracy_score(y_data_test, y_pred)
def get_accuracy(n_neighbors, seed, which, dataset_size, trainingSet_size): """ This function will predict with the KNN class and build a graph based on the prediction, it will also print the accuracy corresponding to the graph Arguments: n_neighbors: an array containing all the number of neighbors of which we should apply KNN seed: this is used to make random operation which: which dataset should be used dataset_size: the number of samples in the dataset trainingSet_size: the number of samples in the training set Return: / """ # Get the sets x_train_sample, x_test_sample, y_train_sample, y_test_sample = get_sets( dataset_size, trainingSet_size, seed, which) for i in range(len(n_neighbors)): # Get the KN neighbours for each n_neighbors knn = KNeighborsClassifier(n_neighbors=n_neighbors[i]).fit( x_train_sample, y_train_sample) # Predictions done from the training samples prediction = knn.predict(x_test_sample) # Compute the accuracy accuracy = accuracy_score(y_test_sample, prediction) # Plot fname = "KNN=" + str(n_neighbors[i]) + "_ds=" + str(which) title = "KNN of " + str(n_neighbors[i]) \ + " neighbours and with an accuracy of %0.4f" %accuracy plot_boundary(fname, knn, x_test_sample, y_test_sample, 0.1, title) print("The accuracy for the dataset " + str(which) + " is: %0.4f" % accuracy)
def compute_accuracy(nb_gen, max_depth, nb_points): """Computes the test set accurencies over n generations of the dataset using the DecisionTreeClassifier class from sklearn.tree with a particular max depth. Parameters ---------- - nb_gen : number of generations of the dataset. - max_depth : maximum depth of the decision tree for the DT model. - nb_points : number of samples. Returns ------- accuracy : a list of the test set accuracies of the different generations. """ accuracy = [] for generation in range(nb_gen): X, y = make_dataset2(nb_points, generation) X_ls, X_ts, y_ls, y_ts = train_test_split(X, y, train_size=.8, test_size=.2) if max_depth == "None": estimator = DecisionTreeClassifier().fit(X_ls, y_ls) else: estimator = DecisionTreeClassifier(max_depth=max_depth).fit( X_ls, y_ls) y_pred = estimator.predict(X_ts) accuracy.append(accuracy_score(y_ts, y_pred)) if generation == 1: plot_boundary("DT maxdepth {}".format(max_depth), estimator, X_ts, y_ts, 0.1) return np.array(accuracy)
def compute_accuracy(nb_gen, nb_neighbors, nb_points): """Computes the test set accurencies over n generations of the dataset for the KNeighborsClassifier class from sklearn.neighbors with a particular number of nearest neighbors. Parameters ---------- - nb_gen : number of generations of the dataset. - nb_neighbors : number of nearest neighbors for the KNN model. - nb_points : number of samples. Returns ------- accuracy : a list of the test set accuracies of the different generations. """ accuracy = [] for generation in range(nb_gen): X, y = make_dataset2(nb_points, generation) X_ls, X_ts, y_ls, y_ts = train_test_split(X, y, train_size=.8, test_size=.2) estimator = KNeighborsClassifier(n_neighbors=nb_neighbors).fit( X_ls, y_ls) y_pred = estimator.predict(X_ts) accuracy.append(accuracy_score(y_ts, y_pred)) if generation == 1: plot_boundary("KNN neighbors {}".format(nb_neighbors), estimator, X_ts, y_ts, 0.1) return np.array(accuracy)
def findNbIter(trainSample, testSample, plot=False): nbIter = [1, 10, 20, 50, 100, 200, 500, 1000] bestAcc = 0 bestIter = 0 for i in nbIter: start_time = time.time() lr = LogisticRegressionClassifier(n_iter=i) lr.fit(trainSample[0], trainSample[1]) yPredicted = lr.predict(testSample[0]) currentAcc = accuracy_score(testSample[1], yPredicted) if bestAcc < currentAcc: bestAcc = currentAcc bestIter = i print("Accuracy for {} iterations is {}. It took {} sec.".format( i, currentAcc, time.time() - start_time)) if (plot): name = "boundaryLR" + str(i) title = "Distibution for " + str(i) + "iterations." plot_boundary(name, lr, testSample[0], testSample[1], title=title) print("The Optimal number of iterations is {}".format(bestIter)) return bestIter
Py *= (1 / factor_den) * exp p[h].append(Py) Z += Py for i in range(len(p[h])): p[h][i] /= Z p = np.matrix(p) return p if __name__ == "__main__": dataset_size = 2000 trainingSet_size = 150 for i in range(2): x_train_sample, x_test_sample, y_train_sample, y_test_sample = get_sets( dataset_size, trainingSet_size, 1, i + 1) nb = GaussianNaiveBayes().fit(x_train_sample, y_train_sample) prediction = nb.predict(x_test_sample) accuracy = accuracy_score(y_test_sample, prediction) fname = "NB_ds=" + str(i + 1) title = "Naive Bayes classification with an accuracy of %0.4f" % accuracy plot_boundary(fname, nb, x_test_sample, y_test_sample, 0.1, title)
ypr = Sigmoid(theta,X[i]) error = (ypr - Y[i]) * xij Err += error J = Err / len(Y) return J niter=[50] #[10,200,1000] learningrate=[0.01,0.1,1,10] # Main body if __name__ == "__main__": for i in range(len(niter)): for j in range(len(learningrate)): cnf=np.zeros((2,2)); a=0; st=0 ni=niter[i] lr=learningrate[j] for k in range(5): # five generations of the dataset b=make_unbalanced_dataset(3000) Xtr=np.array(b[0][0:1000,:]) ytr=b[1][0:1000] Xte=np.array(b[0][1000:,:]) yte=b[1][1000:] c=LogisticRegressionClassifier(n_iter=ni,learning_rate=lr) t=LogisticRegressionClassifier.fit(c,Xtr,ytr) plot_boundary(fname="Logistic_regression_learn_rate_%s_n_iter_%s.png" %(lr,ni),fitted_estimator=t,X=Xte,y=yte) pr=t.predict(Xte) cnf += confusion_matrix(yte,pr) a += round(accuracy_score(yte,pr),3) st += round(np.std(pr-yte),2) print("Average accuracy if Learn Rate = %s & Iteration N = %s: True negative %s False negative %s True positive %s False positive %s Accuracy score %s St dev %s" %(lr,ni,cnf[0,0]/5.,cnf[1,0]/5.,cnf[1,1]/5.,cnf[0,1]/5.,a/5.,st/5.)); c=0 pass
# (Question 2) if __name__ == "__main__": n_table = [1, 5, 25, 125, 300, 625,1200] data = make_dataset2(1500, 1997) scores = {} mean = {} var = {} for n in n_table: # part 1 estimator = KNeighborsClassifier(n_neighbors=n).fit(data[0], data[1]) print("computing" + str(n)) plot_boundary("knn_" + str(n), estimator, data[0], data[1]) scores[n] = cross_val_score( estimator, data[0], data[1], cv=10).tolist() for i in range(9): cv = StratifiedKFold(n_splits=10, random_state=i, shuffle=True) scores[n].extend(cross_val_score( estimator, data[0], data[1], cv=cv).tolist()) print(len(scores[n])) mean[n] = np.mean(scores[n]) var[n] = np.var(scores[n]) print("mean" + str(mean)) print("var" + str(var)) # part2 # desired accuracy
numerator.append(num[k]) numerator = np.asarray(numerator) p[i] = numerator / den i += 1 return p if __name__ == "__main__": # 1st dataset train_set = make_dataset1(1200, 565354) lda = LinearDiscriminantAnalysis() lda.fit(train_set[0], train_set[1]) plot_boundary('lda_trainDataset1', lda, train_set[0], train_set[1]) # 2nd dataset train_set = make_dataset2(1200, 565354) lda = LinearDiscriminantAnalysis() lda.fit(train_set[0], train_set[1]) plot_boundary('lda_trainDataset2', lda, train_set[0], train_set[1]) # Accuracy and std for five generations of different seeds accuracy1 = np.zeros(5) accuracy2 = np.zeros(5) seed = 10000 # Will change for each generation for i in range(5): (train_set1, test_set1) = (make_dataset1(1200, seed),
# (Question 1) dt.py: Decision tree SAMPLE_NUMBER = 2000 TRAIN_SET_SAMPLE_NUM = 150 X, y = get_dataset(SAMPLE_NUMBER) X_train, y_train = X[:TRAIN_SET_SAMPLE_NUM], y[:TRAIN_SET_SAMPLE_NUM] X_test, y_test = X[TRAIN_SET_SAMPLE_NUM:], y[TRAIN_SET_SAMPLE_NUM:] # 1. decisionTreeClassifier = DecisionTreeClassifier(random_state=get_random_state()) decisionTreeClassifier.fit(X_train, y_train) y_dtc = decisionTreeClassifier.predict(X_test) # Plot plot_boundary("1-1-Ground-Truth", decisionTreeClassifier, X_test, y_test, title="Ground Truth data") plot_boundary("1-1-Prediction", decisionTreeClassifier, X_test, y_dtc, title="Prediction data") # 2. max_depths = [i for i in range(1, 20)] training_scores = [] for max_depth in max_depths: decisionTreeClassifier = DecisionTreeClassifier(random_state=get_random_state(), max_depth=max_depth) decisionTreeClassifier.fit(X_train, y_train) y_dtc = decisionTreeClassifier.predict(X_test) # Plot plot_boundary("1-2-Max-Depth_%s" % str(max_depth), decisionTreeClassifier, X_test, y_test, title="Real data with max_depth = %s" % str(max_depth)) training_scores.append(decisionTreeClassifier.score(X_train,y_train))
from plot import plot_boundary from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import accuracy_score if __name__ == "__main__": train_set = make_dataset2(1200, 565354) test_set = make_dataset2(300, 156) seeds = [5, 36, 47, 9898] depth_test = [1, 2, 4, 8] scores = {} # create tree and figure of unconstrained depth estimator = DecisionTreeClassifier().fit(train_set[0], train_set[1]) plot_boundary("inf_train_tree", estimator, train_set[0], train_set[1]) plot_boundary("inf_test_tree", estimator, test_set[0], test_set[1]) prediction = estimator.predict(test_set[0]) scores[0] = [] scores[0].append(accuracy_score(test_set[1], prediction)) # part 2, test model against 5 test set. for seed in seeds: test_set = make_dataset2(300, seed) prediction = estimator.predict(test_set[0]) scores[0].append(accuracy_score(test_set[1], prediction)) # create tree and figure for each depth for depth in depth_test: print("create tree" + str(depth)) estimator = DecisionTreeClassifier(max_depth=depth).fit(
if __name__ == "__main__": print("Nearest Neighbour: Standard calculation") for i in range(len(d)): cnf = np.zeros((2, 2)) a = 0 st = 0 for k in range(5): b = make_unbalanced_dataset(3000) Xtr = np.array(b[0][0:1000, :]) ytr = b[1][0:1000] Xte = np.array(b[0][1000:, :]) yte = b[1][1000:] c = KNeighborsClassifier(n_neighbors=d[i]) t = KNeighborsClassifier.fit(c, Xtr, ytr) plot_boundary(fname="K_nearest_neighbors_%s.png" % (str(d[i])), fitted_estimator=t, X=Xte, y=yte) pr = t.predict(Xte) cnf += confusion_matrix(yte, pr) a += round(accuracy_score(yte, pr), 3) st += round(np.std(pr - yte), 2) print( "Average accuracy if N-value %s : True negative %s False negative %s True positive %s False positive %s Accuracy score %s St dev %s" % (d[i], cnf[0, 0] / 5., cnf[1, 0] / 5., cnf[1, 1] / 5., cnf[0, 1] / 5., a / 5., st / 5.)) print("Ten-fold cross validation") for i in range(len(d)): cnf = np.zeros((2, 2)) a = 0 st = 0 for k in range(5):
def knn_plot(n_neighbors_input=1, fname=""): model = KNeighborsClassifier(n_neighbors=n_neighbors_input) model.fit(x_data_train, y_data_train) y_pred = model.predict(x_data_test) plot_boundary(fname, model, x, y) return accuracy_score(y_data_test, y_pred)
# Get training and testing sets X, y = f(n_samples, random_state=0) # seed fixed to 0 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=p_test, shuffle=False) # K Neighbors algorithm for n in n_neighbors: estimator = KNeighborsClassifier(n_neighbors=n).fit( X_train, y_train) # Save results fig_name = f.__name__ + '_knn_' + str(n) plot_boundary(fig_name, estimator, X_test[:n_show], y_test[:n_show]) ############## # Question 2 # ############## # Variables k, k_neighbors = 10, range(5, 150, 1) # ten-fold cross validation accuracies_mean = np.zeros((len(k_neighbors))) # Get the second dataset X, y = datasets[1](n_samples, random_state=0) # seed fixed to 0 # Apply the algorithm with k-fold cross validation on second dataset for i, n in enumerate(k_neighbors): neigh = KNeighborsClassifier(n_neighbors=n)
X_test, y_test = X[TRAIN_SET_SAMPLE_NUM:], y[TRAIN_SET_SAMPLE_NUM:] # 1. knc = KNeighborsClassifier(n_neighbors=1) knc.fit(X_train, y_train) y_predict = knc.predict(X_test) n_errors = compare(y_test, y_predict) print("[Q2-1] 1-NN - Error percentage : {}%".format(n_errors*100/len(X_test))) # 2. oneNN = sklearn.neighbors.KNeighborsClassifier(n_neighbors=1) oneNN.fit(X_train, y_train) y_predict = oneNN.predict(X_test) plot_boundary("2-2-Ground-Truth", oneNN, X_test, y_test, title="Ground Truth data") plot_boundary("2-2-Prediction", oneNN, X_test, y_predict, title="Prediction data") n_errors = compare(y_test, y_predict) print("[Q2-2] 1-NN - Error percentage : {}%".format(n_errors*100/len(X_test))) plot_boundary("2-2-Training-set", oneNN, X_train, y_train, title="Training set boundaries") # 3. n_neighbors = [1, 2, 4, 7, 10, 30, 90, 150] for n in n_neighbors: nearest_neighb_class = sklearn.neighbors.KNeighborsClassifier(n_neighbors=n) nearest_neighb_class.fit(X_train, y_train) y_predict = nearest_neighb_class.predict(X_test) plot_boundary("2-3-Prediction-%s" % str(n), nearest_neighb_class, X_test, y_predict, title="Prediction data")
np.random.seed(0) if __name__ == "__main__": cnf = np.zeros((2, 2)) a = 0 st = 0 for k in range(5): ''' choose either unbalanced either balanced and uncomment another''' b = make_unbalanced_dataset(3000) #b=make_balanced_dataset(3000) Xtr = np.array(b[0][0:1000, :]) ytr = b[1][0:1000] Xte = np.array(b[0][1000:, :]) yte = b[1][1000:] c = GaussianNB() t = GaussianNB.fit(c, Xtr, ytr) if k == 0: plot_boundary(fname="Naive_Bias_Depth_%s.png" % (k), fitted_estimator=t, X=Xte, y=yte) pr = t.predict(Xte) cnf += confusion_matrix(yte, pr) a += round(accuracy_score(yte, pr), 2) st += round(np.std(pr - yte), 2) print( "Accuracy: True negative %s False negative %s True positive %s False positive %s Accuracy score %s St dev %s" % (cnf[0, 0] / 5, cnf[1, 0] / 5, cnf[1, 1] / 5, cnf[0, 1] / 5, a / 5, st / 5)) pass
# Compute the accuracy accuracy = accuracy_score(y_test_sample, prediction) accuracies[k].append(accuracy) # Plot the best accuracy if accuracy > best_accuracy: to_plot = [ decisionTree, x_test_sample, y_test_sample, accuracy ] best_accuracy = accuracy if j == 4: fname = "DTC_depth=" + str(depth[k]) + "_ds=" + str(i + 1) title = "Decision Tree Classifier with a depth of " + str(depth[k]) \ + " with an accuracy of %0.4f" %to_plot[3] plot_boundary(fname, to_plot[0], to_plot[1], to_plot[2], 0.1, title) # Compute the average accuracies over 5 generations of the dataset for j in range(5): avg_accuracy = sum(accuracies[j]) / 5 deviation = np.std(accuracies[j]) print("From dataset %d:" % (i + 1)) print("Depth = " + str(depth[j])) print("Average accuracy = %0.4f" % avg_accuracy) print("Deviation = %0.4f" % deviation) print()
# ... d = [1, 2, 4, 6, 8, None] if __name__ == "__main__": for i in range(len(d)): cnf = np.zeros((2, 2)) a = 0 st = 0 for k in range(5): b = make_unbalanced_dataset(3000) Xtr = np.array(b[0][0:1000, :]) ytr = b[1][0:1000] Xte = np.array(b[0][1000:, :]) yte = b[1][1000:] c = DecisionTreeClassifier(max_depth=d[i]) t = DecisionTreeClassifier.fit(c, Xtr, ytr) plot_boundary(fname="Decision_Tree_Depth_%s.png" % (str(d[i])), fitted_estimator=t, X=Xte, y=yte) pr = t.predict(Xte) cnf += confusion_matrix(yte, pr) a += round(accuracy_score(yte, pr), 2) st += round(np.std(pr - yte), 2) print( "Average accuracy if Depth %s : True negative %s False negative %s True positive %s False positive %s Accuracy score %s St dev %s" % (d[i], cnf[0, 0] / 5., cnf[1, 0] / 5., cnf[1, 1] / 5., cnf[0, 1] / 5., a / 5., st / 5.)) c = 0 pass