def Q3(): # AdaBoost val_error = [] train_error = [] for T in range(1, 205, 5): adaboost = aba.AdaBoost(DecisionStump, T) adaboost.train(X_train, y_train) train_error.append(adaboost.error(X_train, y_train)) val_error.append(adaboost.error(X_val, y_val)) plot(list(range(1, 205, 5)), train_error) plot(list(range(1, 205, 5)), val_error) xlabel("Iteration_num") ylabel("error") legend(["Training Error", "Validation Error"], loc=5) show() figure(1) ion() for index, T in enumerate([1, 5, 10, 50, 100, 200]): adaboost = aba.AdaBoost(DecisionStump, T) adaboost.train(X_train, y_train) subplot(2, 3, index + 1) decision_boundaries(adaboost, X_train, y_train, "Iteration: " + str(T)) pause(8) best_iteration = val_error.index(np.min(val_error)) * 5 print(best_iteration) ab = aba.AdaBoost(DecisionStump, best_iteration) ab.train(X_train, y_train) print(ab.error(X_test, y_test)) return
def main(): mode, filename, modelFile, model = sys.argv[1:] # Driver if mode == "train": if model == "nearest" or model == "best": knn = KNN() knn.train(filename) elif model == "adaboost": ada = adaboost.AdaBoost() ada.train(filename) save_obj(ada, modelFile) elif model == "nnet": pass else: print("Incorrect model.........exiting") sys.exit(1) elif mode == "test": if model == "nearest" or model == "best": knn = KNN() knn.classify(filename, modelFile) elif model == "adaboost": ada = load_obj(modelFile) ada.classify(filename, modelFile) elif model == "nnet": pass else: print("Incorrect model....exiting") sys.exit(1) else: print("Incorrect mode......exiting") sys.exit(1)
def testWineData(self): trn, tst = load_wine_data() trn_X, trn_y = trn tst_X, tst_y = tst td = training_utils.TrainingData(trn_X, trn_y) XX, yy = td.get_training_sample() classifier = adaboost.AdaBoost(rank1_metric.Rank1_Metric) classifier.set_training_sample(XX, yy) T = 20 classifier.train(T, 1) f = metric_learning.boosted_dist(classifier) knn = knn_utils.KNN(3) knn.dist_func = f knn.set_training_data(trn_X, trn_y) # test on training knn.compute_distance(trn_X) predicted_y = numpy.array(knn.classify(True)) accuracy_trn = compute_accuracy(predicted_y, trn_y) # test on testing knn.compute_distance(tst_X) predicted_y = numpy.array(knn.classify(False)) accuracy_tst = compute_accuracy(predicted_y, tst_y) print accuracy_trn, accuracy_tst self.failUnless(min(accuracy_trn, accuracy_tst) > .95)
def Cascade(trn, max_level): if len(trn) > 2: X = array([x for x, y in trn]) Y = array([y for x, y in trn]) else: X, Y = trn T = 512 classifiers = [] td = training_utils.TrainingData(X, Y) SD = distance_utils.calcDistanceMatrix2([X]) trunk = td.trunk sensitivity = 0.99 o = metric_learning.search_threshold(SD, trunk, sensitivity) thres = o[0] TP, FP = o[1] num_FN = (1 - sensitivity) * (len(TP) / sensitivity) pairs = TP, FP MAX_NUM_POS = 2000 MAX_NUM_NEG = 4000 num_pos = min(len(pairs[0]), MAX_NUM_POS) num_neg = min(len(pairs[1]), MAX_NUM_NEG) print "TP vs FP: %d vs %d" % (len(TP), len(FP)) trunk = o[2] classifiers.append((thres, (TP, FP))) FPvsFN = len(FP) / num_FN print "#FP/#FN = ", FPvsFN #return classifiers for level in range(1, max_level + 1): print "level = %d" % level random.shuffle(TP) random.shuffle(FP) pos = TP[-1:-num_pos - 1:-1] neg = FP[0:num_neg] pairs = pos, neg print len(pairs[0]), len(pairs[1]) XX, yy = training_utils.create_training_sample(X, pairs) classifier = adaboost.AdaBoost(rank1_metric.Rank1_Metric) classifier.set_training_sample(XX, yy) classifier.train(T, 1) #classifier VD, vy = training_utils.create_training_sample( X, (TP, FP)) # for validation SD = -classifier.predict(VD) # be careful about the sign!!! sensitivity = 0.99 o = metric_learning.search_threshold(SD, trunk, sensitivity) thres = o[0] TP, FP = o[1] num_FN = (1 - sensitivity) * (len(TP) / sensitivity) print "TP vs FP: %d vs %d" % (len(TP), len(FP)) trunk = o[2] classifiers.append((classifier, thres, (TP, FP))) FPvsFN = len(FP) / num_FN print "#FP/#FN = ", FPvsFN if len(TP) == 0 or len(FP) == 0 or FPvsFN < 1.2: break return classifiers
def _init_classifiers(self): # Initialize classifier objects self.fenc = FreemanEncoder() self.knn = KNN.KNN() self.HMM = HMM.HMM() self.NaiveBayes = NaiveBayes.NaiveBayes() self.RandomForest = RandomForest.RandomForests() self.SVM = svm.SVM_SVC() self.LogisticReg = LogisticReg.LogisticReg() self.AdaBoost = adaboost.AdaBoost() self.GBRT = gbrt.GBRT() #Train initially on the default data set, if no model saved already # Initialize KNN, no saved model for KNN self.knn.knn_train(CharRecognitionGUI_support.training_dataset, 1.0) # Initialize HMM self.HMM.training(CharRecognitionGUI_support.training_dataset) # Initialize Naive Bayes try: pickle.load( open( "./Models/naivebayes_model.p", "rb" ) ) except IOError: self.NaiveBayes.training(CharRecognitionGUI_support.training_dataset) # Initialize Random Forest try: pickle.load( open( "./Models/random_forest.p", "rb" ) ) except IOError: self.RandomForest.training(CharRecognitionGUI_support.training_dataset) # Initialize SVM try: pickle.load( open( "./Models/svm.p", "rb" ) ) except IOError: self.SVM.training(CharRecognitionGUI_support.training_dataset) # Initialize Logistic Regression try: pickle.load( open( "./Models/logistic_model.p", "rb" ) ) except IOError: self.LogisticReg.training(CharRecognitionGUI_support.training_dataset) # Initialize AdaBoost try: pickle.load( open( "./Models/AdaBoostClassifier.p", "rb" ) ) except IOError: self.AdaBoost.training(CharRecognitionGUI_support.training_dataset) # Initialize GBRT try: pickle.load( open( "./Models/GradientBoostingClassifier.p", "rb" ) ) except IOError: self.GBRT.training(CharRecognitionGUI_support.training_dataset)
def q_11(noise): """ This function operates the required in q_11- plots the decisions of the learned classifier with T=[5, 10, 50, 100, 200, 500] :param noise: the noise to generate data with (the test data) """ x_train, y_train = generate_data(NUM_SAMPLES_TRAIN, noise) x_test, y_test = generate_data(NUM_SAMPLES_TEST, noise) for i in range(len(T_LIST)): boost = adb.AdaBoost(DecisionStump, T_LIST[i]) boost.train(x_train, y_train) plt.subplot(2, 3, i + 1) decision_boundaries(boost, x_test, y_test, T_LIST[i]) plt.show()
def q_12(noise): """ This function operates the required in q_12- finds the T that minimizes the test error, and plots the decisions boundaries of this classifier with the training data :param noise: the noise to generate data with (the test data) """ x_train, y_train = generate_data(NUM_SAMPLES_TRAIN, noise) train_err, test_err = calc_train_and_test_err(x_train, y_train, noise) T_hat = np.argmin(test_err) min_boost = adb.AdaBoost(DecisionStump, T_hat) min_boost.train(x_train, y_train) decision_boundaries(min_boost, x_train, y_train, T_hat) plt.show() print("The T_hat is: " + str(T_hat) + " and its test error is: " + str(test_err[T_hat]))
def Q3(): # AdaBoost """ :return: """ syn_data = get_syn_data() X_test, X_train, X_val, y_test, y_train, y_val = syn_data[0], syn_data[1],\ syn_data[2], syn_data[3],\ syn_data[4], syn_data[5] # init T T = 22 * [0] T[0] = 1 for i in range(20): T[i + 1] = (i + 1) * 5 T[21] = 200 keep_T = [1, 5, 10, 50, 100, 200] learned_classifiers = [None] * len(keep_T) min_val_err = 1 min_val_err_classifier = None training_error, validation_error, test_error = [], [], [] for t in T: ada_boost = adaboost.AdaBoost(WL=tools.DecisionStump, T=t) ada_boost.train(X_train, y_train) if t in keep_T: learned_classifiers[keep_T.index(t)] = ada_boost if t != 1: training_error.append(ada_boost.error(X_train, y_train)) validation_error.append(ada_boost.error(X_val, y_val)) plot_decisions(keep_T, learned_classifiers, X_train, y_train, "adaBoost on SynData") plt.plot(T[1:], training_error, label='training error', color='magenta') plt.plot(T[1:], validation_error, label='validation error', color='deepskyblue') plt.title('adaBoost error on SynData as function of T') plt.legend(loc='best') plt.xlabel('T - number of base learners to learn') plt.ylabel('Error') plt.show()
def calc_train_and_test_err(x_train, y_train, noise): """ This function calculate the train and test error vectors :param x_train: the samples of the train data to calculate its error :param y_train: the labels of the train data to calculate its error :param noise: the noise to generate data with (the test data) :return: the train and test error vectors """ x_test, y_test = generate_data(NUM_SAMPLES_TEST, noise) boost = adb.AdaBoost(DecisionStump, T) boost.train(x_train, y_train) train_err = [] test_err = [] for t in range(T): train_err.append(boost.error(x_train, y_train, t + 1)) test_err.append(boost.error(x_test, y_test, t + 1)) return train_err, test_err
def Q3(): # AdaBoost syn_data = getSynData() X_test,X_train,X_val = syn_data[0],syn_data[1],syn_data[2] Y_test,Y_train,Y_val = syn_data[3] ,syn_data[4] ,syn_data[5] T = [5*(i+1) for i in range(20)] + [200] test_err, validation_err = list(), list() for t in T: AdaB = adaboost.AdaBoost(ex4_tools.DecisionStump,t) AdaB.train(X_train,Y_train) test_err.append(AdaB.error(X_test,Y_test)) validation_err.append(AdaB.error(X_val,Y_val)) graph_plot("T","error of adaBoost",T,test_err, "test err", validation_err,"validation err","Q3") # return
def q_13(noise): """ This function operates the required in q_13- plots the training set with size proportional to its weight in D^T. (after seeing the results with the original D^T, the function normalized the D^T and then plotted) :param noise: the noise to generate data with (the test data) """ x_train, y_train = generate_data(NUM_SAMPLES_TRAIN, noise) # using D_T without normalize - cant see any points: # boost = adb.AdaBoost(DecisionStump, T) # D_T = boost.train(x_train, y_train) # decision_boundaries(boost, x_train, y_train, T, D_T) # plt.show() # using D_T with normalize: boost = adb.AdaBoost(DecisionStump, T) D_T = boost.train(x_train, y_train) D_T = (D_T / np.max(D_T)) * 10 decision_boundaries(boost, x_train, y_train, T, D_T) plt.show()
def main(): k = 5 # Number of folds # Dataset retrieving and formatting dataset = dh.load_dataset("tic-tac-toe.data") dataset = dh.format_outputs(dataset) dataset = dh.fold_dataset(dataset, k) cv_accuracies = [] cv_errors = [] cv_model_errors = [] # Execute k-fold cross-validation for i in range(k): print("Round", i + 1) testing_set = dh.separate_attributes(dataset[i]) remaining_folds = np.concatenate(np.delete(dataset, i)) training_set = dh.separate_attributes(remaining_folds) ada = ab.AdaBoost(training_set, testing_set) results = ada.boost(301) cv_accuracies.append(results[0]) cv_errors.append(results[1]) cv_model_errors.append(results[2]) # Convert lists to numpy arrays for faster calculations cv_accuracies = np.asarray(cv_accuracies) cv_errors = np.asarray(cv_errors) cv_model_errors = np.asarray(cv_model_errors) # Calculate the mean of the accuracies and the errors cv_accuracies = np.divide(np.sum(cv_accuracies, axis=0), k) cv_errors = np.divide(np.sum(cv_errors, axis=0), k) cv_model_errors = np.divide(np.sum(cv_model_errors, axis=0), k) # Save the results to a CSV dh.save_results(cv_accuracies, "boosting_accuracy") dh.save_results(cv_errors, "boosting_error") dh.save_results(cv_model_errors, "model_error")
def Q5(): # spam data n_folds = 5 # creating the data y = np.loadtxt("SpamData/spam.data", usecols=(-1, )) y[y == 0] = -1 data = np.loadtxt("SpamData/spam.data") X = data[:, :-1] sample_data_idx = np.random.permutation(np.array(range(len(X)))) X_test, y_test, X_train, y_train = X[sample_data_idx[:1536]], y[sample_data_idx[:1536]],\ X[sample_data_idx[1536:]], y[sample_data_idx[1536:]] T_values = [5, 50, 100, 200, 500, 1000] d_values = [5, 8, 10, 12, 15, 18] # splitting to folds cross_val_idx = np.random.permutation(np.array(range(len(X_train)))) folds = np.array([ X_train[cross_val_idx[int(i * len(X_train) / n_folds):int((i + 1) * len(X_train) / n_folds)]] for i in range(n_folds) ]) folds_y = np.array([ y_train[cross_val_idx[int(i * len(X_train) / n_folds):int((i + 1) * len(X_train) / n_folds)]] for i in range(n_folds) ]) AB_mean_errors = [] DT_mean_errors = [] AB_sd = [] DT_sd = [] # cross validation: for val in range(len(T_values)): print("val" + str(val)) # the len of T and d is the same ab_errors = [] dt_errors = [] for fold_idx in range(n_folds): print("fold idx " + str(fold_idx)) fold_test = folds[fold_idx] fold_test_y = folds_y[fold_idx] other_fold_idx = [i for i in range(n_folds) if i != fold_idx] other_folds = np.concatenate(folds[other_fold_idx]) other_folds_y = np.concatenate(folds_y[other_fold_idx]) # adaboost training on current fold: ab = aba.AdaBoost(DecisionStump, T_values[val]) ab.train(other_folds, other_folds_y) ab_errors.append(ab.error(fold_test, fold_test_y)) # DT training on current fold: dt = dta.DecisionTree(d_values[val]) dt.train(other_folds, other_folds_y) dt_errors.append(dt.error(fold_test, fold_test_y)) # add the mean errors: AB_mean_errors.append(np.array(ab_errors).mean()) AB_sd.append(np.std(ab_errors)) DT_mean_errors.append(np.array(dt_errors).mean()) DT_sd.append(np.std(dt_errors)) # plotting the errors in order to view the best T and d parameters: errorbar(T_values, AB_mean_errors, np.reshape(np.array(AB_sd), (len(T_values), 1)), ecolor="green") xlabel("parameter - T") ylabel("error rate") show() errorbar(d_values, DT_mean_errors, np.reshape(np.array(DT_sd), (len(T_values), 1)), ecolor="green") xlabel("parameter max depth") ylabel("error rate") show() # best_b = AB_mean_errors.index(np.min(AB_mean_errors)) best_b = 200 print(" best parameter for adaboost:" + str(best_b)) ab = aba.AdaBoost(DecisionStump, best_b) ab.train(X_train, y_train) print(ab.error(X_test, y_test)) # best_b = DT_mean_errors.index(np.min(DT_mean_errors)) best_b = 10 print("DT best parameter:" + str(best_b)) dt = dta.DecisionTree(best_b) dt.train(X_train, y_train) print(dt.error(X_test, y_test))
import adaboost from sklearn.ensemble import AdaBoostClassifier t = adaboost.AdaBoost() # print detailed debugging information regarding the classifier selection t.debug = 2 x = [[1, 2], [1, 4], [2.5, 5.5], [3.5, 6.5], [4, 5.4], [2, 1], [2, 4], [3.5, 3.5], [5, 2], [5, 5.5]] y = [1, 1, 1, 1, 1, -1, -1, -1, -1, -1] # train classifier t.train(x, y) # x is a matrix, y is a actual classifications (-1 or 1) # classify novel set of values, the sign of the return value is predicted binary class novel_y_prime = t.apply_to_matrix(x) print novel_y_prime clf = AdaBoostClassifier(n_estimators=10) clf.fit(x, y) print clf.feature_importances_ print clf.predict(x)
print('Preprocessing Finished...') end_preprocessing = time.time() print("Preprocessing took " + str(float(end_preprocessing - start_preprocessing) / 60) + " min") print() print() else: df_credit = pd.read_csv('Preprocessed_Credit.csv', delimiter=',', header=None) df_credit_train, df_credit_test = model_selection.train_test_split(df_credit, test_size=0.20) start_training = time.time() # dt = decision_tree.DecisionTree(df_credit.shape[1] - 1) # dt.train(df_credit_train.iloc[:, df_credit.shape[1] - 1], df_credit_train.iloc[:, :df_credit.shape[1] - 1]) dt = adaboost.AdaBoost(df_credit_train, decision_tree.DecisionTree, 20) dt.train() end_training = time.time() print("Training took " + str(float(end_training - start_training) / 60) + " min") print() print() results = [] for smpl in range(0, df_credit_train.shape[0]): results.append(dt.decide(df_credit_train.iloc[smpl, :df_credit.shape[1] - 1].tolist())) tn, fp, fn, tp = metrics.confusion_matrix(df_credit_train.iloc[:, df_credit.shape[1] - 1].tolist(), results).ravel() print('True positive = ' + str(tp)) print('True negative = ' + str(tn)) print('False positive = ' + str(fp))
def helper(question): """ execution of question 3,4,bonus & 5, including training the classifiers using adaboost, decision trees and bagging, and calculating the error for each of the T,d or b values (both training, validation & test) :return: training_errors, validation_errors, test_errors, classifiers, x_matrix_training, y_vector_training """ with open("./SynData/X_train.txt", 'r') as training_x, open("./SynData/y_train.txt", 'r') as \ training_y, open("./SynData/X_val.txt", 'r') as validation_x, \ open("./SynData/y_val.txt", 'r') as validation_y, open("./SynData/y_test.txt", 'r') as test_y, \ open("./SynData/X_test.txt", 'r') as test_x: # convert the files into vectors and matrices y_vector_training = text_lines_to_vector(training_y) spam_matrix = text_lines_to_matrix(training_x) y_vector_validation = text_lines_to_vector(validation_y) x_matrix_validation = text_lines_to_matrix(validation_x) y_vector_test = text_lines_to_vector(test_y) x_matrix_test = text_lines_to_matrix(test_x) # preset the ingredients for the training training_errors = [] validation_errors = [] test_errors = [] # written form question 3 if question == 3: wl = ex4_tools.DecisionStump classifiers = list() # adding the classifier for T = 1 adb_h_1 = adb.AdaBoost(wl, 1) adb_h_1.train(spam_matrix, y_vector_training) classifiers.append(adb_h_1) t_values = [5, 10, 50, 100, 200] # train a classification hypothesis using adaboost for different values of T for t in range(run.T): adb_h_t = adb.AdaBoost(wl, t) adb_h_t.train(spam_matrix, y_vector_training) training_errors.append( adb_h_t.error(spam_matrix, y_vector_training)) validation_errors.append( adb_h_t.error(x_matrix_validation, y_vector_validation)) test_errors.append(adb_h_t.error(x_matrix_test, y_vector_test)) # collecting the trained classifiers for part 2 of question 3 if t in t_values: classifiers.append(adb_h_t) return training_errors, validation_errors, test_errors, classifiers,\ spam_matrix, y_vector_training # written for question 4 if question == 4: classifiers = list() D_VALUES = [3, 6, 8, 10, 12] for d in D_VALUES: tree_classifier = dt.DecisionTree(d) tree_classifier.train(spam_matrix, y_vector_training) training_errors.append( tree_classifier.error(spam_matrix, y_vector_training)) validation_errors.append( tree_classifier.error(x_matrix_validation, y_vector_validation)) test_errors.append( tree_classifier.error(x_matrix_test, y_vector_test)) return training_errors, validation_errors, test_errors, classifiers, \ spam_matrix, y_vector_training # written for question 4 - bonus if question == 'bonus': b_values = run.B max_depth = 10 # since we discovered at question 4 that 10 was the optimal depth for # this data tree_classifier = dt.DecisionTree(max_depth) for b in b_values: print(b) bagging_classifier = bag.Bagging(tree_classifier, b) bagging_classifier.train(spam_matrix, y_vector_training) validation_errors.append( bagging_classifier.error(x_matrix_validation, y_vector_validation)) test_errors.append( bagging_classifier.error(x_matrix_test, y_vector_test)) return validation_errors, test_errors # written for question 5 if question == '5': with open("./SpamData/spam.data") as spam: # convert the file into a matrix and a vector, and create a list of partitioned parts # for the cross validation spam_matrix = text_lines_to_matrix(spam) x_spam_matrix, y_spam_vector, x_vault_matrix, y_vault_vector = spam_modif( spam_matrix) subgroup_length = np.shape(x_spam_matrix)[0] // 5 partitioned_x, partitioned_y = cross_val_partition( x_spam_matrix, y_spam_vector, subgroup_length) # preparations for the adaboost training wl = ex4_tools.DecisionStump validation_errors_adb = [] # train a classification hypothesis using adaboost for different values of T, # and through each of the subsets as a validation group (cross validation) for j, t in enumerate(T_VALUES): for i in range(5): segment_len = len(partitioned_x[0]) x, y = sub_matrix(x_spam_matrix, y_spam_vector, segment_len, i) adb_h_t = adb.AdaBoost(wl, t) adb_h_t.train(x, y) if i == 0: validation_errors_adb.append([ adb_h_t.error(partitioned_x[i], partitioned_y[i]) ]) else: validation_errors_adb[j].append( adb_h_t.error(partitioned_x[i], partitioned_y[i])) print(validation_errors_adb) # preparations for the decision tree training validation_errors_dt = [] # train a classification hypothesis using decision trees for different values of depth, # and through each of the subsets as a validation group (cross validation) for j, d in enumerate(D_VALUES): for i in range(5): x, y = sub_matrix(x_spam_matrix, partitioned_x, y_spam_vector, partitioned_y) tree_classifier = dt.DecisionTree(d) tree_classifier.train(x, y) if i == 0: validation_errors_dt.append([ tree_classifier.error(partitioned_x[i], partitioned_y[i]) ]) else: validation_errors_dt[j].append( tree_classifier.error(partitioned_x[i], partitioned_y[i])) return validation_errors_adb, validation_errors_dt
def Q5(): # spam data T = [5, 50, 100, 200, 500, 1000] D = [5, 8, 10, 12, 15, 18] # get spam data spam_data = np.loadtxt('SpamData/spam.data') # change values of 0 to -1 spam_data[:, -1][spam_data[:, -1] == 0] = -1\ # get vault data and train data np.random.shuffle(spam_data) vault_index = np.random.choice(len(spam_data), 1536, replace=False) train_index = np.array( [i for i in range(len(spam_data)) if i not in vault_index]) train_data = spam_data[train_index] vault_data = spam_data[vault_index] # Use 5-fold cross validation to pick T and d data_size = len(train_data) split = int(data_size / 5) folds = np.split(train_data, [split, 2 * split, 3 * split, 4 * split]) data_sets = split_data_to_folds(folds) DT_error = [0] * 6 adaboost_error = [0] * 6 best_DT_error = None bes_adaboost_error = None for i in range(5): fold_size1 = data_sets[i][0].shape[1] arr1 = data_sets[i][0] arr2 = data_sets[i][1] X_train, y_train = arr1[:, 0:fold_size1 - 1],\ arr1[:,fold_size1 - 1:fold_size1] X_validation = arr2[:, 0:(fold_size1 - 1)] y_validation = arr2[:, (fold_size1 - 1):fold_size1] y_train = y_train.reshape((-1, )) y_validation = y_validation.reshape((-1, )) for t in T: ada_boost = adaboost.AdaBoost(tools.DecisionStump, t) ada_boost.train(X_train, y_train) current_adaboost_error = ada_boost.error(X_validation, y_validation) adaboost_error[i] += current_adaboost_error if bes_adaboost_error == None or bes_adaboost_error > current_adaboost_error: bes_adaboost_error = t for d in D: dt = decision_tree.DecisionTree(d) dt.train(X_train, y_train) current_dt_error = dt.error(X_validation, y_validation) DT_error[i] += current_dt_error if best_DT_error == None or best_DT_error > current_dt_error: best_DT_error = d # get mean error adaboost_error = np.array([x / 5 for x in adaboost_error]) DT_error = np.array([x / 5 for x in DT_error]) plt.errorbar(T, adaboost_error, capsize=np.std, color='magenta') plt.title('validation error on SpamData for adaBoost as function of T') plt.legend(loc='best') plt.xlabel('T') plt.ylabel('Error') plt.errorbar(T, adaboost_error) plt.show() # plt.errorbar(D, DT_error, capsize=np.std, color='magenta') plt.title('validation error on SpamData for DT as function of max depth') plt.legend(loc='best') plt.xlabel('max depth') plt.ylabel('Error') plt.show() # Train classifiers using the chosen parameter values, using the complete training set. # X_train, y_train = train_data[:, 0:57], train_data[:, 57] X_vault, y_vault = vault_data[:, 0:57], vault_data[:, 57] ada_boost = adaboost.AdaBoost(tools.DecisionStump, bes_adaboost_error) ada_boost.train(X_train, y_train) vault_adaboost_error = ada_boost.error(X_vault, y_vault) dt = decision_tree.DecisionTree(best_DT_error) dt.train(X_train, y_train) vault_dt_error = dt.error(X_vault, y_vault) print("vault_adaboost_error= " + vault_adaboost_error) print("vault_dt_error= " + vault_dt_error)
df_adult_test = ap.binarize_test(binarizers_adult, binarizers_adult_columns, df_adult_test) df_adult_test = df_adult_test.reset_index(drop=True) df_adult_test.to_csv('Preprocessed_Adult_Test.csv', sep=',') end_preprocessing = time.time() print("Preprocessing on test data took " + str(float(end_preprocessing - start_preprocessing) / 60) + " min") else: df_adult_train = pd.read_csv('Preprocessed_Adult_Train.csv', delimiter=',', header=None) df_adult_test = pd.read_csv('Preprocessed_Adult_Test.csv', delimiter=',', header=None) start_training = time.time() # dt = decision_tree.DecisionTree(df_adult_train.shape[1] - 1) # dt.train(df_adult_train.iloc[:, df_adult_train.shape[1] - 1], df_adult_train.iloc[:, :df_adult_train.shape[1] - 1]) dt = adaboost.AdaBoost(df_adult_train, decision_tree.DecisionTree, 20) dt.train() end_training = time.time() print("Training took " + str(float(end_training - start_training) / 60) + " min") print() print() results = [] for smpl in range(0, df_adult_train.shape[0]): results.append(dt.decide(df_adult_train.iloc[smpl, :df_adult_train.shape[1] - 1].tolist())) tn, fp, fn, tp = metrics.confusion_matrix(df_adult_train.iloc[:, df_adult_train.shape[1] - 1].tolist(), results).ravel() print('True positive = ' + str(tp)) print('True negative = ' + str(tn)) print('False positive = ' + str(fp))
def Q5(): # spam data SpamData = getSpamData() # changing the labels from 0 to -1, because this is how the classifiers work. SpamData[:, -1] [SpamData[:, -1] == 0] = -1 # Shuffling the data np.random.shuffle(SpamData) np.random.shuffle(SpamData) mSpam = len(SpamData) inds_vault = np.random.choice(mSpam, 1536, replace=False) inds_train = np.array([i for i in range(mSpam) if i not in inds_vault]) data_train = SpamData[inds_train] data_vault = SpamData[inds_vault] m_train = len(data_train) # get a folds as a tuple (train, validation,i) folds = np.split(data_train, [int(m_train /5), 2*int(m_train /5), 3*int(m_train /5), 4*int(m_train /5)]) data_5_cv = split(folds) T = [5,50,100,200,500,1000] D = [5,8,10,12,15,18] err_DT, err_AdaB = [0] * 6, [0] * 6 for i in range(5): print("*******i: ", i, " *********") s = data_5_cv[i][0].shape[1] X_train, y_train = data_5_cv[i][0][:, 0:s - 1],data_5_cv[i][0][:, s - 1:s] X_validation, y_validation = data_5_cv[i][1][:, 0:s - 1],data_5_cv[i][1][:, s - 1:s] y_train = y_train.reshape((-1,)) y_validation = y_validation.reshape((-1,)) # train decision tree for d in D: dt = DecisionTree(d) dt.train(X_train,y_train) e = dt.error(X_validation,y_validation) print("d:",str(d), " e: ",e) err_DT[i] +=dt.error(X_validation,y_validation) #train AdaBoost for t in T: adaB = adaboost.AdaBoost(ex4_tools.DecisionStump,t) adaB.train(X_train,y_train) e = adaB.error(X_validation,y_validation) err_AdaB[i] += e print("t:",str(t), " e: ",e) err_DT, err_AdaB = np.array([i/5 for i in err_DT]), np.array([i/5 for i in err_AdaB]) print("err_DT: ",err_DT) print("err_ADAB: ",err_AdaB) # checking optimal value for the data vault X ,y = SpamData[:,0:57],SpamData[:,57:58].reshape((len(SpamData),)) X_test, y_test = data_vault[:,0:57],data_vault[:,57:58].reshape((1536,)) dt = DecisionTree(15) dt.train(X, y) e = dt.error(X_test,y_test) print( "DT: e - ", e) adaB = adaboost.AdaBoost(ex4_tools.DecisionStump, 100) adaB.train(X, y) e = adaB.error(X_test,y_test) print( "adaBoost: e = ", e) return
str(float(end_preprocessing - start_preprocessing) / 60) + " min") print() print() else: df_telco = pd.read_csv('Preprocessed_Telco.csv', delimiter=',', header=None) df_telco_train, df_telco_test = model_selection.train_test_split( df_telco, test_size=0.20) start_training = time.time() # dt = decision_tree.DecisionTree(df_telco.shape[1] - 1) # dt.train(df_telco_train.iloc[:, df_telco.shape[1] - 1], df_telco_train.iloc[:, :df_telco.shape[1] - 1]) dt = adaboost.AdaBoost(df_telco_train, decision_tree.DecisionTree, 20) dt.train() end_training = time.time() print("Training took " + str(float(end_training - start_training) / 60) + " min") print() print() results = [] for smpl in range(0, df_telco_train.shape[0]): results.append( dt.decide(df_telco_train.iloc[smpl, :df_telco.shape[1] - 1].tolist())) tn, fp, fn, tp = metrics.confusion_matrix( df_telco_train.iloc[:, df_telco.shape[1] - 1].tolist(), results).ravel()
import adaboost import bool_ann if __name__ == '__main__': print '...loading datas' x, y = load(open('datas.dat', 'rb')) rate = 0.8 spInd = x.shape[0] * rate test_s = x.shape[0] - spInd + 1 print 'traing data: %d, testing data: %d' % (spInd - 1, test_s) print 'the true rate: %lf' % ((y == True).sum() * 1.0 / y.shape[0]) # print '======Test with Bool_ANN=======' # classifier = bool_ann.bool_ann(x[:spInd], y[:spInd],{'disp':False, 'maxiter':10}) # correct = (classifier.predict(x[spInd:])==y[spInd:]).sum() # print 'correct = %d/%d = %lf' % (correct, test_s, correct*1.0/test_s) print '======Test with ANN========' classifier = ann.NeuralNetworkClassifier([50]) classifier.fit(x[:spInd], y[:spInd]) correct = (classifier.predict(x[spInd:]) == y[spInd:]).sum() print 'correct = %d/%d = %lf' % (correct, test_s, correct * 1.0 / test_s) print '======Test with Adaboost=======' new_y = array(map(lambda y: (y - 0.5) * 2, y)) classifier = adaboost.AdaBoost(x[:spInd], new_y[:spInd], adaNum=200) pre = classifier.predict(x[spInd:]) correct = (pre == y[spInd:]).sum() print 'correct = %d/%d = %lf' % (correct, test_s, correct * 1.0 / test_s)