def CVF(hp, decaying=False, averaged=False): folds = [] for i in range(5): file = "./CVfolds/fold" + str(i + 1) cv_x_train, cv_y_train, cv_num_features = libsvm.read_libsvm(file) if decaying: cv_w, cv_b, cv_hist, accs = train(cv_x_train.toarray(), cv_y_train, epochs=10, lr=hp, decaying=True) elif averaged: cv_w, cv_b, cv_hist, accs = train(cv_x_train.toarray(), cv_y_train, epochs=10, lr=hp, averaged=True) else: cv_w, cv_b, cv_hist, accs = train(cv_x_train.toarray(), cv_y_train, epochs=10, lr=hp) temp = 0 for j in range(5): if i != j: f = "./CVfolds/fold" + str(j + 1) x, y, _ = libsvm.read_libsvm(f, cv_num_features) temp += accuracy(x.toarray(), y, cv_w, cv_b) folds.append(temp / 4) return sum(folds) / len(folds) pass
def setup_data(data_set): if data_set == "s": xtr, ytr, nf = libsvm.read_libsvm("../Kaggle/data/data-splits/data.train") xt, yt, _ = libsvm.read_libsvm("../Kaggle/data/data-splits/data.test", nf) return xtr, ytr, nf, xt, yt elif data_set == "m": xtr, ytr, nf = libsvm.read_libsvm("data_madelon/madelon_data_train") xt, yt, _ = libsvm.read_libsvm("data_madelon/madelon_data_test", nf) return xtr, ytr, nf, xt, yt
def setup_data(data_set): if data_set == "s": xtr, ytr, nf = libsvm.read_libsvm("data_semeion/hand_data_train") xt, yt, _ = libsvm.read_libsvm("data_semeion/hand_data_test", nf) return xtr, ytr, nf, xt, yt elif data_set == "m": xtr, ytr, nf = libsvm.read_libsvm("data_madelon/madelon_data_train") xt, yt, _ = libsvm.read_libsvm("data_madelon/madelon_data_test", nf) return xtr, ytr, nf, xt, yt
def crossValidateSVM(): f1Inputs, f1Labels, _ = read_libsvm('data/data_semeion/folds/fold1') f2Inputs, f2Labels, _ = read_libsvm('data/data_semeion/folds/fold2') f3Inputs, f3Labels, _ = read_libsvm('data/data_semeion/folds/fold3') f4Inputs, f4Labels, _ = read_libsvm('data/data_semeion/folds/fold4') f5Inputs, f5Labels, _ = read_libsvm('data/data_semeion/folds/fold5') allFoldInputArrays = [ f1Inputs.toarray(), f2Inputs.toarray(), f3Inputs.toarray(), f4Inputs.toarray(), f5Inputs.toarray() ] allFoldLabelArrays = [f1Labels, f2Labels, f3Labels, f4Labels, f5Labels] initLearningRates = [10**1, 10**0, 10**-1, 10**-2, 10**-3, 10**-4] regularizations = [10**1, 10**0, 10**-1, 10**-2, 10**-3, 10**-4] bestLearningRate = None bestRegularization = None bestAccuracy = 0 everyAccuracy = [] for rate in initLearningRates: for regularization in regularizations: allAccuracies = [] for i in range(len(allFoldInputArrays)): allTrainData = [] allTrainLabels = [] for j in range(len(allFoldInputArrays)): if j != i: allTrainData.extend(allFoldInputArrays[j]) allTrainLabels.extend(allFoldLabelArrays[j]) print("Hyperparameters: Learning rate: " + str(rate) + " Regularization: " + str(regularization)) tempsvm = svm(numFeatures, rate, regularization, 100) tempsvm.train(allTrainData, allTrainLabels) accuracy = tempsvm.evaluate(allFoldInputArrays[i], allFoldLabelArrays[i]) allAccuracies.append(accuracy) everyAccuracy.append(accuracy) if statistics.mean(allAccuracies) > bestAccuracy: bestAccuracy = statistics.mean(allAccuracies) bestLearningRate = rate bestRegularization = regularization avgAccuracy = statistics.mean(everyAccuracy) print("Best rate: " + str(bestLearningRate)) print("Best reg: " + str(bestRegularization)) print("Best accuracy: " + str(bestAccuracy)) print("Average accuracy: " + str(avgAccuracy))
def CVF(hp): folds=[] for i in range(5): file = "./data/cvf/fold"+str(i+1) cv_x_train, cv_y_train, cv_num_features = libsvm.read_libsvm(file) cv_w, cv_b, cv_hist, accs = train(cv_x_train.toarray(), cv_y_train, epochs=10, lr=hp) temp = 0 for j in range(5): if i != j: f = "./data/cvf/fold"+str(j+1) x, y,_ = libsvm.read_libsvm(f, cv_num_features) temp += accuracy(x.toarray(), y, cv_w, cv_b) folds.append(temp/4) return sum(folds)/len(folds) pass
def cvf(data_set, lr): acc = [] path = "" if data_set == "s": path = "data_semeion/folds" else: path = "data_madelon/folds" for i in range(1, 5): file = path + "/fold" + str(i) cvxtrain, cvytrain, cvnum = libsvm.read_libsvm(file) w, a = train(cvxtrain.toarray(), cvytrain, lr) rate = 0 for j in range(1, 5): if i != j: f = path + "/fold" + str(j + 1) x, y, _ = libsvm.read_libsvm(f, cvnum) x = add_bias(x.toarray()) rate += accuracy(x, y, w) acc.append(rate / 4) return sum(acc) / len(acc)
def train_test(): X_train, y_train, num_features = libsvm.read_libsvm( "C:\\Users\\Abhi\\Documents\\MyFiles\\AB\\GradSchool\\Fall19\\ML\\Project" "\\data\\data\\data-splits\\data.train") # log transforming X_train = X_train.log1p() # X_train_array = np.array([]) # print("Number of features =", num_features) # adding all sparse matrices to a numpy array # for x in X_train: # X_train_array = np.append(X_train_array, x) simple_perceptron = Perceptron(num_features) decay_perceptron = Perceptron(num_features) average_perceptron = Perceptron(num_features) pocket_perceptron = Perceptron(num_features) margin_perceptron = Perceptron(num_features) learning_rates = [0.01, 0.1, 1] margins = [1, 0.1, 0.01] print("Accuracies on training set for 20 epochs") print("--" * 50) print("Simple Perceptron") print("--" * 50) for r in learning_rates: w, b = simple_perceptron.simple_perceptron_train(X_train, y_train, epochs=20, lr=r) print("Simple perceptron accuracy on training set for lr =", r, ":", simple_perceptron.accuracy(X_train, y_train, w, b) * 100) # print() # print("Decay Perceptron") # print("--" * 50) # for r in learning_rates: # w, b = decay_perceptron.decaying_perceptron_train(X_train_array, y_train, epochs=20, lr=r) # print("Decay perceptron accuracy on training set for lr =", r, ":", # decay_perceptron.accuracy(X_train_array, y_train, w, b) * 100) print() print("Averaged Perceptron") print("--" * 50) for r in learning_rates: w, b = average_perceptron.averaged_perceptron_train(X_train, y_train, epochs=20, lr=r) print("Average perceptron accuracy on training set for lr =", r, ":", average_perceptron.accuracy(X_train, y_train, w, b) * 100)
id_list.append(line.strip()) return id_list def discretize(data): for i in range(0, data.shape[0]): threshold = data[i].mean() binarizer = preprocessing.Binarizer(threshold).fit(data) data[i] = binarizer.transform(data[i]) if __name__ == "__main__": X_train, y_train, num_features = libsvm.read_libsvm( "../data/data/data-splits/data.train") X_test, y_test, num_features = libsvm.read_libsvm( "../data/data/data-splits/data.test") X_anon, y_anon, num_features = libsvm.read_libsvm( "../data/data/data-splits/data.eval.anon") X_train = X_train.log1p() X_test = X_test.log1p() X_anon = X_anon.log1p() print("Discretizing...") discretize(X_train) discretize(X_test) discretize(X_anon)
def read_data_to_array(path): X_train, y_train, num_features = read_libsvm(path) X_train = X_train.toarray() return X_train, y_train
combineFoldsNames = [ "fold1234", "fold1235", "fold1245", "fold1345", "fold2345" ] singleFoldNames = ["fold5", "fold4", "fold3", "fold2", "fold1"] for CINDEX in range(6): for LRINDEX in range(6): for i in range(3): size = SizeOfTheForest[i] accuracies = [] for j in range(5): combineFoldName = combineFoldsNames[j] singleFoldName = singleFoldNames[j] _indexCollection = [] _root = [] X_train, y_train, num_features = read_libsvm(combineFoldName) x = X_train.todense() for y in range(size): _index = [] for z in range(50): _index.append(random.randint(0, 359)) formatFile(x, y_train, _index, 6075) trainData = np.loadtxt('fileFormated', delimiter=',', dtype=str) trainData_obj = Data(data=trainData) attributesSet = trainData_obj.attributes root = id3Depth(attributesSet, trainData_obj, 1) _root.append(root) _indexCollection.append(_index)
printEpochs = False BestC = 10 ########################################################################################################## print("Cross Validation") print() C = learningRates[0] for i in range(6): learningRate = learningRates[i] accuracies = [] for j in range(5): combineFoldName = combineFoldsNames[j] singleFoldName = singleFoldNames[j] X_train, y_train, num_features = read_libsvm(combineFoldName) X_test, y_test, _ = read_libsvm(singleFoldName, num_features) w,b = SimplePerceptron(X_train.todense(), y_train, 10, learningRate, C) accuracies.append(accuracy(X_test.todense(),y_test,w,b)) sum = 0 for j in range(5): sum += accuracies[j] average = sum / 5 print("Learning Rate:",learningRate,"Average: ",average,"C:",C) if average > bestAccuracy: bestLearningRate = learningRate BestC = C bestAccuracy = average
def setup(): X_train, y_train, num_features = libsvm.read_libsvm('data_train') X_test, y_test, _ = libsvm.read_libsvm('data_test', num_features) return X_train, y_train, num_features, X_test, y_test
########################################################################################################## bestLearningRate = 0 bestAccuracy = 0 learningRates = [1, 0.1, 0.01] combineFoldsNames = [ "fold1234", "fold1235", "fold1245", "fold1345", "fold2345" ] singleFoldNames = ["fold5", "fold4", "fold3", "fold2", "fold1"] DecayingTheLearningRateHyperParameter = 0 DecayingTheLearningRateEpoch = 0 printEpochs = False X_train, y_train, num_features = read_libsvm('data_train') X_test, y_test, _ = read_libsvm('data_test', num_features) ########################################################################################################## print("Cross Validation") print() ########################################################################################################## bestLearningRate = 0 bestAccuracy = 0 for i in range(3): learningRate = learningRates[i] accuracies = []
def get_id_list(): id_list = [] with open(r"../data/data/data-splits/eval.id") as file: lines = file.readlines() for line in lines: id_list.append(line.strip()) return id_list enc = KBinsDiscretizer(n_bins=4, encode="onehot", strategy="uniform") X_train, y_train, num_features = libsvm.read_libsvm( "../data/data/data-splits/data.train") X_train_binned = enc.fit_transform(X_train.toarray()) X_test, y_test, num_features = libsvm.read_libsvm( "../data/data/data-splits/data.test") X_test_binned = enc.fit_transform(X_test.toarray()) X_test_eval, y_test_eval, num_features = libsvm.read_libsvm( "../data/data/data-splits/data.eval.anon") X_eval_binned = enc.fit_transform(X_test_eval.toarray()) id_list = get_id_list() print("*" * 50 + "DTREE" + "*" * 50) depths = [2] print("Accuracies for non binned data")
from libsvm import read_libsvm from svm import * import statistics ## Setup Data: trainingInputs, trainingLabels, numFeatures = read_libsvm( 'data/data_semeion/hand_data_train') testInputs, testLabels, _ = read_libsvm('data/data_semeion/hand_data_test', numFeatures) trainingInputsArr = trainingInputs.toarray() testInputsArr = testInputs.toarray() def crossValidateSVM(): f1Inputs, f1Labels, _ = read_libsvm('data/data_semeion/folds/fold1') f2Inputs, f2Labels, _ = read_libsvm('data/data_semeion/folds/fold2') f3Inputs, f3Labels, _ = read_libsvm('data/data_semeion/folds/fold3') f4Inputs, f4Labels, _ = read_libsvm('data/data_semeion/folds/fold4') f5Inputs, f5Labels, _ = read_libsvm('data/data_semeion/folds/fold5') allFoldInputArrays = [ f1Inputs.toarray(), f2Inputs.toarray(), f3Inputs.toarray(), f4Inputs.toarray(), f5Inputs.toarray() ] allFoldLabelArrays = [f1Labels, f2Labels, f3Labels, f4Labels, f5Labels] initLearningRates = [10**1, 10**0, 10**-1, 10**-2, 10**-3, 10**-4] regularizations = [10**1, 10**0, 10**-1, 10**-2, 10**-3, 10**-4]
from libsvm import read_libsvm from svm import * import statistics ## Setup Data: trainingInputs, trainingLabels, numFeatures = read_libsvm('data/data_madelon/madelon_data_train') testInputs, testLabels, _ = read_libsvm('data/data_madelon/madelon_data_test', numFeatures) trainingInputsArr = trainingInputs.toarray() testInputsArr = testInputs.toarray() def crossValidateSVM(): f1Inputs, f1Labels, _ = read_libsvm('data/data_madelon/folds/fold1') f2Inputs, f2Labels, _ = read_libsvm('data/data_madelon/folds/fold2') f3Inputs, f3Labels, _ = read_libsvm('data/data_madelon/folds/fold3') f4Inputs, f4Labels, _ = read_libsvm('data/data_madelon/folds/fold4') f5Inputs, f5Labels, _ = read_libsvm('data/data_madelon/folds/fold5') allFoldInputArrays = [f1Inputs.toarray(), f2Inputs.toarray(), f3Inputs.toarray(), f4Inputs.toarray(), f5Inputs.toarray()] allFoldLabelArrays = [f1Labels, f2Labels, f3Labels, f4Labels, f5Labels] initLearningRates = [10**1, 10**0, 10**-1, 10**-2, 10**-3, 10**-4] regularizations = [10**1, 10**0, 10**-1, 10**-2, 10**-3, 10**-4] bestLearningRate = None bestRegularization = None bestAccuracy = 0 counter = 1
return 10**-4 if __name__ == '__main__': xtrain_s, ytrain_s, numfeat_s, xtest_s, ytest_s = setup_data("s") # folds = [] # folds.append(cvf("s", 10**1)) # folds.append(cvf("s", 10**0)) # folds.append(cvf("s",10**-1)) # folds.append(cvf("s",10**-2)) # folds.append(cvf("s",10**-3)) # folds.append(cvf("s",10**-4)) # best = np.max(folds) # hp = best_hp(folds, best) xt, yt, _ = libsvm.read_libsvm("../Kaggle/data/data-splits/data.eval.anon") clf = AdaBoostClassifier(learning_rate=1, n_estimators=5000) clf.fit(xtrain_s, ytrain_s) score = clf.score(xtrain_s, ytrain_s) print(score) clf.fit(xtest_s, ytest_s) test_score = clf.score(xtest_s, ytest_s) print(test_score) labels = clf.predict(xt) ids = np.fromfile('../Kaggle/data/data-splits/eval.id', dtype=int, sep="\n") with open('adaboost.csv', mode='w+', newline='') as csv_file: fieldnames = ['example_id', 'label'] writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
LambdaTerms = [2, 1.5, 1.0, 0.5] combineFoldsNames = [ "fold1234", "fold1235", "fold1245", "fold1345", "fold2345" ] singleFoldNames = ["fold5", "fold4", "fold3", "fold2", "fold1"] bestaccuracy = 0 bestLambda = 2 for i in range(4): lambdaTerm = LambdaTerms[i] accuracies = [] for j in range(5): combineFoldName = combineFoldsNames[j] singleFoldName = singleFoldNames[j] X_train, y_train, num_features = read_libsvm(combineFoldName) totalCol = len(X_train.todense()[1].flat) totalRows = len(X_train.todense()) totalOnes = totalPositive(y_train) totalOnesProb = totalOnes / len(y_train) positiveOnes = [] negativeOnes = [] for i in range(totalCol): OnesPos = 0 OnesNeg = 0 for j in range(totalRows): x = X_train.todense()[j].flat[i] if x == 1: if y_train[j] == 1: OnesPos = OnesPos + 1 else:
def setup(): X_train, y_train, num_features = libsvm.read_libsvm('./data/data-splits/data.train') X_test, y_test, _ = libsvm.read_libsvm('./data/data-splits/data.test', num_features) return X_train, y_train, num_features, X_test, y_test
import libsvm import numpy as np import random from sklearn import preprocessing from scipy import sparse from sklearn.preprocessing import MaxAbsScaler X_train, y_train, num_features = libsvm.read_libsvm( "C:\\Users\\Abhi\\Documents\\MyFiles\\AB\\GradSchool\\Fall19\\ML\\Project" "\\data\\data\\data-splits\\data.train") # log_x = X_train.log1p() # print(type(log_x)) #print(X_train[1]) #print(log_x[1]) # normalizing the data # normal_x = preprocessing.normalize(X_train) # print(normal_x[2]) #min max scaling scaler = MaxAbsScaler() scaler.fit(X_train) X_minmax = scaler.transform(X_train) # print(X_train[0]) # print(binarized_X[0])