def __init__(self, numFeatures, forestSize, learningRate, regularization): self.numFeatures = numFeatures self.forestSize = forestSize self.learningRate = learningRate self.regularization = regularization self.forestObject = randomForest(numFeatures, forestSize) self.svmObject = svm(numFeatures, learningRate, regularization, 100)
def __init__(self): self.svm = svm() self.param = svm_parameter() self.prob = svm_problem() self.x_space = None self.cross_validation = False self.nr_fold = 0 self.quiet = False
def crossValidateSVM(): f1Inputs, f1Labels, _ = read_libsvm('data/data_semeion/folds/fold1') f2Inputs, f2Labels, _ = read_libsvm('data/data_semeion/folds/fold2') f3Inputs, f3Labels, _ = read_libsvm('data/data_semeion/folds/fold3') f4Inputs, f4Labels, _ = read_libsvm('data/data_semeion/folds/fold4') f5Inputs, f5Labels, _ = read_libsvm('data/data_semeion/folds/fold5') allFoldInputArrays = [ f1Inputs.toarray(), f2Inputs.toarray(), f3Inputs.toarray(), f4Inputs.toarray(), f5Inputs.toarray() ] allFoldLabelArrays = [f1Labels, f2Labels, f3Labels, f4Labels, f5Labels] initLearningRates = [10**1, 10**0, 10**-1, 10**-2, 10**-3, 10**-4] regularizations = [10**1, 10**0, 10**-1, 10**-2, 10**-3, 10**-4] bestLearningRate = None bestRegularization = None bestAccuracy = 0 everyAccuracy = [] for rate in initLearningRates: for regularization in regularizations: allAccuracies = [] for i in range(len(allFoldInputArrays)): allTrainData = [] allTrainLabels = [] for j in range(len(allFoldInputArrays)): if j != i: allTrainData.extend(allFoldInputArrays[j]) allTrainLabels.extend(allFoldLabelArrays[j]) print("Hyperparameters: Learning rate: " + str(rate) + " Regularization: " + str(regularization)) tempsvm = svm(numFeatures, rate, regularization, 100) tempsvm.train(allTrainData, allTrainLabels) accuracy = tempsvm.evaluate(allFoldInputArrays[i], allFoldLabelArrays[i]) allAccuracies.append(accuracy) everyAccuracy.append(accuracy) if statistics.mean(allAccuracies) > bestAccuracy: bestAccuracy = statistics.mean(allAccuracies) bestLearningRate = rate bestRegularization = regularization avgAccuracy = statistics.mean(everyAccuracy) print("Best rate: " + str(bestLearningRate)) print("Best reg: " + str(bestRegularization)) print("Best accuracy: " + str(bestAccuracy)) print("Average accuracy: " + str(avgAccuracy))
def runSVM_CV(dataCV, es): # Using current time t_st = time.time() lrs = [10**0, 10**-1, 10**-2, 10**-3, 10**-4] #intiial learning rates Cs = [ 10**3, 10**2, 10**1, 10**0, 10**-1, 10**-2, ] #initial tradeoffs hps = list(itertools.product(lrs, Cs)) best_perf = pd.DataFrame(columns=['Ep', 'lr', 'C', 'acc', 'obj']) T = 10 for f in dataCV: print('\n Fold -', f) dataVal = dataCV[f]['val'].to_numpy() data = dataCV[f]['trn'] acc0 = 0 # reset accuracy for lr, C in hps: # for learning rates and tradeoff combinations # CV training w_best, _, lc, obj, losses = svm(data, lr, C, es, T) # CV validation X = dataVal[:, 1:] X = np.hstack((X, np.ones((X.shape[0], 1)))) # add bias y = dataVal[:, 0] acc_Val = accuracy(X, y, w_best) # accuracy(X,y,w): if acc_Val > acc0: best_perf.loc[f] = [len(lc), lr, C, acc_Val, obj[-1]] acc0 = acc_Val print('\n -- Best Performance over CV Folds -- ') print(best_perf) print('\nEarly stop:', es) t_en = time.time() t_run = np.round((t_en - t_st) / 60, 3) print('\nRuntime (m):', t_run) return best_perf, t_run
def runSVM_CV(dataCV): # Using current time t_st = time.time() lrs = [10**0, 10**-1, 10**-2, 10**-3, 10**-4] #intiial learning rates Cs = [ 10**3, 10**2, 10**1, 10**0, 10**-1, 10**-2, ] #initial tradeoffs hps = list(itertools.product(lrs, Cs)) best_perf = pd.DataFrame(columns=['Ep', 'lr', 'C', 'acc', 'obj']) T = 50 for f in dataCV: print('\n Fold -', f) data = dataCV[f] acc0 = 0 # reset accuracy for lr, C in hps: # for learning rates and tradeoff combinations tau = 0.01 * C # early stop threshold w_best, best_acc, lc, obj, up = svm(data, lr, C, tau, T) if best_acc > acc0: best_perf.loc[f] = [len(lc), lr, C, best_acc, obj[-1]] acc0 = best_acc print('\n -- Best Performance over CV Folds -- \n', best_perf) t_en = time.time() print('\nRuntime (m):', np.round((t_en - t_st) / 60, 3)) return best_perf
def run(): mod = svm( array([[gauss(0,1)] for i in range(50) ] + [[gauss(8,1)] for i in range(50) ]).reshape([100,1]) ) print "Total Loss: %s" % sum( (mod.Y.reshape( [len(mod.X),]) - mod.cdf( mod.X.reshape( [len(mod.X),]) ) ) ** 2) fig = plt.figure() start = -5. end = 12. X = arange(start,end,.25) #a = fig.add_subplot(2,2,1) #n, bins, patches = a.hist(mod.data, 20, normed=1, facecolor='green', alpha=0.5, label='empirical distribution') #a.plot(X,mod.Pr(X), 'r--', label="computed distribution") #a.set_title("Computed vs empirical PDF") c = fig.add_subplot(2,2,2) c.plot(numpy.sort(mod.X,0), numpy.sort(mod.Y,0), 'green' ) c.plot(X, mod.cdf(X), 'r--' ) c.plot( mod.X, (mod.Y.reshape( [len(mod.X),]) - mod.cdf( mod.X.reshape( [len(mod.X),]) ) ) ** 2, '+' ) c.set_title("Computed vs emprical CDF")
from svm import * from mlp import load_test if __name__ == "__main__": data = load_data() train = data[0] valid = data[1] test = load_test("4-9") print "Training phase" alpha, b = svm(data[0], data[1], tau=2**-5, C=2**-4) print "Testing phase" validate((data[0], data[1]), alpha, b, (test[0] / 255, test[1]), 2**-5)
from svm import * from mlp import load_test if __name__ == "__main__": data = load_data() train = data[0] valid = data[1] test = load_test("4-9") print "Training phase" alpha,b = svm(data[0],data[1],tau=2**-5,C=2**-4) print "Testing phase" validate((data[0],data[1]),alpha,b,(test[0]/255,test[1]),2**-5)
from logit import * from read_data import * from svm import * from hoeffding import * from random_forest import * if __name__ == "__main__": ''' Run line by line to avoid confusion of ouptut ''' # Read data (train_x, train_y, test_x, test_y) = read_data() # Run SVM algorithm CI_SVM = svm(train_x, train_y, test_x, test_y) # Logist Regression algorithm CI_LR = logit(train_x, train_y, test_x, test_y) # Random Forrest CI_RF = random_forest(train_x, train_y, test_x, test_y) print("\n\nFinal Results") print("==================================================") print("\nHoeffding's Confidence interval for SVM is:") print(CI_SVM) print("\nHoeffding's Confidence interval for LR is:") print(CI_LR)
def runSVM_trn(dataTrn, lr, C, tau, T): w_best, best_acc, lc, obj, losses = svm(data, lr, C, tau, T) return w_best, acc0, lc, obj, losses
def main(): # data =load data datasets = [] data = pd.read_csv('adult.csv') print(data.shape) data.count()[1] # print(data.head()) def cc(x): return sum(x == '?') # print(data.apply(cc)) df = data[data.occupation != '?'] #print(df.shape) df = df[df.workclass != '?'] #print(df.shape) df = df[df['native.country'] != '?'] #print(df.shape) #print(df.groupby(by='education')['education.num'].mean()) df.loc[df['native.country'] != 'United-States', 'native.country'] = 'non_usa' df.loc[df['income'] == '<=50K', 'income'] = -1 df.loc[df['income'] == '>50K', 'income'] = 1 features_categorical = [ 'workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country' ] features_numerical = [ 'age', 'fnlwgt', 'education.num', 'capital.gain', 'capital.loss', 'hours.per.week' ] # convert the categorical features into one-hot encoding for feature in features_categorical: df1 = pd.get_dummies(df[feature], drop_first=False) df = df.drop([feature], axis=1) df = df.join(df1) print(df.shape) # normalize the numerical features by z- normalization for feature in features_numerical: df[feature] = (df[feature] - df[feature].mean()) / df[feature].std() #df['capital.change'] = (df['capital.gain'] > 0) | (df['capital.loss'] >0) #df['capital.change'] = df['capital.change'].astype(int) print(df.columns) print(df.head()) # first test on hours.per.week, education.num df1 = df.drop(['income'], axis=1) allX = df1.values allX.astype(float) ally = df.as_matrix(columns=['income']) print(allX.shape, ally.shape) X = allX[0:2000] y = ally[0:2000] myC = 10 num_ensamble = 10 classifiers = [] for i in range(num_ensamble): classifier = svm(C=myC, kernel=linear_kernel, gamma=0.05, coef=1) classifiers.append(classifier) for i in range(num_ensamble): X_train, X_val, y_train, y_val = subsample(X, y, 1.0) lagr_mult = classifiers[i].fit(X_train, y_train) y_pred = classifiers[i].predict(X_val) accuracy = get_accuracy(y_val, y_pred) print("Out of bag Validation accuracy is {}".format(accuracy)) # while testing, predict with each svm # Take majority vote # measure accuracy X_test = allX[2001:4000] y_test = ally[2001:4000] predictions = [] for i in range(num_ensamble): y_pred = classifiers[i].predict(X_test) predictions.append(y_pred) # do majority vote reduction predictions = np.array(predictions) pred_t = [] for i in range(len(X_test)): myarray = predictions[:, i].reshape(-1) # print(myarray) u, indices = np.unique(myarray, return_inverse=True) pred_t.append(u[np.argmax(np.bincount(indices))]) # calculate the accuracy accuracy = get_accuracy(pred_t, y_test) print("Testing Accuracy is ", accuracy)
- model = svm.svc() # there is various option associated with it, this is simple for classification. You can refer link, for mo# re detail. - # Train the model using the training sets and check score - model.fit(X, y) - model.score(X, y) - #Predict Output - predicted= model.predict(x_test) R 代码 - library(e1071) - x <- cbind(x_train,y_train) - # Fitting model - fit <-svm(y_train ~ ., data = x) - summary(fit) - #Predict Output - predicted= predict(fit,x_test)
" Regularization: " + str(regularization)) tempsvm = svm(numFeatures, rate, regularization, 100) tempsvm.train(allTrainData, allTrainLabels) accuracy = tempsvm.evaluate(allFoldInputArrays[i], allFoldLabelArrays[i]) allAccuracies.append(accuracy) everyAccuracy.append(accuracy) if statistics.mean(allAccuracies) > bestAccuracy: bestAccuracy = statistics.mean(allAccuracies) bestLearningRate = rate bestRegularization = regularization avgAccuracy = statistics.mean(everyAccuracy) print("Best rate: " + str(bestLearningRate)) print("Best reg: " + str(bestRegularization)) print("Best accuracy: " + str(bestAccuracy)) print("Average accuracy: " + str(avgAccuracy)) crossValidateSVM() ## SVM test: testSvm = svm(numFeatures, 0.001, 10, 100) testSvm.train(trainingInputsArr, trainingLabels) print("SVM training evaluation: ") print(testSvm.evaluate(trainingInputsArr, trainingLabels)) print("SVM test evaluation: ") print(testSvm.evaluate(testInputsArr, testLabels))
testes = [ Teste("./dataset/cars/car.data", "car", 6, ','), Teste("./dataset/mushroom/agaricus-lepiota.data", "mushroom", 0, ","), Teste("./dataset/nursery/nursery2.data", "nursery", 8, ',') ] # Variaveis numFolds = 10 if len(sys.argv) < 2: tstAtl = 2 else: tstAtl = int(sys.argv[1]) # Classificador baseado em svm predSvm, labelsSvm = svm(testes[tstAtl]) printAnalysis(predSvm, labelsSvm, "SVM") # Classificador baseado em naive-bayes classifier = NaiveBayesClassifier(testes[tstAtl].separador, testes[tstAtl].labelPosi) # limpa as saidas classifier.cleanOutput() # classe que gera o arquivo de folds e processa os dados # parametros = (numFolds, nomeArqSaida, arqEntrada) dataMinipu = dataManip(numFolds, testes[tstAtl].nomeProb, testes[tstAtl].data, testes[tstAtl].labelPosi, testes[tstAtl].separador) # processa os dados
discreteizeData(trainingInputsArr) discreteizeData(testInputsArr) discreteizeData(evalInputsArr) for i in range(len(trainingLabels)): if trainingLabels[i] == 0: trainingLabels[i] = -1 for i in range(len(testLabels)): if testLabels[i] == 0: testLabels[i] = -1 ## SVM test: testSvm = svm(numFeatures, 1, 1000, 2000) testSvm.train(trainingInputsArr, trainingLabels) # print("SVM training evaluation: ") # print(testSvm.evaluate(trainingInputsArr, trainingLabels)) # print("SVM test evaluation: ") # print(testSvm.evaluate(testInputsArr, testLabels)) testSvm.evaluate(evalInputsArr, evalLabels) ## Naive Bayes Test: # testBayes = naiveBayes(numFeatures, .5) # testBayes.train(trainingInputsArr, trainingLabels) # # print(testBayes.evaluate(trainingInputsArr, trainingLabels)) # # print(testBayes.evaluate(testInputsArr, testLabels)) # testBayes.evaluate(evalInputsArr, evalLabels) ## Random Forest test: