file = open('iris.data.txt', 'rt') for line in file: line = line.strip() array = line.split(',') data.append(array[0:-1]) ans.append(array[-1]) file.close() # Data set splitting # Shuffle array [data, ans] = data_preprocess.shuffle(data, ans) # Split to train, test, then split train to validation [trainData, testData] = data_preprocess.split_data(data, 0.6) [trainAns, testAns] = data_preprocess.split_data(ans, 0.6) [trainDataSmall, validData] = data_preprocess.split_data(trainData, (2.0 / 3.0)) [trainAnsSmall, validAns] = data_preprocess.split_data(trainAns, (2.0 / 3.0)) # Iterates through all possible combinations of parameters C_Param_Values = [1, 50, 200, 500, 1000] accuracy_a = [] for param in C_Param_Values: clf = svm.SVC(C=param) clf.fit(trainDataSmall, trainAnsSmall) accuracy = clf.score(validData, validAns, sample_weight=None) accuracy_a.append(accuracy)
def build_SVM(filename, option, svm_type = None, poly_degree = None): # LOAD DATA descriptors = qm_descriptors X, Y = data_preprocess.load_data(filename, descriptors) if svm_type == None: svm_type = 'linear' if poly_degree == None: poly_degree = 2 #print('training polynomial SVM of degree', poly_degree) if option == 'default': print('Training SVM...') print('*-----------------------------*') print('Training on default parameters.') accuracies_default = [] for i in range(10): x_train, x_valid, y_train, y_valid = data_preprocess.split_data (X, Y, partition=0.20) accuracies_default.append(train_SVM(x_train, y_train, x_valid, y_valid)) print('Average accuracy over 10 default runs: %.2f' % numpy.mean(accuracies_default)) elif option == 'train': print('*-----------------------------*') print('Searchig for best parameters.') params = [] accuracies = [] for i in range(10): x_train, x_valid, y_train, y_valid = data_preprocess.split_data (X, Y, partition=0.20) best_parameters = scan_parameters(x_train, y_train) params.append(best_parameters) accuracy = train_SVM(x_train, y_train, x_valid, y_valid, best_parameters) accuracies.append(accuracy) print('*-----------------------------*') print('Summary of Results.') print('*-----------------------------*') for i in range(len(accuracies)): print('Run ' + str (i+1)+ ' ', params[i], ' : ', accuracies[i]) elif option == 'RFE': print('*-----------------------------*') print('Recursive feature estimation.') #http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html#sklearn.feature_selection.RFE ranking = perform_RFE(X, Y) print('*-----------------------------*') print('Ranking of descriptors.') print('*-----------------------------*') for d in range(len(qm_descriptors)): print(qm_descriptors[d], ranking[d]) elif option == 'test': print('TESTING') print('*-----------------------------*') #kernels = 'rbf' #Cs = 1 #gammas = 1 #degrees = 3 #weights = None kernels = 'rbf' Cs = 10 gammas = 0.1 degrees = 3 weights = None params_dict = {'kernel': kernels, 'C': Cs, 'class_weight' : weights, 'degree': degrees, 'gamma' : gammas} acc_list = [] for i in range(10): x_train, x_valid, y_train, y_valid = data_preprocess.split_data (X, Y, partition=0.20) acc_list.append(train_SVM(x_train, y_train, x_valid, y_valid, params_dict)) print('Summary of Results.') print('*-----------------------------*') print('Average accuracy over 10 runs: %.2f' % numpy.mean(acc_list))
handler.setLevel(level) logger.addHandler(handler) logger.setLevel(level) #======= loading stuff ========== logger.info('Start') config = load_yaml(args.config) config['loader_config']['data_checkpoint'] = args.data_checkpoint x, y, weights = load_from_config(config, args.force_resample) #======= split data ======= checkpoint_name = args.splits_checkpoint cv_cfg = config['cross_validation'] cv_cfg['checkpoint'] = checkpoint_name cv_splits, train_idx, test_idx, weights = split_data(y, weights, cv_cfg, args.validation_mode, args.force_resplit) data_stuff = [x, y, weights] kind = config['model_params'].get('kind', 'any') is_multitask = 'multitask' in kind if is_multitask: axis = 1 N_outputs = len(y) else: axis = 0 N_outputs = -1 to_relax = config['model_params'].get('relax', False) #======= print info =======
def build_NN_classifier(filename, option, model_name=None): # LOAD DATA descriptors = qm_descriptors X, Y = data_preprocess.load_data(filename, descriptors) # IF DOWNSAMPLING: #print('>> Down sampling.') #smaller_x, smaller_y = data_preprocess.do_down_sampling(X,Y) if option == 'default': print('Training Logist...') print('*-----------------------------*') print('Training on default parameters.') accuracies_default = [] for i in range(10): x_train, x_valid, y_train, y_valid = data_preprocess.split_data( X, Y, partition=0.20) accuracies_default.append( train_NN(x_train, y_train, x_valid, y_valid)) print('Average accuracy over 3 default runs: %.2f' % numpy.mean(accuracies_default)) elif option == 'train': print('*-----------------------------*') print('Searchig for best parameters.') params = [] accuracies = [] for i in range(10): x_train, x_valid, y_train, y_valid = data_preprocess.split_data( X, Y, partition=0.20) best_parameters = scan_parameters(x_train, y_train) params.append(best_parameters) accuracy = train_NN(x_train, y_train, x_valid, y_valid, best_parameters) accuracies.append(accuracy) print('*-----------------------------*') print('Summary of Results.') print('*-----------------------------*') for i in range(len(accuracies)): print('Run ' + str(i + 1) + ' ', params[i], ' : ', accuracies[i]) elif option == 'test': print('TESTING') print('*-----------------------------*') hidden_layer_sizes = (100, 100) solver = 'adam' alpha = 0.001 params_dict = { 'hidden_layer_sizes': hidden_layer_sizes, 'solver': solver, 'alpha': alpha, 'max_iter': [400] } print(params_dict) acc_list = [] for i in range(10): x_train, x_valid, y_train, y_valid = data_preprocess.split_data( X, Y, partition=0.20) acc_list.append( train_NN(x_train, y_train, x_valid, y_valid, params_dict)) print('Summary of Results.') print('*-----------------------------*') print('Average accuracy over 10 runs: %.2f' % numpy.mean(acc_list))
def build_logist(filename, option, model_name=None): # LOAD DATA descriptors = qm_descriptors X, Y = data_preprocess.load_data(filename, descriptors) if option == 'default': print('Training Logist...') print('*-----------------------------*') print('Training on default parameters.') accuracies_default = [] for i in range(10): x_train, x_valid, y_train, y_valid = data_preprocess.split_data( X, Y, partition=0.20) accuracies_default.append( train_logist(x_train, y_train, x_valid, y_valid)) print('Average accuracy over 10 default runs: %.2f' % numpy.mean(accuracies_default)) elif option == 'train': print('*-----------------------------*') print('Searchig for best parameters.') params = [] accuracies = [] for i in range(10): x_train, x_valid, y_train, y_valid = data_preprocess.split_data( X, Y, partition=0.20) best_parameters = scan_parameters(x_train, y_train) params.append(best_parameters) accuracy = train_logist(x_train, y_train, x_valid, y_valid, best_parameters) accuracies.append(accuracy) print('*-----------------------------*') print('Summary of Results.') print('*-----------------------------*') for i in range(len(accuracies)): print('Run ' + str(i + 1) + ' ', params[i], ' : ', accuracies[i]) elif option == 'RFE': print('*-----------------------------*') print('Recursive feature estimation.') #http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html#sklearn.feature_selection.RFE ranking = perform_RFE(X, Y) print('*-----------------------------*') print('Ranking of descriptors.') print('*-----------------------------*') for d in range(len(qm_descriptors)): print(qm_descriptors[d], ranking[d]) elif option == 'test': print('TESTING') print('*-----------------------------*') #penalties = 'l2' #Cs = 0.001 #weights = None penalties = 'l1' Cs = 10 weights = None params_dict = {'C': Cs, 'class_weight': weights, 'penalty': penalties} print(params_dict) acc_list = [] for i in range(10): x_train, x_valid, y_train, y_valid = data_preprocess.split_data( X, Y, partition=0.20) acc_list.append( train_logist(x_train, y_train, x_valid, y_valid, params_dict)) print('Summary of Results.') print('*-----------------------------*') print('Average accuracy over 20 runs: %.2f' % numpy.mean(acc_list))
# test dataset volume test_data_size=100 # threshold for classification threshold=0.7 k=2 # dataset taken from Kaggle fileName='Admission_Predict.csv' # normalized version of it normalizedFile="Regression_Admission.csv" # dataset directories class_test="Classification_Test_Data.csv" reg_test="Regression_Test_Data.csv" # train sets reg_train="Regression_Train.csv" class_train="Classification_Train.csv" # preparing the data classifier(threshold,fileName,normalizedFile) split_data(test_data_size,reg_train,class_train,reg_test,class_test) # Principal Component Analysis PCA() # kNN algorithm kNN(k,reg_train,reg_test,test_data_size) #SVM algorithm SVM_machine(class_train,class_test) #Random Forest average_f1_score("RandomForest")
b2 = update(b2, gradientB2, learning_rate) # Return the final dot product layer return [weight1, b1, weight2, b2] def predict(X, weight1, b1, weight2, b2): [a1, z2] = forward_propogate(X, weight1, weight2, b1, b2) return z2 learning_rate = 0.00001 k_output = 1 # Dimension of the output hidden_nodes = 30 data = np.genfromtxt('winequality-red.csv', delimiter=';') data = data[1:] # Train Test split - 80/20 [trainData, testData] = dp.split_data(data, 0.8) trainX = trainData[:, 0:-1] trainY = dp.reshapeCol(trainData[:, -1]) testX = testData[:, 0:-1] testY = dp.reshapeCol(testData[:, -1]) [weight1, b1, weight2, b2] = NN(trainX, trainY, hidden_nodes, learning_rate, k_output) y_pred = predict(testX, weight1, b1, weight2, b2) mse = dp.MSE(y_pred, testY) print mse
import numpy import leastSquaresSolution as ls import gradient_descent as gd import data_preprocess as dp # Data splitting data = numpy.loadtxt(open("winequality-red.csv", "rb"), delimiter=";", skiprows=1) [data_train, data_ans] = dp.stripLastColAsTest(data) [train, test] = dp.split_data(data_train, 0.5) [train_ans, test_ans] = dp.split_data(data_ans, 0.5) opt_weight = ls.leastSquareSolve(train, train_ans) opt_w = numpy.transpose(opt_weight) test_t = numpy.transpose(test) predict = numpy.dot(opt_w, test_t) error = dp.L2(test_ans, predict) print 'Least Squares Solution L2 Error: ', error opt_weight = gd.getOptimalWeights(train, train_ans) opt_w = numpy.transpose(opt_weight) test_t = numpy.transpose(test) predict = numpy.dot(opt_w, test_t) error = dp.L2(test_ans, predict)