file = open('iris.data.txt', 'rt')
for line in file:
    line = line.strip()
    array = line.split(',')
    data.append(array[0:-1])
    ans.append(array[-1])

file.close()

# Data set splitting

# Shuffle array
[data, ans] = data_preprocess.shuffle(data, ans)

# Split to train, test, then split train to validation
[trainData, testData] = data_preprocess.split_data(data, 0.6)
[trainAns, testAns] = data_preprocess.split_data(ans, 0.6)

[trainDataSmall,
 validData] = data_preprocess.split_data(trainData, (2.0 / 3.0))
[trainAnsSmall, validAns] = data_preprocess.split_data(trainAns, (2.0 / 3.0))

# Iterates through all possible combinations of parameters
C_Param_Values = [1, 50, 200, 500, 1000]
accuracy_a = []

for param in C_Param_Values:
    clf = svm.SVC(C=param)
    clf.fit(trainDataSmall, trainAnsSmall)
    accuracy = clf.score(validData, validAns, sample_weight=None)
    accuracy_a.append(accuracy)
Ejemplo n.º 2
0
def build_SVM(filename, option, svm_type = None, poly_degree = None):  
     
    
    # LOAD DATA
    descriptors = qm_descriptors
    X, Y = data_preprocess.load_data(filename, descriptors)
    
    if svm_type == None:
        svm_type = 'linear'
    
    if poly_degree == None:
         poly_degree = 2
         #print('training polynomial SVM of degree', poly_degree)    
    
   
    if option == 'default':
        
        print('Training SVM...')
        print('*-----------------------------*')
        print('Training on default parameters.')
        
        accuracies_default = []
        for i in range(10):
            x_train, x_valid, y_train, y_valid = data_preprocess.split_data (X, Y, partition=0.20)
            accuracies_default.append(train_SVM(x_train, y_train, x_valid, y_valid))
        
        print('Average accuracy over 10 default runs: %.2f' % numpy.mean(accuracies_default))
        
        
    elif option == 'train':
         
        print('*-----------------------------*')
        print('Searchig for best parameters.')
        
        params = []
        accuracies = []

        for i in range(10):
            x_train, x_valid, y_train, y_valid = data_preprocess.split_data (X, Y, partition=0.20)
            best_parameters = scan_parameters(x_train, y_train)
            params.append(best_parameters)
            accuracy = train_SVM(x_train, y_train, x_valid, y_valid, best_parameters)
            accuracies.append(accuracy)
            
        print('*-----------------------------*')
        print('Summary of Results.')            
        print('*-----------------------------*')
        
        
        for i in range(len(accuracies)):
            print('Run ' + str (i+1)+ ' ', params[i], ' : ', accuracies[i])
                

    elif option == 'RFE':
        
        print('*-----------------------------*')
        print('Recursive feature estimation.')
        #http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html#sklearn.feature_selection.RFE
        
        ranking = perform_RFE(X, Y)
            
        print('*-----------------------------*')
        print('Ranking of descriptors.')            
        print('*-----------------------------*')
        for d in range(len(qm_descriptors)):
            print(qm_descriptors[d], ranking[d])


    elif option == 'test':
        
        print('TESTING')
        print('*-----------------------------*')
        
        #kernels  = 'rbf'
        #Cs = 1
        #gammas = 1
        #degrees = 3
        #weights = None
        
        kernels  = 'rbf'
        Cs = 10
        gammas = 0.1
        degrees = 3
        weights = None
        
        params_dict = {'kernel': kernels, 'C': Cs, 'class_weight' : weights, 'degree': degrees, 'gamma' : gammas}
        
        acc_list = []
        
        for i in range(10):
            x_train, x_valid, y_train, y_valid = data_preprocess.split_data (X, Y, partition=0.20)
            
            acc_list.append(train_SVM(x_train, y_train, x_valid, y_valid, params_dict))
        
        print('Summary of Results.')            
        print('*-----------------------------*')
        print('Average accuracy over 10 runs: %.2f' % numpy.mean(acc_list))
Ejemplo n.º 3
0
    handler.setLevel(level)
    logger.addHandler(handler)
logger.setLevel(level)

#======= loading stuff ==========
logger.info('Start')
config = load_yaml(args.config)
config['loader_config']['data_checkpoint'] = args.data_checkpoint
x, y, weights = load_from_config(config, args.force_resample)

#======= split data =======
checkpoint_name = args.splits_checkpoint
cv_cfg = config['cross_validation']
cv_cfg['checkpoint'] = checkpoint_name
cv_splits, train_idx, test_idx, weights = split_data(y, weights, cv_cfg,
                                                     args.validation_mode,
                                                     args.force_resplit)
data_stuff = [x, y, weights]

kind = config['model_params'].get('kind', 'any')
is_multitask = 'multitask' in kind
if is_multitask:
    axis = 1
    N_outputs = len(y)
else:
    axis = 0
    N_outputs = -1

to_relax = config['model_params'].get('relax', False)

#======= print info =======
Ejemplo n.º 4
0
def build_NN_classifier(filename, option, model_name=None):

    # LOAD DATA
    descriptors = qm_descriptors
    X, Y = data_preprocess.load_data(filename, descriptors)

    # IF DOWNSAMPLING:
    #print('>> Down sampling.')
    #smaller_x, smaller_y = data_preprocess.do_down_sampling(X,Y)

    if option == 'default':

        print('Training Logist...')
        print('*-----------------------------*')
        print('Training on default parameters.')

        accuracies_default = []
        for i in range(10):
            x_train, x_valid, y_train, y_valid = data_preprocess.split_data(
                X, Y, partition=0.20)
            accuracies_default.append(
                train_NN(x_train, y_train, x_valid, y_valid))

        print('Average accuracy over 3 default runs: %.2f' %
              numpy.mean(accuracies_default))

    elif option == 'train':

        print('*-----------------------------*')
        print('Searchig for best parameters.')

        params = []
        accuracies = []

        for i in range(10):
            x_train, x_valid, y_train, y_valid = data_preprocess.split_data(
                X, Y, partition=0.20)
            best_parameters = scan_parameters(x_train, y_train)
            params.append(best_parameters)
            accuracy = train_NN(x_train, y_train, x_valid, y_valid,
                                best_parameters)
            accuracies.append(accuracy)

        print('*-----------------------------*')
        print('Summary of Results.')
        print('*-----------------------------*')

        for i in range(len(accuracies)):
            print('Run ' + str(i + 1) + ' ', params[i], ' : ', accuracies[i])

    elif option == 'test':

        print('TESTING')
        print('*-----------------------------*')

        hidden_layer_sizes = (100, 100)
        solver = 'adam'
        alpha = 0.001

        params_dict = {
            'hidden_layer_sizes': hidden_layer_sizes,
            'solver': solver,
            'alpha': alpha,
            'max_iter': [400]
        }

        print(params_dict)

        acc_list = []
        for i in range(10):
            x_train, x_valid, y_train, y_valid = data_preprocess.split_data(
                X, Y, partition=0.20)
            acc_list.append(
                train_NN(x_train, y_train, x_valid, y_valid, params_dict))

        print('Summary of Results.')
        print('*-----------------------------*')
        print('Average accuracy over 10 runs: %.2f' % numpy.mean(acc_list))
Ejemplo n.º 5
0
def build_logist(filename, option, model_name=None):

    # LOAD DATA
    descriptors = qm_descriptors
    X, Y = data_preprocess.load_data(filename, descriptors)

    if option == 'default':

        print('Training Logist...')
        print('*-----------------------------*')
        print('Training on default parameters.')

        accuracies_default = []
        for i in range(10):
            x_train, x_valid, y_train, y_valid = data_preprocess.split_data(
                X, Y, partition=0.20)
            accuracies_default.append(
                train_logist(x_train, y_train, x_valid, y_valid))

        print('Average accuracy over 10 default runs: %.2f' %
              numpy.mean(accuracies_default))

    elif option == 'train':

        print('*-----------------------------*')
        print('Searchig for best parameters.')

        params = []
        accuracies = []

        for i in range(10):
            x_train, x_valid, y_train, y_valid = data_preprocess.split_data(
                X, Y, partition=0.20)
            best_parameters = scan_parameters(x_train, y_train)
            params.append(best_parameters)
            accuracy = train_logist(x_train, y_train, x_valid, y_valid,
                                    best_parameters)
            accuracies.append(accuracy)

        print('*-----------------------------*')
        print('Summary of Results.')
        print('*-----------------------------*')

        for i in range(len(accuracies)):
            print('Run ' + str(i + 1) + ' ', params[i], ' : ', accuracies[i])

    elif option == 'RFE':

        print('*-----------------------------*')
        print('Recursive feature estimation.')
        #http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html#sklearn.feature_selection.RFE

        ranking = perform_RFE(X, Y)

        print('*-----------------------------*')
        print('Ranking of descriptors.')
        print('*-----------------------------*')
        for d in range(len(qm_descriptors)):
            print(qm_descriptors[d], ranking[d])

    elif option == 'test':

        print('TESTING')
        print('*-----------------------------*')
        #penalties = 'l2'
        #Cs = 0.001
        #weights = None

        penalties = 'l1'
        Cs = 10
        weights = None

        params_dict = {'C': Cs, 'class_weight': weights, 'penalty': penalties}
        print(params_dict)

        acc_list = []
        for i in range(10):
            x_train, x_valid, y_train, y_valid = data_preprocess.split_data(
                X, Y, partition=0.20)
            acc_list.append(
                train_logist(x_train, y_train, x_valid, y_valid, params_dict))

        print('Summary of Results.')
        print('*-----------------------------*')
        print('Average accuracy over 20 runs: %.2f' % numpy.mean(acc_list))
Ejemplo n.º 6
0
# test dataset volume
test_data_size=100
# threshold for classification
threshold=0.7
k=2
# dataset taken from Kaggle
fileName='Admission_Predict.csv'
# normalized version of it
normalizedFile="Regression_Admission.csv"
# dataset directories
class_test="Classification_Test_Data.csv"
reg_test="Regression_Test_Data.csv"
# train sets
reg_train="Regression_Train.csv"
class_train="Classification_Train.csv"
# preparing the data
classifier(threshold,fileName,normalizedFile)
split_data(test_data_size,reg_train,class_train,reg_test,class_test)

# Principal Component Analysis
PCA()

# kNN algorithm
kNN(k,reg_train,reg_test,test_data_size)

#SVM algorithm
SVM_machine(class_train,class_test)

#Random Forest 
average_f1_score("RandomForest") 
        b2 = update(b2, gradientB2, learning_rate)

    # Return the final dot product layer
    return [weight1, b1, weight2, b2]


def predict(X, weight1, b1, weight2, b2):
    [a1, z2] = forward_propogate(X, weight1, weight2, b1, b2)
    return z2


learning_rate = 0.00001
k_output = 1  # Dimension of the output
hidden_nodes = 30

data = np.genfromtxt('winequality-red.csv', delimiter=';')
data = data[1:]
# Train Test split - 80/20
[trainData, testData] = dp.split_data(data, 0.8)
trainX = trainData[:, 0:-1]
trainY = dp.reshapeCol(trainData[:, -1])

testX = testData[:, 0:-1]
testY = dp.reshapeCol(testData[:, -1])

[weight1, b1, weight2, b2] = NN(trainX, trainY, hidden_nodes, learning_rate,
                                k_output)
y_pred = predict(testX, weight1, b1, weight2, b2)
mse = dp.MSE(y_pred, testY)
print mse
import numpy
import leastSquaresSolution as ls
import gradient_descent as gd
import data_preprocess as dp

# Data splitting
data = numpy.loadtxt(open("winequality-red.csv", "rb"),
                     delimiter=";",
                     skiprows=1)
[data_train, data_ans] = dp.stripLastColAsTest(data)
[train, test] = dp.split_data(data_train, 0.5)
[train_ans, test_ans] = dp.split_data(data_ans, 0.5)

opt_weight = ls.leastSquareSolve(train, train_ans)

opt_w = numpy.transpose(opt_weight)
test_t = numpy.transpose(test)

predict = numpy.dot(opt_w, test_t)
error = dp.L2(test_ans, predict)

print 'Least Squares Solution L2 Error: ', error

opt_weight = gd.getOptimalWeights(train, train_ans)

opt_w = numpy.transpose(opt_weight)
test_t = numpy.transpose(test)

predict = numpy.dot(opt_w, test_t)
error = dp.L2(test_ans, predict)