def main(): # training parameter result_path = 'results/housingLiR_1.mse' model_name = 'housing_shiftAndScale' # normalization = Preprocess.zero_mean_unit_var normalization = Preprocess.shift_and_scale # cols_not_norm = (0,7,12) cols_not_norm = [] # laod and preprocess training data training_data = loader.load_dataset('data/housing_train.txt') testing_data = loader.load_dataset('data/housing_test.txt') Preprocess.normalize_features_all(normalization, training_data[0], testing_data[0], cols_not_norm) # start training model = rm.LinearRegression() model.build(training_data[0], training_data[1]) training_mse = model.test(training_data[0], training_data[1], util.mse) testing_mse = model.test(testing_data[0], testing_data[1], util.mse) print 'Error for training data is:' print training_mse print 'Error for testing data is:' print testing_mse result = {} result['TrainingMSE'] = str(training_mse) result['TestingMSE'] = str(testing_mse) result['Theta'] = str(model.theta) # log the training result to file util.write_result_to_file(result_path, model_name, result)
def main(): kernel = c.COSINE # training parameter result_path = 'results/PB2_spam.acc' model_name = 'digits_' + kernel tr_data_path = 'data\\digits\\tr_f_l_10.pickle' te_data_path = 'data\\digits\\te_f_l_10.pickle' # laod and preprocess training data tr_data = loader.load_pickle_file(tr_data_path) te_data = loader.load_pickle_file(te_data_path) # transpose label tr_data[1] = np.transpose(tr_data[1])[0] te_data[1] = np.transpose(te_data[1])[0] Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, tr_data[0]) Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, te_data[0]) # start training st = time.time() # start training print('{:.2f} Start training.'.format(time.time() - st)) for r in (0.15, 0.1): clf = kNN.kNN(kernel=kernel, dataset=c.DS_DIGITS) clf.fit(tr_data[0], tr_data[1]) tr_pred = clf.predict(tr_data[0], r=r) te_pred = clf.predict(te_data[0], r=r) tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0] te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0] print('{} Final results with kernel {} and r={}. Train acc: {}, Test acc: {}'.format(time.time() - st, kernel, r, tr_acc, te_acc))
def main(): is_sklearn = False # kernel = c.COSINE # kernel = c.GAUSSIAN kernel = c.POLY # training parameter result_path = 'results/PB2_spam.acc' model_name = 'digits_' + kernel model_path = 'data/PB1_B_digits_sk_Gaussian_1.model' # tr_data_path = 'data\\digits\\tr_f_l.pickle' # te_data_path = 'data\\digits\\te_f_l.pickle' tr_data_path = 'data\\digits\\tr_f_l_10.pickle' te_data_path = 'data\\digits\\te_f_l_10.pickle' # laod and preprocess training data tr_data = loader.load_pickle_file(tr_data_path) te_data = loader.load_pickle_file(te_data_path) # transpose label tr_data[1] = np.transpose(tr_data[1])[0] te_data[1] = np.transpose(te_data[1])[0] Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, tr_data[0]) Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, te_data[0]) # start training models = [] st = time.time() # start training print('{:.2f} Start training.'.format(time.time() - st)) for k in (1, 3, 7): if not is_sklearn: clf = kNN.kNN(kernel=kernel) clf.fit(tr_data[0], tr_data[1]) tr_pred = clf.predict(tr_data[0], k=k) te_pred = clf.predict(te_data[0], k=k) else: clf = KNeighborsClassifier(n_neighbors=k, metric=cosine_distances) clf.fit(tr_data[0], tr_data[1]) tr_pred = clf.predict(tr_data[0]) te_pred = clf.predict(te_data[0]) tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0] te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0] models.append(clf) print('{} Final results with kernel {} and k={}. Train acc: {}, Test acc: {}'.format(time.time() - st, kernel, k, tr_acc, te_acc))
def main(): # training parameter k = 8 # fold result_path = 'results/PB2_spam.acc' model_name = 'spam_' + str(k) + 'fold' data_path = 'data/spam/data.pickle' # laod and preprocess training data training_data = loader.load_pickle_file(data_path) # TODO convert labels from {0, 1} to {-1, 1} # util.replace_zero_label_with_neg_one(training_data) Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, training_data[0]) # Preprocess.normalize_features_all(Preprocess.shifiat_and_scale, training_data[0]) # start training training_accs = [] testing_accs = [] print('Preparing k fold data.') k_folds = Preprocess.prepare_k_folds(training_data, k) kernel = c.EUCLIDEAN sst = time.time() for i in (1,): st = time.time() tr_data, te_data = Preprocess.get_i_fold(k_folds, i) # start training print('{:.2f} Start training.'.format(time.time() - st)) for r in (2.5, 2.7): clf = kNN.kNN(kernel=kernel) # clf.fit(training_data[0], training_data[1]) clf.fit(tr_data[0], tr_data[1]) # tr_pred = clf.predict(training_data[0], r=r) tr_pred = clf.predict(tr_data[0], r=r) te_pred = clf.predict(te_data[0], r=r) # tr_acc = (training_data[1] == tr_pred).sum() / training_data[0].shape[0] tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0] te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0] testing_accs.append(te_acc) print('{} {}-fold results with kernel {}, r={}. Train acc: {}, Test acc: {}'.format(time.time() - st, i, kernel, r, tr_acc, te_acc))
def main(): # training parameter k = 10 # fold result_path = "results/PB1_A_spam.acc" model_name = "spam_" + str(k) + "fold" threshes_path = "data/spambase.threshes" data_path = "data/spam/data.pickle" # kernel = 'poly' kernel = "linear" # kernel = 'rbf' verbose = False tol = 0.01 c = 0.1 # laod and preprocess training data training_data = loader.load_pickle_file(data_path) # TODO convert labels from {0, 1} to {-1, 1} util.replace_zero_label_with_neg_one(training_data) # normalize Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, training_data[0]) print("Preparing k fold data.") k_folds = Preprocess.prepare_k_folds(training_data, k) for i in range(1): st = time.time() tr_data, te_data = Preprocess.get_i_fold(k_folds, i) # start training print("{:3f} Start training. Kernel: {}".format(time.time() - st, kernel)) clf = svm.SVC(C=c, kernel=kernel, tol=tol, verbose=verbose) # clf = svm.NuSVC(kernel=kernel, tol=tol, verbose=verbose) clf.fit(tr_data[0], tr_data[1]) tr_pred = clf.predict(tr_data[0]) te_pred = clf.predict(te_data[0]) tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0] te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0] print("{:3f} Final results. Train acc: {}, Test acc: {}".format(time.time() - st, tr_acc, te_acc))
from perceptron_dual import PerceptronDual import csv import Utilities as util import numpy as np import Consts as c import Preprocess data_file = 'data/twoSpirals.txt' # load and preprocess data features = [] labels = [] with open(data_file) as f: for line in csv.reader(f, delimiter='\t'): cur_l = int(float(line[-1])) sign = 1 cur_f = [sign * float(l) for l in line[:-1]] features.append(cur_f) labels.append([cur_l]) features = np.array(features) Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, features) # Preprocess.normalize_features_all(Preprocess.shift_and_scale, features) labels = np.array(labels).transpose()[0] # create perceptron # kernel = c.LINEAR kernel = c.GAUSSIAN model = PerceptronDual(kernel_fun=kernel) model.fit(features, labels)
# params lamda = 0.5 tol = 0.92 normalize_method = prep.zero_mean_unit_var term_method = util.acc_higher_than_ridge # laod and preprocess training data tr_data = loader.load_pickle_file(train_data_path) te_data = loader.load_pickle_file(test_data_path) print("{:.2f} Data loaded!".format(time.time() - st)) tr_data[0] = tr_data[0].tolist() te_data[0] = te_data[0].tolist() # normalize features prep.normalize_features_all(normalize_method, tr_data[0], te_data[0]) print("{:.2f} Features normalized!".format(time.time() - st)) saved_model = loader.load_pickle_file(model_path) # load the model theta = saved_model.theta is_batch = True penalty = "l2" # l2 for RIDGE alpha = 0.05 model = gd.LogisticRegressionGD(theta, penalty, alpha) # model.build(tr_data[0], tr_data[1], lamda, term_method, tol, is_batch) model.build(tr_data[0], tr_data[1], lamda, term_method, tol, is_batch, te_data[0], te_data[1]) training_acc = model.test(tr_data[0], tr_data[1], util.acc) testing_acc = model.test(te_data[0], te_data[1], util.acc) print("{} Final results. Train acc: {}, Test acc: {}".format(time.time() - st, training_acc, testing_acc))
# training parameter result_path = 'results/housingLiRGD_1.mse' model_name = 'housing' lamda = 0.0001 # 0.000015 is_batch = False # normalization = Preprocess.zero_mean_unit_var normalization = Preprocess.shift_and_scale term_fun = util.mse_less_than term_thresh = 25 cols_not_norm = [0,7] # laod and preprocess training data training_data = loader.load_dataset('data/housing_train.txt') testing_data = loader.load_dataset('data/housing_test.txt') Preprocess.normalize_features_all(normalization, training_data[0], testing_data[0], not_norm=cols_not_norm) # start training model = gd.LinearRegressionGD() model.build(training_data[0], training_data[1], lamda, term_fun, term_thresh, is_batch) try: pass except KeyboardInterrupt: print 'Interrupted' finally: training_mse = model.test(training_data[0], training_data[1], util.mse) testing_mse = model.test(testing_data[0], testing_data[1], util.mse) print 'Error for training data is:' print training_mse print 'Error for testing data is:' print testing_mse
import numpy as np import Utilities as util import RegressionModel as rm import Consts as c # training parameter k = 50 # fold result_path = "results/spamLiR_5.acc" model_name = "spam_" + str(k) + "fold_zeroMean" # normalization = Preprocess.zero_mean_unit_var normalization = Preprocess.shift_and_scale # laod and preprocess training data training_data = loader.load_dataset("data/spambase.data") Preprocess.normalize_features_all(normalization, training_data[0]) # start training training_accs = [] training_cms = [] testing_accs = [] testing_cms = [] roc = [] auc = 0.0 for i in range(k): (tr_data, te_data) = Preprocess.prepare_k_fold_data(training_data, k, i + 1) model = rm.LinearRegression() model.build(tr_data[0], tr_data[1]) training_test_res = model.test(tr_data[0], tr_data[1], util.compute_acc_confusion_matrix)
def main(config_path): ''' Main script for classifier building and testing ''' config = loader.load_config(config_path) training_data = None testing_data = None # load training and testing data from files, normalize if necessary if c.TRAINING_D in config.keys(): training_data = loader.load_dataset(config[c.TRAINING_D]) if c.TESTING_D in config.keys(): testing_data = loader.load_dataset(config[c.TESTING_D]) if c.NORM_METHOD in config.keys(): method = None if config[c.NORM_METHOD] == c.SHIFT_SCALE: method = Preprocess.shift_and_scale elif config[c.NORM_METHOD] == c.ZERO_MEAN_UNIT_VAR: method = Preprocess.zero_mean_unit_var if c.TESTING_D in config.keys(): Preprocess.normalize_features_all(method, training_data[0], testing_data[0]) else: Preprocess.normalize_features_all(method, training_data[0]) # generate thresholds file if needed if c.THRESHS in config.keys() and not os.path.isfile(config[c.THRESHS]): Preprocess.generate_thresholds(training_data[0], config[c.THRESHS]) # get path to store models and output results model_path = config[c.MODEL_PATH] output_path = config[c.OUTPUT_PATH] # use different validation method base on the config match = re.match(c.K_FOLD_RE, config[c.VALID_METHOD]) if match: # perform k-fold validation k = int(match.group(c.K_GROUP)) training_errs = [] testing_errs = [] for i in range(k): (tr_data, te_data) = Preprocess.prepare_k_fold_data(training_data, k, i + 1) model = builder.build_model(tr_data, config) training_errs.append(model.test(tr_data[0], tr_data[1], Utilities.get_test_method(config))) testing_errs.append(model.test(te_data[0], te_data[1], Utilities.get_test_method(config))) mean_training_err = np.mean(training_errs) mean_testing_err = np.mean(testing_errs) print str(k) + '-fold validation done. Training errors are:' print training_errs print 'Mean training error is:' print mean_training_err print 'Testing errors are:' print testing_errs print 'Mean testing error is:' print mean_testing_err config['TrainingErrs'] = str(training_errs) config['MeanTrainingErr'] = str(mean_training_err) config['TestingErrs'] = str(testing_errs) config['MeanTestingErr'] = str(mean_testing_err) elif config[c.VALID_METHOD] == c.HAS_TESTING_DATA: # perform testing with given testing dataset model = builder.build_model(training_data, config) training_err = model.test(training_data[0], training_data[1], Utilities.get_test_method(config)) testing_err = model.test(testing_data[0], testing_data[1], Utilities.get_test_method(config)) print 'Error for training data is:' print training_err print 'Error for testing data is:' print testing_err config['TrainingErr'] = str(training_err) config['TestingErr'] = str(testing_err) # Log the err f = open(output_path, 'w+') f.write(str(config)) f.close() return