def load_clean_csv(data_path, sub_sample=False, missing_val="ignore", normalized=True): """Load clean csv, specify data_path, sub_sample(True/False), missing_val(ignore, avg, median), normalized(True/False) Return yb, input_data, and ids""" yb, input_data, ids = load_csv_data(data_path, sub_sample) missing_ind = get_missing_index(input_data) incomplete_features = np.unique(np.where(input_data == -999.0)[1]) if (missing_val == "avg"): mean = np.mean(input_data[~missing_ind], 0) for i in incomplete_features: np.place(input_data[:, i], input_data[:, i] == -999, mean[i]) elif (missing_val == "median"): median = np.median(input_data[~missing_ind], 0) for i in incomplete_features: np.place(input_data[:, i], input_data[:, i] == -999, median[i]) else: yb = yb[~missing_ind] input_data = input_data[~missing_ind] ids = ids[~missing_ind] if normalized: input_m = np.mean(input_data, 0) input_std = np.std(input_data, 0) input_data = (input_data - input_m) / input_std return yb, input_data, ids
def process_data(path, inv_log=False): """Process the data before using it doing some engineering featuring :param path: path of the dataset :param inv_log: apply log on the positive columns of the dataset :return: y, processed data, masks based on pri_jet_num, ids """ y, X, ids = helpers.load_csv_data(path) dict_mask_jets_train = helpers_us.get_jet_masks(X) new_X = [] for i in range(len(dict_mask_jets_train)): new_X.append(np.delete(X[dict_mask_jets_train[i]], [22, 29], axis=1)) for i in range(len(dict_mask_jets_train)): undefined_columns = [j for j in range(len(new_X[i][0])) if (new_X[i][:, j] < -900).all()] new_X[i] = np.delete(new_X[i], undefined_columns, axis=1) for i in range(len(dict_mask_jets_train)): for j in range(len(new_X[i][0])): col = new_X[i][:, j] np.where(col < -900) m = np.mean(col[col >= -900]) # compute mean of the right columns col[np.where(col < -900)] = m new_X[i][:, j] = col if inv_log: new_X = helpers_us.log_f(new_X) for i in range(1, len(dict_mask_jets_train)): new_X[i], x_mean, x_std = helpers_us.standardize(new_X[i]) return y, new_X, dict_mask_jets_train, ids
def main(): """ Tests the six mandatory implementations on the raw data sets. Splits the original training set into a new training set and a test set with the ratio of the new training set to the old one being 0.8. Reports the percentage of correct predictions for each method. As a side note, standardization of the data helps algorithms that use gradient descent, hence standardized features are used in those iterative algorithms. """ y, tx, _ = load_csv_data('train.csv') y_train, tx_train, y_test, tx_test = train_test_split(y, tx, 0.8) standardized_tx_train, mean_tx_train, std_tx_train = standardize(tx_train) standardized_tx_test, _, _ = standardize(tx_test, mean_tx_train, std_tx_train) test_least_squares_GD(y_train, standardized_tx_train, y_test, standardized_tx_test) test_least_squares_SGD(y_train, standardized_tx_train, y_test, standardized_tx_test) test_least_squares(y_train, tx_train, y_test, tx_test) test_ridge_regression(y_train, tx_train, y_test, tx_test) y_train = change_labels_logistic(y_train) y_test = change_labels_logistic(y_test) test_logistic_regression(y_train, standardized_tx_train, y_test, standardized_tx_test) test_reg_logistic_regression(y_train, standardized_tx_train, y_test, standardized_tx_test)
def load_data_sets(y_train_jets, tx_train_jets, ids_train_jets, y_test_jets, tx_test_jets, ids_test_jets): print( '\nLoading the processed training and test set data for each jet number...' ) for jet_num in range(4): y_train, tx_train, ids_train = load_csv_data(training_files[jet_num]) y_train_jets.append(y_train) tx_train_jets.append(tx_train) ids_train_jets.append(ids_train) y_test, tx_test, ids_test = load_csv_data(test_files[jet_num]) y_test_jets.append(y_test) tx_test_jets.append(tx_test) ids_test_jets.append(ids_test) print('\nTraining and test set data for jet ', str(jet_num), ' is loaded.') print('\n... done.')
def load(trainFile, testFile): """ Builds various numpy arrays from the given .csv format training and test tests. Args: trainFile: file name/path for the input training set testFile: file name/path for the input test set Returns: y_train: labels in the training set as a numpy array tx_train: features in the training set as a numpy array ids_train: ids of the training data points as a numpy array y_test: labels in the test set as a numpy array tx_test: features in the test set as a numpy array ids_test: ids of the test data points as a numpy array """ print('\nLoading the raw training and test set data...') y_train, tx_train, ids_train = load_csv_data(trainFile) y_test, tx_test, ids_test = load_csv_data(testFile) print('\n... finished.') return y_train, tx_train, ids_train, y_test, tx_test, ids_test
def load_data(change_labels=True): """ Loads the training and testing data from disk. Args: change_labels: Convert the labels from -1/1 to 0/1 for logistic regression. """ train_path = "../data/train.csv" test_path = "../data/test.csv" print('Reading from file {}'.format(train_path)) y, tx, ids = load_csv_data(train_path, sub_sample=False) y = np.expand_dims(y, axis=1) if change_labels: y = np.where(y == -1, 0, y) print('Reading from file {}'.format(test_path)) _, tx_submission, _ = load_csv_data(test_path, sub_sample=False) return tx, y, tx_submission
def main(param): # load train set y, x, i = load_csv_data('data/train.csv', sub_sample=False) # load test set y_test, x_test, i_test = load_csv_data('data/test.csv', sub_sample=False) # Reshape y y = y.reshape(y.shape[0], 1) # Preprocess x (remove features with lot of -999) x = remove_columns(x) # Number of sub-set for crossvalidation N_TOTAL_FOLDS = 1 accuracies = [] x, y = shuffle_data(x, y) for k in range(0, x.shape[0], x.shape[0] // N_TOTAL_FOLDS): accuracy, y_predictions, w = crossvalidation(y, x, k, N_TOTAL_FOLDS, param) accuracies.append(accuracy) print(accuracies) #plot_result(lambdas,accuracies) submission(x_test, w, i_test)
def eval_train(self): if self._tX is None: y, tX, _ = load_csv_data(self._DATA_TRAIN_PATH) self._y, self._tX = self.prepare_all_data(y, tX) self._tX_orig = self._tX.copy() self._y_orig = self._y.copy() self._y, self._tX = self._prepare_model_data(self._y, self._tX) self._orig_train = True y_pred = self._predict(self._tX) # performance of model 5 on train dataset acc = 1 - sum(abs(self._y_orig - y_pred) / 2) / self._y_orig.shape[0] print('Total accuracy: ' + str(acc)) return acc
def predict_test(self, x=None, ids=None): if x is None or ids is None: if self._orig_test is False: _, _tX_test, self._ids_test = load_csv_data( self._DATA_TEST_PATH) _, self._tX_test = self.prepare_all_data(None, _tX_test) self._tX_orig = self._tX_test.copy() _, self._tX_test = self._prepare_model_data( None, self._tX_test) self._orig_test = True else: _, self._tX_test = self.prepare_all_data(None, x.copy()) self._tX_orig = self._tX_test.copy() self._ids_test = ids.copy() _, self._tX_test = self._prepare_model_data(None, self._tX_test) self._orig_test = False y_test_pred = self._predict(self._tX_test) create_csv_submission(self._ids_test, y_test_pred, self._output_path)
def train(self, y=None, x=None): if y is None or x is None: if self._orig_train is False: self._x_mean, self._x_std = None, None y, tX, _ = load_csv_data(self._DATA_TRAIN_PATH) self._y, self._tX = self.prepare_all_data(y, tX) self._tX_orig = self._tX.copy() self._y_orig = self._y.copy() self._y, self._tX = self._prepare_model_data(self._y, self._tX) self._orig_train = True else: self._x_mean, self._x_std = None, None self._y, self._tX = self.prepare_all_data(y.copy(), x.copy()) self._tX_orig = self._tX.copy() self._y_orig = self._y.copy() self._y, self._tX = self._prepare_model_data(self._y, self._tX) self._orig_train = False self._train_model() return self.eval_train()
fold_count = 1 seed = 2 # optimization gd_func = gradient_descent.logistic_L2_gradient_descent max_iters = 7000 gamma = 0.08 # lambdas (to find with grid search) lambdas = np.linspace(10, 13, num=6) lambda_best = 0 ################################################################################ # read data # ################################################################################ (y, X, id) = proj1_helpers.load_csv_data(DATA_TRAIN_PATH, sub_sample=False) # y is categorical, so we want integers (-1, 1) instead of floats (-1.0, 1.0) # Modified here instead of in load_csv_data, because we don't know if we have the # right to change the provided functions. y = y.astype(int) # The formulas used for the cost and gradients of the logistic function expect # categories that are 0/1 for some terms to disappear in the equations. y[np.where(y == -1)] = 0 ################################################################################ # clean data # ################################################################################ # one-hot coding for "PRI_jet_num" (column 22) (id, y, X) = clean_data.one_hot_PRI_jet_num(id, y, X)
from sklearn.linear_model import LogisticRegression from sklearn.model_selection import cross_validate from proj1_helpers import load_csv_data import numpy as np ITERATIONS = 50000 SAMPLE_SIZE = 250000 # less or eq than 250000. step=250k/SAMPLE_SIZE y, X, _, _ = load_csv_data('all/train.csv', step=int(250000. / SAMPLE_SIZE)) # Cleans dataset by removing all features that admit undefined values. undef_features = [i for i, feature in enumerate(X.T) if -999 in feature] X = np.delete(X, undef_features, axis=1) clf = LogisticRegression(solver='newton-cg', max_iter=ITERATIONS).fit(X, y) print(cross_validate(clf, X, y, scoring=['accuracy', 'precision']))
import numpy as np from implementations import ridge_regression from proj1_helpers import load_csv_data, predict_labels, create_csv_submission from data_processing import process_data, build_poly print("Loading data\n") # Loading data from csv files y_tr, tx_tr, ids_tr = load_csv_data("data/train.csv") y_te, tx_te, ids_te = load_csv_data("data/test.csv") # Hyper-parameters definitions degree = 7 lambda_ = 0.00025 # Preprocessing data: cleaning, standardazing and adding constant column tx_tr, tx_te = process_data(tx_tr, tx_te, y_tr, y_te) # Feature augmentation through polynomials tx_tr = build_poly(tx_tr, degree) tx_te = build_poly(tx_te, degree) # Training with ridge regression print("Training the model\n") weights, _ = ridge_regression(y_tr, tx_tr, lambda_) # Computing prediction vector y_pred = predict_labels(weights, tx_te) # Creating file for submission
from feature_selection import compute_log, compute_theta, compute_physics from correction_rate import cross_validation, print_score from evaluation import predict_regression_labels from costs import sigmoid # edit if train.csv and test.csv in not in ../data/ dat_dir = '../data/' ############################ ## Training ############################ print('training started') # load the training set print('loading the training dataset...') y_train_pre, tx_train, ids_train = load_csv_data(dat_dir + "train.csv", sub_sample=False) print('data loaded...') y_train = y_train_pre.reshape(y_train_pre.shape[0], 1) # construct the featues using log() index_log = [ 0, 1, 2, 4, 5, 6, 7, 9, 10, 12, 16, 21, 23, 24, 25, 26, 27, 28, 29 ] tx_log, mean_log, std_log = compute_log(tx_train, index_log) # construct the featues using cosine() index_theta = [14, 15, 17, 18, 20] tx_theta, mean_theta, std_theta = compute_theta(tx_train, index_theta) # construct the featues with physics meanings:
def predict(w, x_test, small=-1, big=1): '''Returns the prediction for x_test with w, for two values small and big. The prediction is done by choosing the nearest value''' y_pred = x_test @ w sep_val = (small + big) / 2 y_pred[y_pred < sep_val] = small y_pred[y_pred >= sep_val] = big return y_pred # ------------------------------- BEGINNING ------------------------------- print('Reading data') yb_full, input_data_full, ids_full = load_csv_data('data/train.csv') yb_test, input_data_test, ids_test = load_csv_data('data/test.csv') # Shuffling a bit the data to get some subsample that is picked at random np.random.seed(16) per = np.random.permutation(250000) # Picking sumsamples yb, input_data, ids = yb_full[per][::10], input_data_full[per][::10], ids_full[ per][::10] print('Data read') print('Treating data') # Separating each np.array into 4 sub-arrays by category (number of jets aka column 22) input_data_by_22, ids_by_22, yb_by_22 = separate_by_col22(input_data, ids, yb)
import numpy as np import proj1_helpers as helper import data_preprocessing as preprocess import multi_models_splitter as multi import implementations as imp from cross_validation import k_fold_cross_validation import os # Load training data y_train,tx_train,ids_train = helper.load_csv_data('../all/train.csv') # Load test data _,tx_test,ids = helper.load_csv_data('../all/test.csv') # Seed the random number generator with a fixed value for consistent results np.random.seed(20181028) # Parameters degrees = [3, 5, 6, 8, 10, 12] lambdas = np.logspace(-9, 0, 10) k_cross_val = [5] # Best results best_pred_score = 0.0 best_weights = 0 best_tx = 0 # Best parameters best_degree = 0 best_lambda = 0.0 best_k = 0
from proj1_helpers import load_csv_data, predict_labels, create_csv_submission import numpy as np from datetime import datetime from created_helpers import * print("loading data") y_train, x_train, ids_train = load_csv_data("train.csv") y_test, x_test, ids_test = load_csv_data("test.csv") # same ridge_regression as in implementations.py but just # returning loss def ridge_regression(y, tx, lambda_): """implement ridge regression.""" aI = 2 * tx.shape[0] * lambda_ * np.identity(tx.shape[1]) a = tx.T.dot(tx) + aI b = tx.T.dot(y) return np.linalg.solve(a, b) # cross_validation code taken from lab def cross_validation(y, x, k_indices, k, lambda_, degree): """return the loss of ridge regression.""" # get k'th subgroup in test, others in train te_indice = k_indices[k] tr_indice = k_indices[~(np.arange(k_indices.shape[0]) == k)] tr_indice = tr_indice.reshape(-1) y_te = y[te_indice] y_tr = y[tr_indice] x_te = x[te_indice] x_tr = x[tr_indice]
COMBINED_DEGREES = params['COMBINED_DEGREES'] SIMPLE_DEGREES = params['SIMPLE_DEGREES'] TAN_HYP_DEGREES = params['TAN_HYP_DEGREES'] INVERSE_LOG_DEGREES = params['INVERSE_LOG_DEGREES'] ROOT_DEGREES = params['ROOT_DEGREES'] NUM_SETS = params['NUM_SETS'] DATA_TRAIN_PATH = params['DATA_TRAIN_PATH'] DATA_TEST_PATH = params['DATA_TEST_PATH'] OUTPUT_PATH = params['OUTPUT_PATH'] CACHE = params['CACHE'] LAMBDAS = params['lambdas'] ######### # Load CSV ######### print("Loading CSV") y, tX_train, _ = load_csv_data(DATA_TRAIN_PATH) _, tX_test, ids_test = load_csv_data(DATA_TEST_PATH) ######### # Preprocess ######### print("Preprocessing") (XS_TRAIN, MASKS_TRAIN) = preprocessing(tX_train) (XS_TEST, MASKS_TEST) = preprocessing(tX_test) # placeholder for submission y_submission = np.zeros(tX_test.shape[0]) # compute for each subset of PRI_JET_NUM for i in range(NUM_SETS):
import numpy as np from proj1_helpers import (create_csv_submission, predict_labels, load_csv_data) from implementations import (least_squares_GD, least_squares_SGD, least_squares, ridge_regression, logistic_regression, normalize_data, delete_missing_values, replace_data, reg_logistic_regression) #load data from train set y, tX, ids = load_csv_data("train.csv") # change [-1, 1] labels to [0, 1] y = y / 2 + 0.5 N, d = tX.shape #initial weights randomly generated w0 = 10 * np.random.rand(d + 1, 1) # remplace -999 values with the mean of the other ones tX = replace_data(tX) # normalize data to std 1 and 0 mean tX = normalize_data(tX) w, L = reg_logistic_regression(y, tX, lambda_=0.001, initial_w=w0, max_iters=10,
import proj1_helpers import implementations import numpy as np print("Extracting dataset") y_train, X_train, id1 = proj1_helpers.load_csv_data("train.csv", True) y_test, X_test, id2 = proj1_helpers.load_csv_data("test.csv", True) print(X_train.shape) batch_size = 128 print("Splitting dataset into batch") X_batch = np.array_split(X_train, int(X_train.shape[0] / batch_size)) y_batch = np.array_split(y_train, int(y_train.shape[0] / batch_size)) print(X_batch[0].shape) w, _ = implementations.ridge_regression(y_batch[0], X_train[0], 1) print(w) y_pred = proj1_helpers.predict_labels(w, X_test) s = 0 tot = 0 for i, y in enumerate(y_pred): if y == y_test[i]: s += 1 tot += 1 print(s / tot)
""" run.py is used to launch the application of weights on a test dataset and serialize the results. """ def load_npy(*npy_paths): """ Returns numpy arrays serialized at npy_paths. Args: npy_paths : a sequence of serialized np.arrays files paths. Returns: Deserialized numpy arrays """ return (np.load(p) for p in npy_paths) # Load the test dataset _, test_data, test_ids, _ = load_csv_data('all/test.csv') # Load the weights, feature masks and parameters (mean, std_dev) weights, clean_features, parameters = load_npy('all/weights.npy', 'all/clean_features.npy', 'all/parameters.npy') # Runs the weights against the test dataset pri_jet_num_idx = 22 polynomial_degree = 3 predictions = model_predictions(test_data, weights, pri_jet_num_idx, clean_features, parameters, polynomial_degree) create_csv_submission(test_ids, predictions, 'all/predictions.csv')
def get_data(use_preexisting=True, save_preprocessed=True, z_outlier=False, feature_expansion=False, correlation_analysis=False, class_equalizer=False, M=4, z_value=3.0): """ Data supplying function. This function has the purpose of loading data and applying preprocessing. It includes many features such as downloading the data from the github repository, saving the data (for fast reuse), applying different preprocessing algorithms, etc... Args: use_preexisting (bool): if existent, enabling this parameters will allow the function to use previously preprocessed and saved data files save_preprocessed (bool): enabling this parameters will allow the function to save the preprocessed data z_outlier (Union[int, bool]): enabling this parameters will allow the function to perform z outlier detection feature_expansion (bool): enabling this parameters will allow the function to perform exponential feature expansion correlation_analysis (Union[int, bool]): enabling this parameters will allow the function to perform correlation analysis and remove highly correlated features class_equalizer (Union[int, bool]): enabling this parameters will allow the function to perform class balancing M (Union[int, list]): feature expansion parameter per group z_value (Union[float, list]): outlier detection threshold per group Returns: list: groups of training samples list: corresponding groups of training labels list: corresponding indexes of affiliated training ows list: groups of test samples list: corresponding groups of test labels list: corresponding indexes of affiliated test rows list: list of indexes of testing (for creating submissions) """ if os.path.isdir(config.DATA_PATH) and os.path.isdir( config.PREPROCESSED_PATH) and use_preexisting: print("[*] Using previously preprocessed Data") groups_tr_X = np.load(config.PREPROCESSED_X_TR_GROUPS_NPY, allow_pickle=True) groups_tr_Y = np.load(config.PREPROCESSED_Y_TR_GROUPS_NPY, allow_pickle=True) indc_list_tr = np.load(config.PREPROCESSED_GROUP_INDEX_TR_NPY, allow_pickle=True) groups_te_X = np.load(config.PREPROCESSED_X_TE_GROUPS_NPY, allow_pickle=True) groups_te_Y = np.load(config.PREPROCESSED_Y_TE_GROUPS_NPY, allow_pickle=True) indc_list_te = np.load(config.PREPROCESSED_GROUP_INDEX_TE_NPY, allow_pickle=True) ids_te = np.load(config.PREPROCESSED_IDS_TE_GROUPS_NPY, allow_pickle=True) else: if not (os.path.isdir(config.DATA_PATH) and os.path.isfile(config.TRAIN_DATA_CSV_PATH) and os.path.isfile(config.TEST_DATA_CSV_PATH)): Path(config.DATA_PATH).mkdir(exist_ok=True) download_url(config.TRAIN_URL, config.TRAIN_DATA_CSV_PATH) download_url(config.TEST_URL, config.TEST_DATA_CSV_PATH) print("[*] Creating preprocessed Data") # load data from csv filesconfig.Z_VALUE Y_tr, X_tr, ids_tr = load_csv_data(config.TRAIN_DATA_CSV_PATH) Y_te, X_te, ids_te = load_csv_data(config.TEST_DATA_CSV_PATH) groups_tr_Y, groups_tr_X, indc_list_tr = split_groups(Y_tr, X_tr) groups_te_Y, groups_te_X, indc_list_te = split_groups(Y_te, X_te) nr_groups_tr = len(indc_list_tr) # make to lists z_outlier = make_to_list(z_outlier) class_equalizer = make_to_list(class_equalizer) correlation_analysis = make_to_list(correlation_analysis) M = make_to_list(M) for indx in range(nr_groups_tr): # perform z outlier detection if z_outlier[indx]: groups_tr_X[indx] = z_score_outlier_detection( groups_tr_X[indx], thresh=z_value) groups_te_X[indx] = z_score_outlier_detection( groups_te_X[indx], thresh=z_value) # perform correlation analysis if correlation_analysis[indx]: groups_tr_X[indx], columns_to_keep = corr_filter( groups_tr_X[indx], threshold=0.95) groups_te_X[indx] = groups_te_X[indx][:, columns_to_keep] # perform class equalization if class_equalizer[indx]: groups_tr_X[indx], groups_tr_Y[ indx] = class_imbalance_equalizer(groups_tr_X[indx], groups_tr_Y[indx]) # perform feature expansion if feature_expansion: groups_tr_X[indx] = augment_features_polynomial( groups_tr_X[indx], M=M[indx]) groups_te_X[indx] = augment_features_polynomial( groups_te_X[indx], M=M[indx]) # standardize features groups_tr_X[indx] = standardize(groups_tr_X[indx]) groups_te_X[indx] = standardize(groups_te_X[indx]) # add bias groups_tr_X[indx] = add_bias(groups_tr_X[indx]) groups_te_X[indx] = add_bias(groups_te_X[indx]) print(f"\t [+]Group {indx + 1} finished!") if save_preprocessed: Path(config.PREPROCESSED_PATH).mkdir(exist_ok=True) np.save(config.PREPROCESSED_X_TR_GROUPS_NPY, groups_tr_X, allow_pickle=True) np.save(config.PREPROCESSED_Y_TR_GROUPS_NPY, groups_tr_Y, allow_pickle=True) np.save(config.PREPROCESSED_X_TE_GROUPS_NPY, groups_te_X, allow_pickle=True) np.save(config.PREPROCESSED_Y_TE_GROUPS_NPY, groups_te_Y, allow_pickle=True) np.save(config.PREPROCESSED_GROUP_INDEX_TR_NPY, indc_list_tr, allow_pickle=True) np.save(config.PREPROCESSED_GROUP_INDEX_TE_NPY, indc_list_te, allow_pickle=True) np.save(config.PREPROCESSED_IDS_TE_GROUPS_NPY, ids_te, allow_pickle=True) print("[+] Saved Preprocessed Data") return groups_tr_X, groups_tr_Y, indc_list_tr, groups_te_X, groups_te_Y, indc_list_te, ids_te
from data_utils import feature_transform, standardise, standardise_to_fixed from implementation_variants import logistic_regression_mean cwd = path.dirname(__file__) SEED = 42 DATA_PATH = '../data/' # Training hyperparameters (obtained through procedure in Run.ipynb) MAX_ITERS = 50000 GAMMA = 0.01 THRESHOLD = 1e-7 if __name__ == "__main__": # Load train data y_train, x_train, _ = load_csv_data(path.join(DATA_PATH, 'train.csv')) # Apply feature transform fx_train = feature_transform(x_train) # Standardise to mean and s.d. fx_train, mu_train, sigma_train = standardise(fx_train) # Add offset term tx_train = np.c_[np.ones(len(y_train)), fx_train] # Initialise training w_initial = np.ones(tx_train.shape[1]) # Run gradient descent w, loss = logistic_regression_mean(y_train,
import numpy as np import preprocessing as prep import feature_engineering as f_e import local_prediction as pred import proj1_helpers as helpers import params if __name__ == '__main__': # Training set preprocessing and feature engineering print('Train set:') y, tX, ids = helpers.load_csv_data(params.DATA_TRAIN_PATH) y_preprocessed, tX_preprocessed, ids_preprocessed, masks, counts = prep.preprocess( y, tX, ids) tX_improved = f_e.feature_engineer(tX_preprocessed) # In case we want to test our model locally by splitting our data if params.LOCAL_PREDICTION: pred.locally_predict(tX_improved, y_preprocessed, counts) else: print('Test set:') y_test, tX_test, ids_test = helpers.load_csv_data( params.DATA_TEST_PATH) y_test_preprocessed, tX_test_preprocessed, ids_test_preprocessed, masks_test, counts_test = prep.preprocess( y_test, tX_test, ids_test) tX_test_improved = f_e.feature_engineer(tX_test_preprocessed) log_initial_ws = [] for i in range(len(tX_test_improved)): log_initial_ws.append(np.repeat(0, tX_test_improved[i].shape[1])) optimal_ws = pred.find_optimal_ws_grouped( tX_improved, y_preprocessed, params.IMPLEMENTATION, log_initial_ws,
accuracies.append( pred.locally_predict(tX, y, counts[group_number], implementation=2, group=False, max_iter=max_iter, gamma=gamma, log_lambda=log_lambda)) argmax = np.flip(np.argsort(accuracies), axis=0)[0] print(max_iter_range[argmax], gamma_range[argmax], log_lambda_range[argmax], accuracies[argmax]) if __name__ == "__main__": y, tX, ids = helpers.load_csv_data(params.DATA_TRAIN_PATH) for replace_unwanted_value in true_false: for std in true_false: print('\t\treplace_unwanted_value = {}'.format( replace_unwanted_value)) print('\t\tstd = {}'.format(std)) y_grouped_preprocessed, tX_grouped_preprocessed, ids_grouped_preprocessed, masks, counts = \ prep.preprocess(y, tX, ids, std=std, replace_unwanted_value=replace_unwanted_value) for ones_column in true_false: for feature_multiplication in true_false: print('\t\tones_column = {}'.format(ones_column)) print('\t\tfeature_multiplication = {}'.format( feature_multiplication)) tX_improved = f_e.feature_engineer( tX_grouped_preprocessed[0], group=False,
import numpy as np from proj1_helpers import load_csv_data from implementations import least_squares_SGD yb, input_data, ids = load_csv_data('all/train.csv', step=50) losses, w = least_squares_SGD(yb, input_data, initial_w=np.zeros(30), batch_size=1, max_iters=30, gamma=0.5) with open('output.txt', 'w') as fp: print(yb, input_data, losses, w, file=fp)
# Simply the copy of Test Set Prediction Notebook # It takes nearly 20 seconds be patient import numpy as np from cross_validation import cross_validation from polynomial import build_poly from implementations import ridge_regression from proj1_helpers import load_csv_data from proj1_helpers import predict_labels from proj1_helpers import create_csv_submission USE_PRETRAINED_WEIGHTS = False # Read the test set test_set = load_csv_data('../data/test.csv') y_test, X_test, ids, columns = test_set # Selected columns selected_features = np.array([1, 3, 9, 10, 11, 13, 21, 22, 23]) selected_features = np.sort(np.append(selected_features, [0, 4, 5, 6, 12])) # Log transformed columns log_transformed_columns = [3, 9, 10, 13, 21] for i in log_transformed_columns: X_test[np.where(X_test[:, i] != -999), i] = np.log(X_test[np.where(X_test[:, i] != -999), i] + 1) X_test[np.where(X_test[:, i] == -999)] = -999 # Select the correct features
[email protected] """ import os import numpy as np from proj1_helpers import load_csv_data from cross_validation import optimize_model #from tests.test_helpers import load_data from helpers import _standardize # Resets cpu core task affinity os.system("taskset -p 0xff %d" % os.getpid()) # ## Load the training data into feature matrix, class labels, and event ids: DATA_TRAIN_PATH = '../data/train.csv' y, tX, ids = load_csv_data(DATA_TRAIN_PATH) # # ## Our code # # split categorical variable (23) # tX = np.vstack((tX.T, (tX[:, 22] == 0).astype(int))).T # tX = np.vstack((tX.T, (tX[:, 22] == 1).astype(int))).T # tX = np.vstack((tX.T, (tX[:, 22] == 2).astype(int))).T # tX = np.vstack((tX.T, (tX[:, 22] == 3).astype(int))).T # tX.shape # # # model1_datapoints_no = tX.shape[0] - sum(tX[:, 4] == -999) # tX1 = tX[tX[:, 4] != -999] # tX1 = np.delete(tX1, 0, axis=1) # y1 = y[tX[:, 4] != -999] # (tX1.shape, y1.shape)
load_csv_data, predict_01_labels) from implementations import ( replace_data, normalize_data, remove_outliers, oversample, one_hot_encode, polynomial_expansion, least_squares_GD, least_squares_SGD, least_squares, ridge_regression, logistic_regression, reg_logistic_regression, cross_validation_OLS, cross_validation_SGD, cross_validation_RR, cross_validation_LR, cross_validation_RLR_gamma, cross_validation_RLR_lambda) ########################################################################## #### Loading data ########################################################################## # load data from train set _y_train, _tX_train, ids_train = load_csv_data("train.csv") # change [-1, 1] labels to [0, 1] y_train = _y_train / 2 + 0.5 ########################################################################## #### Data pre-processing ########################################################################## # replace -999 values with the mean of the other ones tX_train = replace_data(_tX_train) # Get the one-hot-encoded columns for later one_hot_columns = one_hot_encode(tX_train, 22) # normalize data to std 1 and 0 mean
import numpy as np from proj1_helpers import load_csv_data, predict_labels, create_csv_submission from implementations import ridge_regression from helpers import build_poly, build_k_indices, normalize DATA_PATH = '../data/' lambda_ = 1e-20 degree = 13 seed = 12 k_fold = 7 # We work with the training data in this notebook y, x, ids = load_csv_data(DATA_PATH + 'train.csv') x, col_mean, xmin, xmax = normalize(x) def cross_validation(y, x, k_indices, k, lambda_, degree): te_indice = k_indices[k] tr_indice = k_indices[~(np.arange(k_indices.shape[0]) == k)] tr_indice = tr_indice.reshape(-1) y_te, y_tr = y[te_indice], y[tr_indice] x_te, x_tr = x[te_indice], x[tr_indice] tx_tr = build_poly(x_tr, degree) tx_te = build_poly(x_te, degree) w, _ = ridge_regression(y_tr, tx_tr, lambda_) y_tr_pred = predict_labels(w, tx_tr)