def logistic_trials(y, tx, tx_sub, degree_range, partitions=2): ## Split data into test and training sets ## If partitions > 2, use k-fold cross-validation glob_tx_tr, glob_tx_te, glob_y_tr, glob_y_te = split_data(tx, y, 0.8) ## Initial results: losses, weights, preditions and (test) losses models = [] losses = [] accuracies = [] predictions = [] ## Loops over range of degrees degrees = range(degree_range[0], degree_range[1]) for degree in degrees: print("Trying degree", degree, ":") tx_tr, tx_te, tx_pred = expand(degree, glob_tx_tr, glob_tx_te, tx_sub) initial_w = np.ones(tx_tr.shape[1]) w, loss = logistic_regression(glob_y_tr, tx_tr, initial_w, MAX_ITERS, GAMMA) print("\tTraining Loss = ", loss) y_test = predict_labels(w, tx_te) test_loss = compute_loss(glob_y_te, tx_te, w, func="logistic") accuracy = compute_accuracy((y_test + 1) / 2, glob_y_te) y_pred = predict_labels(w, tx_pred) print("\tTest Loss = ", test_loss, " Test Accuracy = ", accuracy) models.append(("logistic_SGD", degree, w)) losses.append(test_loss) accuracies.append(accuracy) predictions.append(y_pred) return models, losses, accuracies, predictions
def cross_validation(y, x, k_indices, k, regression_method, **args): """ Completes k-fold cross-validation using the regression method passed as argument. """ # get k'th subgroup in test, others in train msk_test = k_indices[k] msk_train = np.delete(k_indices, (k), axis=0).ravel() x_train = x[msk_train, :] x_test = x[msk_test, :] y_train = y[msk_train] y_test = y[msk_test] # data pre-processing x_train, x_test = process_data(x_train, x_test, True) # compute weights using given method weights, loss = regression_method(y=y_train, tx=x_train, **args) # predict output for train and test data y_train_pred = predict_labels(weights, x_train) y_test_pred = predict_labels(weights, x_test) # compute accuracy for train and test data acc_train = compute_accuracy(y_train_pred, y_train) acc_test = compute_accuracy(y_test_pred, y_test) return acc_train, acc_test
def generate_prediction(x_tr_0, y_tr_0, x_tr_1, y_tr_1, x_tr_2, y_tr_2, x_tr_3, y_tr_3, x_te_0, x_te_1, x_te_2, x_te_3, jet_num_te): """Generate a prediction for a test dataset already split according to jet_num by calculating weights using a training dataset also already split.""" #compute the weights using predetermined polynomial degrees w_0, _ = least_squares(y_tr_0, build_poly(x_tr_0, 9)) w_1, _ = least_squares(y_tr_1, build_poly(x_tr_1, 15)) w_2, _ = least_squares(y_tr_2, build_poly(x_tr_2, 13)) w_3, _ = least_squares(y_tr_3, build_poly(x_tr_3, 12)) #compute the prediction using the weights y_te_0 = predict_labels(w_0, build_poly(x_te_0, 9)) y_te_1 = predict_labels(w_1, build_poly(x_te_1, 15)) y_te_2 = predict_labels(w_2, build_poly(x_te_2, 13)) y_te_3 = predict_labels(w_3, build_poly(x_te_3, 12)) #join the four predictions into a single one matching the original indices predicted_y_te = [] i_0, i_1, i_2, i_3 = 0, 0, 0, 0 for jet_num in jet_num_te: if jet_num == 0: predicted_y_te.append(y_te_0[i_0]) i_0 += 1 elif jet_num == 1: predicted_y_te.append(y_te_1[i_1]) i_1 += 1 elif jet_num == 2: predicted_y_te.append(y_te_2[i_2]) i_2 += 1 else: predicted_y_te.append(y_te_3[i_3]) i_3 += 1 return predicted_y_te
def cross_validation_lr(y, x, k_indices, k, gamma, lambda_, max_iters, degree): """ Return the classification error of the logistic regression for each step of the k-fold cross validation. @param y : raw output variable @param x :raw input variable, might be a polynomial basis obtained from the input x @param k_indices : the indices of the data that belong to each of the K groups of the cross_validation. @param k : the index of the group that we are using for the testing. @param gamma : the gamma with which we're doing the cross_validation @param lambda : the penalization parameter we're working on. @param max_iters : the max number of iterations of the logistic regression @param degree : the degree of the polynomial basis with which we're doing the cross validation @return loss_tr : the classification error made on the training data. @return loss_te : the classification error made on the testing data. """ #1. WE DIVIDE THE DATA IN THE SUBGROUPS # get k'th subgroup in test, others in train: x_test = np.array(x[k_indices[k - 1]]) y_test = np.array(y[k_indices[k - 1]]) x_train = np.empty((0, x.shape[1])) y_train = np.empty((0, 1)) #This for loops gets the other groups for k_iter, validation_points in enumerate(k_indices): if (k_iter != k - 1): x_train = np.append(x_train, x[validation_points], axis=0) y_train = np.append(y_train, y[validation_points]) #2. WE FORMAT THE DATA #we sanitize and standardize our training data here, and apply the same median, mean and variance to the testing data x_train = count_NaN(x_train) x_test = count_NaN(x_test) x_train, median_train = sanitize_NaN(x_train) x_test, median_test = sanitize_NaN(x_test, median_train) x_train, mean_tr, std_tr = standardize(x_train) x_test, mean_te, ste_te = standardize(x_test, mean_tr, std_tr) # form data with polynomial degree: x_train_poly = build_poly(x_train, degree) x_test_poly = build_poly(x_test, degree) #print('Shape of polynomial training date :', x_train_poly.shape) #3. WE RUN THE MODEL AND COMPUTE THE ERROR # Relgularized logistic regression: w_rlr = regularized_logistic_regression(y_train, x_train_poly, gamma, lambda_, max_iters) # calculate the classification error for train and test data: loss_tr = sum( abs((2 * (y_train) - 1) - predict_labels(w_rlr, x_train_poly))) / (2 * len(y_train)) loss_te = sum(abs((2 * y_test - 1) - predict_labels(w_rlr, x_test_poly))) / (2 * len(y_test)) return loss_tr, loss_te
def cross_validation_ridge_regression(y, x, k_indices, k, lambdas, degrees): """ Completes k-fold cross-validation using the ridge regression method. Here, we build polynomial features and create four subsets using the jet feature. """ # get k'th subgroup in test, others in train msk_test = k_indices[k] msk_train = np.delete(k_indices, (k), axis=0).ravel() x_train_all_jets = x[msk_train, :] x_test_all_jets = x[msk_test, :] y_train_all_jets = y[msk_train] y_test_all_jets = y[msk_test] # split in 4 subsets the training set msk_jets_train = get_jet_masks(x_train_all_jets) msk_jets_test = get_jet_masks(x_test_all_jets) # initialize output vectors y_train_pred = np.zeros(len(y_train_all_jets)) y_test_pred = np.zeros(len(y_test_all_jets)) for idx in range(len(msk_jets_train)): x_train = x_train_all_jets[msk_jets_train[idx]] x_test = x_test_all_jets[msk_jets_test[idx]] y_train = y_train_all_jets[msk_jets_train[idx]] # data pre-processing x_train, x_test = process_data(x_train, x_test, False) phi_train = build_poly(x_train, degrees[idx]) phi_test = build_poly(x_test, degrees[idx]) phi_train = add_constant_column(phi_train) phi_test = add_constant_column(phi_test) # compute weights using given method weights, loss = ridge_regression(y=y_train, tx=phi_train, lambda_=lambdas[idx]) y_train_pred[msk_jets_train[idx]] = predict_labels(weights, phi_train) y_test_pred[msk_jets_test[idx]] = predict_labels(weights, phi_test) # compute accuracy for train and test data acc_train = compute_accuracy(y_train_pred, y_train_all_jets) acc_test = compute_accuracy(y_test_pred, y_test_all_jets) return acc_train, acc_test
def calculate_f1(x, y, w): """ Compute F1 score of the found model """ y_sol = np.copy(y) y_pred = predict_labels(w, x) nP = 2 nS = 2 P = np.zeros((nP, nS)) R = np.zeros((nP, nS)) F1 = np.zeros((nP, nS)) M = len(y_sol) F1_overall = 0 y_sol[np.where(y_sol == -1)] = 0 y_pred[np.where(y_pred == -1)] = 0 for i in range(nS): ci = sum(y_sol == i) for j in range(nP): true_value = 0 for m in range(M): if y_sol[m] == i and y_pred[m] == j: true_value = true_value + 1 kj = sum(y_pred == j) if kj != 0: P[j, i] = true_value / kj if ci != 0: R[j, i] = true_value / ci if R[j, i] + P[j, i] != 0: F1[j, i] = (2 * R[j, i] * P[j, i]) / (R[j, i] + P[j, i]) F1_overall = F1_overall + ci / M * max(F1[:, i]) return F1_overall * 100
def cross_validation_SGD(X, y, k_fold=4, seed=1): """# Returns the mean accuracy based on k_fold cross validation""" all_indices = build_k_indices(y, k_fold, seed) # Try over the Gamma (learning rate) gamma = np.logspace(-6, -3, 10) # This is going to be a grid search on gamma accuracy = np.zeros((k_fold, len(gamma))) for k in range(k_fold): test_indices = all_indices[k] train_indices = np.setdiff1d(range(len(y)), test_indices) y_test = y[test_indices] X_test = X[test_indices] y_train = y[train_indices] X_train = X[train_indices] for j in range(len(gamma)): # Corresponds to 1 'epoch' print(gamma[j]) w, loss_tr = least_squares_SGD(y = y_train, tx = X_train, initial_w = np.random.random(size=X_train.shape[1])*0.01, max_iters = len(y_train), gamma = gamma[j], verbose=False) prediction = predict_labels(w, X_test, 0.0) accuracy[k, j] = len(np.where(y_test == prediction)[0]) / len(y_test) * 100 return np.hstack((gamma.reshape(-1,1), np.mean(accuracy, axis=0).reshape(-1,1)))
def cross_validation_RLR(X, y, k_fold=4, seed=1): """# Returns the mean accuracy based on k_fold cross validation""" all_indices = build_k_indices(y, k_fold, seed) # Try over the Gamma (learning rate) gamma = np.logspace(-6, -3, 2) # Try over the lambda (regularisation) lambda_ = np.logspace(-6, -3, 5) # This is going to be a grid search on gamma and lambda_ accuracy = np.zeros((k_fold, len(gamma), len(lambda_))) for k in range(k_fold): test_indices = all_indices[k] train_indices = np.setdiff1d(range(len(y)), test_indices) y_test = y[test_indices] X_test = X[test_indices] y_train = y[train_indices] X_train = X[train_indices] for j in range(len(gamma)): # Corresponds to 1 'epoch' print(gamma[j]) for l in range(len(lambda_)): w, loss_tr = reg_logistic_regression(y = y_train, tx = X_train, lambda_ = lambda_[l], initial_w = np.random.random(size=X_train.shape[1])*0.01, max_iters = len(y_train), gamma = gamma[j], verbose=False) prediction = predict_labels(w, X_test, 0.0) accuracy[k, j, l] = len(np.where(y_test == prediction)[0]) / len(y_test) * 100 return gamma, lambda_, np.mean(accuracy, axis=0)
def cross_validation_RR(X, y, k_fold=4, seed=1): """# Returns the mean accuracy based on k_fold cross validation""" all_indices = build_k_indices(y, k_fold, seed) # Try over the lambda (regularisation) lambda_ = np.logspace(-6, -3, 20) # This is going to be a grid search on lambda_ accuracy = np.zeros((k_fold, len(lambda_))) for k in range(k_fold): test_indices = all_indices[k] train_indices = np.setdiff1d(range(len(y)), test_indices) y_test = y[test_indices] X_test = X[test_indices] y_train = y[train_indices] X_train = X[train_indices] for j in range(len(lambda_)): # Corresponds to 1 'epoch' print(lambda_[j]) w, loss_tr = ridge_regression(y = y_train, tx = X_train, lambda_ = lambda_[j]) prediction = predict_labels(w, X_test, 0.0) accuracy[k, j] = len(np.where(y_test == prediction)[0]) / len(y_test) * 100 return np.hstack((lambda_.reshape(-1,1), np.mean(accuracy, axis=0).reshape(-1,1)))
def cross_validation(y, x, k_indices, k, lambda_, degree): # Dividing in subgroups te_indice = k_indices[k] tr_indice = k_indices[~(np.arange(k_indices.shape[0]) == k)] tr_indice = tr_indice.reshape(-1) y_te = y[te_indice] y_tr = y[tr_indice] tx_te = x[te_indice] tx_tr = x[tr_indice] # Preprocessing data: cleaning, standardazing and adding constant column tx_tr, tx_te = process_data(tx_tr, tx_te, y_tr, y_te) # Feature augmentation through polynomials tx_tr = build_poly(tx_tr, degree) tx_te = build_poly(tx_te, degree) # Printing degree and lambda tested print("Test: d = ", degree, "; l = ", lambda_) # Training with ridge regression w, loss = ridge_regression(y_tr, tx_tr, lambda_) # Computing prediction vector y_pred = predict_labels(w, tx_te) # Computing accuracy on test set accuracy = compute_accuracy(y_te, y_pred) # Log informations print("Accuracy = ", accuracy, "; loss = ", loss, "\n") return loss_te, accuracy
def pipeline(tx_train, y_train, tx_val, y_val, degrees, gamma, lambda_, epochs, verbose): """ Run the model training and evaluation on the given parameters """ # Perform data cleaning (missing values, constant features, outliers, standardization) data_cleaner = DataCleaning() tx_train = data_cleaner.fit_transform(tx_train) tx_val = data_cleaner.transform(tx_val) # Perform feature engineering feature_generator = FeatureEngineering() x_train = feature_generator.fit_transform(tx=tx_train, degree=degrees) x_val = feature_generator.transform(tx=tx_val) # Initialize values initial_w = np.zeros(x_train.shape[1]) # Train model w, _ = reg_logistic_regression(y_train, x_train, lambda_, initial_w, epochs, gamma, verbose) # Perform inference on validation pred = predict_labels(weights=w, data=x_val, logistic=True) evaluator = Evaluation(y_val, pred) return evaluator.get_f1(), evaluator.get_accuracy()
def run_stochastic_gradient_descent(tx_train, y_train, tx_val, y_val): """It performs training and evaluation of least squares with stochastic gradient descent.""" print('\nTraining with Stochastic Gradient Descent') initial_w = np.zeros((tx_train.shape[1])) gamma = 0.005 max_iter = 3000 # Train the model w, _ = least_squares_SGD(y=y_train, tx=tx_train, initial_w=initial_w, max_iters=max_iter, gamma=gamma, verbose=False) # Perform predictions y_pred = predict_labels(weights=w, data=tx_val, logistic=False) # Evaluate evaluation = Evaluation(y_actual=y_val, y_pred=y_pred) acc = evaluation.get_accuracy() f1 = evaluation.get_f1() print('Accuracy: {acc}, F1: {f1}'.format(acc=acc, f1=f1)) return acc, f1
def run_regularized_logistic_regression(tx_train, y_train, tx_val, y_val): """It performs training and evaluation of regularized logistic regression.""" print('\nTraining with regularized logistic regression ') # Initialize parameters initial_w = np.zeros((tx_train.shape[1])) gamma = 1e-6 max_iter = 1000 lambda_ = 0.00001 # Train the model w, _ = reg_logistic_regression(y=y_train, tx=tx_train, initial_w=initial_w, max_iters=max_iter, gamma=gamma, lambda_=lambda_) # Perform predictions y_pred = predict_labels(weights=w, data=tx_val, logistic=True) # Evaluate evaluation = Evaluation(y_actual=y_val, y_pred=y_pred) acc = evaluation.get_accuracy() f1 = evaluation.get_f1() print('Accuracy: {acc}, F1: {f1}'.format(acc=acc, f1=f1)) return acc, f1
def compute_model_accuracy(x, y, w): """ Compute the accuracy of the found model """ y_pred = predict_labels(w, x) size = y.shape[0] false_values = np.count_nonzero( y_pred.reshape(size, 1) - y.reshape(size, 1)) diff = false_values / size accuracy = 1 - diff return 100 * accuracy
def cross_validation_ridge(y_train, x_train, num_folds, lambda_, seed=1): np.random.seed(seed) scores = [] for x_train_sub, x_val_sub, y_train_sub, y_val_sub in k_fold_splits(y_train, x_train, num_folds): w, _ = ridge_regression(y_train_sub, x_train_sub, lambda_) y_val_predict = predict_labels(w, x_val_sub) score = np.mean(y_val_predict == y_val_sub) scores.append(score) return np.array(scores)
def multi_models_splitter_experimental(y_train, tx_train, tx_test, feature_column_index, k, fun_model, fun_model_args): """Creates a predictions vector by creating different models based on the value of a categorizing feature in the dataset. Args: y_train (N x 1 vector): Training labels vector. tx_train (N x D matrix): Training features matrix (already pre-processed). tx_test (N x D matrix): Test features matrix (already pre-processed). feature_column_index (int): Categorizing feature's column index. k (int): Number of folds used for cross validation. fun_model (*function(...) return (weights,loss)): Function that computes a model. fun_model_args ([...]): Arguments list for fun_model (except y and tx). Returns: D x 1 vector: Predictions vector for tx_test. float: Average of all predictions score. """ # feature_column_index must be positive if (feature_column_index < 0): raise ValueError("Parameter feature_column_index must be positive") # Get range of categorization values categorization_values = np.unique(tx_train[:, feature_column_index]) num_models = len(categorization_values) # Accumulators idx_array = [] y_pred_array = [] pred_scores_array = [] for i in range(num_models): # Only consider datapoints of one category idx_categorized = np.where( tx_train[:, feature_column_index] == categorization_values[i]) y_categorized = y_train[idx_categorized] tx_categorized = tx_train[idx_categorized] # Run cross-validation on the model weights, avg_pred_score = k_fold_cross_validation( y_categorized, tx_categorized, k, fun_model, fun_model_args) # Get predictions idx_categorized_test = np.where( tx_test[:, feature_column_index] == categorization_values[i]) tx_categorized_test = tx_test[idx_categorized_test] y_pred_categorized = predict_labels(weights, tx_categorized_test) # Update accumulators idx_array.append(idx_categorized_test) y_pred_array.append(y_pred_categorized) pred_scores_array.append(avg_pred_score) return idx_array, y_pred_array, pred_scores_array
def generate_best(param_dict=None, log_param_dict_path="../data/logs/best.json"): """ Generate submission for the best function-parameters combination. These parameters are given either randomly through param_dict, or automatically fetched from the logs. Args: param_dict (dict): dictionary with function and its parameters log_param_dict_path (string): path to logs with best parameters Returns: None """ # if not parameters are given manually, look for a log dictionary if not param_dict: try: with open(log_param_dict_path, "r") as f: log_dict = json.load(f) param_dict = transform_log_dict_to_param_dict(log_dict) except OSError: print(f"Could not open/read file: {log_param_dict_path}") sys.exit() M_list = [param_dict[str(group_indx)]["M"] for group_indx in range(1, 7)] class_equalizer_list = [param_dict[str(group_indx)]["class_eq"] for group_indx in range(1, 7)] z_outlier_list = [param_dict[str(group_indx)]["z_outlier"] for group_indx in range(1, 7)] corr_anal_list = [param_dict[str(group_indx)]["corr_anal"] for group_indx in range(1, 7)] # divide the dataset into the multiple groups and preprocess it # TODO change preexisting to False groups_tr_X, groups_tr_Y, indc_list_tr, groups_te_X, groups_te_Y, indc_list_te, ids_te = get_data( use_preexisting=False, save_preprocessed=False, z_outlier=z_outlier_list, feature_expansion=True, correlation_analysis=corr_anal_list, class_equalizer=class_equalizer_list, M=M_list) # numpy array for submission Y_te = np.zeros(shape=(568238,)) # for each group... for group_indx, (X_tr, Y_tr, X_te, Y_te_indx) in enumerate( zip(groups_tr_X, groups_tr_Y, groups_te_X, indc_list_te)): # get shape and create initial parameters N, D = X_tr.shape W_init = np.random.rand(D, ) best_params_train = { "tx": X_tr, "y": Y_tr, "initial_w": W_init, "max_iters": param_dict[str(group_indx + 1)]["params"][0], "gamma": param_dict[str(group_indx + 1)]["params"][1], "lambda_": param_dict[str(group_indx + 1)]["params"][2] } # train it on all available training data W_best, _ = IMPLEMENTATIONS[param_dict[str(group_indx + 1)]["function_name"]]["function"](**best_params_train) # write into the corresponding indexes of this group Y_te[Y_te_indx] = predict_labels(W_best, X_te) generate_submission(ids_te, Y_te)
def cross_validation(y, x, k_indices, k, lambda_, degree): te_indice = k_indices[k] tr_indice = k_indices[~(np.arange(k_indices.shape[0]) == k)] tr_indice = tr_indice.reshape(-1) y_te, y_tr = y[te_indice], y[tr_indice] x_te, x_tr = x[te_indice], x[tr_indice] tx_tr = build_poly(x_tr, degree) tx_te = build_poly(x_te, degree) w, _ = ridge_regression(y_tr, tx_tr, lambda_) y_tr_pred = predict_labels(w, tx_tr) y_te_pred = predict_labels(w, tx_te) loss_tr = sum(y_tr_pred != y_tr) / len(y_tr) loss_te = sum(y_te_pred != y_te) / len(y_te) return loss_tr, loss_te, w
def submission(x_test, w, i): x_test = remove_columns(x_test) x_test = replace_outliers_with_mean(x_test) x_test = standardize(x_test) #x_test = build_poly(x_test,3) x_test = addones(x_test) y_predictions = predict_labels(w, x_test) y_predictions = predict_reverse(y_predictions) y_predictions.reshape(y_predictions.shape[0], ) create_csv_submission(i, y_predictions, 'data/sample-submission.csv')
def compute_predictions_score(y_ref, weights, data): """Computes the prediction score obtained by a weights vector. Args: y_ref (N x 1 vector): Reference labels vector. weights (D x 1 matrix): Weights vector data (N x D matrix): Features matrix (already pre-processed). Returns: float: the proportion of correctly predicted labels (between 0 and 1) """ y_pred = proj1_helpers.predict_labels(weights, data) return float(np.sum(y_pred == y_ref)) / float(y_ref.shape[0])
def ridge_trials(y, tx, tx_sub, degree_range, lambda_range, partitions=2): ## Split data into test and training sets ## If partitions > 2, use k-fold cross-validation glob_tx_tr, glob_tx_te, glob_y_tr, glob_y_te = split_data(tx, y, 0.8) ## Initial results: losses, weights, preditions and (test) losses models = [] losses = [] accuracies = [] predictions = [] ## Loops over range of degrees degrees = range(degree_range[0], degree_range[1]) lambdas = np.logspace(lambda_range[0], lambda_range[1], num=1 + (lambda_range[1] - lambda_range[0])) for degree in degrees: ## Loops over range of lambdas for lambda_ in lambdas: print("Trying degree", degree, "with lambda =", lambda_, ":") tx_tr, tx_te, tx_pred = expand(degree, glob_tx_tr, glob_tx_te, tx_sub) w, loss = ridge_regression(glob_y_tr, tx_tr, lambda_) print("\tTraining Loss = ", loss) y_test = predict_labels(w, tx_te) test_loss = compute_loss(glob_y_te, tx_te, w) accuracy = compute_accuracy((y_test + 1) / 2, glob_y_te) y_pred = predict_labels(w, tx_pred) print("\tTest Loss = ", test_loss, " Test Accuracy = ", accuracy) models.append(("ridge_regression", degree, lambda_, w)) losses.append(test_loss) accuracies.append(accuracy) predictions.append(y_pred) return models, losses, accuracies, predictions
def best_model_predictions(data_obj, jet, degrees): """ This method splits the data based on the jet value trains the model and gets the predictions on the test dataset. :param data_obj: DataLoader obj :param jet: int, the jet value :param degrees: int, the polynomial degree :return: pred: np.array with the predicted labels ids: np.array with the row index """ print('Training for Jet {jet}'.format(jet=jet)) # Split data based on jet value for train and val datasets y, tx = get_jet_data_split(data_obj.y, data_obj.tx, jet) ids_test, tx_test = get_jet_data_split(data_obj.ids_test, data_obj.test, jet) # Perform data cleaning (missing values, constant features, outliers, standardization) data_cleaner = DataCleaning() tx = data_cleaner.fit_transform(tx) tx_test = data_cleaner.transform(tx_test) # Perform feature engineering feature_generator = FeatureEngineering() tx = feature_generator.fit_transform(tx, degrees) tx_test = feature_generator.transform(tx_test) # Initialize values initial_w = np.zeros((tx.shape[1])) lambda_ = 1e-06 gamma = 1e-06 max_iter = 1000 # Train model w, loss = reg_logistic_regression(y, tx, lambda_, initial_w, max_iter, gamma, verbose=True) # Perform inference on test set pred = predict_labels(w, tx_test, True) return ids_test, pred
def compute_score(y, tx, w): """ Compute percentage of well predicted labels. INPUT: y - Labels vector tx - Samples w - Weights OUTPUT: score - Percentage obtained """ # Predict labels y_pred = predict_labels(w, tx) # Calculate the percentage of correct predictions score = np.sum(y_pred == y) / len(y) return score
def run_least_squares(tx_train, y_train, tx_val, y_val): """It performs training and evaluation of least squares with normal equations.""" print('\nTraining with least squares') # Train the model w, _ = least_squares(y=y_train, tx=tx_train) # Perform predictions y_pred = predict_labels(weights=w, data=tx_val, logistic=False) # Evaluate evaluation = Evaluation(y_actual=y_val, y_pred=y_pred) acc = evaluation.get_accuracy() f1 = evaluation.get_f1() print('Accuracy: {acc}, F1: {f1}'.format(acc=acc, f1=f1)) return acc, f1
def crossvalidation(y, x, k, n, param): #data divided in n part, validate on subset k and train on the other n-k" """ Trains and evaluates model for given K-fold CV parameter set. For an input matrix x and total of n folds, this function trains and evaluates the model. The intention of this function is for it to be called n times. Each time, the test set is composed of the rows from the training data from row N to row N=k to row N=k+N/n. For example, if the dataset has a total of 1000 rows, and this function is called with arguments k=3, n=10, this function will use x[300:400] (with y[300:400] as the corresponding labels) as the test dataset and all remaining datapoints for training (i.e., concatenation of x[0:300] and x[400:1000]). Positional parameters: y ------ All labels available for training. x ------ Input training data matrix; should be shuffled prior to passing to the cross_validation function. k ------ Which row to begin data segmentation at. n ------ Total number of folds in cross-validation. param -- list of parameters used in train() function. See implementation of train() for more details. """ x_validate = x[k:k + x.shape[0] // n] y_validate = y[k:k + y.shape[0] // n] x_train = np.concatenate((x[:k], x[k:k + x.shape[0] + 1]), axis=0) y_train = np.concatenate((y[:k], y[k:k + y.shape[0] + 1]), axis=0) x_validate = replace_outliers_with_mean(x_validate) x_train = replace_outliers_with_mean(x_train) x_train = standardize(x_train) x_validate = standardize(x_validate) #x_train = build_poly(x_train, 3) #x_validate = build_poly(x_validate, 3) x_train = addones(x_train) x_validate = addones(x_validate) print(x_train.shape) w = train(y_train, x_train, param) y_predictions = predict_labels(w, x_validate) accuracy = calculate_prediction_accuracy(y_predictions, y_validate) return accuracy, y_predictions, w
def run_ridge_regression(tx_train, y_train, tx_val, y_val): """It performs training and evaluation of ridge regression.""" print('\nTraining with ridge regression') lambda_ = 1e-06 # Train the model w, _ = ridge_regression(y=y_train, tx=tx_train, lambda_=lambda_) # Perform predictions y_pred = predict_labels(weights=w, data=tx_val, logistic=False) # Evaluate evaluation = Evaluation(y_actual=y_val, y_pred=y_pred) acc = evaluation.get_accuracy() f1 = evaluation.get_f1() print('Accuracy: {acc}, F1: {f1}'.format(acc=acc, f1=f1)) return acc, f1
def learn(predictions, ids_predicted, y_train_jets, tx_train_jets, tx_test_jets, ids_test_jets, lambda_best_jets, degree_best_jets): print('\nLearning by ridge regression...') for jet_num in range(4): print('\nLearning from training set with jet number ', str(jet_num), ' using optimal hyperparameters...') y_train, tx_train = y_train_jets[jet_num], tx_train_jets[jet_num] tx_train = feature_engineering(tx_train, degree_best_jets[jet_num], jet_num > 1) w_best, _ = ridge_regression(y_train, tx_train, lambda_best_jets[jet_num]) tx_test, ids_test = tx_test_jets[jet_num], ids_test_jets[jet_num] tx_test = feature_engineering(tx_test, degree_best_jets[jet_num], jet_num > 1) predictions.append(predict_labels(w_best, tx_test)) ids_predicted.append(ids_test) print('\nReporting prediction accuracy for the training set... \n') report_prediction_accuracy(y_train, tx_train, w_best) print('\n... this gives a rough idea about the training success.') print('\n... predicted labels for test set with jet number ', str(jet_num)) print('\n... ,predicted labels for each test set.')
def cross_validation(y, x, k_indices, k, lambda_):#, degree): """return the loss of ridge regression.""" # *************************************************** # INSERT YOUR CODE HERE # get k'th subgroup in test, others in train: TODO # *************************************************** other_indices = np.setdiff1d(range(len(y)), k_indices) y_test = y[k_indices] tx_test = x[k_indices] y_train = y[other_indices] tx_train = x[other_indices] w, loss_tr = ridge_regression(y_train, tx_train, lambda_) loss_te = 1./(2*len(y)) * np.sum((y_test - tx_test@w)**2) + lambda_ * np.linalg.norm(w)**2 prediction = predict_labels(w, tx_test, 0.0) accuracy = len(np.where(y_test - prediction == 0)[0]) / len(y) * 100 return loss_tr, loss_te, accuracy
def predict(initial_y, tx, tx_test, indices_test_group, indices_train_group, best_weights, best_degrees, logistic): """Return the prediction labels for the testing dataset""" y_pred = initial_y for i, indice_test_group in enumerate(indices_test_group): # for standardizing the test subset, we need the data of both train and test subsets tx_subset = tx[indices_train_group[i]] tx_test_subset = tx_test[indice_test_group] # get the standardized test subset _, standardized_tx_test_subset = preprocess_data( tx_subset, tx_test_subset) # predict the labels y_pred_subset = predict_labels( best_weights[i], build_poly(standardized_tx_test_subset, best_degrees[i]), logistic) y_pred[indice_test_group] = y_pred_subset return y_pred
def cross_validation_OLS(X, y, k_fold=4, seed=1): """Returns the mean accuracy based on k_fold cross validation""" all_indices = build_k_indices(y, k_fold, seed) accuracy = np.zeros(k_fold) for k in range(k_fold): test_indices = all_indices[k] train_indices = np.setdiff1d(range(len(y)), test_indices) y_test = y[test_indices] X_test = X[test_indices] y_train = y[train_indices] X_train = X[train_indices] w, loss_tr = least_squares(y_train, X_train) prediction = predict_labels(w, X_test, 0.0) accuracy[k] = len(np.where(y_test == prediction)[0]) / len(y_test) * 100 return np.mean(accuracy)