def generate_prediction(x_tr_0, y_tr_0, x_tr_1, y_tr_1, x_tr_2, y_tr_2, x_tr_3, y_tr_3, x_te_0, x_te_1, x_te_2, x_te_3, jet_num_te): """Generate a prediction for a test dataset already split according to jet_num by calculating weights using a training dataset also already split.""" #compute the weights using predetermined polynomial degrees w_0, _ = least_squares(y_tr_0, build_poly(x_tr_0, 9)) w_1, _ = least_squares(y_tr_1, build_poly(x_tr_1, 15)) w_2, _ = least_squares(y_tr_2, build_poly(x_tr_2, 13)) w_3, _ = least_squares(y_tr_3, build_poly(x_tr_3, 12)) #compute the prediction using the weights y_te_0 = predict_labels(w_0, build_poly(x_te_0, 9)) y_te_1 = predict_labels(w_1, build_poly(x_te_1, 15)) y_te_2 = predict_labels(w_2, build_poly(x_te_2, 13)) y_te_3 = predict_labels(w_3, build_poly(x_te_3, 12)) #join the four predictions into a single one matching the original indices predicted_y_te = [] i_0, i_1, i_2, i_3 = 0, 0, 0, 0 for jet_num in jet_num_te: if jet_num == 0: predicted_y_te.append(y_te_0[i_0]) i_0 += 1 elif jet_num == 1: predicted_y_te.append(y_te_1[i_1]) i_1 += 1 elif jet_num == 2: predicted_y_te.append(y_te_2[i_2]) i_2 += 1 else: predicted_y_te.append(y_te_3[i_3]) i_3 += 1 return predicted_y_te
def __init__(self, model_name, w=None, learning_param=None, debug=True): # Set weights self.w = w # Set debug object if debug: self.dbg = debugger.Debugger(['loss', 'w']) else: self.dbg = None """Depending on the chosen model, we choose the approriate output, loss prediction, and learning functions. """ if model_name == 'logistic_regression': self.model_output = misc.lr_output self.compute_loss = cost.compute_loss_ce self.predict_output = misc.map_prediction max_iters = learning_param['max_iters'] gamma = learning_param['gamma'] self.learn = lambda y, x, w, dbg: impl.logistic_regression(y, x, w, max_iters, gamma, dbg) if model_name == 'reg_logistic_regression': self.model_output = misc.lr_output self.compute_loss = cost.compute_loss_reg_ce self.predict_output = misc.map_prediction max_iters = learning_param['max_iters'] gamma = learning_param['gamma'] lambda_ = learning_param['lambda_'] self.learn = lambda y, x, w, dbg: impl.reg_logistic_regression(y, x, lambda_, w, max_iters, gamma, dbg) if model_name == 'least_squares_GD': self.model_output = np.dot self.compute_loss = cost.compute_loss_ls self.predict_output = misc.predict_ls max_iters = learning_param['max_iters'] gamma = learning_param['gamma'] self.learn = lambda y, x, w, dbg: impl.least_squares_GD(y, x, w, max_iters, gamma, dbg) if model_name == 'ridge_regression': self.model_output = np.dot self.compute_loss = cost.compute_loss_ls self.predict_output = misc.predict_ls lambda_ = learning_param['lambda_'] self.learn = lambda y, x, w, dbg: impl.ridge_regression(y, x, lambda_) if model_name == 'least_squares': self.model_output = np.dot self.compute_loss = cost.compute_loss_ls self.predict_output = misc.predict_ls self.learn = lambda y, x, w, dbg: impl.least_squares(y, x)
def impute_lr(data): #find columns that have no -999 clear_cols = [i for i in range(data.shape[1]) if -999 not in data[:,i]] #find rows that have no -999 clear_rows = [i for i in range(data.shape[0]) if -999 not in data[i,:]] #pdb.set_trace() dirty_cols = [i for i in range(data.shape[1]) if i not in clear_cols] dirty_rows = [i for i in range(data.shape[0]) if i not in clear_rows] clear_samples = np.copy(data[clear_rows, :]) #clear_samples, mean_x, std_x = hp.standardize(clear_samples) w_lr = list() mse= list() #pdb.set_trace() for feature in dirty_cols: wf = imp.least_squares(clear_samples[:, feature], clear_samples[:, clear_cols]) w_lr.append(wf[0]) #pdb.set_trace() #mse.append(compute_loss(clear_samples[:, feature], clear_samples[:, clear_cols] ,wf[0])) for sample in dirty_rows: if data[sample,feature] == -999: replacement = np.dot(data[sample, clear_cols].transpose(), wf[0]) data[sample, feature] = replacement return data
def cross_validation(y, x, k_indices,k, degree,index_to_be_skewed): """return the loss of ridge regression.""" x_train = x[np.array([p for i in range(k_indices.shape[0]) if i != k for p in k_indices[i]])] y_train= y[np.array([p for i in range(k_indices.shape[0]) if i != k for p in k_indices[i]])] x_test=x[k_indices[k]] y_test=y[k_indices[k]] min_tr=np.min(x_train,axis=0) max_tr=np.max(x_train,axis=0) #Transformations to train x_train=min_max_transform(x_train,min_tr,max_tr) x_train[:,index_to_be_skewed]= np.log(x_train[:,index_to_be_skewed]+1) x_train_poly,mean_train,std_train= expand_and_normalize_X(x_train,degree) #Transformations to test, using same min, max, mean and std as in the train partition x_test= min_max_transform(x_test,min_tr,max_tr) x_test[:,index_to_be_skewed]= x_test[:,index_to_be_skewed] x_test[:,index_to_be_skewed]= np.log(x_test[:,index_to_be_skewed]+1) x_test_poly=build_poly(x_test,degree) x_test_poly[:,1:]=(x_test_poly[:,1:]-mean_train)/std_train w,loss=m.least_squares(y_train, x_train_poly) loss_tr= -accuracy(y_train, predict_labels(w,x_train_poly)) loss_te= -accuracy(y_test, predict_labels(w,x_test_poly)) return loss_tr, loss_te,min_tr,max_tr
def cross_validation(y, x, degree, k, k_indices,method, error, feature_augmentation, hyperparams): """""" from helpers_data import feature_processing, feat_augmentation, standardize, build_poly from implementations import ridge_regression, least_squares, least_squares_GD, least_squares_SGD, logistic_regression, reg_logistic_regression # get k'th subgroup in test, others in train te_indice = k_indices[k] tr_indice = k_indices[~(np.arange(k_indices.shape[0]) == k)] tr_indice = tr_indice.reshape(-1) y_te = y[te_indice] y_tr = y[tr_indice] x_te = x[te_indice] x_tr = x[tr_indice] x_tr, y_tr, median = feature_processing (x_tr, y_tr, 'mean', replace_feature = True, suppr_outliers = hyperparams[-1], threshold = 3, ref_median=[]) x_te, y_te, _= feature_processing (x_te, y_te, 'mean', replace_feature = True, suppr_outliers = False, threshold = 3, ref_median=median) tx_tr_aug = [] tx_te_aug = [] if feature_augmentation: tx_tr_aug, index = feat_augmentation(x_tr, 0.003) tx_te_aug, _ = feat_augmentation(x_te, 0.003, False, index) # form data with polynomial degree tx_tr = build_poly(x_tr, degree, feature_augmentation, tx_tr_aug) tx_te = build_poly(x_te, degree, feature_augmentation, tx_te_aug) tx_tr, mean, std = standardize(tx_tr) tx_te, _, _ = standardize(tx_te, mean, std) #print('Mean and std of each feature in train set: {} , {}'.format(tx_tr.mean(axis = 0),tx_tr.std(axis = 0))) #print('Mean and std of each feature in test set: {} , {}'.format(tx_te.mean(axis = 0),tx_te.std(axis = 0))) if method == 'rr': w,_ = ridge_regression(y_tr, tx_tr, hyperparams[0]) # ridge regression elif method == 'ls': w,_ = least_squares(y_tr, tx_tr) # least square elif method == 'lsGD': w,_ = least_squares_GD(y_tr, tx_tr, hyperparams[0], hyperparams[1], hyperparams[2]) # gradient descent elif method == 'lsSGD': w,_ = least_squares_SGD(y_tr, tx_tr, hyperparams[0], hyperparams[1], hyperparams[2], hyperparams[3]) # stoch GD elif method == 'log': w,_ = logistic_regression(y_tr, tx_tr, hyperparams[0], hyperparams[1], hyperparams[2]) # logistic reg elif method == 'rlog': w,_ =reg_logistic_regression(y_tr, tx_tr, hyperparams[3], np.zeros(tx_tr.shape[1]), hyperparams[1], hyperparams[2]) # regularised logistic reg else: raise NotImplementedError if method == 'log': loss_tr = cal_loglike(y_tr, tx_tr, w) loss_te = cal_loglike(y_te, tx_te, w) elif method == 'rlog': loss_tr = cal_loglike_r(y_tr, tx_tr, w, hyperparams[3]) loss_te = cal_loglike_r(y_te, tx_te, w, hyperparams[3]) else : # calculate the loss for train and test data loss_tr = compute_loss(y_tr, tx_tr, w, error) loss_te = compute_loss(y_te, tx_te, w, error) y_pred = predict_labels(np.array(w).T, tx_te) acc = accuracy(y_te,y_pred) return loss_tr, loss_te, w, acc
def cross_validation(y, tx, mlfunction, split_number=5, lambda_=1e-6, gamma=0.001): '''Performs a ml_function given as parameters using cross validation on the training set split_number folds (5 as default value) ''' # define empty lists to store train/test losses and accuracy train_loss_ = [] test_loss_ = [] train_accuracy_ = [] test_accuracy_ = [] # get k_indices k_indices = build_k_indices(len(y), split_number) for ki in range(len(k_indices)): # set the k'th indices as test, and others as training set #train_idx = np.asarray([k_indices[i] for i in np.delete( np.arange(len(k_indices)), ki)]).flatten() test_idx = np.asarray(k_indices[ki]) train_idx = np.delete(np.arange(len(y)), test_idx) train_tX = tx[train_idx] train_y = y[train_idx] test_tX = tx[test_idx] test_y = y[test_idx] if (mlfunction == 'ridge_regression'): w, loss = impl.ridge_regression(train_y, train_tX, lambda_) elif (mlfunction == 'least_squares'): w, loss = impl.least_squares(train_y, train_tX) elif (mlfunction == 'logistic_regression'): w, loss = impl.logistic_regression(train_y, train_tX) elif (mlfunction == 'reg_logistic_regression'): w, loss = impl.reg_logistic_regression(train_y, train_tX, lambda_) elif (mlfunction == 'least_squares_sgd'): w, loss = impl.least_squares_SGD(train_y, train_tX, gamma) elif (mlfunction == 'least_squares_gd'): w, loss = impl.least_squares_GD(train_y, train_tX, gamma) else: print('ERROR: ml_function not recognized') print( 'least_squares, least_squares_gd, least_squares_sgd, logistic_regression, reg_logistic_regression' ) return None # Calculate different losses and accuracy train_loss_.append(impl.compute_loss_mse(train_y, train_tX, w)) test_loss_.append(impl.compute_loss_mse(test_y, test_tX, w)) train_accuracy_ = impl.compute_accuracy(train_y, train_tX, w) test_accuracy_ = impl.compute_accuracy(test_y, test_tX, w) return np.mean(train_loss_), np.mean(test_loss_), np.mean( train_accuracy_), np.mean(test_accuracy_)
def get_model(model, y, tx, initial_w, max_iters, gamma, lambda_, batch_size): """ Returns the learned weights 'w' (last weight vector) and the corresponding loss function by a given model. Parameters ---------- model: string The model y: ndarray The labels tx: ndarray The feature matrix initial_w: ndarray The initial weights max_iters: integer The number of steps to run gamma: integer The step size lambda_: integer The regularization parameter batch_size: integer The batch size Returns ------- tuple The learned weights """ if model == "MSE_GD": w, _ = least_squares_GD(y, tx, initial_w, max_iters, gamma) elif model == "MSE_SGD": w, _ = least_squares_SGD(y, tx, initial_w, batch_size, max_iters, gamma) elif model == "MSE_OPT": w, _ = least_squares(y, tx) elif model == "MSE_OPT_REG": w, _ = ridge_regression(y, tx, lambda_) elif model == "LOG_GD": w, _ = logistic_regression(y, tx, initial_w, max_iters, gamma) elif model == "LOG_REG_GD": w, _ = reg_logistic_regression(y, tx, lambda_, initial_w, max_iters, gamma) elif model == "LOG_REG_L1": w, _ = reg_logistic_regression_L1(y, tx, lambda_, initial_w, max_iters, gamma) elif model == "MSE_GD_L1": w, _ = least_squares_GD_L1(y, tx, lambda_, initial_w, max_iters, gamma) else: raise UnknownModel return w
def cross_validation_ls(y, x, k_indices, k): """train and test least square model using cross validation""" x_test = x[k_indices[k]] x_train = np.delete(x, [k_indices[k]], axis=0) y_test = y[k_indices[k]] y_train = np.delete(y, [k_indices[k]], axis=0) opt_w, mse_tr = imp.least_squares(y_train,x_train) mse_te = imp.compute_mse(y_test, x_test, opt_w) return mse_te, opt_w
def test_least_squares(y_train, tx_train, y_test, tx_test): """ Tests least_squares method on the splitted data set and reports percentage of correct predictions. Args: y_train: training labels after the splitting tx_train: training features after the splitting y_test: test labels after the splitting tx_test: test features after the splitting """ print('\nTesting least_squares...') w, _ = least_squares(y_train, tx_train) report_prediction_accuracy(y_test, tx_test, w) print('... testing completed.')
def find_optimal_w(tX, y, implementation, log_initial_w, log_max_iters, log_gamma, decreasing_gamma, log_regulator, ridge_lambda): """ Find the optimal weights by training the data set Parameters ---------- tX: array The feature matrices y: array The output log_initial_w: array inital weights in order to perform GD or SGD log_max_iters: integer number of iterations to perform GD or SGD log_gamma: float gamma parameter to perform GD or SGD log_regulator: float lambda to perform logistic regression ridge_lambda: float lambda to perform ridge regression Return ------ optimal_w = array Optimal weights. """ optimal_w = None if implementation == 0: optimal_w, _ = impl.least_squares(y, tX) if implementation == 1: optimal_w, _ = impl.ridge_regression(y, tX, ridge_lambda) if implementation == 2: optimal_w, _ = impl.reg_logistic_regression(y, tX, log_regulator, log_initial_w, log_max_iters, log_gamma, decreasing_gamma) return optimal_w
def fill_missing_values(X_, deg=1, tresh=1, lambda_=1e-7): # Create a dictionary to store the index of the feature with -999 value as key, and the corresponding indices as value X = X_.copy() unknown_dict = find_bad_features(X) # Get bad/good features indices bad_features = list(unknown_dict.keys()) # select feature to fill depending on the treshold features_to_fill = [ i for i in bad_features if ((len(unknown_dict[i]) / len(X)) < tresh) ] features_to_ignore = bad_features.copy() for i in features_to_fill: features_to_ignore.remove(i) clean_features = np.delete(np.arange(len(X.T)), bad_features) clean_X = X.T[clean_features] # Ignoring very bad features (>tresh) # fill missing values using least squares for i in features_to_fill: clean_idx = list(np.delete(np.arange(len(X)), unknown_dict[i])) tx = clean_X.T[clean_idx] ys = X.T[i][clean_idx] bad_idx_by_feature = unknown_dict[i] w, _ = impl.least_squares(ys, tx) y_bad = np.dot(clean_X.T[bad_idx_by_feature], w) # Predict missing values for idx in bad_idx_by_feature: X[idx][i] = y_bad[i] feat_to_conserve = np.delete(np.arange(len(X.T)), features_to_ignore) return X.T[feat_to_conserve].T
# Cross validation over lambda avg_test_accuracy_RR = cross_validation_RR(X_train, y_train, k_fold=4, seed=1) # Cross validation over gamma avg_test_accuracy_LR = cross_validation_LR(X_train, y_train, k_fold=4, seed=1) # Cross validation over both gamma and lambda g, l, avg_test_accuracy_RLR = cross_validation_RLR(X_train, y_train, k_fold=4, seed=1) #%% Testing functions #np.random.seed(42) gamma = 0.2 lambda_ = 4E-5 w, loss = least_squares(y = y_train, tx = X_train) # w, loss = least_squares_SGD(y = y_train, tx = X_train, initial_w = np.random.random(size=num_features)*0.01, max_iters = 200000, gamma = gamma) # w, loss = ridge_regression(y = y_train, tx = X_train, lambda_ = lambda_) # w, loss = logistic_regression(y = y_train, tx = X_train, initial_w = np.random.random(size=num_features)*10, max_iters = 125000, gamma = gamma) # w, loss = reg_logistic_regression(y = y_train, tx = X_train, lambda_ = lambda_, initial_w = np.random.random(size=num_features)*0.01, max_iters = 200000, gamma = gamma) plt.plot(w) #%% Predictive step y_test = X_test @ w plt.hist(y_test, bins=200)
def cross_validation(x, y, k, mode, gamma=None, lambda_=None, max_iters=None, initial_w=None): """ INPUT: @x : input data, dimensions (NxD) @y : target labels, (Nx1) array @k : number of folds OUTPUT: """ D = x.shape[1] #randomly permute data maybe? x_split = np.array_split(x, k, axis=0) y_split = np.array_split(y, k, axis=0) #initialize weights and metrics weights = list() acc = list() tpr = list() fpr = list() losses = list() #loop over folds for fold in range(k): #create model #train_ind = [i for i in range(k) if i!=fold] #val_ind = [i for i in range(k) if i==fold] #pdb.set_trace() x_train = [x_split[i] for i in range(k) if i != fold] y_train = [y_split[i] for i in range(k) if i != fold] x_train = np.concatenate(x_train, axis=0) y_train = np.concatenate(y_train, axis=0) x_val = x_split[fold] y_val = y_split[fold] #model = Proj1_Model(x_train, y_train, mode) #train model for fold #weights[k] = model.train() """here the choice of method""" if mode == 'linear_regression_eq': update, loss = imp.least_squares(y_train, x_train) predictions = np.dot(x_val, update) pr_bool = predictions >= np.mean(predictions) elif mode == 'ridge_regression_eq': update, loss = imp.ridge_regression(y_train, x_train, lambda_) predictions = np.dot(x_val, update) pr_bool = predictions >= np.mean(predictions) elif mode == 'linear_regression_GD': update, loss = imp.least_squares_GD(y_train, x_train, initial_w, max_iters, gamma) predictions = np.dot(x_val, update) pr_bool = predictions >= np.mean(predictions) elif mode == 'linear_regression_SGD': update, loss = imp.least_squares_SGD(y_train, x_train, initial_w, max_iters, gamma) predictions = np.dot(x_val, update) pr_bool = predictions >= np.mean(predictions) elif mode == 'logistic_regression': update, loss = imp.logistic_regression(y_train, x_train, initial_w, max_iters, gamma) predictions = np.dot(x_val, update) predicted_prob = H.sigmoid(predictions) #pdb.set_trace() pr_bool = predicted_prob > 0.5 elif mode == 'reg_logistic_regression': update, loss = imp.reg_logistic_regression(y_train, x_train, initial_w, max_iters, gamma) predictions = np.dot(x_val, update) predicted_prob = H.sigmoid(predictions) #pdb.set_trace() pr_bool = predicted_prob > 0.5 weights.append(update) losses.append(loss) pr_bool = predictions >= np.mean(predictions) y_bool = y_val == 1 correct = pr_bool == y_bool tp = np.logical_and(correct, y_bool) fp = np.logical_and(np.logical_not(correct), pr_bool) #tp = [i for i in range(len(pr_bool)) if (pr_bool[i] == True and y_bool[i] == True)] #all_p = [i for i in range(len(pr_bool)) if y_bool == True] #fp = [i for i in range(len(pr_bool)) if (pr_bool == True and y_bool == False)] #all_n = [i for i in range(len(pr_bool)) if y_bool == False] #print('True signal samples:' + str(sum(y_val)) + ' - Predicted signal samples:' + str(sum(pr_bool))) acc.append(sum(correct) / float(len(y_val))) tpr.append(sum(tp) / float(sum(y_bool))) fpr.append(sum(fp) / float(sum(np.logical_not(y_bool)))) #acc[k] = model.acc() #tpr[k] = model.tpr() #fpr[k] = model.fpr() return acc, tpr, fpr, losses
from sklearn.metrics import r2_score # Linear regression print("Linear Regession \n ---------------- \n") X, y = datasets.load_boston(return_X_y = True) X, _, _ = implementations.standardize_numpy(X) tx = np.c_[np.ones(X.shape[0]), X] initial_w = np.zeros(tx.shape[1]) max_iters = 1000 gamma = 0.01 w_lr, loss_lr = implementations.least_squares(y, tx) y_pred_lr = tx @ w_lr print(f"Linear regression eq: {r2_score(y_pred_lr, y)}") w_lr_gd, loss_lr_gd = implementations.least_squares_GD(y, tx, initial_w, max_iters, gamma, verbose=False) y_pred_lr_gd = tx @ w_lr_gd print(f"Linear regression gd: {r2_score(y_pred_lr_gd, y)}") w_lr_sgd, loss_lr_sgd = implementations.least_squares_SGD(y, tx, initial_w, max_iters, gamma, verbose=False) y_pred_lr_sgd = tx @ w_lr_sgd print(f"Linear regression sgd: {r2_score(y_pred_lr_sgd, y)}") reg = LinearRegression().fit(X, y) y_pred_sk = reg.predict(X)
""" Load the datasets, train a model, and create a Kaggle submission for the first Machine Learning project Authors: Kirill IVANOV, Matthias RAMIREZ, Nicolas TALABOT """ ### Import modules and datasets from proj1_helpers import load_csv_data, predict_labels, create_csv_submission from implementations import least_squares from utilities import split_data, preprocess_data y_train, x_train, ids_train = load_csv_data("train.csv") y_test, x_test, ids_test = load_csv_data("test.csv") # Parameters seed = 3 degree = 11 ratio = 0.66 # Learn the model tx, x_mean, x_std = preprocess_data(x_train, degree) x_tr, y_tr, x_te, y_te = split_data(tx, y_train, ratio, seed) w, loss_tr = least_squares(y_tr, x_tr) # Create a Kaggle submission x_kaggle,_,_ = preprocess_data(x_test, degree, compute_mean_std=False, \ x_mean=x_mean, x_std=x_std) y_pred = predict_labels(w, x_kaggle) create_csv_submission(ids_test, y_pred, "run_submission.csv")
def cross_validation(y, tX, gamma, method='logistic_regression'): """Cross validation for logistic regression @param gamma: learning rate @return : the average accuracy over the four fold validations """ N, D = tX.shape # Logistic regression parameters max_iters = 100 batch_size = N / 100 # Cross validation parameters seed = 1 k_fold = 4 k_indices = build_k_indices(y, k_fold, seed) N_fold = N * (k_fold - 1) / k_fold N_test = N / k_fold acc = [] for k in range(k_fold): yTr = np.array([]) xTr = np.zeros((0, D)) for i in range(k_fold): if i == k: yTe = y[k_indices[i]] xTe = tX[k_indices[i]] else: yTr = np.append(yTr, y[k_indices[i]], axis=0) xTr = np.append(xTr, tX[k_indices[i]], axis=0) initial_w = np.zeros(tX.shape[1]) if method == 'logistic_regression': initial_w = np.zeros((tX.shape[1], 1)) w, loss = logistic_regression(yTr, xTr, initial_w, max_iters, gamma) y_est = sigmoid(np.dot(xTe, w)) y_label = [0 if i < 0.5 else 1 for i in y_est] elif method == 'reg_logistic_regression': initial_w = np.zeros((tX.shape[1], 1)) lambda_ = 0.1 w, loss = reg_logistic_regression(yTr, xTr, lambda_, initial_w, max_iters, gamma) y_est = sigmoid(np.dot(xTe, w)) y_label = [0 if i < 0.5 else 1 for i in y_est] elif method == 'least_squares_GD': w, loss = least_squares_GD(yTr, xTr, initial_w, max_iters, gamma) y_label = predict_labels(w, xTe) elif method == 'least_squares_SGD': w, loss = least_squares_SGD(yTr, xTr, initial_w, max_iters, gamma) y_label = predict_labels(w, xTe) elif method == 'least_squares': w, loss = least_squares(yTr, xTr) y_label = predict_labels(w, xTe) elif method == 'ridge_regression': w, loss = ridge_regression(yTr, xTr, 0.1) y_label = predict_labels(w, xTe) else: raise Exception('Invalid method') corr = [ True if i == yTe[ind] else False for ind, i in enumerate(y_label) ] acc.append(sum(corr) / N_test) # print("Fold: {f}, Accuracy: {acc}, Loss:{loss}".format(f=k, acc=acc[k], loss=loss)) return (sum(acc) / k_fold), acc
print("Starting cross validation for the tx0 dataset") print("##################################") min_degree0,min_loss0=cross_validation_demo(y[tX0_dropped_distribution[:,0].astype(int)], tX0_dropped_distribution[:,1:],1,16,index_to_be_skewed0) # # Record the min, max, mean, std of the data set resulting from the best weight found so they can be re-applied to the testing set later min0= np.min(tX0_dropped_distribution[:,1:],axis=0) max0=np.max(tX0_dropped_distribution[:,1:],axis=0) tx0=min_max_transform(tX0_dropped_distribution[:,1:],min0,max0) tx0[:,index_to_be_skewed0]= np.log(tx0[:,index_to_be_skewed0]+1) tx0_norm,mean0,std0=expand_and_normalize_X(tx0,min_degree0) w0,loss0=m.least_squares(y[tX0_dropped_distribution[:,0].astype(int)],tx0_norm) min_degree0,min_loss0,loss0 print("Accuracy of best w found for tx0",accuracy(y[tX0_dropped_distribution[:,0].astype(int)],predict_labels(w0,tx0_norm))) print("##################################") print("Starting cross validation for the tx1 dataset") print("##################################") min_degree1,min_loss1=cross_validation_demo(y[tX1_dropped_distribution[:,0].astype(int)], tX1_dropped_distribution[:,1:],1,16,index_to_be_skewed1)