Esempio n. 1
0
def cross_validation_ridge(y, x, k_indices, k, lambda_, degree):
    """return the loss of ridge regression computed over a k-fold cross validation
    with polynomial degrees"""
    losses_tr = []
    losses_te = []

    for k_group in range(k):

        # divide in test and train set: 1 set for test all the others for train
        index_te = k_indices[k_group]
        index_tr = np.setdiff1d(np.arange(len(y)), index_te)
        x_te = x[index_te]
        x_tr = x[index_tr]
        y_te = y[index_te]
        y_tr = y[index_tr]

        # form data with polynomial degree
        x_te_poly = build_poly(x_te, degree)
        x_tr_poly = build_poly(x_tr, degree)

        # compute w with ridge regression
        w, _ = ridge_regression(y_tr, x_tr_poly, lambda_)

        # calculate the loss for train and test data
        rmse_tr = compute_rmse_ridge(y_tr, x_tr_poly, w, lambda_)
        rmse_te = compute_rmse_ridge(y_te, x_te_poly, w, lambda_)
        losses_tr.append(rmse_tr)
        losses_te.append(rmse_te)

    #return losses average
    loss_tr = np.mean(losses_tr)
    loss_te = np.mean(losses_te)
    return loss_tr, loss_te
Esempio n. 2
0
def ridge_regression_demo(y, tx, lamb, degree):
    # define parameter
    tX = im.build_poly(tx, degree)
    weight, loss = im.ridge_regression(y, tX, lamb)

    print("Training RMSE={tr:.3f}".format(tr=loss))
    return weight, loss
Esempio n. 3
0
def lambda_cv(tX, y, plot=False):
    lambdas = np.logspace(-5, 5, 15)

    tX_tr, y_tr, tX_te, y_te = split_data(tX, y, ratio=0.8, seed=1)

    accs_tr = []
    accs_te = []

    for lambda_ in lambdas:
        w, _ = implementations.ridge_regression(y_tr, tX_tr, lambda_)
        y_pr_tr = predict_labels(w, tX_tr)
        y_pr_te = predict_labels(w, tX_te)
        accs_tr.append(compute_accuracy(y_tr, y_pr_tr))
        accs_te.append(compute_accuracy(y_te, y_pr_te))

    min_acc = max(accs_te)
    best_lambda = lambdas[np.argwhere(accs_te == min_acc)][0][0]

    if plot:
        plt.plot(lambdas, accs_tr, label="Train")
        plt.plot(lambdas, accs_te, label="Test")
        plt.plot(best_lambda, min_acc, "*", label="Best value")
        plt.xlabel("Lambda")
        plt.ylabel("Accuracy")
        plt.legend()
        plt.show()

    return best_lambda
Esempio n. 4
0
def cross_validation(y, x, k_fold, lambda_, degree):
    """
    Return the loss for ridge regression for this given lambda_ and given degree
    Arguments:
    - y: the column of ground truth results
    - x: the features matrix
    - k_fold: the number of fold to do cross validation
    - lambda_: penalizing parameter for ridge regression
    - degree: the degree of the polynomial data augmentation
    """
    k_indices = build_k_indices(y, k_fold, 1)
    x_k, y_k = x[k_indices], y[k_indices]
    Loss_tr = []
    Loss_te = []
    for k in range(k_fold):
        x_train, y_train, x_test, y_test = [], [], [], []
        x_test = x_k[k]
        y_test = y_k[k]
        x_train = np.delete(x_k, k, axis=0)
        y_train = np.delete(y_k, k, axis=0)
        phi_x_train = build_poly(x_train, degree)
        phi_x_test = build_poly(x_test, degree)
        loss_tr, weights = implementations.ridge_regression(
            y_train, phi_x_train, lambda_)
        loss_te = implementations.compute_mse(y_test, phi_x_test, weights)
        Loss_tr.append(loss_tr)
        Loss_te.append(loss_te)
    Loss_tr = np.array(Loss_tr)
    Loss_te = np.array(Loss_te)

    return Loss_tr.mean(), Loss_te.mean()
Esempio n. 5
0
    def __init__(self, model_name, w=None, learning_param=None, debug=True):

        # Set weights
        self.w = w

        # Set debug object
        if debug:
            self.dbg = debugger.Debugger(['loss', 'w'])
        else:
            self.dbg = None

        """Depending on the chosen model, we choose the approriate output, 
        loss prediction, and learning functions.
        """
        if model_name == 'logistic_regression':
            self.model_output = misc.lr_output
            self.compute_loss = cost.compute_loss_ce
            self.predict_output = misc.map_prediction

            max_iters = learning_param['max_iters']
            gamma = learning_param['gamma']
            
            self.learn = lambda y, x, w, dbg: impl.logistic_regression(y, x, w, max_iters, gamma, dbg)

        if model_name == 'reg_logistic_regression':
            self.model_output = misc.lr_output
            self.compute_loss = cost.compute_loss_reg_ce
            self.predict_output = misc.map_prediction

            max_iters = learning_param['max_iters']
            gamma = learning_param['gamma']
            lambda_ = learning_param['lambda_']
            self.learn = lambda y, x, w, dbg: impl.reg_logistic_regression(y, x, lambda_, w, max_iters, gamma, dbg)


        if model_name == 'least_squares_GD':
            self.model_output = np.dot
            self.compute_loss = cost.compute_loss_ls
            self.predict_output = misc.predict_ls

            max_iters = learning_param['max_iters']
            gamma = learning_param['gamma']
            
            self.learn = lambda y, x, w, dbg: impl.least_squares_GD(y, x, w, max_iters, gamma, dbg)

        if model_name == 'ridge_regression':
            self.model_output = np.dot
            self.compute_loss = cost.compute_loss_ls
            self.predict_output = misc.predict_ls

            lambda_ = learning_param['lambda_']
            
            self.learn = lambda y, x, w, dbg: impl.ridge_regression(y, x, lambda_)

        if model_name == 'least_squares':
            self.model_output = np.dot
            self.compute_loss = cost.compute_loss_ls
            self.predict_output = misc.predict_ls

            self.learn = lambda y, x, w, dbg: impl.least_squares(y, x)
Esempio n. 6
0
def cross_validation_ridge(y, x, k_indices, k, lambda_, degree):
    """Cross validation helper function for ridge regression techniques

        :param y: outpus/labels, numpy array (-1 = background and 1 = signal)
        :param x: vector of the data samples
        :param k_indices: k indices groups for k-fold
        :param k: k'th group to select
        :param lambda_: regularization factor (penalty factor)
        :param degree: maximum degree of the polynomial basis
        :return: loss for train, loss for test, weights
    """
    # Build test and training set
    te_indice = k_indices[k]
    tr_indice = k_indices[~(np.arange(k_indices.shape[0]) == k)]
    tr_indice = tr_indice.reshape(-1)
    y_test = y[te_indice]
    y_train = y[tr_indice]
    X_test = x[te_indice]
    X_train = x[tr_indice]

    # form data with polynomial degree
    tx_train = build_poly(X_train, degree)
    tx_test = build_poly(X_test, degree)

    # ridge regression
    w, loss = imp.ridge_regression(y_train, tx_train, lambda_)

    # calculate the loss for train and test data
    loss_train = imp.calculate_rmse(loss)
    loss_test = imp.calculate_rmse(imp.compute_loss(y_test, tx_test, w))
    accuracy = calculate_accuracy(y_test, predict_labels(w, tx_test))
    return loss_train, loss_test, accuracy, w
Esempio n. 7
0
def experiment_for_submitting():
    y_train, tX_train, ids = load_csv_data(DATA_TRAIN_PATH)
    _, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

    np.random.seed(2019)
    results = pd.DataFrame(
        columns=["Preprocessing", "Class -1 count", "Class +1 count"])

    for preprocessing_param in preprocessing_options:
        tX_stacked = np.vstack((tX_train, tX_test))
        prep_param = {
            "bias": True,
            "fill": True,
            "standardize": False,
            "degree": 11,
            "log": True,
            "root": True
        }
        tX_stacked_prep, _, desc_prep = preprocess_data(
            tX_stacked, None, prep_param)
        tX_train_prep, tX_test_prep = np.split(tX_stacked_prep,
                                               [len(tX_train)])

        lambda_ = lambda_cv(tX_train_prep, y_train)
        print(f"Best lambda: {lambda_}")
        w, _ = ridge_regression(y_train, tX_train_prep, lambda_)

        y_pred = predict_labels(w, tX_test_prep)
        uniq, count = np.unique(y_pred, return_counts=True)

        print(preprocessing_param,
              f"Class -1: {count[0]}, Class +1: {count[1]}")
        results.loc[len(results)] = (desc_prep, count[0], count[1])

    results.to_csv("Submitting experiment.csv", sep=";")
Esempio n. 8
0
def cross_validation(y, x, degree, k, k_indices,method, error, feature_augmentation, hyperparams):
    """"""
    from helpers_data import feature_processing, feat_augmentation, standardize, build_poly
    from implementations import ridge_regression, least_squares, least_squares_GD, least_squares_SGD, logistic_regression, reg_logistic_regression
    
    
    # get k'th subgroup in test, others in train
    te_indice = k_indices[k]
    tr_indice = k_indices[~(np.arange(k_indices.shape[0]) == k)]
    tr_indice = tr_indice.reshape(-1)
    
    y_te = y[te_indice]
    y_tr = y[tr_indice]
    x_te = x[te_indice]
    x_tr = x[tr_indice]
    
    x_tr, y_tr, median = feature_processing (x_tr, y_tr, 'mean', replace_feature = True, suppr_outliers = hyperparams[-1], threshold = 3, ref_median=[])
    x_te, y_te, _= feature_processing (x_te, y_te, 'mean', replace_feature = True, suppr_outliers = False, threshold = 3, ref_median=median)
    
    
    tx_tr_aug = []
    tx_te_aug = []
    if feature_augmentation:
        tx_tr_aug, index = feat_augmentation(x_tr, 0.003)
        tx_te_aug, _ = feat_augmentation(x_te, 0.003, False, index)
    
    # form data with polynomial degree
    tx_tr = build_poly(x_tr, degree, feature_augmentation, tx_tr_aug)
    tx_te = build_poly(x_te, degree, feature_augmentation, tx_te_aug)
    tx_tr, mean, std = standardize(tx_tr)
    tx_te, _, _ = standardize(tx_te, mean, std)
    
    #print('Mean and std of each feature in train set: {} , {}'.format(tx_tr.mean(axis = 0),tx_tr.std(axis = 0)))
    #print('Mean and std of each feature in test set: {} , {}'.format(tx_te.mean(axis = 0),tx_te.std(axis = 0)))
    
    
    
    if method == 'rr': w,_ = ridge_regression(y_tr, tx_tr, hyperparams[0]) # ridge regression
    elif method == 'ls': w,_ = least_squares(y_tr, tx_tr) # least square
    elif method == 'lsGD': w,_ = least_squares_GD(y_tr, tx_tr, hyperparams[0], hyperparams[1], hyperparams[2]) # gradient descent
    elif method == 'lsSGD': w,_ = least_squares_SGD(y_tr, tx_tr, hyperparams[0], hyperparams[1], hyperparams[2], hyperparams[3]) # stoch GD
    elif method == 'log': w,_ = logistic_regression(y_tr, tx_tr, hyperparams[0], hyperparams[1], hyperparams[2]) # logistic reg
    elif method == 'rlog': w,_ =reg_logistic_regression(y_tr, tx_tr, hyperparams[3], np.zeros(tx_tr.shape[1]), hyperparams[1], hyperparams[2]) # regularised logistic reg
    else: raise NotImplementedError
   
    if method == 'log':
        loss_tr = cal_loglike(y_tr, tx_tr, w)
        loss_te = cal_loglike(y_te, tx_te, w)
    elif method == 'rlog':
        loss_tr = cal_loglike_r(y_tr, tx_tr, w, hyperparams[3])
        loss_te = cal_loglike_r(y_te, tx_te, w, hyperparams[3])
    else :
        # calculate the loss for train and test data
        loss_tr = compute_loss(y_tr, tx_tr, w, error)
        loss_te = compute_loss(y_te, tx_te, w, error)      
    
    y_pred = predict_labels(np.array(w).T, tx_te)
    acc = accuracy(y_te,y_pred)
    
    return loss_tr, loss_te, w, acc
Esempio n. 9
0
def main():
    # Model Parameters
    degree = 13
    whis = 2.5
    lambda_ = 0.0001

    # Load the training data
    print("Loading the training Datas...")
    y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

    # Clean and prepare our data
    print("Clean and prepare the training datas...")
    y_train, tX_train, ids_train = prepareData(y, tX, ids, degree, whis)

    # Train our models
    print("Train the models...")
    weights_0, loss_0 = ridge_regression(y_train[0], tX_train[0], lambda_)
    weights_1, loss_1 = ridge_regression(y_train[1], tX_train[1], lambda_)
    weights_2, loss_2 = ridge_regression(y_train[2], tX_train[2], lambda_)
    weights_3, loss_3 = ridge_regression(y_train[3], tX_train[3], lambda_)

    # Load the dataset to predict
    print("Loading the testing Datas...")
    y_test, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

    # Prepare the data in the same way as the train dataset
    print("Clean and prepare the testing datas...")
    y_test, tX_test, ids_test = prepareData(y_test, tX_test, ids_test, degree,
                                            whis)

    # Predict each class
    print("Predict the testing datas...")
    y_pred_0 = predict_labels(weights_0, tX_test[0])
    y_pred_1 = predict_labels(weights_1, tX_test[1])
    y_pred_2 = predict_labels(weights_2, tX_test[2])
    y_pred_3 = predict_labels(weights_3, tX_test[3])

    # Concatenate the results
    y_pred = np.concatenate([y_pred_0, y_pred_1, y_pred_2, y_pred_3])
    ids_test = np.concatenate(
        [ids_test[0], ids_test[1], ids_test[2], ids_test[3]])

    # Write the results in a csv file
    print("Writing the results...")
    create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

    print("DONE!, your predictions are available in ", OUTPUT_PATH)
def cross_validation(y,
                     tx,
                     mlfunction,
                     split_number=5,
                     lambda_=1e-6,
                     gamma=0.001):
    '''Performs a ml_function given as parameters using cross validation on the training set split_number folds (5 as default value) '''

    # define empty lists to store train/test losses and accuracy
    train_loss_ = []
    test_loss_ = []
    train_accuracy_ = []
    test_accuracy_ = []

    # get k_indices
    k_indices = build_k_indices(len(y), split_number)

    for ki in range(len(k_indices)):

        # set the k'th indices as test, and others as training set
        #train_idx = np.asarray([k_indices[i] for i in np.delete( np.arange(len(k_indices)), ki)]).flatten()
        test_idx = np.asarray(k_indices[ki])
        train_idx = np.delete(np.arange(len(y)), test_idx)

        train_tX = tx[train_idx]
        train_y = y[train_idx]

        test_tX = tx[test_idx]
        test_y = y[test_idx]

        if (mlfunction == 'ridge_regression'):
            w, loss = impl.ridge_regression(train_y, train_tX, lambda_)
        elif (mlfunction == 'least_squares'):
            w, loss = impl.least_squares(train_y, train_tX)
        elif (mlfunction == 'logistic_regression'):
            w, loss = impl.logistic_regression(train_y, train_tX)
        elif (mlfunction == 'reg_logistic_regression'):
            w, loss = impl.reg_logistic_regression(train_y, train_tX, lambda_)

        elif (mlfunction == 'least_squares_sgd'):
            w, loss = impl.least_squares_SGD(train_y, train_tX, gamma)
        elif (mlfunction == 'least_squares_gd'):
            w, loss = impl.least_squares_GD(train_y, train_tX, gamma)
        else:
            print('ERROR: ml_function not recognized')
            print(
                'least_squares, least_squares_gd, least_squares_sgd, logistic_regression, reg_logistic_regression'
            )
            return None

        # Calculate different losses and accuracy
        train_loss_.append(impl.compute_loss_mse(train_y, train_tX, w))
        test_loss_.append(impl.compute_loss_mse(test_y, test_tX, w))

        train_accuracy_ = impl.compute_accuracy(train_y, train_tX, w)
        test_accuracy_ = impl.compute_accuracy(test_y, test_tX, w)

    return np.mean(train_loss_), np.mean(test_loss_), np.mean(
        train_accuracy_), np.mean(test_accuracy_)
Esempio n. 11
0
def cross_validation_ridge(y_train, x_train, num_folds, lambda_, seed=1):
    np.random.seed(seed)
    scores = []
    for x_train_sub, x_val_sub, y_train_sub, y_val_sub in k_fold_splits(y_train, x_train, num_folds):
        w, _ = ridge_regression(y_train_sub, x_train_sub, lambda_)
        y_val_predict = predict_labels(w, x_val_sub)
        score = np.mean(y_val_predict == y_val_sub)
        scores.append(score)
    return np.array(scores)
Esempio n. 12
0
def get_model(model, y, tx, initial_w, max_iters, gamma, lambda_, batch_size):
    """ Returns the learned weights 'w' (last weight vector) and
    the corresponding loss function by a given model.

    Parameters
    ----------
    model: string
        The model
    y: ndarray
        The labels
    tx: ndarray
        The feature matrix
    initial_w: ndarray
        The initial weights
    max_iters: integer
        The number of steps to run
    gamma: integer
        The step size
    lambda_: integer
        The regularization parameter
    batch_size: integer
        The batch size

    Returns
    -------
    tuple
        The learned weights
    """
    if model == "MSE_GD":
        w, _ = least_squares_GD(y, tx, initial_w, max_iters, gamma)
        
    elif model == "MSE_SGD":
        w, _ = least_squares_SGD(y, tx, initial_w, batch_size, max_iters, gamma)
        
    elif model == "MSE_OPT":
        w, _ = least_squares(y, tx)
        
    elif model == "MSE_OPT_REG":
        w, _ = ridge_regression(y, tx, lambda_)
        
    elif model == "LOG_GD":
        w, _ = logistic_regression(y, tx, initial_w, max_iters, gamma)
        
    elif model == "LOG_REG_GD":
        w, _ = reg_logistic_regression(y, tx, lambda_, initial_w, max_iters, gamma)

    elif model == "LOG_REG_L1":
        w, _ = reg_logistic_regression_L1(y, tx, lambda_, initial_w, max_iters, gamma)
    
    elif model == "MSE_GD_L1":
        w, _ = least_squares_GD_L1(y, tx, lambda_, initial_w, max_iters, gamma)
    
    else:
        raise UnknownModel
    
    return w
Esempio n. 13
0
def solve(tX, y):
    tX_tr, y_tr, tX_te, y_te = split_data(tX, y, ratio=0.8, seed=2019)

    lambda_ = 1
    w, _ = ridge_regression(y_tr, tX_tr, lambda_)
    y_pr_tr = predict_labels(w, tX_tr)
    y_pr_te = predict_labels(w, tX_te)
    acc_tr = compute_accuracy(y_tr, y_pr_tr)
    acc_te = compute_accuracy(y_te, y_pr_te)

    return acc_tr, acc_te
Esempio n. 14
0
    def run(self, data_y, data_x, data_ids, test_x, test_ids):
        if self.do_drop_minus_999_features:
            print('Dropping features containing at least one -999 value...',
                  end=' ',
                  flush=True)
            data_x = modifiers.drop_minus_999_features(data_x)
            print('DONE')
        if self.do_eliminate_minus_999:
            print(
                'Eliminating -999 values by setting them to feature median...',
                end=' ',
                flush=True)
            data_x = modifiers.eliminate_minus_999(data_x)
            print('DONE')

        # Build polynomial
        data_x = modifiers.build_poly(data_x, self.degree, True)

        if self.do_std:
            print('Standardising...', end=' ', flush=True)
            data_x = modifiers.standardize(data_x)
            print('DONE')

        # Find a good initial w
        initial_w, _ = impl.ridge_regression(data_y, data_x, lambda_=0.1)

        w_err_hyper_tuples = []  # (w, err, acc) triplets accumulator
        for hyper_params in self._obtain_hyper_params():
            print('Running with hyper parameters:', end=' ')
            print_dict(hyper_params)
            print()

            result = self._run(data_y, data_x, data_ids, initial_w,
                               **hyper_params)
            w_err_hyper_tuples.append((result, hyper_params))

        # Find w that corresponds to minimum error and predict based on that
        (w, err, acc), hyper_params = min(w_err_hyper_tuples,
                                          key=lambda x: x[0][1])
        print('Found optimal w with error={err}, accuracy={acc}'.format(
            err=err, acc=acc),
              'and hyper parameters:',
              end=' ')
        print_dict(hyper_params)
        print()

        if np.isnan(err):
            print('Error is infinite, computation has probably diverged.',
                  'Abandoning predictions!')
            return

        self._make_predictions(w, test_x, test_ids)
Esempio n. 15
0
def main():
    y_train, tX_train, ids = load_csv_data(DATA_TRAIN_PATH)
    _, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

    np.random.seed(2019)

    # Preprocess data together to have the same shifts while creating log or root features
    tX_stacked = np.vstack((tX_train, tX_test))
    prep_param = {
        "bias": True,
        "fill": True,
        "standardize": False,
        "degree": 8,
        "log": True,
        "root": True
    }
    tX_stacked_prep, *_ = preprocess_data(tX_stacked, None, prep_param)
    tX_train_prep, tX_test_prep = np.split(tX_stacked_prep, [len(tX_train)])

    # Split data according to PRI_jet_num value
    tX_tr_splitted, indices_tr = divide_data(tX_train_prep)
    tX_te_splitted, indices_te = divide_data(tX_test_prep)
    n_models = len(indices_tr)

    y_tr_splitted = []
    for i in range(n_models):
        y_tr_splitted.append(y_train[indices_tr[i]])

    # Train
    weights = []
    for i in range(n_models):
        lambda_ = lambda_cv(tX_tr_splitted[i], y_tr_splitted[i])
        print(f"Class {i}, lambda: {lambda_}")
        weights.append(
            ridge_regression(y_tr_splitted[i], tX_tr_splitted[i], lambda_)[0])

    # Predict
    y_pr_tr = np.zeros(tX_train.shape[0])
    y_pr_te = np.zeros(tX_test.shape[0])
    for i in range(n_models):
        y_pr_tr[indices_tr[i]] = predict_labels(weights[i], tX_tr_splitted[i])
        y_pr_te[indices_te[i]] = predict_labels(weights[i], tX_te_splitted[i])

    acc_tr = compute_accuracy(y_train, y_pr_tr)
    print(f"Total accuracy train: {acc_tr}")
    _, counts = np.unique(y_pr_te, return_counts=True)
    print(
        f"Distribution on test data class -1: {counts[0]}, class +1: {counts[1]}"
    )

    create_csv_submission(ids_test, y_pr_te, OUTPUT_PATH)
Esempio n. 16
0
def cross_validation_rr(y, x, k_indices, k, lambda_, degree):
    """train and test ridge regression model using cross validation"""
    x_test = x[k_indices[k]]
    x_train = np.delete(x, [k_indices[k]], axis=0)
    y_test = y[k_indices[k]]
    y_train = np.delete(y, [k_indices[k]], axis=0)

    x_tr_poly = helpers.build_poly(x_train, degree)
    x_te_poly = helpers.build_poly(x_test, degree)

    w, loss_tr = imp.ridge_regression(y_train, x_tr_poly, lambda_)
    loss_te = imp.compute_mse(y_test, x_te_poly, w)
    
    return loss_tr, loss_te
Esempio n. 17
0
def test_ridge_regression(y_train, tx_train, y_test, tx_test):
    """
    Tests ridge_regression method on the splitted data set and 
    reports percentage of correct predictions.
    Args:
        y_train: training labels after the splitting
        tx_train: training features after the splitting
        y_test: test labels after the splitting
        tx_test: test features after the splitting
    """
    print('\nTesting ridge_regression...')
    w, _ = ridge_regression(y_train, tx_train, 1e-08)
    report_prediction_accuracy(y_test, tx_test, w)
    print('... testing completed.')
Esempio n. 18
0
def train_3models(tX, y):
    # Preprocess data together to have the same shifts while creating log or root features
    prep_param = {
        "bias": True,
        "fill": True,
        "standardize": False,
        "degree": 8,
        "log": True,
        "root": True
    }
    tX_new, y_new, _ = preprocess_data(tX, y, prep_param)

    tX_tr, y_tr, tX_te, y_te = split_data(tX_new, y_new, ratio=0.8, seed=2019)

    # Split data according to PRI_jet_num value
    tX_tr_splitted, indices_tr = divide_data(tX_tr)
    tX_te_splitted, indices_te = divide_data(tX_te)
    n_models = len(tX_tr_splitted)

    y_tr_splitted = []
    for i in range(len(indices_tr)):
        y_tr_splitted.append(y_tr[indices_tr[i]])
        print(tX_tr_splitted[i].shape)

    # Train
    weights = []
    for i in range(n_models):
        lambda_ = lambda_cv(tX_tr_splitted[i], y_tr_splitted[i])
        print(f"Class {i}, lambda: {lambda_}")
        weights.append(
            ridge_regression(y_tr_splitted[i], tX_tr_splitted[i], lambda_)[0])
        print(len(weights[-1]))

    # Predict
    y_pr_tr = np.zeros(y_tr.shape)
    y_pr_te = np.zeros(y_te.shape)
    for i in range(n_models):
        y_pr_tr[indices_tr[i]] = predict_labels(weights[i], tX_tr_splitted[i])
        y_pr_te[indices_te[i]] = predict_labels(weights[i], tX_te_splitted[i])

    # Get accuracy
    acc_tr = compute_accuracy(y_tr, y_pr_tr)
    acc_te = compute_accuracy(y_te, y_pr_te)
    print(f"Total accuracy tr: {acc_tr}, te: {acc_te}")

    for i in range(n_models):
        acc_tr = compute_accuracy(y_tr[indices_tr[i]], y_pr_tr[indices_tr[i]])
        acc_te = compute_accuracy(y_te[indices_te[i]], y_pr_te[indices_te[i]])
        print(f"Class {i}, Accuracy tr: {acc_tr}, te: {acc_te}")
Esempio n. 19
0
def cross_validation_ridge_regression(y, x, k_indices, k, lambdas, degrees):
    """
    Completes k-fold cross-validation using the ridge regression method.
    Here, we build polynomial features and create four subsets using
    the jet feature.
    """
    # get k'th subgroup in test, others in train
    msk_test = k_indices[k]
    msk_train = np.delete(k_indices, (k), axis=0).ravel()

    x_train_all_jets = x[msk_train, :]
    x_test_all_jets = x[msk_test, :]
    y_train_all_jets = y[msk_train]
    y_test_all_jets = y[msk_test]

    # split in 4 subsets the training set
    msk_jets_train = get_jet_masks(x_train_all_jets)
    msk_jets_test = get_jet_masks(x_test_all_jets)

    # initialize output vectors
    y_train_pred = np.zeros(len(y_train_all_jets))
    y_test_pred = np.zeros(len(y_test_all_jets))

    for idx in range(len(msk_jets_train)):
        x_train = x_train_all_jets[msk_jets_train[idx]]
        x_test = x_test_all_jets[msk_jets_test[idx]]
        y_train = y_train_all_jets[msk_jets_train[idx]]

        # data pre-processing
        x_train, x_test = process_data(x_train, x_test, False)

        phi_train = build_poly(x_train, degrees[idx])
        phi_test = build_poly(x_test, degrees[idx])

        phi_train = add_constant_column(phi_train)
        phi_test = add_constant_column(phi_test)

        # compute weights using given method
        weights, loss = ridge_regression(y=y_train, tx=phi_train, lambda_=lambdas[idx])

        y_train_pred[msk_jets_train[idx]] = predict_labels(weights, phi_train)
        y_test_pred[msk_jets_test[idx]] = predict_labels(weights, phi_test)

    # compute accuracy for train and test data
    acc_train = compute_accuracy(y_train_pred, y_train_all_jets)
    acc_test = compute_accuracy(y_test_pred, y_test_all_jets)

    return acc_train, acc_test
Esempio n. 20
0
def cross_validation_ridge(y, x, k_indices, k, lambda_):
    """Performs one iteration of the k-fold cross validation using L2 Regularized Logistic regression"""
    val_indices = k_indices[k]
    train_indices = k_indices[~(np.arange(len(k_indices)) == k)].reshape(-1)
    x_val, y_val = x[val_indices], y[val_indices]
    x_train, y_train = x[train_indices], y[train_indices]

    x_val, y_val = prepare_for_training(x_val, y_val, logistic=False)
    x_train, y_train = prepare_for_training(x_train, y_train, logistic=False)

    w, loss_tr = ridge_regression(y_train, x_train, lambda_)

    loss_val = compute_mse_loss(y_val, x_val,
                                w) + (2 * lambda_ * np.linalg.norm(w)**2)

    acc = compute_accuracy(y_val, x_val, w)
    return w, loss_tr, loss_val, acc
Esempio n. 21
0
def ridge_regression_demo(y, x, degree, k_fold):
    """find best hyperparameters and return error for ridge regression model"""
    seed = 1
    lambdas = np.logspace(-1.1, -0.8, 20)
    
    # split data in k fold
    k_indices = helpers.build_k_indices(y, k_fold, seed)
    
    # define lists to store the loss of training data and test data
    rmse_tr = []
    rmse_te = []
    
    # iterate over all the lambdas, compute model parameters, store the rmse
    for i in range(len(lambdas)):
        l = lambdas[i]
        avg_err_tr = 0
        avg_err_te = 0
        for k in range(k_fold):
            err = cross_validation_rr(y, x, k_indices, k, l, degree)
            avg_err_tr += err[0]
            avg_err_te += err[1]
        rmse_tr.append(np.sqrt(2 * avg_err_tr / k_fold))
        rmse_te.append(np.sqrt(2 * avg_err_te / k_fold))
    helpers.visualization(lambdas, rmse_tr, rmse_te)
    
    # find the best lambda
    min_err_index = 0
    for i in range(1, len(rmse_te)):
        if rmse_te[i] < rmse_te[min_err_index]:
            min_err_index = i
            
    lambda_opt = lambdas[min_err_index]
    
    x_poly = helpers.build_poly(x, degree)
    w_opt, mse = imp.ridge_regression(y, x_poly, lambda_opt)
    
    print("   lambda={l:.3f}, mse={mse:.3f}".format(mse = mse, l = lambda_opt))

    #Training Accuracy
    y_predicted = helpers.predict_labels(w_opt.T, x_poly)
    accuracy = (list(y == y_predicted.flatten()).count(True))/len(y)
    print("   accuracy={acc:.3f}".format(acc = accuracy))
Esempio n. 22
0
def cross_validation(y, x, k_indices, k, lambda_, degree):

    te_indice = k_indices[k]
    tr_indice = k_indices[~(np.arange(k_indices.shape[0]) == k)]
    tr_indice = tr_indice.reshape(-1)
    y_te, y_tr = y[te_indice], y[tr_indice]
    x_te, x_tr = x[te_indice], x[tr_indice]

    tx_tr = build_poly(x_tr, degree)
    tx_te = build_poly(x_te, degree)

    w, _ = ridge_regression(y_tr, tx_tr, lambda_)

    y_tr_pred = predict_labels(w, tx_tr)
    y_te_pred = predict_labels(w, tx_te)

    loss_tr = sum(y_tr_pred != y_tr) / len(y_tr)
    loss_te = sum(y_te_pred != y_te) / len(y_te)

    return loss_tr, loss_te, w
def find_optimal_w(tX, y, implementation, log_initial_w, log_max_iters,
                   log_gamma, decreasing_gamma, log_regulator, ridge_lambda):
    """
    Find the optimal weights by training the data set
    
    Parameters 
    ----------
    
    tX: array
        The feature matrices
    y: array
        The output
    log_initial_w: array
        inital weights in order to perform GD or SGD
    log_max_iters: integer
        number of iterations to perform GD or SGD
    log_gamma: float
        gamma parameter to perform GD or SGD
    log_regulator: float
        lambda to perform logistic regression
    ridge_lambda: float
        lambda to perform ridge regression
      
    Return
    ------
    
    optimal_w = array
        Optimal weights.

    """
    optimal_w = None
    if implementation == 0:
        optimal_w, _ = impl.least_squares(y, tX)
    if implementation == 1:
        optimal_w, _ = impl.ridge_regression(y, tX, ridge_lambda)
    if implementation == 2:
        optimal_w, _ = impl.reg_logistic_regression(y, tX, log_regulator,
                                                    log_initial_w,
                                                    log_max_iters, log_gamma,
                                                    decreasing_gamma)
    return optimal_w
Esempio n. 24
0
def learn(predictions, ids_predicted, y_train_jets, tx_train_jets,
          tx_test_jets, ids_test_jets, lambda_best_jets, degree_best_jets):
    print('\nLearning by ridge regression...')
    for jet_num in range(4):
        print('\nLearning from training set with jet number ', str(jet_num),
              ' using optimal hyperparameters...')
        y_train, tx_train = y_train_jets[jet_num], tx_train_jets[jet_num]
        tx_train = feature_engineering(tx_train, degree_best_jets[jet_num],
                                       jet_num > 1)
        w_best, _ = ridge_regression(y_train, tx_train,
                                     lambda_best_jets[jet_num])
        tx_test, ids_test = tx_test_jets[jet_num], ids_test_jets[jet_num]
        tx_test = feature_engineering(tx_test, degree_best_jets[jet_num],
                                      jet_num > 1)
        predictions.append(predict_labels(w_best, tx_test))
        ids_predicted.append(ids_test)
        print('\nReporting prediction accuracy for the training set... \n')
        report_prediction_accuracy(y_train, tx_train, w_best)
        print('\n... this gives a rough idea about the training success.')
        print('\n... predicted labels for test set with jet number ',
              str(jet_num))
    print('\n... ,predicted labels for each test set.')
Esempio n. 25
0
def cross_validation(y,
                     augmented_tx,
                     k_indices,
                     k,
                     lambda_,
                     report_predictions=False):
    """
    Perform cross_validation for a specific test set from the partitioned set.
    :param y: label data
    :param augmented_tx: augmented features
    :param k_indices: An array of k sub-indices that are randomly partitioned
    :param k: number of folds
    :param lambda_: regularization parameters
    :param report_predictions: report prediction or not
    :return: root mean square of loss training error, prediction
    """
    y_test = y[k_indices[k]]
    y_train = np.delete(y, k_indices[k])
    augmented_tx_test = augmented_tx[k_indices[k]]
    augmented_tx_train = np.delete(augmented_tx, k_indices[k], axis=0)
    w, loss_train = ridge_regression(y_train, augmented_tx_train, lambda_)
    pred = report_prediction_accuracy(y_test, augmented_tx_test, w, False)
    return compute_rmse(loss_train), pred
Esempio n. 26
0
def cross_validation(y, augmented_tx, k_indices, k, lambda_, report_predictions = False):
    """
    Performs cross_validation for a specific test set from the partitioned set.
    Args:
        y: labels
        augmented_tx: augmented features
        k_indices: an array of k sub-indices that are randomly partitioned
        k: the test set that is kth partition 
        lambda_: regularization parameter for the ridge regression
    Returns:
        rmse_training: numeric value of the root mean squared error loss
            for the training set
        pred: correct prediction percentage for the test set
    """
    y_test = y[k_indices[k]]
    y_training = np.delete(y, k_indices[k])
    augmented_tx_test = augmented_tx[k_indices[k]]
    augmented_tx_training = np.delete(augmented_tx, k_indices[k], axis = 0)
    w, loss_training = ridge_regression(y_training, augmented_tx_training, lambda_)
    pred = report_prediction_accuracy(y_test, augmented_tx_test, w, False)
    # instead of test rmse, return correct prediction percentage (it works better)
    #loss_test = compute_mse(compute_error_vector(y_test, augmented_tx_test, w))
    return compute_rmse(loss_training), pred #compute_rmse(loss_test)
Esempio n. 27
0
def regress(x, y, lamb=0):
    """
    Computes weights using ridge regression
    """
    w, _ = ridge_regression(y, x, lamb)
    return w
Esempio n. 28
0
avg_test_accuracy_LR = cross_validation_LR(X_train, y_train, k_fold=4, seed=1)

# Cross validation over both gamma and lambda
g, l, avg_test_accuracy_RLR = cross_validation_RLR(X_train, y_train, k_fold=4, seed=1)

#%% Testing functions
#np.random.seed(42)

gamma = 0.2
lambda_ = 4E-5

w, loss = least_squares(y = y_train, tx = X_train)
#
w, loss = least_squares_SGD(y = y_train, tx = X_train, initial_w = np.random.random(size=num_features)*0.01, max_iters = 200000, gamma = gamma)
#
w, loss = ridge_regression(y = y_train, tx = X_train, lambda_ = lambda_)
#
w, loss = logistic_regression(y = y_train, tx = X_train, initial_w = np.random.random(size=num_features)*10, max_iters = 125000, gamma = gamma)
#
w, loss = reg_logistic_regression(y = y_train, tx = X_train, lambda_ = lambda_, initial_w = np.random.random(size=num_features)*0.01, max_iters = 200000, gamma = gamma)

plt.plot(w)

#%% Predictive step
y_test = X_test @ w

plt.hist(y_test, bins=200)

y_pred = predict_labels(w, X_test)

#%% Create submission
Esempio n. 29
0
    # Get train and test data
    train_index = jet_train_samples[i]
    test_index = jet_test_samples[i]

    x_tr, y_tr = x_train[train_index], y_train[train_index]
    x_te, y_te = x_test[test_index], y_test[test_index]

    # Clean train and test data
    x_tr, x_te = clean_data(x_tr, x_te)

    # Build polynomial data
    x_tr, y_tr = augment_data(x_tr, y_tr, degree)
    x_te, y_te = augment_data(x_te, y_te, degree)

    # Train model
    weights, loss = ridge_regression(y_tr, x_tr, lambda_)
    accuracy = predict_accuracy(y_tr, x_tr, weights)
    f1_score = compute_f1_score(y_tr, x_tr, weights)
    y_prediction_test[test_index] = predict_labels(weights, x_te)
    print("  Accuracy = {acc} \n  F1-score = {f1} \n".format(acc=accuracy,
                                                             f1=f1_score))
    mean_accuracy += train_index.shape[0] * accuracy
    mean_f1_score += train_index.shape[0] * f1_score

mean_accuracy /= x_train.shape[0]
mean_f1_score /= x_train.shape[0]
print("Final accuracy = {acc} \nFinal F1-score = {f1} \n".format(
    acc=mean_accuracy, f1=mean_f1_score))

# Save ouput for submission
OUTPUT_PATH = "../data/submission.csv"
Esempio n. 30
0
def cross_validation(x,
                     y,
                     k,
                     mode,
                     gamma=None,
                     lambda_=None,
                     max_iters=None,
                     initial_w=None):
    """
    INPUT:
    @x : input data, dimensions (NxD)
    @y : target labels, (Nx1) array
    @k : number of folds
    OUTPUT:
    """
    D = x.shape[1]
    #randomly permute data maybe?
    x_split = np.array_split(x, k, axis=0)
    y_split = np.array_split(y, k, axis=0)
    #initialize weights and metrics
    weights = list()
    acc = list()
    tpr = list()
    fpr = list()
    losses = list()

    #loop over folds
    for fold in range(k):
        #create model
        #train_ind = [i for i in range(k) if i!=fold]
        #val_ind = [i for i in range(k) if i==fold]
        #pdb.set_trace()
        x_train = [x_split[i] for i in range(k) if i != fold]
        y_train = [y_split[i] for i in range(k) if i != fold]
        x_train = np.concatenate(x_train, axis=0)
        y_train = np.concatenate(y_train, axis=0)
        x_val = x_split[fold]
        y_val = y_split[fold]
        #model = Proj1_Model(x_train, y_train, mode)
        #train model for fold
        #weights[k] = model.train()
        """here the choice of method"""
        if mode == 'linear_regression_eq':
            update, loss = imp.least_squares(y_train, x_train)
            predictions = np.dot(x_val, update)
            pr_bool = predictions >= np.mean(predictions)
        elif mode == 'ridge_regression_eq':
            update, loss = imp.ridge_regression(y_train, x_train, lambda_)
            predictions = np.dot(x_val, update)
            pr_bool = predictions >= np.mean(predictions)
        elif mode == 'linear_regression_GD':
            update, loss = imp.least_squares_GD(y_train, x_train, initial_w,
                                                max_iters, gamma)
            predictions = np.dot(x_val, update)
            pr_bool = predictions >= np.mean(predictions)
        elif mode == 'linear_regression_SGD':
            update, loss = imp.least_squares_SGD(y_train, x_train, initial_w,
                                                 max_iters, gamma)
            predictions = np.dot(x_val, update)
            pr_bool = predictions >= np.mean(predictions)
        elif mode == 'logistic_regression':
            update, loss = imp.logistic_regression(y_train, x_train, initial_w,
                                                   max_iters, gamma)
            predictions = np.dot(x_val, update)
            predicted_prob = H.sigmoid(predictions)
            #pdb.set_trace()
            pr_bool = predicted_prob > 0.5
        elif mode == 'reg_logistic_regression':
            update, loss = imp.reg_logistic_regression(y_train, x_train,
                                                       initial_w, max_iters,
                                                       gamma)
            predictions = np.dot(x_val, update)
            predicted_prob = H.sigmoid(predictions)
            #pdb.set_trace()
            pr_bool = predicted_prob > 0.5
        weights.append(update)
        losses.append(loss)
        pr_bool = predictions >= np.mean(predictions)
        y_bool = y_val == 1
        correct = pr_bool == y_bool
        tp = np.logical_and(correct, y_bool)
        fp = np.logical_and(np.logical_not(correct), pr_bool)
        #tp = [i for i in range(len(pr_bool)) if (pr_bool[i] == True and y_bool[i] == True)]
        #all_p = [i for i in range(len(pr_bool)) if y_bool == True]
        #fp = [i for i in range(len(pr_bool)) if (pr_bool == True and y_bool == False)]
        #all_n = [i for i in range(len(pr_bool)) if y_bool == False]
        #print('True signal samples:' + str(sum(y_val)) + ' - Predicted signal samples:' + str(sum(pr_bool)))
        acc.append(sum(correct) / float(len(y_val)))
        tpr.append(sum(tp) / float(sum(y_bool)))
        fpr.append(sum(fp) / float(sum(np.logical_not(y_bool))))
        #acc[k] = model.acc()
        #tpr[k] = model.tpr()
        #fpr[k] = model.fpr()
    return acc, tpr, fpr, losses