Exemple #1
0
    def __init__(self, model_name, w=None, learning_param=None, debug=True):

        # Set weights
        self.w = w

        # Set debug object
        if debug:
            self.dbg = debugger.Debugger(['loss', 'w'])
        else:
            self.dbg = None

        """Depending on the chosen model, we choose the approriate output, 
        loss prediction, and learning functions.
        """
        if model_name == 'logistic_regression':
            self.model_output = misc.lr_output
            self.compute_loss = cost.compute_loss_ce
            self.predict_output = misc.map_prediction

            max_iters = learning_param['max_iters']
            gamma = learning_param['gamma']
            
            self.learn = lambda y, x, w, dbg: impl.logistic_regression(y, x, w, max_iters, gamma, dbg)

        if model_name == 'reg_logistic_regression':
            self.model_output = misc.lr_output
            self.compute_loss = cost.compute_loss_reg_ce
            self.predict_output = misc.map_prediction

            max_iters = learning_param['max_iters']
            gamma = learning_param['gamma']
            lambda_ = learning_param['lambda_']
            self.learn = lambda y, x, w, dbg: impl.reg_logistic_regression(y, x, lambda_, w, max_iters, gamma, dbg)


        if model_name == 'least_squares_GD':
            self.model_output = np.dot
            self.compute_loss = cost.compute_loss_ls
            self.predict_output = misc.predict_ls

            max_iters = learning_param['max_iters']
            gamma = learning_param['gamma']
            
            self.learn = lambda y, x, w, dbg: impl.least_squares_GD(y, x, w, max_iters, gamma, dbg)

        if model_name == 'ridge_regression':
            self.model_output = np.dot
            self.compute_loss = cost.compute_loss_ls
            self.predict_output = misc.predict_ls

            lambda_ = learning_param['lambda_']
            
            self.learn = lambda y, x, w, dbg: impl.ridge_regression(y, x, lambda_)

        if model_name == 'least_squares':
            self.model_output = np.dot
            self.compute_loss = cost.compute_loss_ls
            self.predict_output = misc.predict_ls

            self.learn = lambda y, x, w, dbg: impl.least_squares(y, x)
Exemple #2
0
def cross_validation(y, x, degree, k, k_indices,method, error, feature_augmentation, hyperparams):
    """"""
    from helpers_data import feature_processing, feat_augmentation, standardize, build_poly
    from implementations import ridge_regression, least_squares, least_squares_GD, least_squares_SGD, logistic_regression, reg_logistic_regression
    
    
    # get k'th subgroup in test, others in train
    te_indice = k_indices[k]
    tr_indice = k_indices[~(np.arange(k_indices.shape[0]) == k)]
    tr_indice = tr_indice.reshape(-1)
    
    y_te = y[te_indice]
    y_tr = y[tr_indice]
    x_te = x[te_indice]
    x_tr = x[tr_indice]
    
    x_tr, y_tr, median = feature_processing (x_tr, y_tr, 'mean', replace_feature = True, suppr_outliers = hyperparams[-1], threshold = 3, ref_median=[])
    x_te, y_te, _= feature_processing (x_te, y_te, 'mean', replace_feature = True, suppr_outliers = False, threshold = 3, ref_median=median)
    
    
    tx_tr_aug = []
    tx_te_aug = []
    if feature_augmentation:
        tx_tr_aug, index = feat_augmentation(x_tr, 0.003)
        tx_te_aug, _ = feat_augmentation(x_te, 0.003, False, index)
    
    # form data with polynomial degree
    tx_tr = build_poly(x_tr, degree, feature_augmentation, tx_tr_aug)
    tx_te = build_poly(x_te, degree, feature_augmentation, tx_te_aug)
    tx_tr, mean, std = standardize(tx_tr)
    tx_te, _, _ = standardize(tx_te, mean, std)
    
    #print('Mean and std of each feature in train set: {} , {}'.format(tx_tr.mean(axis = 0),tx_tr.std(axis = 0)))
    #print('Mean and std of each feature in test set: {} , {}'.format(tx_te.mean(axis = 0),tx_te.std(axis = 0)))
    
    
    
    if method == 'rr': w,_ = ridge_regression(y_tr, tx_tr, hyperparams[0]) # ridge regression
    elif method == 'ls': w,_ = least_squares(y_tr, tx_tr) # least square
    elif method == 'lsGD': w,_ = least_squares_GD(y_tr, tx_tr, hyperparams[0], hyperparams[1], hyperparams[2]) # gradient descent
    elif method == 'lsSGD': w,_ = least_squares_SGD(y_tr, tx_tr, hyperparams[0], hyperparams[1], hyperparams[2], hyperparams[3]) # stoch GD
    elif method == 'log': w,_ = logistic_regression(y_tr, tx_tr, hyperparams[0], hyperparams[1], hyperparams[2]) # logistic reg
    elif method == 'rlog': w,_ =reg_logistic_regression(y_tr, tx_tr, hyperparams[3], np.zeros(tx_tr.shape[1]), hyperparams[1], hyperparams[2]) # regularised logistic reg
    else: raise NotImplementedError
   
    if method == 'log':
        loss_tr = cal_loglike(y_tr, tx_tr, w)
        loss_te = cal_loglike(y_te, tx_te, w)
    elif method == 'rlog':
        loss_tr = cal_loglike_r(y_tr, tx_tr, w, hyperparams[3])
        loss_te = cal_loglike_r(y_te, tx_te, w, hyperparams[3])
    else :
        # calculate the loss for train and test data
        loss_tr = compute_loss(y_tr, tx_tr, w, error)
        loss_te = compute_loss(y_te, tx_te, w, error)      
    
    y_pred = predict_labels(np.array(w).T, tx_te)
    acc = accuracy(y_te,y_pred)
    
    return loss_tr, loss_te, w, acc
Exemple #3
0
def pipeline(tx_train, y_train, tx_val, y_val, degrees, gamma, lambda_, epochs,
             verbose):
    """ Run the model training and evaluation on the given parameters """

    # Perform data cleaning (missing values, constant features, outliers, standardization)
    data_cleaner = DataCleaning()
    tx_train = data_cleaner.fit_transform(tx_train)
    tx_val = data_cleaner.transform(tx_val)

    # Perform feature engineering
    feature_generator = FeatureEngineering()
    x_train = feature_generator.fit_transform(tx=tx_train, degree=degrees)
    x_val = feature_generator.transform(tx=tx_val)

    # Initialize values
    initial_w = np.zeros(x_train.shape[1])
    # Train model
    w, _ = reg_logistic_regression(y_train, x_train, lambda_, initial_w,
                                   epochs, gamma, verbose)

    # Perform inference on validation
    pred = predict_labels(weights=w, data=x_val, logistic=True)

    evaluator = Evaluation(y_val, pred)
    return evaluator.get_f1(), evaluator.get_accuracy()
def cross_validation(y,
                     tx,
                     mlfunction,
                     split_number=5,
                     lambda_=1e-6,
                     gamma=0.001):
    '''Performs a ml_function given as parameters using cross validation on the training set split_number folds (5 as default value) '''

    # define empty lists to store train/test losses and accuracy
    train_loss_ = []
    test_loss_ = []
    train_accuracy_ = []
    test_accuracy_ = []

    # get k_indices
    k_indices = build_k_indices(len(y), split_number)

    for ki in range(len(k_indices)):

        # set the k'th indices as test, and others as training set
        #train_idx = np.asarray([k_indices[i] for i in np.delete( np.arange(len(k_indices)), ki)]).flatten()
        test_idx = np.asarray(k_indices[ki])
        train_idx = np.delete(np.arange(len(y)), test_idx)

        train_tX = tx[train_idx]
        train_y = y[train_idx]

        test_tX = tx[test_idx]
        test_y = y[test_idx]

        if (mlfunction == 'ridge_regression'):
            w, loss = impl.ridge_regression(train_y, train_tX, lambda_)
        elif (mlfunction == 'least_squares'):
            w, loss = impl.least_squares(train_y, train_tX)
        elif (mlfunction == 'logistic_regression'):
            w, loss = impl.logistic_regression(train_y, train_tX)
        elif (mlfunction == 'reg_logistic_regression'):
            w, loss = impl.reg_logistic_regression(train_y, train_tX, lambda_)

        elif (mlfunction == 'least_squares_sgd'):
            w, loss = impl.least_squares_SGD(train_y, train_tX, gamma)
        elif (mlfunction == 'least_squares_gd'):
            w, loss = impl.least_squares_GD(train_y, train_tX, gamma)
        else:
            print('ERROR: ml_function not recognized')
            print(
                'least_squares, least_squares_gd, least_squares_sgd, logistic_regression, reg_logistic_regression'
            )
            return None

        # Calculate different losses and accuracy
        train_loss_.append(impl.compute_loss_mse(train_y, train_tX, w))
        test_loss_.append(impl.compute_loss_mse(test_y, test_tX, w))

        train_accuracy_ = impl.compute_accuracy(train_y, train_tX, w)
        test_accuracy_ = impl.compute_accuracy(test_y, test_tX, w)

    return np.mean(train_loss_), np.mean(test_loss_), np.mean(
        train_accuracy_), np.mean(test_accuracy_)
def get_model(model, y, tx, initial_w, max_iters, gamma, lambda_, batch_size):
    """ Returns the learned weights 'w' (last weight vector) and
    the corresponding loss function by a given model.

    Parameters
    ----------
    model: string
        The model
    y: ndarray
        The labels
    tx: ndarray
        The feature matrix
    initial_w: ndarray
        The initial weights
    max_iters: integer
        The number of steps to run
    gamma: integer
        The step size
    lambda_: integer
        The regularization parameter
    batch_size: integer
        The batch size

    Returns
    -------
    tuple
        The learned weights
    """
    if model == "MSE_GD":
        w, _ = least_squares_GD(y, tx, initial_w, max_iters, gamma)
        
    elif model == "MSE_SGD":
        w, _ = least_squares_SGD(y, tx, initial_w, batch_size, max_iters, gamma)
        
    elif model == "MSE_OPT":
        w, _ = least_squares(y, tx)
        
    elif model == "MSE_OPT_REG":
        w, _ = ridge_regression(y, tx, lambda_)
        
    elif model == "LOG_GD":
        w, _ = logistic_regression(y, tx, initial_w, max_iters, gamma)
        
    elif model == "LOG_REG_GD":
        w, _ = reg_logistic_regression(y, tx, lambda_, initial_w, max_iters, gamma)

    elif model == "LOG_REG_L1":
        w, _ = reg_logistic_regression_L1(y, tx, lambda_, initial_w, max_iters, gamma)
    
    elif model == "MSE_GD_L1":
        w, _ = least_squares_GD_L1(y, tx, lambda_, initial_w, max_iters, gamma)
    
    else:
        raise UnknownModel
    
    return w
def cross_validation_lrr(y, x, k_indices, k, lambda_, gamma, max_iters, w_initial):
    """train and test regularized logistic regression model using cross validation"""
    x_test = x[k_indices[k]]
    x_train = np.delete(x, [k_indices[k]], axis=0)
    y_test = y[k_indices[k]]
    y_train = np.delete(y, [k_indices[k]], axis=0)

    opt_w, loss = imp.reg_logistic_regression(y_train,x_train,lambda_,w_initial,max_iters,gamma)
    loss_te = imp.compute_loss_lrr(y_test, x_test,opt_w)
    return loss_te, opt_w
Exemple #7
0
def test_reg_logistic_regression(y_train, tx_train, y_test, tx_test):
    """
    Tests reg_logistic_regression method on the splitted data set and 
    reports percentage of correct predictions.
    Args:
        y_train: training labels after the splitting
        tx_train: training features after the splitting
        y_test: test labels after the splitting
        tx_test: test features after the splitting
    """
    print('\nTesting reg_logistic_regression...')
    w, _ = reg_logistic_regression(y_train, tx_train, 0.1,
                                   np.zeros(tx_train.shape[1]), 3000, 1e-06)
    report_prediction_accuracy_logistic(y_test, tx_test, w)
    print('... testing completed.')
Exemple #8
0
def best_model_predictions(data_obj, jet, degrees):
    """
    This method splits the data based on the jet value
    trains the model and gets the predictions on the test dataset.

    :param data_obj: DataLoader obj
    :param jet: int, the jet value
    :param degrees: int, the polynomial degree
    :return:
        pred: np.array with the predicted labels
        ids: np.array with the row index
    """
    print('Training for Jet {jet}'.format(jet=jet))
    # Split data based on jet value for train and val datasets
    y, tx = get_jet_data_split(data_obj.y, data_obj.tx, jet)
    ids_test, tx_test = get_jet_data_split(data_obj.ids_test, data_obj.test,
                                           jet)

    # Perform data cleaning (missing values, constant features, outliers, standardization)
    data_cleaner = DataCleaning()
    tx = data_cleaner.fit_transform(tx)
    tx_test = data_cleaner.transform(tx_test)

    # Perform feature engineering
    feature_generator = FeatureEngineering()
    tx = feature_generator.fit_transform(tx, degrees)
    tx_test = feature_generator.transform(tx_test)

    # Initialize values
    initial_w = np.zeros((tx.shape[1]))
    lambda_ = 1e-06
    gamma = 1e-06
    max_iter = 1000

    # Train model
    w, loss = reg_logistic_regression(y,
                                      tx,
                                      lambda_,
                                      initial_w,
                                      max_iter,
                                      gamma,
                                      verbose=True)

    # Perform inference on test set
    pred = predict_labels(w, tx_test, True)

    return ids_test, pred
def find_optimal_w(tX, y, implementation, log_initial_w, log_max_iters,
                   log_gamma, decreasing_gamma, log_regulator, ridge_lambda):
    """
    Find the optimal weights by training the data set
    
    Parameters 
    ----------
    
    tX: array
        The feature matrices
    y: array
        The output
    log_initial_w: array
        inital weights in order to perform GD or SGD
    log_max_iters: integer
        number of iterations to perform GD or SGD
    log_gamma: float
        gamma parameter to perform GD or SGD
    log_regulator: float
        lambda to perform logistic regression
    ridge_lambda: float
        lambda to perform ridge regression
      
    Return
    ------
    
    optimal_w = array
        Optimal weights.

    """
    optimal_w = None
    if implementation == 0:
        optimal_w, _ = impl.least_squares(y, tX)
    if implementation == 1:
        optimal_w, _ = impl.ridge_regression(y, tX, ridge_lambda)
    if implementation == 2:
        optimal_w, _ = impl.reg_logistic_regression(y, tX, log_regulator,
                                                    log_initial_w,
                                                    log_max_iters, log_gamma,
                                                    decreasing_gamma)
    return optimal_w
Exemple #10
0
def cross_validation(x,
                     y,
                     k,
                     mode,
                     gamma=None,
                     lambda_=None,
                     max_iters=None,
                     initial_w=None):
    """
    INPUT:
    @x : input data, dimensions (NxD)
    @y : target labels, (Nx1) array
    @k : number of folds
    OUTPUT:
    """
    D = x.shape[1]
    #randomly permute data maybe?
    x_split = np.array_split(x, k, axis=0)
    y_split = np.array_split(y, k, axis=0)
    #initialize weights and metrics
    weights = list()
    acc = list()
    tpr = list()
    fpr = list()
    losses = list()

    #loop over folds
    for fold in range(k):
        #create model
        #train_ind = [i for i in range(k) if i!=fold]
        #val_ind = [i for i in range(k) if i==fold]
        #pdb.set_trace()
        x_train = [x_split[i] for i in range(k) if i != fold]
        y_train = [y_split[i] for i in range(k) if i != fold]
        x_train = np.concatenate(x_train, axis=0)
        y_train = np.concatenate(y_train, axis=0)
        x_val = x_split[fold]
        y_val = y_split[fold]
        #model = Proj1_Model(x_train, y_train, mode)
        #train model for fold
        #weights[k] = model.train()
        """here the choice of method"""
        if mode == 'linear_regression_eq':
            update, loss = imp.least_squares(y_train, x_train)
            predictions = np.dot(x_val, update)
            pr_bool = predictions >= np.mean(predictions)
        elif mode == 'ridge_regression_eq':
            update, loss = imp.ridge_regression(y_train, x_train, lambda_)
            predictions = np.dot(x_val, update)
            pr_bool = predictions >= np.mean(predictions)
        elif mode == 'linear_regression_GD':
            update, loss = imp.least_squares_GD(y_train, x_train, initial_w,
                                                max_iters, gamma)
            predictions = np.dot(x_val, update)
            pr_bool = predictions >= np.mean(predictions)
        elif mode == 'linear_regression_SGD':
            update, loss = imp.least_squares_SGD(y_train, x_train, initial_w,
                                                 max_iters, gamma)
            predictions = np.dot(x_val, update)
            pr_bool = predictions >= np.mean(predictions)
        elif mode == 'logistic_regression':
            update, loss = imp.logistic_regression(y_train, x_train, initial_w,
                                                   max_iters, gamma)
            predictions = np.dot(x_val, update)
            predicted_prob = H.sigmoid(predictions)
            #pdb.set_trace()
            pr_bool = predicted_prob > 0.5
        elif mode == 'reg_logistic_regression':
            update, loss = imp.reg_logistic_regression(y_train, x_train,
                                                       initial_w, max_iters,
                                                       gamma)
            predictions = np.dot(x_val, update)
            predicted_prob = H.sigmoid(predictions)
            #pdb.set_trace()
            pr_bool = predicted_prob > 0.5
        weights.append(update)
        losses.append(loss)
        pr_bool = predictions >= np.mean(predictions)
        y_bool = y_val == 1
        correct = pr_bool == y_bool
        tp = np.logical_and(correct, y_bool)
        fp = np.logical_and(np.logical_not(correct), pr_bool)
        #tp = [i for i in range(len(pr_bool)) if (pr_bool[i] == True and y_bool[i] == True)]
        #all_p = [i for i in range(len(pr_bool)) if y_bool == True]
        #fp = [i for i in range(len(pr_bool)) if (pr_bool == True and y_bool == False)]
        #all_n = [i for i in range(len(pr_bool)) if y_bool == False]
        #print('True signal samples:' + str(sum(y_val)) + ' - Predicted signal samples:' + str(sum(pr_bool)))
        acc.append(sum(correct) / float(len(y_val)))
        tpr.append(sum(tp) / float(sum(y_bool)))
        fpr.append(sum(fp) / float(sum(np.logical_not(y_bool))))
        #acc[k] = model.acc()
        #tpr[k] = model.tpr()
        #fpr[k] = model.fpr()
    return acc, tpr, fpr, losses
def cross_validation(y, tX, gamma, method='logistic_regression'):
    """Cross validation for logistic regression
	@param gamma: learning rate
	@return : the average accuracy over the four fold validations
	"""
    N, D = tX.shape

    # Logistic regression parameters
    max_iters = 100
    batch_size = N / 100

    # Cross validation parameters
    seed = 1
    k_fold = 4
    k_indices = build_k_indices(y, k_fold, seed)

    N_fold = N * (k_fold - 1) / k_fold
    N_test = N / k_fold

    acc = []

    for k in range(k_fold):
        yTr = np.array([])
        xTr = np.zeros((0, D))
        for i in range(k_fold):
            if i == k:
                yTe = y[k_indices[i]]
                xTe = tX[k_indices[i]]
            else:
                yTr = np.append(yTr, y[k_indices[i]], axis=0)
                xTr = np.append(xTr, tX[k_indices[i]], axis=0)

        initial_w = np.zeros(tX.shape[1])
        if method == 'logistic_regression':
            initial_w = np.zeros((tX.shape[1], 1))
            w, loss = logistic_regression(yTr, xTr, initial_w, max_iters,
                                          gamma)
            y_est = sigmoid(np.dot(xTe, w))
            y_label = [0 if i < 0.5 else 1 for i in y_est]
        elif method == 'reg_logistic_regression':
            initial_w = np.zeros((tX.shape[1], 1))
            lambda_ = 0.1
            w, loss = reg_logistic_regression(yTr, xTr, lambda_, initial_w,
                                              max_iters, gamma)
            y_est = sigmoid(np.dot(xTe, w))
            y_label = [0 if i < 0.5 else 1 for i in y_est]
        elif method == 'least_squares_GD':
            w, loss = least_squares_GD(yTr, xTr, initial_w, max_iters, gamma)
            y_label = predict_labels(w, xTe)
        elif method == 'least_squares_SGD':
            w, loss = least_squares_SGD(yTr, xTr, initial_w, max_iters, gamma)
            y_label = predict_labels(w, xTe)
        elif method == 'least_squares':
            w, loss = least_squares(yTr, xTr)
            y_label = predict_labels(w, xTe)
        elif method == 'ridge_regression':
            w, loss = ridge_regression(yTr, xTr, 0.1)
            y_label = predict_labels(w, xTe)
        else:
            raise Exception('Invalid method')

        corr = [
            True if i == yTe[ind] else False for ind, i in enumerate(y_label)
        ]
        acc.append(sum(corr) / N_test)
        # print("Fold: {f}, Accuracy: {acc}, Loss:{loss}".format(f=k, acc=acc[k], loss=loss))
    return (sum(acc) / k_fold), acc
            train_data_log_svm = preprocessing_pipeline(train_data_split,
                                                        degree=deg,
                                                        norm_first=False)
            train_set_folds = k_fold_cross_split_data(train_classes_split,
                                                      train_data_log_svm,
                                                      k_indices)

            for j, lambda_ in enumerate(POSSIBLE_LAMBDA_LOG):
                folds_train_accuracy = []
                folds_validation_accuracy = []

                # Train a Regularized Ridge Regression model on each fold
                for x_train, y_train, x_test, y_test in train_set_folds:
                    initial_w = np.zeros((x_train.shape[1], ))
                    try:
                        w, train_loss = reg_logistic_regression(
                            y_train, x_train, lambda_, initial_w, 350, 3e-1, 1)
                        folds_train_accuracy.append(
                            compute_accuracy(predict_labels(w, x_train),
                                             y_train))
                        folds_validation_accuracy.append(
                            compute_accuracy(predict_labels(w, x_test),
                                             y_test))
                    except Exception:
                        pass
                train_accuracy_matrix[jet_num, 1, i, j] = \
                    (np.mean(folds_train_accuracy), np.std(folds_train_accuracy))
                validation_accuracy_matrix[jet_num, 1, i, j] = \
                    (np.mean(folds_validation_accuracy), np.std(folds_validation_accuracy))

            for j, lambda_ in enumerate(POSSIBLE_LAMBDA_SVM):
                folds_train_accuracy = []
Exemple #13
0
# change [-1, 1] labels to [0, 1]
y = y / 2 + 0.5

N, d = tX.shape
#initial weights randomly generated
w0 = 10 * np.random.rand(d + 1, 1)

# remplace -999 values with the mean of the other ones
tX = replace_data(tX)
# normalize data to std 1 and 0 mean
tX = normalize_data(tX)

w, L = reg_logistic_regression(y,
                               tX,
                               lambda_=0.001,
                               initial_w=w0,
                               max_iters=10,
                               gamma=5e-7)

y_pred = predict_labels(w, tX, 0.5)

N = y_pred.size

# accuracy test on train set for sanity
n_err = 0

for i in range(N):
    if (y_pred[i] != y[i]):
        n_err = n_err + 1

print("train accuracy :", n_err / N)
Exemple #14
0
        split_data_by_categorical_column(test_classes,
                                         test_data,
                                         test_ids,
                                         PRI_JET_NUM_INDEX)

    # We achieved our best results using Regularized Logistic Regression,
    # so we only load only those previously computed optimal params to generate the submission
    logistic_best_params = np.load("results/logistic_best_params.npy", allow_pickle=True)
    logistic_best_models = []

    for (lambda_, deg, gamma), train_classes_split, train_data_split in \
            zip(logistic_best_params, train_classes_jet_num_splits, train_data_jet_num_splits):
        data_split, columns_to_remove, mean, std = preprocessing_pipeline(train_data_split, degree=np.int(deg),
                                                                          cross_term=True, norm_first=False)
        initial_w = np.zeros((data_split.shape[1],))
        w, loss = reg_logistic_regression(train_classes_split, data_split, lambda_, initial_w, 500, gamma, 1)
        print(f'Loss: {loss:.3f} Accuracy : {compute_accuracy(predict_labels(w, data_split), train_classes_split)}')
        logistic_best_models.append((w, loss, columns_to_remove, mean, std))

    # Calculate the predictions for each of the 4 subsets using the weights and then combine them
    results = None
    for (w, _, col_to_rm, mean, std), (_, deg, _), test_classes_split, test_data_split, test_ids_split in \
            zip(logistic_best_models, logistic_best_params,
                test_classes_jet_num_splits, test_data_jet_num_splits, test_ids_jet_num_splits):
        test_data_split, _, _, _ = preprocessing_pipeline(test_data_split, degree=np.int(deg),
                                                          columns_to_remove=col_to_rm,
                                                          cross_term=True, norm_first=False, mean=mean, std=std)
        pred = predict_labels(w, test_data_split)
        out = np.stack((test_ids_split, pred), axis=-1)
        results = out if results is None else np.vstack((results, out))
Exemple #15
0
# Logistic regression
print("Logistic Regression \n --------------")

X, y = datasets.load_breast_cancer(return_X_y = True)
X, _, _ = implementations.standardize_numpy(X)

tx = np.c_[np.ones(X.shape[0]), X]

initial_w = np.zeros(tx.shape[1])

w_log_gd, loss_log_gr = implementations.logistic_regression(y, tx, initial_w, max_iters,
                        gamma, verbose=False)
y_pred_log_gd = implementations.logistic_prediction(tx, w_log_gd)

w_log_gd_reg, loss_log_gd_reg = implementations.reg_logistic_regression(y, tx, lambda_, 2, initial_w,
                            max_iters, gamma, verbose=False,
                            early_stopping=True, tol = 0.0001,
                            patience = 5)
y_pred_log_gd_reg = implementations.logistic_prediction(tx, w_log_gd_reg)

print(f"Logistic regression gd : {implementations.accuracy(y, y_pred_log_gd)}")
print(f"Logistic regression reg: {implementations.accuracy(y, y_pred_log_gd_reg)}")

y_pred_log_sk = LogisticRegression().fit(X, y).predict(X)
print(f"Sklearn logistic regression : {implementations.accuracy(y, y_pred_log_sk)}")

y_pred_log_reg = LogisticRegression(C=1/lambda_, max_iter = 1000).fit(X,y).predict(X)
print(f"Sklearn reg logistic regression: {implementations.accuracy(y, y_pred_log_reg)}")



Exemple #16
0
g, l, avg_test_accuracy_RLR = cross_validation_RLR(X_train, y_train, k_fold=4, seed=1)

#%% Testing functions
#np.random.seed(42)

gamma = 0.2
lambda_ = 4E-5

w, loss = least_squares(y = y_train, tx = X_train)
#
w, loss = least_squares_SGD(y = y_train, tx = X_train, initial_w = np.random.random(size=num_features)*0.01, max_iters = 200000, gamma = gamma)
#
w, loss = ridge_regression(y = y_train, tx = X_train, lambda_ = lambda_)
#
w, loss = logistic_regression(y = y_train, tx = X_train, initial_w = np.random.random(size=num_features)*10, max_iters = 125000, gamma = gamma)
#
w, loss = reg_logistic_regression(y = y_train, tx = X_train, lambda_ = lambda_, initial_w = np.random.random(size=num_features)*0.01, max_iters = 200000, gamma = gamma)

plt.plot(w)

#%% Predictive step
y_test = X_test @ w

plt.hist(y_test, bins=200)

y_pred = predict_labels(w, X_test)

#%% Create submission

create_csv_submission(test.Id, y_pred, 'submission.csv')
Exemple #17
0
gamma = 5e-3
lambda_ = 6e-7

#w, loss = least_squares(y_train, tX_train)

#w, loss = least_squares_GD(y_train, tX_train, initial_w = w0, max_iters = max_iters, gamma = gamma)

#w, loss = least_squares_SGD(y_train, tX_train, w0, max_iters = max_iters, gamma = gamma)

#w, loss = ridge_regression(y_train, tX_train, lambda_)

#w, loss = logistic_regression(y_train, tX_train, initial_w = w0, max_iters = max_iters, gamma = gamma)

w, loss = reg_logistic_regression(y_train,
                                  tX_train,
                                  lambda_=lambda_,
                                  initial_w=w0,
                                  max_iters=max_iters,
                                  gamma=gamma)

##########################################################################
#### Calculate the train accuracy
##########################################################################

N = y_train.size
# TRAIN test accuracy for sanity
n_err = len(
    np.where(
        y_train != predict_01_labels(w, tX_train, 0.5).reshape(y_train.shape))
    [0])

print("train accuracy :", 1 - n_err / N)
def train_test(data_list,
               test_interval,
               val_num,
               test_list,
               whitening=True,
               method='ls',
               name_list=['A', 'B', 'AB', 'BC', 'ABC', 'D'],
               max_iters=1000,
               gamma=0.01,
               lambda_=0.001,
               epsilon=1e-9,
               fan_out_list=[25, 10],
               out_dim=2,
               lr=0.001,
               lam=0.0005,
               batch_size=100,
               num_epoch=100):
    """
    Train the model and test the accuracy.
    Note that there are 6 models to be trained that deal with 6 types of data.
    Return: 
        accuracy_list: list of accuracies for the models
        loss_list: collection of final losses. 
                    (for neural network, we collect all the losses)
                    
        recall_list: collection of recalls (only for neural net)
        precision_list: collection of precision (only for neural net)
    
    
    #######Parameters######
    data_list       collect different type of data in a list
    test_interval   collect indices which indicate the range of data being trained
    val_num         the number of the data subset in k-fold, note 0 <= val_num <= k - 1
    test_list       collect indices of redundant feature of all data types
    whitening       a boolean value for data whitening
    method          a string that is either 'log', 'ls' or 'dl' 
                        - 'log': logistic regression
                        - 'ls': least squares (or ridge regression if lambda_ > 0)
                        - 'dl': deep learning method (neural network)
    name_list       collect all names of the data type
    max_iters       maximum iterations for logistic regression
    gamma           step size for each iteration in logistic regression
    lambda_         parameter for l2 regularization in least squares and logistic regression
    epsilon         parameter for ZCA data whitening, should be a small positive
    fan_out_list    the list that collects the number of neurons in hidden layer of the neural network
    out_dim         output dimension at last layer in the neural network (before softmax)
    lr              learning rate when optimizing the neural network
    lam             parameter for weight decay (l2 regularization)
    batch_size      batch size for stochastic gradient descent in neural network
    num_epoch       number of epochs when optimizing the neural network
    """

    # collect all parameters such as data whitening and weights
    W_collection = []
    b_collection = []
    M_list = []
    mean_list = []
    accuracy_list = []

    w_list = []
    loss_list = []
    precision_list = []
    recall_list = []
    # iterate through all data types, train and test each model on a specific method
    for i in range(len(name_list)):
        print('Training for data ', name_list[i])
        x, y, dim = extract_train_data(test_list[i], data_list[i])

        # only use some part of data for training, rest is for testing
        i1 = test_interval[i][0]
        i2 = test_interval[i][1]
        index = list(range(0, i1)) + list(range(i2, len(y)))
        x_tr = x[index, :]
        y_tr = y[index]
        x_tst = x[i1:i2, :]
        y_tst = y[i1:i2]

        # print('Dummy accuracy is ', np.sum(y_tst == -1) / len(y_tst))

        # we use training data to obtain transformation
        if whitening:
            M, mean = data_whitening(x, epsilon)
            x_tr = np.dot(x_tr - mean, M)
            x_tst = np.dot(x_tst - mean, M)

            M_list.append(M)
            mean_list.append(mean)

        x_tr = build_poly(x_tr)
        x_tst = build_poly(x_tst)
        print('Length of data point: ', x_tr.shape[1])

        if method == 'ls':
            # least squares / ridge regression
            w, loss = imp.ridge_regression(y_tr, x_tr, lambda_)

            #            w = np.dot(x_tr.T, y_tr) / lambda_
            #            loss = 0

            accuracy = imp.evaluate(w, x_tst, y_tst)
            accuracy_list.append(accuracy)
            w_list.append(w)

        elif method == 'log':
            # logistic regression
            initial_w = np.random.rand(dim + 1)
            w, loss = imp.reg_logistic_regression(y_tr, x_tr, lambda_,
                                                  initial_w, max_iters, gamma)
            accuracy = imp.evaluate(w, x_tst, y_tst)
            accuracy_list.append(accuracy)
            w_list.append(w)
        elif method == 'dl':
            # deep learning method
            fan_out = fan_out_list[i]
            y_tr = y_tr.astype(np.int8)
            y_tr[y_tr == -1] = 0
            y_tst = y_tst.astype(np.int8)
            y_tst[y_tst == -1] = 0
            inst = sim.SimNet(fan_out, x_tr[:, 1:].T, y_tr, out_dim, lr, lam,
                              batch_size, num_epoch)
            loss = inst.optimize()
            accuracy, precision, recall = inst.test(x_tst[:, 1:].T, y_tst)
            recall_list.append(recall)
            precision_list.append(precision)
            W_collection.append(inst.W_list)
            b_collection.append(inst.b_list)
            accuracy_list.append(accuracy)
        else:
            raise ValueError
        loss_list.append(loss)

        print('For data ', name_list[i], ', the average accuracy is: ',
              accuracy, '\n')

    # Save all parameters
    if whitening:
        np.save('./parameters/data_whitening/mean_list_val' + str(val_num),
                np.array(mean_list))
        np.save('./parameters/data_whitening/M_list_val' + str(val_num),
                np.array(M_list))
    if method == 'dl':
        np.save('./parameters/neural_net/W_collection_dl_val' + str(val_num),
                np.array(W_collection))
        np.save('./parameters/neural_net/b_collection_dl_val' + str(val_num),
                np.array(b_collection))
    elif method == 'ls':
        np.save('./parameters/ridge/w_' + method + '_val' + str(val_num),
                np.array(w_list))
    elif method == 'log':
        np.save('./parameters/logistic/w_' + method + '_val' + str(val_num),
                np.array(w_list))
    return accuracy_list, precision_list, recall_list, loss_list