Example #1
0
def ridge_regression_demo(y, tx, lamb, degree):
    # define parameter
    tX = im.build_poly(tx, degree)
    weight, loss = im.ridge_regression(y, tX, lamb)

    print("Training RMSE={tr:.3f}".format(tr=loss))
    return weight, loss
Example #2
0
def trial_high_dimension():
    """failed attent to search a good gamma hyperparameter, but the attent seemed intresting anyway.

    Returns:
        numpy.ndarray: prediction shape (len(_),)
    """
    # reloading to lose normalisation
    y, tX, ids = load_csv_data(DATA_TRAIN_PATH)
    _, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)
    tX_completed = implementations.Datas_completion_lacking_values_predicted(
        tX)
    tX_test_completed = implementations.Datas_completion_lacking_values_predicted(
        tX_test)
    tX_completed = implementations.normalize(tX_completed)
    tX_test = implementations.normalize(tX_test_completed)
    y[y == -1] = 0
    tX_completed = implementations.build_poly(tX_completed, 8)
    tX_test_completed = implementations.build_poly(tX_test_completed, 8)
    i = 0
    w = []
    accuracy = 0
    indices = implementations.build_k_indices(len(y), 9)
    x_train, y_train, x_test, y_test = implementations.cross_validation_split(
        y, tX_completed, indices, 2)
    F1 = 0
    while (F1 < 0.4 and accuracy < 0.6):
        i += 1
        gamma = 5 / 10**i
        w, l = implementations.get_w_loss(y_train, x_train, 1, 0, 6000, gamma)
        print("w={}, loss={} ".format(w, l))
        y_pred = predict_labels(w, x_test)

        #F1 = implementations.f1_score(y_test, y_pred)
        # print(F1)
        matches = [i for i, j in zip(y_pred, y_test) if i == j]
        accuracy = len(matches) / len(y_test)
        print(accuracy)
        F1 = implementations.f1_score(y_test, y_pred)
        print(F1)
    print("found model {}".format(w))
    y_pred = predict_labels(w, tX_test_completed)
    return y_pred
Example #3
0
def Feature_Completion_Benchmark(k_fold=10):
    """Here we try to assess the utility of the feature completion (trying to put a value on each -999 value using least squares)

    """
    method = 1
    indices = implementations.build_k_indices(len(y), k_fold)
    enhanced_completed_tX = implementations.build_poly(tX_completed, 4)
    enhanced_tX = implementations.build_poly(tX, 4)

    print("1 : Train test split")
    x_train, y_train, x_test, y_test = implementations.cross_validation_split(
        y, enhanced_tX, indices, 0)
    x_train_completed, y_train_completed, x_test_completed, y_test_completed = implementations.cross_validation_split(
        y, enhanced_completed_tX, indices, 0)
    y_test[y_test == 0] = -1
    #x_test = np.c_[np.ones((y_test.shape[0], 1)), x_test]
    print("2 : Compute gradient descent")
    w, loss_tr = implementations.get_w_loss(y_train,
                                            x_train,
                                            method,
                                            gamma=0.05,
                                            max_iters=2000)
    w_completed, loss_tr_completed = implementations.get_w_loss(
        y_train, x_train_completed, method, gamma=0.05, max_iters=2000)
    y_pred = predict_labels(w, x_test)
    y_pred_completed = predict_labels(w_completed, x_test_completed)
    print("3 : Compute stats")
    matches = [i for i, j in zip(y_pred, y_test) if i == j]
    accuracy = len(matches) / len(y_test)
    matches = [i for i, j in zip(y_pred_completed, y_test) if i == j]
    accuracy_completed = len(matches) / len(y_test)
    F1 = implementations.f1_score(y_test, y_pred)
    F1_completed = implementations.f1_score(y_test, y_pred_completed)
    print(
        "no completion : accuracy = {}, F1 = {}, with completion : accuracy = {}, F1 = {}"
        .format(accuracy, F1, accuracy_completed, F1_completed))
    return F1, accuracy, F1_completed, accuracy_completed, w, w_completed
Example #4
0
def I_do_it_all_and_I_try_to_do_it_good_REG_LOG_REG(degree,
                                                    lambdas_,
                                                    k_fold=10):
    method = 6
    indices = implementations.build_k_indices(len(y), k_fold)
    enhanced_tX = implementations.build_poly(tX, degree)
    best_heuristique = best_accuracy = best_TP = best_TS = best_lambda = best_losses_tr = best_losses_te = 0
    best_w = []
    print("1 : Train test split")
    x_train, y_train, x_test, y_test = implementations.cross_validation_split(
        y, enhanced_tX, indices, 0)
    print("2 : Compute regularized logistic regression")
    for lambda_ in lambdas_:
        w, loss_tr = get_w_loss(y_train,
                                x_train,
                                method,
                                gamma=0.00005,
                                max_iters=100,
                                lambda_=0.001)
        x_test = np.c_[np.ones((y_test.shape[0], 1)), x_test]
        print("3 : Predict using generated model with {}".format(lambda_))
        y_pred = predict_labels(w, x_test)
        y_test[y_test == 0] = -1
        print("4 : Compute stats with {}".format(lambda_))
        matches = [i for i, j in zip(y_pred, y_test) if i == j]
        accuracy = len(matches) / len(y_test)
        F1 = implementations.f1_score(y_test, y_pred)
        # As the set seems not to much unbalanced I give more importance to accuracy than F1.
        if (2 * accuracy + F1 > best_heuristique):
            best_w = w
            best_TP, best_FP, best_FN = implementations.stats(y_test, y_pred)
            best_lambda = lambda_
            best_F1 = F1
            best_accuracy = accuracy
            loss_te_best = loss_te = implementations.calculate_loss(
                y_test, x_test, w, lambda_)
            loss_tr_best = loss_tr
    print("5 : Generate the submission")
    implementations.submit(_, tX_test, best_model, ids_test, method, degree)
    return best_model, losses_tr_best, losses_te_best, best_lambda, best_accuracy, best_F1, best_TP, best_FP, best_FN
Example #5
0
# Iterate over each subset and build a model
# The predictions of every single model are combined
for i in range(num_subsets):
    # Extract the train/test subsets
    y_train_subset, X_train_subset, ids_train_subset = train_subsets[i]
    y_test_subset, X_test_subset, ids_test_subset = test_subsets[i]

    # Map the categorical output labels into [0, 1]
    y_train_subset = map_0_1(y_train_subset)
    # Standardize the data
    X_train_subset, X_test_subset = standardize(X_train_subset, X_test_subset)
    print(
        f"Train shape before feature expansion: {str(X_train_subset.shape):>12}   Test shape: {str(X_test_subset.shape):>12}"
    )
    # Build the polynomial features and expand the data
    X_train_subset, X_test_subset = build_poly(X_train_subset,
                                               max_degree[i]), build_poly(
                                                   X_test_subset,
                                                   max_degree[i])
    print(
        f"Train shape after  feature expansion: {str(X_train_subset.shape):>12}   Test shape: {str(X_test_subset.shape):>12}"
    )

    # Set n_best_features to X_train_subset.shape[1] if you don't want feature selection
    n_best_features = round(fs_perc[i] * X_train_subset.shape[1])
    D = n_best_features
    N, _ = X_train_subset.shape

    # Accuracy by predicting the majority class in the training dataset
    CA_one = y_train_subset.sum() / N
    CA_zero = 1 - CA_one
    CA_baseline = max(CA_zero, CA_one)
Example #6
0
    w, loss_tr = implementations.get_w_loss(y_train,
                                            x_train,
                                            method,
                                            gamma=0.05,
                                            max_iters=2000)
    w_completed, loss_tr_completed = implementations.get_w_loss(
        y_train, x_train_completed, method, gamma=0.05, max_iters=2000)
    y_pred = predict_labels(w, x_test)
    y_pred_completed = predict_labels(w_completed, x_test_completed)
    print("3 : Compute stats")
    matches = [i for i, j in zip(y_pred, y_test) if i == j]
    accuracy = len(matches) / len(y_test)
    matches = [i for i, j in zip(y_pred_completed, y_test) if i == j]
    accuracy_completed = len(matches) / len(y_test)
    F1 = implementations.f1_score(y_test, y_pred)
    F1_completed = implementations.f1_score(y_test, y_pred_completed)
    print(
        "no completion : accuracy = {}, F1 = {}, with completion : accuracy = {}, F1 = {}"
        .format(accuracy, F1, accuracy_completed, F1_completed))
    return F1, accuracy, F1_completed, accuracy_completed, w, w_completed


F1s, accuracies, F1s_completed, accuracies_completed, w, w_completed = Feature_Completion_Benchmark(
)

ids = np.array(range(350000, 918938))
y_pred = predict_labels(w_completed,
                        implementations.build_poly(tX_test_completed, 4))
print(y_pred.shape)
create_csv_submission(ids, y_pred, "../data/submission.csv")
def main():
	#Loading the Data
	# Training dataset
	DATA_TRAIN_PATH = '../data/train.csv' 
	y, X, ids = load_csv_data(DATA_TRAIN_PATH)
	# Testing Dataset
	DATA_TEST_PATH = '../data/test.csv' 
	y_t, X_t, ids_t = load_csv_data(DATA_TEST_PATH)

	#Separate training and testing sets into 4 different categories depending 
	#on the PRI_jet_num feature with index -8
	feature = -8
	X_cat = preproc.get_categories(X, feature=feature)
	X_t_cat = preproc.get_categories(X_t, feature=feature)

	#looop for every v in range 4 to obtain the 4 predictions, 
	#then concatenate and create submission file
	y_pred_all = []
	# Found using cross_validation

	# Setting best hyperparameters (the degree and the corresponding lambda) for each category
	degrees = [10, 10, 9, 9]
	lambdas = [0.00047508101621, 7.05480231072e-07, 0.000343046928631, 5.72236765935e-05]
	
	for v in range(4):
		# Extract category (test, train and labels)
	    Xv = X[X_cat[v]]
	    Xv_t = X_t[X_t_cat[v]]
	    y_v = y[X_cat[v]]

	    #Concatenante the train and testing set
	    all_Xv = np.concatenate((Xv, Xv_t), axis=0)

	    # find features (bad_features) with a unique value
	    bad_features = []
	    for i in range(len(all_Xv.T)):
	        if(len(np.unique(all_Xv.T[i])) == 1):
	            bad_features.append(i)

	    # Delete bad_features and fill missing values
	    all_Xv_c =  X_v = np.delete(all_Xv, bad_features, axis=1)
	    all_Xv_filled = preproc.fill_missing_values(all_Xv_c, tresh=1)

	    #Separate train and test
	    Xv_f = all_Xv_filled[:len(Xv)]
	    Xv_t_f = all_Xv_filled[len(Xv):]	    

	    #Standardize the dataset
	    tXv, mean_x, std_x = preproc.standardize(Xv_f)
	    tXv_t,  mean_x, std_x = preproc.standardize(Xv_t_f)

	    ### Generate model

	    final_degree = degrees[v]
	    best_lambda = lambdas[v]

	    # Build the polynomial basis, perform ridge regression
	    final_X = impl.build_poly(tXv, final_degree)
	    final_Xt = impl.build_poly(tXv_t, final_degree)

	    #Generate the model (Using Ridge Regression)
	    final_w, loss_ = impl.ridge_regression(y_v, final_X, best_lambda)

	    # Genereate prediction for this category
	    y_predv = predict_labels(final_w, final_Xt)
	    y_pred_all.append(y_predv)
	    p = len(X_cat[v])/len(X)

    ### Concatenate all predictions, and sort them by indices
	Xt_cat_all = [idx for sublist in X_t_cat for idx in sublist]
	y_pred = [yi for sublist in y_pred_all for yi in sublist]
	final_ypred = np.asarray(y_pred)[np.argsort(Xt_cat_all)]

	#Create Submission file
	OUTPUT_PATH = '../submissions/results__4categories_fillByCat_run.csv'

	create_csv_submission(ids_t, final_ypred, OUTPUT_PATH)
	print('Congratulations ........ Submission file created ::: ', OUTPUT_PATH)
# for ridge: for every models test different lambdas and degrees

D = len(degrees)
L = len(lambdas)
#averages of the f1/accuracy over the kfold for each cell
metrics_tot = []

#higher level: we keep the k_metrics_train,k_metrics_test and optcutoffs in a similar table
save_metrics = []

for idx_subset, (x_train, y_train) in enumerate(clean_data_trains):
    print('##### START SUBSET {} #####'.format(idx_subset))
    save_metric = []

    for idx_deg, deg in enumerate(degrees):
        x_poly = imp.build_poly(x_train, deg)
        temp1 = []
        print("{d}/{D} row".format(d=idx_deg, D=D))
        for idx_lambda, lambda_ in enumerate(lambdas):
            ridge = lambda y, x: imp.ridge_regression(y, x, lambda_)

            start = datetime.datetime.now()

            k_metrics_train, k_metrics_test, _ = imp.k_fold_cv(y_train,
                                                               x_poly,
                                                               KFOLD,
                                                               ridge,
                                                               METRIC,
                                                               verbose=False)

            end = datetime.datetime.now()
Example #9
0
tX1 = tX_test[index1, :]
tX1 = np.delete(tX1, 22, 1)
index2 = tX_test[:, 22] == 2
tX2 = tX_test[index2, :]
tX2 = np.delete(tX2, 22, 1)
index3 = tX_test[:, 22] == 3
tX3 = tX_test[index3, :]
tX3 = np.delete(tX3, 22, 1)

tX0_final, index_final_0 = im.formating(tX0)
tX1_final, index_final_1 = im.formating(tX1)
tX2_final, index_final_2 = im.formating(tX2)
tX3_final, index_final_3 = im.formating(tX3)

#Building the polynomial basis for the test data and predicting results
tX0_final_test = im.build_poly(tX0_final, degree0)
ypred0 = predict_labels(weight0, tX0_final_test)
tX1_final_test = im.build_poly(tX1_final, degree1)
ypred1 = predict_labels(weight1, tX1_final_test)
tX2_final_test = im.build_poly(tX2_final, degree2)
ypred2 = predict_labels(weight2, tX2_final_test)
tX3_final_test = im.build_poly(tX3_final, degree3)
ypred3 = predict_labels(weight3, tX3_final_test)

#Assembling the predicted y
y_pred = np.zeros((tX_test.shape[0]))
y_pred[index0] = ypred0.reshape(ypred0.shape[0], 1)
y_pred[index1] = ypred1.reshape(ypred1.shape[0], 1)
y_pred[index2] = ypred2.reshape(ypred2.shape[0], 1)
y_pred[index3] = ypred3.reshape(ypred3.shape[0], 1)
Example #10
0
xs, ys = clean_input_data(x_loaded.copy(),
                          y_loaded.copy(),
                          corr=1,
                          dimension_expansion=5,
                          bool_col=True)
for jet in range(4):  # set -1 to 0
    ys[jet][ys[jet] == -1] = 0

xs, mean_log, std_log = concatenate_log(xs.copy())
print("Train data cleaned")

# 3. Build the polynomials (one for each one of the 4 datasets)
degree = 2
txs = [None] * 4
for jet in range(4):
    txs[jet] = build_poly(xs[jet], degree)
print("The train polynomials have been built.")

# 4. Set the array of gammas for the logistic regression
gamma_constants = [1e-5, 1e-6]  # one for the degree 1 and one for the degree 2
gammas = [None] * 4
for jet in range(4):
    ncolumns = xs[jet].shape[1]
    gammas[jet] = np.concatenate([[gamma_constants[0]]] + [ncolumns*[g] for g in gamma_constants[:degree]])\
        .reshape((-1,1))


# 5. run the logistic regression on the four datasets
def logistic_regression_on_jet(jet):
    y = ys[jet]
    tx = txs[jet]