def test_k_fold_logistic(): np.set_printoptions(precision=4) b_time = datetime.datetime.now() print('Begining reading data') DATA_TRAIN_PATH = get_filepath('train') y, tX, ids = load_csv_data(DATA_TRAIN_PATH) print("Finish loading in {s} seconds".format(s=(datetime.datetime.now() - b_time).total_seconds())) tX = remove_dimensions(tX) tX = standardize(tX) e_time = datetime.datetime.now() print("Finish data reading in {s} seconds".format( s=(e_time - b_time).total_seconds())) # Lambda space lambdas = np.logspace(-3, 1, 10) logistic = LogisticRegression((y, tX[0]), regularizer='Lasso', regularizer_p=0.1) best_lambda, (tr_err, te_err) = logistic.cross_validation( 5, lambdas, lambda_name='regularizer_p', max_iters=6000) print('best lambda {}'.format(best_lambda)) save_path = get_plot_path(test_k_fold_logistic.__name__) tr_err = np.array(tr_err) te_err = np.array(te_err) np.save(save_path + "tr_err", tr_err) np.save(save_path + "te_err", te_err)
def test_complex(): b_time = datetime.datetime.now() print('Begining reading data') DATA_TRAIN_PATH = get_filepath('train') y, tX, ids = load_csv_data(DATA_TRAIN_PATH) print("Finish loading in {s} seconds".format(s=(datetime.datetime.now() - b_time).total_seconds())) tX, _, _ = standardize(tX, intercept=False) complex_tx, _, _ = compose_complex_features(tX, intercept=True, interaction=True, log=True, sqrt=False, pca=True) test_bias(y) logistic = LogisticRegression((y, complex_tx), regularizer="Lasso", regularizer_p=0.5) # result = logistic.train(lr=0.1, batch_size=32, max_iters=6000) result = logistic.cross_validation(4, [0.5], 'regularizer_p', lr=0.1, batch_size=32, max_iters=6000, early_stop=1000)
def test_pca_logistic(): """ According to the PCA first 3 component test, the selected index: 3,8,5,9,7,10,2,1,6,0,4 0-10 :return: """ b_time = datetime.datetime.now() print('Begining reading data') DATA_TRAIN_PATH = get_filepath('train') y, tX, ids = load_csv_data(DATA_TRAIN_PATH) print("Finish loading in {s} seconds".format(s=(datetime.datetime.now() - b_time).total_seconds())) data, x_mean, x_std = standardize(tX) print("test bias") test_bias(y) nb_pc = 5 print("test the PCA with {} elements".format(nb_pc)) pcs, pc_data = pca_transform(data, nb_pc, concatenate=False) print("get interactions") interaction = interactions(data, range(0, 10)) interaction, _, _ = standardize(interaction) print("select first 10 data entry with pc data") data = np.c_[data[:, 0:10], pc_data] data = np.c_[data, interaction] # Begin the least square sgd e_time = datetime.datetime.now() print("Finish data reading in {s} seconds".format( s=(e_time - b_time).total_seconds())) # logistic = LogisticRegression((y, tX)) logistic = LogisticRegression((y, data), regularizer="Lasso", regularizer_p=0.) # result = logistic.train(lr=0.1, batch_size=32, max_iters=6000) result = logistic.cross_validation(4, [0.5], 'regularizer_p', lr=0.1, batch_size=32, max_iters=6000, early_stop=1000) print(result)
def test_pca_logistic2(): """ According to the PCA first 3 component test, the selected index: 3,8,5,9,7,10,2,1,6,0,4 0-10 :return: """ print('Submission added test full') b_time = datetime.datetime.now() print('Begining reading data') DATA_TRAIN_PATH = get_filepath('train') y, tX, ids = load_csv_data(DATA_TRAIN_PATH) print("Finish loading in {s} seconds".format(s=(datetime.datetime.now() - b_time).total_seconds())) data, x_mean, x_std = standardize(tX) print("test bias") test_bias(y) nb_pc = 5 print("test the PCA with {} elements".format(nb_pc)) pcs, pc_data = pca_transform(data, nb_pc, concatenate=False) print("get interactions") interaction = interactions(data, range(0, 10)) interaction, _, _ = standardize(interaction) print("select first 10 data entry with pc data") data = np.c_[data[:, 0:10], pc_data] data = np.c_[data, interaction] # Begin the least square sgd e_time = datetime.datetime.now() print("Finish data reading in {s} seconds".format( s=(e_time - b_time).total_seconds())) # logistic = LogisticRegression((y, tX)) logistic = LogisticRegression((y, data), regularizer="Lasso", regularizer_p=0.) # result = logistic.train(lr=0.1, batch_size=32, max_iters=6000) result = logistic.cross_validation(4, [0.5], 'regularizer_p', lr=0.1, batch_size=32, max_iters=1000, early_stop=1000, skip=True) weight = result[0] _, test_x, test_ids = load_test_data(clean=False) test_data, x_mean, x_std = standardize(test_x) pcs, pc_data = pca_transform(test_data, nb_pc, concatenate=False) print("get interactions") interaction = interactions(test_data, range(0, 10)) interaction, _, _ = standardize(interaction) print("select first 10 data entry with pc data") test_data = np.c_[test_data[:, 0:10], pc_data] test_data = np.c_[test_data, interaction] y_pred = [] for w in weight: _y_pred = logistic.__call__(test_data, w) y_pred.append(_y_pred) y_pred = np.average(y_pred, axis=0) y_pred[np.where(y_pred <= 0.5)] = -1 y_pred[np.where(y_pred > 0.5)] = 1 output_path = get_dataset_dir() + \ '/submission/pca_test{}.csv'.format( datetime.datetime.now().__str__()) create_csv_submission(test_ids, y_pred, output_path)
def test_final(): b_time = datetime.datetime.now() print('Begining reading data') DATA_TRAIN_PATH = get_filepath('train') y, tX, ids = load_csv_data(DATA_TRAIN_PATH) print("Finish loading in {s} seconds".format(s=(datetime.datetime.now() - b_time).total_seconds())) data, _, _ = standardize(tX) nb_pc = 5 print("test the PCA with {} elements".format(nb_pc)) pcs, pc_data = pca_transform(data, nb_pc, concatenate=False) print("get interactions") interaction = interactions(data, range(0, 10)) interaction, _, _ = standardize(interaction) print("select first 10 data entry with pc data") data = np.c_[data[:, 0:10], pc_data] data = np.c_[data, interaction] # Begin the least square sgd e_time = datetime.datetime.now() print("Finish data reading in {s} seconds".format( s=(e_time - b_time).total_seconds())) # train, valid = split_train_valid(0.8, data, labels=y) logistic = LogisticRegression((y, data), regularizer='Lasso', regularizer_p=0.) result = logistic.cross_validation(4, [0.], 'regularizer_p', lr=0.1, batch_size=32, max_iters=1200, early_stop=400) weight = result[0] print("loading the test set") _, test_data, test_ids = load_test_data(clean=False) # Feature transform data, _, _ = standardize(test_data) nb_pc = 5 print("test the PCA with {} elements".format(nb_pc)) pcs, pc_data = pca_transform(data, nb_pc, concatenate=False) print("get interactions") interaction = interactions(data, range(0, 10)) interaction, _, _ = standardize(interaction) print("select first 10 data entry with pc data") data = np.c_[data[:, 0:10], pc_data] data = np.c_[data, interaction] # Begin the least square sgd e_time = datetime.datetime.now() print("Finish data reading in {s} seconds".format( s=(e_time - b_time).total_seconds())) y_pred = [] for w in weight: _y_pred = logistic.__call__(data, w) y_pred += _y_pred y_pred = np.average(y_pred) y_pred[np.where(y_pred <= 0.5)] = -1 y_pred[np.where(y_pred > 0.5)] = 1 output_path = get_dataset_dir() + '/second_submission.csv' create_csv_submission(test_ids, y_pred, output_path)