def test_data_model(): title = 'complex_full_before_ddl_interactions_full' print("Base line testing for model " + title) b_time = datetime.datetime.now() print('Beginning reading data') DATA_TRAIN_PATH = get_filepath('train') y, tX, ids = load_csv_data(DATA_TRAIN_PATH) print("Finish loading in {s} seconds".format(s=(datetime.datetime.now() - b_time).total_seconds())) _, test_x, test_ids = load_test_data(clean=False) data = compose_interactions_for_transforms(tX) t_data = compose_interactions_for_transforms(test_x) # Test 1 Ridge 0.1 logistic = LogisticRegression((y, data), regularizer='Ridge', regularizer_p=0.1) weight = logistic.train(lr=0.01, decay=0.5, max_iters=2000, early_stop=1000, decay_intval=100) # weight, _, _ = logistic.cross_validation(4, [0.1, 0.5, 0.05], 'regularizer_p', max_iters=2000) pred_label = predict_labels(weight, t_data) create_csv_submission( test_ids, pred_label, get_dataset_dir() + '/submission/removed_outlier_{}.csv'.format(title + 'Ridge01')) # Test 2 Lasso 0.1 logistic = LogisticRegression((y, data), regularizer='Lasso', regularizer_p=0.1) weight = logistic.train(lr=0.01, decay=0.5, max_iters=2000, early_stop=1000, decay_intval=100) # weight, _, _ = logistic.cross_validation(4, [0.1, 0.5, 0.05], 'regularizer_p', max_iters=2000) pred_label = predict_labels(weight, t_data) create_csv_submission( test_ids, pred_label, get_dataset_dir() + '/submission/removed_outlier_{}.csv'.format(title + '-Lasso0.1')) # Test 3 No penalized logistic = LogisticRegression((y, data)) weight = logistic.train(lr=0.01, decay=0.5, max_iters=2000, early_stop=1000) # weight, _, _ = logistic.cross_validation(4, [0.1, 0.5, 0.05], 'regularizer_p', max_iters=2000) pred_label = predict_labels(weight, t_data) create_csv_submission( test_ids, pred_label, get_dataset_dir() + '/submission/removed_outlier_{}.csv'.format(title))
def test_draw(): """ Draw balanced sample, but result worse result. """ b_time = datetime.datetime.now() print('Begining reading data') DATA_TRAIN_PATH = get_filepath('train') y, tX, ids = load_csv_data(DATA_TRAIN_PATH) print("Finish loading in {s} seconds".format(s=(datetime.datetime.now() - b_time).total_seconds())) # data, _, _ = standardize(tX) # test_bias(y) # nb_pc = 5 # print("test the PCA with {} elements".format(nb_pc)) # pcs, pc_data = pca_transform(data, nb_pc, concatenate=False) # # print("get interactions") # interaction = interactions(data, range(0, 10)) # interaction, _, _ = standardize(interaction) # print("select first 10 data entry with pc data") # data = np.c_[data[:, 0:10], pc_data] # data = np.c_[data, interaction] # # Begin the least square sgd # e_time = datetime.datetime.now() # print("Finish data reading in {s} seconds". # format(s=(e_time - b_time).total_seconds())) data, _, _ = compose_complex_features_further(tX, intercept=True, interaction=True, log=True, sqrt=True, power=True, pca=True) train, valid = draw_balanced_subsample(y, data, trainsize=6000) # t_data, _, _ = compose_complex_features_further(test_x, intercept=True, # interaction=True, # log=True, # sqrt=True, # power=True, # pca=True) logistic = LogisticRegression(train=train, validation=valid, regularizer='Lasso', regularizer_p=0.5) result = logistic.train(lr=0.01, decay=0.5, early_stop=400, max_iters=2000) print(result)
def test_logistic(): b_time = datetime.datetime.now() print('Begining reading data') DATA_TRAIN_PATH = get_filepath('train') y, tX, ids = load_csv_data(DATA_TRAIN_PATH) print("Finish loading in {s} seconds".format(s=(datetime.datetime.now() - b_time).total_seconds())) tX = standardize(tX) # Begin the least square sgd e_time = datetime.datetime.now() print("Finish data reading in {s} seconds".format( s=(e_time - b_time).total_seconds())) logistic = LogisticRegression((y, tX[0]), regularizer="Lasso", regularizer_p=0.1) result = logistic.train(lr=0.05, batch_size=128, max_iters=1000) print(result)
def test_baseline(): print("base line testing") b_time = datetime.datetime.now() print('Begining reading data') DATA_TRAIN_PATH = get_filepath('train') y, tX, ids = load_csv_data(DATA_TRAIN_PATH) print("Finish loading in {s} seconds".format(s=(datetime.datetime.now() - b_time).total_seconds())) data = baseline_logistic(tX) logistic = LogisticRegression((y, data)) weight = logistic.train(lr=0.01, decay=1) plot = True if plot: from plots import cross_validation_visualization _, test_x, test_ids = load_test_data(clean=False) t_data = baseline_logistic(test_x) pred_label = predict_labels(weight, t_data) create_csv_submission( test_ids, pred_label, get_dataset_dir() + '/submission/logistic_baseline.csv')
labels = unpickle('../data/meta') interesting_coarse_labels = [0, 1] # Aquatic mammals and Fish train = [] y = [] test = [] y_test = [] for i in range(len(train_data[b'coarse_labels'])): for j in interesting_coarse_labels: if train_data[b'coarse_labels'][i] == j: train.append(train_data[b'data'][i]) y.append(j) break for i in range(len(test_data[b'coarse_labels'])): for j in interesting_coarse_labels: if test_data[b'coarse_labels'][i] == j: test.append(test_data[b'data'][i]) y_test.append(j) break train = np.array(train) y = np.array(y) test = np.array(test) y_test = np.array(y_test) weight_matrix, losses = LogisticRegression.train(train, y, iteration=1, learning_rate=0.1) LogisticRegression.accuracy(weight_matrix, test, y_test)