Beispiel #1
0
def test_data_model():
    title = 'complex_full_before_ddl_interactions_full'
    print("Base line testing for model " + title)
    b_time = datetime.datetime.now()
    print('Beginning reading data')
    DATA_TRAIN_PATH = get_filepath('train')
    y, tX, ids = load_csv_data(DATA_TRAIN_PATH)
    print("Finish loading in {s} seconds".format(s=(datetime.datetime.now() -
                                                    b_time).total_seconds()))
    _, test_x, test_ids = load_test_data(clean=False)

    data = compose_interactions_for_transforms(tX)
    t_data = compose_interactions_for_transforms(test_x)

    # Test 1 Ridge 0.1
    logistic = LogisticRegression((y, data),
                                  regularizer='Ridge',
                                  regularizer_p=0.1)
    weight = logistic.train(lr=0.01,
                            decay=0.5,
                            max_iters=2000,
                            early_stop=1000,
                            decay_intval=100)
    # weight, _, _ = logistic.cross_validation(4, [0.1, 0.5, 0.05], 'regularizer_p', max_iters=2000)
    pred_label = predict_labels(weight, t_data)
    create_csv_submission(
        test_ids, pred_label,
        get_dataset_dir() +
        '/submission/removed_outlier_{}.csv'.format(title + 'Ridge01'))

    # Test 2 Lasso 0.1
    logistic = LogisticRegression((y, data),
                                  regularizer='Lasso',
                                  regularizer_p=0.1)
    weight = logistic.train(lr=0.01,
                            decay=0.5,
                            max_iters=2000,
                            early_stop=1000,
                            decay_intval=100)
    # weight, _, _ = logistic.cross_validation(4, [0.1, 0.5, 0.05], 'regularizer_p', max_iters=2000)
    pred_label = predict_labels(weight, t_data)
    create_csv_submission(
        test_ids, pred_label,
        get_dataset_dir() +
        '/submission/removed_outlier_{}.csv'.format(title + '-Lasso0.1'))

    # Test 3 No penalized
    logistic = LogisticRegression((y, data))
    weight = logistic.train(lr=0.01,
                            decay=0.5,
                            max_iters=2000,
                            early_stop=1000)
    # weight, _, _ = logistic.cross_validation(4, [0.1, 0.5, 0.05], 'regularizer_p', max_iters=2000)
    pred_label = predict_labels(weight, t_data)
    create_csv_submission(
        test_ids, pred_label,
        get_dataset_dir() + '/submission/removed_outlier_{}.csv'.format(title))
Beispiel #2
0
def test_draw():
    """
    Draw balanced sample, but result worse result.
    """
    b_time = datetime.datetime.now()
    print('Begining reading data')
    DATA_TRAIN_PATH = get_filepath('train')
    y, tX, ids = load_csv_data(DATA_TRAIN_PATH)
    print("Finish loading in {s} seconds".format(s=(datetime.datetime.now() -
                                                    b_time).total_seconds()))

    # data, _, _ = standardize(tX)

    # test_bias(y)
    # nb_pc = 5
    # print("test the PCA with {} elements".format(nb_pc))
    # pcs, pc_data = pca_transform(data, nb_pc, concatenate=False)
    #
    # print("get interactions")
    # interaction = interactions(data, range(0, 10))
    # interaction, _, _ = standardize(interaction)
    # print("select first 10 data entry with pc data")
    # data = np.c_[data[:, 0:10], pc_data]
    # data = np.c_[data, interaction]
    # # Begin the least square sgd
    # e_time = datetime.datetime.now()
    # print("Finish data reading in {s} seconds".
    #       format(s=(e_time - b_time).total_seconds()))
    data, _, _ = compose_complex_features_further(tX,
                                                  intercept=True,
                                                  interaction=True,
                                                  log=True,
                                                  sqrt=True,
                                                  power=True,
                                                  pca=True)
    train, valid = draw_balanced_subsample(y, data, trainsize=6000)
    # t_data, _, _ = compose_complex_features_further(test_x, intercept=True,
    #                                                 interaction=True,
    #                                                 log=True,
    #                                                 sqrt=True,
    #                                                 power=True,
    #                                                 pca=True)

    logistic = LogisticRegression(train=train,
                                  validation=valid,
                                  regularizer='Lasso',
                                  regularizer_p=0.5)
    result = logistic.train(lr=0.01, decay=0.5, early_stop=400, max_iters=2000)
    print(result)
Beispiel #3
0
def test_logistic():
    b_time = datetime.datetime.now()
    print('Begining reading data')
    DATA_TRAIN_PATH = get_filepath('train')
    y, tX, ids = load_csv_data(DATA_TRAIN_PATH)
    print("Finish loading in {s} seconds".format(s=(datetime.datetime.now() -
                                                    b_time).total_seconds()))
    tX = standardize(tX)
    # Begin the least square sgd
    e_time = datetime.datetime.now()
    print("Finish data reading in {s} seconds".format(
        s=(e_time - b_time).total_seconds()))
    logistic = LogisticRegression((y, tX[0]),
                                  regularizer="Lasso",
                                  regularizer_p=0.1)
    result = logistic.train(lr=0.05, batch_size=128, max_iters=1000)
    print(result)
Beispiel #4
0
def test_baseline():
    print("base line testing")
    b_time = datetime.datetime.now()
    print('Begining reading data')
    DATA_TRAIN_PATH = get_filepath('train')
    y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

    print("Finish loading in {s} seconds".format(s=(datetime.datetime.now() -
                                                    b_time).total_seconds()))
    data = baseline_logistic(tX)

    logistic = LogisticRegression((y, data))
    weight = logistic.train(lr=0.01, decay=1)

    plot = True
    if plot:
        from plots import cross_validation_visualization

    _, test_x, test_ids = load_test_data(clean=False)
    t_data = baseline_logistic(test_x)
    pred_label = predict_labels(weight, t_data)
    create_csv_submission(
        test_ids, pred_label,
        get_dataset_dir() + '/submission/logistic_baseline.csv')
Beispiel #5
0
labels = unpickle('../data/meta')
interesting_coarse_labels = [0, 1]  # Aquatic mammals and Fish

train = []
y = []
test = []
y_test = []
for i in range(len(train_data[b'coarse_labels'])):
    for j in interesting_coarse_labels:
        if train_data[b'coarse_labels'][i] == j:
            train.append(train_data[b'data'][i])
            y.append(j)
            break

for i in range(len(test_data[b'coarse_labels'])):
    for j in interesting_coarse_labels:
        if test_data[b'coarse_labels'][i] == j:
            test.append(test_data[b'data'][i])
            y_test.append(j)
            break

train = np.array(train)
y = np.array(y)
test = np.array(test)
y_test = np.array(y_test)

weight_matrix, losses = LogisticRegression.train(train, y,
                                                 iteration=1, learning_rate=0.1)
LogisticRegression.accuracy(weight_matrix, test, y_test)