Beispiel #1
0
def test_pca_logistic2():
    """
    According to the PCA first 3 component test, the selected index:
        3,8,5,9,7,10,2,1,6,0,4
        0-10
    :return:
    """
    print('Submission added test full')

    b_time = datetime.datetime.now()
    print('Begining reading data')
    DATA_TRAIN_PATH = get_filepath('train')
    y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

    print("Finish loading in {s} seconds".format(s=(datetime.datetime.now() -
                                                    b_time).total_seconds()))
    data, x_mean, x_std = standardize(tX)
    print("test bias")
    test_bias(y)
    nb_pc = 5
    print("test the PCA with {} elements".format(nb_pc))
    pcs, pc_data = pca_transform(data, nb_pc, concatenate=False)

    print("get interactions")
    interaction = interactions(data, range(0, 10))
    interaction, _, _ = standardize(interaction)
    print("select first 10 data entry with pc data")
    data = np.c_[data[:, 0:10], pc_data]
    data = np.c_[data, interaction]
    # Begin the least square sgd
    e_time = datetime.datetime.now()
    print("Finish data reading in {s} seconds".format(
        s=(e_time - b_time).total_seconds()))
    # logistic = LogisticRegression((y, tX))
    logistic = LogisticRegression((y, data),
                                  regularizer="Lasso",
                                  regularizer_p=0.)
    # result = logistic.train(lr=0.1, batch_size=32, max_iters=6000)
    result = logistic.cross_validation(4, [0.5],
                                       'regularizer_p',
                                       lr=0.1,
                                       batch_size=32,
                                       max_iters=1000,
                                       early_stop=1000,
                                       skip=True)

    weight = result[0]
    _, test_x, test_ids = load_test_data(clean=False)
    test_data, x_mean, x_std = standardize(test_x)
    pcs, pc_data = pca_transform(test_data, nb_pc, concatenate=False)

    print("get interactions")
    interaction = interactions(test_data, range(0, 10))
    interaction, _, _ = standardize(interaction)
    print("select first 10 data entry with pc data")
    test_data = np.c_[test_data[:, 0:10], pc_data]
    test_data = np.c_[test_data, interaction]

    y_pred = []
    for w in weight:
        _y_pred = logistic.__call__(test_data, w)
        y_pred.append(_y_pred)
    y_pred = np.average(y_pred, axis=0)
    y_pred[np.where(y_pred <= 0.5)] = -1
    y_pred[np.where(y_pred > 0.5)] = 1
    output_path = get_dataset_dir() + \
                  '/submission/pca_test{}.csv'.format(
                      datetime.datetime.now().__str__())
    create_csv_submission(test_ids, y_pred, output_path)
Beispiel #2
0
def test_final():
    b_time = datetime.datetime.now()
    print('Begining reading data')
    DATA_TRAIN_PATH = get_filepath('train')
    y, tX, ids = load_csv_data(DATA_TRAIN_PATH)
    print("Finish loading in {s} seconds".format(s=(datetime.datetime.now() -
                                                    b_time).total_seconds()))

    data, _, _ = standardize(tX)

    nb_pc = 5
    print("test the PCA with {} elements".format(nb_pc))
    pcs, pc_data = pca_transform(data, nb_pc, concatenate=False)

    print("get interactions")
    interaction = interactions(data, range(0, 10))
    interaction, _, _ = standardize(interaction)

    print("select first 10 data entry with pc data")
    data = np.c_[data[:, 0:10], pc_data]
    data = np.c_[data, interaction]
    # Begin the least square sgd
    e_time = datetime.datetime.now()

    print("Finish data reading in {s} seconds".format(
        s=(e_time - b_time).total_seconds()))
    # train, valid = split_train_valid(0.8, data, labels=y)
    logistic = LogisticRegression((y, data),
                                  regularizer='Lasso',
                                  regularizer_p=0.)
    result = logistic.cross_validation(4, [0.],
                                       'regularizer_p',
                                       lr=0.1,
                                       batch_size=32,
                                       max_iters=1200,
                                       early_stop=400)
    weight = result[0]

    print("loading the test set")
    _, test_data, test_ids = load_test_data(clean=False)
    # Feature transform
    data, _, _ = standardize(test_data)
    nb_pc = 5
    print("test the PCA with {} elements".format(nb_pc))
    pcs, pc_data = pca_transform(data, nb_pc, concatenate=False)

    print("get interactions")
    interaction = interactions(data, range(0, 10))
    interaction, _, _ = standardize(interaction)
    print("select first 10 data entry with pc data")
    data = np.c_[data[:, 0:10], pc_data]
    data = np.c_[data, interaction]
    # Begin the least square sgd
    e_time = datetime.datetime.now()
    print("Finish data reading in {s} seconds".format(
        s=(e_time - b_time).total_seconds()))
    y_pred = []
    for w in weight:
        _y_pred = logistic.__call__(data, w)
        y_pred += _y_pred
    y_pred = np.average(y_pred)
    y_pred[np.where(y_pred <= 0.5)] = -1
    y_pred[np.where(y_pred > 0.5)] = 1
    output_path = get_dataset_dir() + '/second_submission.csv'
    create_csv_submission(test_ids, y_pred, output_path)