def ensemble():
    crop = 150
    s = 15
    n_centroids = 3000

    images = get_images(crop=crop, s=s)
    kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids)

    train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True)
    train_y = classes.train_solutions.data

    # Unload some objects
    del images
    gc.collect()

    wrapper1 = ModelWrapper(models.Ridge.RidgeExtraTreesEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1)
    wrapper1.fit(train_x, train_y)

    wrapper2 = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1)
    wrapper2.fit(train_x, train_y)

    pred1 = wrapper1.predict(train_x)
    pred2 = wrapper2.predict(train_x)
    wrapper3 = ModelWrapper(Ridge)
    wrapper3.cross_validation(np.vstack((pred1, pred2)), train_y)
def kmeans_006_submission():
    # 2014-03-28 10:53:22 - Base - INFO - Cross validation completed in 1487.18687487.  Scores:
    # 2014-03-28 10:53:22 - Base - INFO - [-0.11018943 -0.10946863]

    # Final submission
    crop = 150
    s = 15
    n_centroids = 3000

    images = get_images(crop=crop, s=s)
    kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids)

    train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True)
    train_y = classes.train_solutions.data

    # Unload some objects
    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1)
    # wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True)
    wrapper.fit(train_x, train_y)

    test_x_crop_scale = CropScaleImageTransformer(training=False,
                                                  result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
                                                  crop_size=crop,
                                                  scaled_size=s,
                                                  n_jobs=-1,
                                                  memmap=True)

    test_images = test_x_crop_scale.transform()
    test_x = kmeans_generator.transform(test_images, save_to_file='data/data_test_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True)
    res = wrapper.predict(test_x)
    sub = classes.Submission(res)
    sub.to_file('sub_kmeans_006.csv')
def kmeans_006_colwise_rmse():
    crop = 150
    s = 15
    n_centroids = 3000

    images = get_images(crop=crop, s=s)
    kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids)

    train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True)
    train_y = classes.train_solutions.data

    train_x, test_x, train_y, test_y = train_test_split(train_x, train_y, train_size=0.2, test_size=0.2)

    # Unload some objects
    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500, 'verbose': 1}, n_jobs=-1)
    wrapper.fit(train_x, train_y)
    # About 11 minutes to train the ridge regression on an m2.4 xlarge with 50% of the train set
    # Took about a minute to train ridge on 0.1 of the train set, but the overall rmse was .114 compared to .106 on 50%, and .104 actual
    # 5 minutes to train ridge on 0.2 of the train set, with rmse of .111
    kmeans_preds = wrapper.predict(test_x)

    logger.info('Kmeans')
    colwise = classes.colwise_rmse(kmeans_preds, test_y)
    overall = classes.rmse(kmeans_preds, test_y)

    """
def extra_trees_submission():
    # Somehow the submission on the leaderboard scores 0.22
    crop = 150
    s = 15
    n_centroids = 3000

    images = get_images(crop=crop, s=s)
    kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids)

    train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True)
    train_y = classes.train_solutions.data

    # Unload some objects
    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeExtraTreesEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1)
    # wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True)
    wrapper.fit(train_x, train_y)

    test_x_crop_scale = CropScaleImageTransformer(training=False,
                                                  result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
                                                  crop_size=crop,
                                                  scaled_size=s,
                                                  n_jobs=-1,
                                                  memmap=True)

    test_images = test_x_crop_scale.transform()
    test_x = kmeans_generator.transform(test_images, save_to_file='data/data_test_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True)
    res = wrapper.predict(test_x)
    sub = classes.Submission(res)
    sub.to_file('sub_kmeans_008.csv')
Beispiel #5
0
def kmeans_006_submission():
    # Final submission
    n_centroids = 3000
    s = 15
    crop = 150
    n_patches = 400000
    rf_size = 5
    logger.info("Training with n_centroids {}".format(n_centroids))

    train_x_crop_scale = CropScaleImageTransformer(training=True,
                                                   result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s),
                                                   crop_size=crop,
                                                   scaled_size=s,
                                                   n_jobs=-1,
                                                   memmap=True)

    kmeans_generator = KMeansFeatureGenerator(n_centroids=n_centroids,
                                              rf_size=rf_size,
                                              result_path='data/mdl_kmeans_006_centroids_{}'.format(n_centroids),
                                              n_iterations=20,
                                              n_jobs=-1,)

    patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                         patch_size=rf_size,
                                                         n_jobs=-1)
    images = train_x_crop_scale.transform()

    patches = patch_extractor.transform(images)

    kmeans_generator.fit(patches)

    del patches
    gc.collect()

    train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True)
    train_y = classes.train_solutions.data
    # Unload some objects
    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1)
    wrapper.fit(train_x, train_y)

    test_x_crop_scale = CropScaleImageTransformer(training=False,
                                                  result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
                                                  crop_size=crop,
                                                  scaled_size=s,
                                                  n_jobs=-1,
                                                  memmap=True)

    test_images = test_x_crop_scale.transform()
    test_x = kmeans_generator.transform(test_images, save_to_file='data/data_test_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True)
    res = wrapper.predict(test_x)
    sub = classes.Submission(res)
    sub.to_file('sub_kmeans_006.csv')
Beispiel #6
0
def ensemble():
    crop = 150
    s = 15
    n_centroids = 3000

    images = get_images(crop=crop, s=s)
    kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids)

    train_x = kmeans_generator.transform(
        images,
        save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(
            n_centroids),
        memmap=True)
    train_y = classes.train_solutions.data

    # Unload some objects
    del images
    gc.collect()

    wrapper1 = ModelWrapper(models.Ridge.RidgeExtraTreesEstimator, {
        'alpha': 500,
        'n_estimators': 500
    },
                            n_jobs=-1)
    wrapper1.fit(train_x, train_y)

    wrapper2 = ModelWrapper(models.Ridge.RidgeRFEstimator, {
        'alpha': 500,
        'n_estimators': 500
    },
                            n_jobs=-1)
    wrapper2.fit(train_x, train_y)

    pred1 = wrapper1.predict(train_x)
    pred2 = wrapper2.predict(train_x)
    wrapper3 = ModelWrapper(Ridge)
    wrapper3.cross_validation(np.vstack((pred1, pred2)), train_y)
Beispiel #7
0
def kmeans_006_submission():
    # 2014-03-28 10:53:22 - Base - INFO - Cross validation completed in 1487.18687487.  Scores:
    # 2014-03-28 10:53:22 - Base - INFO - [-0.11018943 -0.10946863]

    # Final submission
    crop = 150
    s = 15
    n_centroids = 3000

    images = get_images(crop=crop, s=s)
    kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids)

    train_x = kmeans_generator.transform(
        images,
        save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(
            n_centroids),
        memmap=True)
    train_y = classes.train_solutions.data

    # Unload some objects
    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
        'alpha': 500,
        'n_estimators': 500
    },
                           n_jobs=-1)
    # wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True)
    wrapper.fit(train_x, train_y)

    test_x_crop_scale = CropScaleImageTransformer(
        training=False,
        result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
        crop_size=crop,
        scaled_size=s,
        n_jobs=-1,
        memmap=True)

    test_images = test_x_crop_scale.transform()
    test_x = kmeans_generator.transform(
        test_images,
        save_to_file='data/data_test_kmeans_features_006_centroids_{}.npy'.
        format(n_centroids),
        memmap=True)
    res = wrapper.predict(test_x)
    sub = classes.Submission(res)
    sub.to_file('sub_kmeans_006.csv')
Beispiel #8
0
def extra_trees_submission():
    # Somehow the submission on the leaderboard scores 0.22
    crop = 150
    s = 15
    n_centroids = 3000

    images = get_images(crop=crop, s=s)
    kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids)

    train_x = kmeans_generator.transform(
        images,
        save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(
            n_centroids),
        memmap=True)
    train_y = classes.train_solutions.data

    # Unload some objects
    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeExtraTreesEstimator, {
        'alpha': 500,
        'n_estimators': 500
    },
                           n_jobs=-1)
    # wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True)
    wrapper.fit(train_x, train_y)

    test_x_crop_scale = CropScaleImageTransformer(
        training=False,
        result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
        crop_size=crop,
        scaled_size=s,
        n_jobs=-1,
        memmap=True)

    test_images = test_x_crop_scale.transform()
    test_x = kmeans_generator.transform(
        test_images,
        save_to_file='data/data_test_kmeans_features_006_centroids_{}.npy'.
        format(n_centroids),
        memmap=True)
    res = wrapper.predict(test_x)
    sub = classes.Submission(res)
    sub.to_file('sub_kmeans_008.csv')
Beispiel #9
0
def kmeans_006_colwise_rmse():
    crop = 150
    s = 15
    n_centroids = 3000

    images = get_images(crop=crop, s=s)
    kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids)

    train_x = kmeans_generator.transform(
        images,
        save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(
            n_centroids),
        memmap=True)
    train_y = classes.train_solutions.data

    train_x, test_x, train_y, test_y = train_test_split(train_x,
                                                        train_y,
                                                        train_size=0.2,
                                                        test_size=0.2)

    # Unload some objects
    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
        'alpha': 500,
        'n_estimators': 500,
        'verbose': 1
    },
                           n_jobs=-1)
    wrapper.fit(train_x, train_y)
    # About 11 minutes to train the ridge regression on an m2.4 xlarge with 50% of the train set
    # Took about a minute to train ridge on 0.1 of the train set, but the overall rmse was .114 compared to .106 on 50%, and .104 actual
    # 5 minutes to train ridge on 0.2 of the train set, with rmse of .111
    kmeans_preds = wrapper.predict(test_x)

    logger.info('Kmeans')
    colwise = classes.colwise_rmse(kmeans_preds, test_y)
    overall = classes.rmse(kmeans_preds, test_y)
    """
Beispiel #10
0
def ensemble_001():
    """
    Ensemble of kmeans and random forest results
    Conducting some analysis of whether the errors from these two models for individual Ys are different

    Ensembled error is .1149.

    Kmeans is better on every class than RF.
    """
    n_centroids = 3000
    s = 15
    crop = 150
    n_patches = 400000
    rf_size = 5

    train_x_crop_scale = CropScaleImageTransformer(training=True,
                                                   crop_size=crop,
                                                   scaled_size=s,
                                                   n_jobs=-1,
                                                   memmap=True)

    kmeans_generator = KMeansFeatureGenerator(
        n_centroids=n_centroids,
        rf_size=rf_size,
        result_path='data/mdl_ensemble_001',
        n_iterations=20,
        n_jobs=-1,
    )

    patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                         patch_size=rf_size,
                                                         n_jobs=-1)
    images = train_x_crop_scale.transform()
    patches = patch_extractor.transform(images)

    kmeans_generator.fit(patches)

    del patches
    gc.collect()

    X = kmeans_generator.transform(images,
                                   save_to_file='data/data_ensemble_001.npy',
                                   memmap=True)
    Y = classes.train_solutions.data

    # Unload some objects
    del images
    gc.collect()

    # Get the input for the RF so that we can split together
    sampler = SampleTransformer(training=True,
                                steps=2,
                                step_size=20,
                                n_jobs=-1)
    pX = sampler.transform()

    # manual split of train and test
    train_x, test_x, ptrain_x, ptest_x, train_y, test_y = train_test_split(
        X, pX, Y, test_size=0.5)

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
        'alpha': 500,
        'n_estimators': 500
    },
                           n_jobs=-1)
    wrapper.fit(train_x, train_y)
    kmeans_preds = wrapper.predict(test_x)

    pWrapper = ModelWrapper(RandomForestRegressor, {
        'n_estimators': 500,
        'verbose': 3
    },
                            n_jobs=-1)
    pWrapper.fit(ptrain_x, train_y)
    pixel_preds = pWrapper.predict(ptest_x)

    logger.info('Kmeans')
    classes.colwise_rmse(kmeans_preds, test_y)
    classes.rmse(kmeans_preds, test_y)
    logger.info('Pixel RF')
    classes.colwise_rmse(pixel_preds, test_y)
    classes.rmse(pixel_preds, test_y)

    logger.info("Ensembling predictions")
    etrain_x = np.hstack(
        (wrapper.predict(train_x), pWrapper.predict(ptrain_x)))
    etest_x = np.hstack((kmeans_preds, pixel_preds))
    eWrapper = ModelWrapper(RandomForestRegressor, {
        'n_estimators': 500,
        'verbose': 3
    },
                            n_jobs=-1)
    eWrapper.fit(etrain_x, train_y)
    ensemble_preds = eWrapper.predict(etest_x)
    classes.colwise_rmse(ensemble_preds, test_y)
    classes.rmse(ensemble_preds, test_y)
Beispiel #11
0
def kmeans_006_submission():
    # Final submission
    n_centroids = 3000
    s = 15
    crop = 150
    n_patches = 400000
    rf_size = 5
    logger.info("Training with n_centroids {}".format(n_centroids))

    train_x_crop_scale = CropScaleImageTransformer(
        training=True,
        result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s),
        crop_size=crop,
        scaled_size=s,
        n_jobs=-1,
        memmap=True)

    kmeans_generator = KMeansFeatureGenerator(
        n_centroids=n_centroids,
        rf_size=rf_size,
        result_path='data/mdl_kmeans_006_centroids_{}'.format(n_centroids),
        n_iterations=20,
        n_jobs=-1,
    )

    patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                         patch_size=rf_size,
                                                         n_jobs=-1)
    images = train_x_crop_scale.transform()

    patches = patch_extractor.transform(images)

    kmeans_generator.fit(patches)

    del patches
    gc.collect()

    train_x = kmeans_generator.transform(
        images,
        save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(
            n_centroids),
        memmap=True)
    train_y = classes.train_solutions.data
    # Unload some objects
    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
        'alpha': 500,
        'n_estimators': 500
    },
                           n_jobs=-1)
    wrapper.fit(train_x, train_y)

    test_x_crop_scale = CropScaleImageTransformer(
        training=False,
        result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
        crop_size=crop,
        scaled_size=s,
        n_jobs=-1,
        memmap=True)

    test_images = test_x_crop_scale.transform()
    test_x = kmeans_generator.transform(
        test_images,
        save_to_file='data/data_test_kmeans_features_006_centroids_{}.npy'.
        format(n_centroids),
        memmap=True)
    res = wrapper.predict(test_x)
    sub = classes.Submission(res)
    sub.to_file('sub_kmeans_006.csv')
Beispiel #12
0
def kmeans_003():
    """
    Grid search for Ridge RF parameters
    Not sure whether to use spherical or minibatch, so maybe do one run with both

    .106 on the leaderboard.  So the difference in CV scores narrowed
    """

    train_x_crop_scale = CropScaleImageTransformer(
        training=True,
        result_path='data/data_train_crop_150_scale_15.npy',
        crop_size=150,
        scaled_size=15,
        n_jobs=-1,
        memmap=True)

    # spherical generator
    kmeans_generator = KMeansFeatureGenerator(
        n_centroids=1600,
        rf_size=5,
        result_path='data/mdl_kmeans_002_new',
        n_iterations=20,
        n_jobs=-1,
    )

    # minibatch generator
    # kmeans_generator = models.KMeansFeatures.KMeansFeatureGenerator(n_centroids=1600,
    #                                                                 rf_size=5,
    #                                                                 result_path='data/mdl_kmeans_002_new_minibatch',
    #                                                                 method='minibatch',
    #                                                                 n_init=1,
    #                                                                 n_jobs=-1,)

    # Don't need to fit, as already cached
    patches = ''
    kmeans_generator.fit(patches)
    images = train_x_crop_scale.transform()

    # Problematic here - memory usage spikes to ~ 11GB when threads return
    # train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_002_new.npy', memmap=True)
    train_x = kmeans_generator.transform(
        images,
        save_to_file='data/data_kmeans_features_002_new.npy',
        memmap=True)
    train_y = classes.train_solutions.data
    # Unload some objects
    del images
    gc.collect()
    # mdl = models.Ridge.RidgeRFEstimator(alpha=14, n_estimators=250, n_jobs=-1)
    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
        'alpha': 14,
        'n_estimators': 500
    },
                           n_jobs=-1)
    params = {'alpha': [150, 250, 500, 750, 1000], 'n_estimators': [250]}

    # 500 trees and alpha 25 gives cv of .10972 on 2-fold CV, but 25 was on the upper range of the search space,
    # So need to re-run with larger range of alpha
    # Will hit 30GB of ram with 500 trees.
    wrapper.grid_search(train_x,
                        train_y,
                        params,
                        refit=False,
                        parallel_estimator=True)

    # [mean: -0.11024, std: 0.00018, params: {'n_estimators': 250, 'alpha': 20.0},
    # mean: -0.11000, std: 0.00019, params: {'n_estimators': 250, 'alpha': 25.0},
    # mean: -0.10969, std: 0.00018, params: {'n_estimators': 250, 'alpha': 35},
    # mean: -0.10934, std: 0.00019, params: {'n_estimators': 250, 'alpha': 50},
    # mean: -0.10892, std: 0.00025, params: {'n_estimators': 250, 'alpha': 75},
    # mean: -0.10860, std: 0.00025, params: {'n_estimators': 250, 'alpha': 100},
    # mean: -0.10828, std: 0.00019, params: {'n_estimators': 250, 'alpha': 150},
    # mean: -0.10789, std: 0.00016, params: {'n_estimators': 250, 'alpha': 250},
    # mean: -0.10775, std: 0.00024, params: {'n_estimators': 250, 'alpha': 500},
    # mean: -0.10779, std: 0.00022, params: {'n_estimators': 250, 'alpha': 750},
    # mean: -0.10784, std: 0.00023, params: {'n_estimators': 250, 'alpha': 1000}]

    # Fit the final model
    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
        'alpha': 500,
        'n_estimators': 500
    },
                           n_jobs=-1)
    wrapper.fit(train_x, train_y)
    test_x_crop_scale = CropScaleImageTransformer(
        training=False,
        result_path='data/data_test_crop_150_scale_15.npy',
        crop_size=150,
        scaled_size=15,
        n_jobs=-1,
        memmap=True)

    test_images = test_x_crop_scale.transform()
    test_x = kmeans_generator.transform(
        test_images,
        save_to_file='data/data_kmeans_test_features_003_new.npy',
        memmap=True)
    res = wrapper.predict(test_x)
    sub = classes.Submission(res)
    sub.to_file('sub_kmeans_003.csv')
Beispiel #13
0
wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
    'alpha': 500,
    'n_estimators': 500
},
                       n_jobs=-1)
wrapper.fit(train_x, train_y)

test_x_crop_scale = CropScaleImageTransformer(
    training=False,
    result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
    crop_size=crop,
    scaled_size=s,
    n_jobs=-1,
    memmap=True)

# Crop and scale the test images
test_images = test_x_crop_scale.transform()

# Generate the test features
test_x = kmeans_generator.transform(
    test_images,
    save_to_file='data/data_test_kmeans_features_006_centroids_{}.npy'.format(
        n_centroids),
    memmap=True)

# Predict on the test features
res = wrapper.predict(test_x)

# Generate a submission file
sub = classes.Submission(res)
sub.to_file('sub_kmeans_006.csv')
Beispiel #14
0
def ensemble_001():
    """
    Ensemble of kmeans and random forest results
    Conducting some analysis of whether the errors from these two models for individual Ys are different

    Ensembled error is .1149.

    Kmeans is better on every class than RF.
    """
    n_centroids = 3000
    s = 15
    crop = 150
    n_patches = 400000
    rf_size = 5

    train_x_crop_scale = CropScaleImageTransformer(training=True,
                                                   crop_size=crop,
                                                   scaled_size=s,
                                                   n_jobs=-1,
                                                   memmap=True)

    kmeans_generator = KMeansFeatureGenerator(n_centroids=n_centroids,
                                              rf_size=rf_size,
                                              result_path='data/mdl_ensemble_001',
                                              n_iterations=20,
                                              n_jobs=-1,)

    patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                         patch_size=rf_size,
                                                         n_jobs=-1)
    images = train_x_crop_scale.transform()
    patches = patch_extractor.transform(images)

    kmeans_generator.fit(patches)

    del patches
    gc.collect()

    X = kmeans_generator.transform(images, save_to_file='data/data_ensemble_001.npy', memmap=True)
    Y = classes.train_solutions.data

    # Unload some objects
    del images
    gc.collect()

    # Get the input for the RF so that we can split together
    sampler = SampleTransformer(training=True, steps=2, step_size=20, n_jobs=-1)
    pX = sampler.transform()

    # manual split of train and test
    train_x, test_x, ptrain_x, ptest_x, train_y, test_y = train_test_split(X, pX, Y, test_size=0.5)

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1)
    wrapper.fit(train_x, train_y)
    kmeans_preds = wrapper.predict(test_x)

    pWrapper = ModelWrapper(RandomForestRegressor, {'n_estimators': 500, 'verbose': 3}, n_jobs=-1)
    pWrapper.fit(ptrain_x, train_y)
    pixel_preds = pWrapper.predict(ptest_x)

    logger.info('Kmeans')
    classes.colwise_rmse(kmeans_preds, test_y)
    classes.rmse(kmeans_preds, test_y)
    logger.info('Pixel RF')
    classes.colwise_rmse(pixel_preds, test_y)
    classes.rmse(pixel_preds, test_y)

    logger.info("Ensembling predictions")
    etrain_x = np.hstack((wrapper.predict(train_x), pWrapper.predict(ptrain_x)))
    etest_x = np.hstack((kmeans_preds, pixel_preds))
    eWrapper = ModelWrapper(RandomForestRegressor, {'n_estimators': 500, 'verbose': 3}, n_jobs=-1)
    eWrapper.fit(etrain_x, train_y)
    ensemble_preds = eWrapper.predict(etest_x)
    classes.colwise_rmse(ensemble_preds, test_y)
    classes.rmse(ensemble_preds, test_y)
Beispiel #15
0
def kmeans_003():
    """
    Grid search for Ridge RF parameters
    Not sure whether to use spherical or minibatch, so maybe do one run with both

    .106 on the leaderboard.  So the difference in CV scores narrowed
    """

    train_x_crop_scale = CropScaleImageTransformer(training=True,
                                                   result_path='data/data_train_crop_150_scale_15.npy',
                                                   crop_size=150,
                                                   scaled_size=15,
                                                   n_jobs=-1,
                                                   memmap=True)



    # spherical generator
    kmeans_generator = KMeansFeatureGenerator(n_centroids=1600,
                                              rf_size=5,
                                              result_path='data/mdl_kmeans_002_new',
                                              n_iterations=20,
                                              n_jobs=-1,)

    # minibatch generator
    # kmeans_generator = models.KMeansFeatures.KMeansFeatureGenerator(n_centroids=1600,
    #                                                                 rf_size=5,
    #                                                                 result_path='data/mdl_kmeans_002_new_minibatch',
    #                                                                 method='minibatch',
    #                                                                 n_init=1,
    #                                                                 n_jobs=-1,)


    # Don't need to fit, as already cached
    patches = ''
    kmeans_generator.fit(patches)
    images = train_x_crop_scale.transform()

    # Problematic here - memory usage spikes to ~ 11GB when threads return
    # train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_002_new.npy', memmap=True)
    train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_002_new.npy', memmap=True)
    train_y = classes.train_solutions.data
    # Unload some objects
    del images
    gc.collect()
    # mdl = models.Ridge.RidgeRFEstimator(alpha=14, n_estimators=250, n_jobs=-1)
    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 14, 'n_estimators': 500}, n_jobs=-1)
    params = {
        'alpha': [150, 250, 500, 750, 1000],
        'n_estimators': [250]
    }

    # 500 trees and alpha 25 gives cv of .10972 on 2-fold CV, but 25 was on the upper range of the search space,
    # So need to re-run with larger range of alpha
    # Will hit 30GB of ram with 500 trees.
    wrapper.grid_search(train_x, train_y, params, refit=False, parallel_estimator=True)

    # [mean: -0.11024, std: 0.00018, params: {'n_estimators': 250, 'alpha': 20.0},
    # mean: -0.11000, std: 0.00019, params: {'n_estimators': 250, 'alpha': 25.0},
    # mean: -0.10969, std: 0.00018, params: {'n_estimators': 250, 'alpha': 35},
    # mean: -0.10934, std: 0.00019, params: {'n_estimators': 250, 'alpha': 50},
    # mean: -0.10892, std: 0.00025, params: {'n_estimators': 250, 'alpha': 75},
    # mean: -0.10860, std: 0.00025, params: {'n_estimators': 250, 'alpha': 100},
    # mean: -0.10828, std: 0.00019, params: {'n_estimators': 250, 'alpha': 150},
    # mean: -0.10789, std: 0.00016, params: {'n_estimators': 250, 'alpha': 250},
    # mean: -0.10775, std: 0.00024, params: {'n_estimators': 250, 'alpha': 500},
    # mean: -0.10779, std: 0.00022, params: {'n_estimators': 250, 'alpha': 750},
    # mean: -0.10784, std: 0.00023, params: {'n_estimators': 250, 'alpha': 1000}]

    # Fit the final model
    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1)
    wrapper.fit(train_x, train_y)
    test_x_crop_scale = CropScaleImageTransformer(training=False,
                                                  result_path='data/data_test_crop_150_scale_15.npy',
                                                  crop_size=150,
                                                  scaled_size=15,
                                                  n_jobs=-1,
                                                  memmap=True)


    test_images = test_x_crop_scale.transform()
    test_x = kmeans_generator.transform(test_images, save_to_file='data/data_kmeans_test_features_003_new.npy', memmap=True)
    res = wrapper.predict(test_x)
    sub = classes.Submission(res)
    sub.to_file('sub_kmeans_003.csv')
del images
gc.collect()

# ModelWrapper is a convenience class that we created to automate some of the typical tasks
# like logging, grid search and cross validation.
# For fit, it is basically equivalent to calling fit on the estimator

# The estimator takes the X and y and trains a ridge regression (sklearn.linear_model.Ridge),
# predicts using the ridge regressor, then uses the results of the prediction to train a random forest.
wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1)
wrapper.fit(train_x, train_y)

test_x_crop_scale = CropScaleImageTransformer(training=False,
                                              result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
                                              crop_size=crop,
                                              scaled_size=s,
                                              n_jobs=-1,
                                              memmap=True)

# Crop and scale the test images
test_images = test_x_crop_scale.transform()

# Generate the test features
test_x = kmeans_generator.transform(test_images, save_to_file='data/data_test_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True)

# Predict on the test features
res = wrapper.predict(test_x)

# Generate a submission file
sub = classes.Submission(res)
sub.to_file('sub_kmeans_006.csv')