def kmeans_006_submission():
    # 2014-03-28 10:53:22 - Base - INFO - Cross validation completed in 1487.18687487.  Scores:
    # 2014-03-28 10:53:22 - Base - INFO - [-0.11018943 -0.10946863]

    # Final submission
    crop = 150
    s = 15
    n_centroids = 3000

    images = get_images(crop=crop, s=s)
    kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids)

    train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True)
    train_y = classes.train_solutions.data

    # Unload some objects
    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1)
    # wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True)
    wrapper.fit(train_x, train_y)

    test_x_crop_scale = CropScaleImageTransformer(training=False,
                                                  result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
                                                  crop_size=crop,
                                                  scaled_size=s,
                                                  n_jobs=-1,
                                                  memmap=True)

    test_images = test_x_crop_scale.transform()
    test_x = kmeans_generator.transform(test_images, save_to_file='data/data_test_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True)
    res = wrapper.predict(test_x)
    sub = classes.Submission(res)
    sub.to_file('sub_kmeans_006.csv')
def extra_trees_submission():
    # Somehow the submission on the leaderboard scores 0.22
    crop = 150
    s = 15
    n_centroids = 3000

    images = get_images(crop=crop, s=s)
    kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids)

    train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True)
    train_y = classes.train_solutions.data

    # Unload some objects
    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeExtraTreesEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1)
    # wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True)
    wrapper.fit(train_x, train_y)

    test_x_crop_scale = CropScaleImageTransformer(training=False,
                                                  result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
                                                  crop_size=crop,
                                                  scaled_size=s,
                                                  n_jobs=-1,
                                                  memmap=True)

    test_images = test_x_crop_scale.transform()
    test_x = kmeans_generator.transform(test_images, save_to_file='data/data_test_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True)
    res = wrapper.predict(test_x)
    sub = classes.Submission(res)
    sub.to_file('sub_kmeans_008.csv')
def rbm_001():
    s = 15
    crop = 150
    n_patches = 400000
    rf_size = 5

    train_x_crop_scale = CropScaleImageTransformer(training=True,
                                                   result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s),
                                                   crop_size=crop,
                                                   scaled_size=s,
                                                   n_jobs=-1,
                                                   memmap=True)

    patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                         patch_size=rf_size,
                                                         n_jobs=-1)
    images = train_x_crop_scale.transform()
    images = images.reshape((images.shape[0], 15 * 15 * 3))

    # rbm needs inputs to be between 0 and 1
    scaler = MinMaxScaler()
    images = scaler.fit_transform(images)

    # Training takes a long time, says 80 seconds per iteration, but seems like longer
    # And this is only with 256 components
    rbm = BernoulliRBM(verbose=1)
    rbm.fit(images)

    train_x = rbm.transform(images)
    train_y = classes.train_solutions.data

    # 0.138 CV on 50% of the dataset
    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1)
    wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True)
Exemple #4
0
def kmeans_007():
    """
    Increasing crop/scale size, rf size, centroids, and patches all at once.

    2014-02-18 02:45:15 - Base - INFO - Cross validation completed in 5426.04788399.  Scores:
    2014-02-18 02:45:15 - Base - INFO - [-0.10834319 -0.10825868]
    """
    n_centroids = 5000
    s = 50
    crop = 200
    # Originally, 1600 centroids for 400,000 patches, or 250 patches per centroid
    # 800000 / 5000 = will give us 160 patches per centroid
    n_patches = 800000
    rf_size = 20
    # 31 x 31 = 961 patches per image, which is 10x more patches than the original settings
    # If we set stride 2, then it's 16 x 16 patches = 256, only twice as many patches
    stride = 2
    train_x_crop_scale = CropScaleImageTransformer(training=True,
                                                   crop_size=crop,
                                                   scaled_size=s,
                                                   n_jobs=-1,
                                                   memmap=True)
    images = train_x_crop_scale.transform()
    patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                         patch_size=rf_size,
                                                         n_jobs=-1)
    patches = patch_extractor.transform(images)

    kmeans_generator = KMeansFeatureGenerator(
        n_centroids=n_centroids,
        rf_size=rf_size,
        result_path='data/mdl_kmeans_007'.format(n_centroids),
        n_iterations=20,
        n_jobs=-1,
    )
    kmeans_generator.fit(patches)

    del patches
    gc.collect()

    train_x = kmeans_generator.transform(
        images,
        save_to_file='data/data_kmeans_features_007.npy',
        stride_size=stride,
        memmap=True)
    train_y = classes.train_solutions.data
    # Unload some objects
    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
        'alpha': 500,
        'n_estimators': 250
    },
                           n_jobs=-1)
    wrapper.cross_validation(train_x, train_y, parallel_estimator=True)
    """
def get_images(crop=150, s=15):
    train_x_crop_scale = CropScaleImageTransformer(training=True,
                                                   result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s),
                                                   crop_size=crop,
                                                   scaled_size=s,
                                                   n_jobs=-1,
                                                   memmap=True)
    images = train_x_crop_scale.transform()
    return images
Exemple #6
0
def get_images(crop=150, s=15):
    train_x_crop_scale = CropScaleImageTransformer(
        training=True,
        result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s),
        crop_size=crop,
        scaled_size=s,
        n_jobs=-1,
        memmap=True)
    images = train_x_crop_scale.transform()
    return images
Exemple #7
0
def kmeans_006_submission():
    # Final submission
    n_centroids = 3000
    s = 15
    crop = 150
    n_patches = 400000
    rf_size = 5
    logger.info("Training with n_centroids {}".format(n_centroids))

    train_x_crop_scale = CropScaleImageTransformer(training=True,
                                                   result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s),
                                                   crop_size=crop,
                                                   scaled_size=s,
                                                   n_jobs=-1,
                                                   memmap=True)

    kmeans_generator = KMeansFeatureGenerator(n_centroids=n_centroids,
                                              rf_size=rf_size,
                                              result_path='data/mdl_kmeans_006_centroids_{}'.format(n_centroids),
                                              n_iterations=20,
                                              n_jobs=-1,)

    patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                         patch_size=rf_size,
                                                         n_jobs=-1)
    images = train_x_crop_scale.transform()

    patches = patch_extractor.transform(images)

    kmeans_generator.fit(patches)

    del patches
    gc.collect()

    train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True)
    train_y = classes.train_solutions.data
    # Unload some objects
    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1)
    wrapper.fit(train_x, train_y)

    test_x_crop_scale = CropScaleImageTransformer(training=False,
                                                  result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
                                                  crop_size=crop,
                                                  scaled_size=s,
                                                  n_jobs=-1,
                                                  memmap=True)

    test_images = test_x_crop_scale.transform()
    test_x = kmeans_generator.transform(test_images, save_to_file='data/data_test_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True)
    res = wrapper.predict(test_x)
    sub = classes.Submission(res)
    sub.to_file('sub_kmeans_006.csv')
def rf_size_10():
    # Pretty bad as well
    # 2014-03-28 13:04:07 - Base - INFO - Cross validation completed in 1475.74401999.  Scores:
    # 2014-03-28 13:04:07 - Base - INFO - [-0.12217214 -0.12209735]

    n_centroids = 3000
    s = 15
    crop = 150
    n_patches = 400000
    rf_size = 5

    train_x_crop_scale = CropScaleImageTransformer(training=True,
                                                   result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s),
                                                   crop_size=crop,
                                                   scaled_size=s,
                                                   n_jobs=-1,
                                                   memmap=True)
    test_x_crop_scale = CropScaleImageTransformer(training=False,
                                                  result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
                                                  crop_size=crop,
                                                  scaled_size=s,
                                                  n_jobs=-1,
                                                  memmap=True)

    kmeans_generator = KMeansFeatureGenerator(n_centroids=n_centroids,
                                              rf_size=rf_size,
                                              result_path='data/mdl_kmeans_008_rf10'.format(n_centroids),
                                              n_iterations=20,
                                              n_jobs=-1,)

    patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                         patch_size=rf_size,
                                                         n_jobs=-1)
    images = train_x_crop_scale.transform()

    patches = patch_extractor.transform(images)

    kmeans_generator.fit(patches)

    del patches
    gc.collect()

    train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_008_rf10.npy'.format(n_centroids), memmap=True)
    train_y = classes.train_solutions.data

    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1)
    wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True)
def get_images(crop=150, s=15):
    """
    Iterates over each image file, cropping it to 150 pixels, then scaling it to 15 pixels.

    Returns an ndarray (possibly memmapped) of size (n_images, 15, 15, 3)
    """
    train_x_crop_scale = CropScaleImageTransformer(training=True,
                                                   result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s),
                                                   crop_size=crop,
                                                   scaled_size=s,
                                                   n_jobs=-1,
                                                   memmap=True)
    images = train_x_crop_scale.transform()
    return images
Exemple #10
0
def get_images(crop=150, s=15):
    """
    Iterates over each image file, cropping it to 150 pixels, then scaling it to 15 pixels.

    Returns an ndarray (possibly memmapped) of size (n_images, 15, 15, 3)
    """
    train_x_crop_scale = CropScaleImageTransformer(
        training=True,
        result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s),
        crop_size=crop,
        scaled_size=s,
        n_jobs=-1,
        memmap=True)
    images = train_x_crop_scale.transform()
    return images
Exemple #11
0
def kmeans_007():
    """
    Increasing crop/scale size, rf size, centroids, and patches all at once.

    2014-02-18 02:45:15 - Base - INFO - Cross validation completed in 5426.04788399.  Scores:
    2014-02-18 02:45:15 - Base - INFO - [-0.10834319 -0.10825868]
    """
    n_centroids = 5000
    s = 50
    crop = 200
    # Originally, 1600 centroids for 400,000 patches, or 250 patches per centroid
    # 800000 / 5000 = will give us 160 patches per centroid
    n_patches = 800000
    rf_size = 20
    # 31 x 31 = 961 patches per image, which is 10x more patches than the original settings
    # If we set stride 2, then it's 16 x 16 patches = 256, only twice as many patches
    stride = 2
    train_x_crop_scale = CropScaleImageTransformer(training=True,
                                                   crop_size=crop,
                                                   scaled_size=s,
                                                   n_jobs=-1,
                                                   memmap=True)
    images = train_x_crop_scale.transform()
    patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                         patch_size=rf_size,
                                                         n_jobs=-1)
    patches = patch_extractor.transform(images)

    kmeans_generator = KMeansFeatureGenerator(n_centroids=n_centroids,
                                              rf_size=rf_size,
                                              result_path='data/mdl_kmeans_007'.format(n_centroids),
                                              n_iterations=20,
                                              n_jobs=-1,)
    kmeans_generator.fit(patches)

    del patches
    gc.collect()

    train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_007.npy', stride_size=stride, memmap=True)
    train_y = classes.train_solutions.data
    # Unload some objects
    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 250}, n_jobs=-1)
    wrapper.cross_validation(train_x, train_y, parallel_estimator=True)

    """
Exemple #12
0
def kmeans_006_submission():
    # 2014-03-28 10:53:22 - Base - INFO - Cross validation completed in 1487.18687487.  Scores:
    # 2014-03-28 10:53:22 - Base - INFO - [-0.11018943 -0.10946863]

    # Final submission
    crop = 150
    s = 15
    n_centroids = 3000

    images = get_images(crop=crop, s=s)
    kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids)

    train_x = kmeans_generator.transform(
        images,
        save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(
            n_centroids),
        memmap=True)
    train_y = classes.train_solutions.data

    # Unload some objects
    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
        'alpha': 500,
        'n_estimators': 500
    },
                           n_jobs=-1)
    # wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True)
    wrapper.fit(train_x, train_y)

    test_x_crop_scale = CropScaleImageTransformer(
        training=False,
        result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
        crop_size=crop,
        scaled_size=s,
        n_jobs=-1,
        memmap=True)

    test_images = test_x_crop_scale.transform()
    test_x = kmeans_generator.transform(
        test_images,
        save_to_file='data/data_test_kmeans_features_006_centroids_{}.npy'.
        format(n_centroids),
        memmap=True)
    res = wrapper.predict(test_x)
    sub = classes.Submission(res)
    sub.to_file('sub_kmeans_006.csv')
Exemple #13
0
def extra_trees_submission():
    # Somehow the submission on the leaderboard scores 0.22
    crop = 150
    s = 15
    n_centroids = 3000

    images = get_images(crop=crop, s=s)
    kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids)

    train_x = kmeans_generator.transform(
        images,
        save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(
            n_centroids),
        memmap=True)
    train_y = classes.train_solutions.data

    # Unload some objects
    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeExtraTreesEstimator, {
        'alpha': 500,
        'n_estimators': 500
    },
                           n_jobs=-1)
    # wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True)
    wrapper.fit(train_x, train_y)

    test_x_crop_scale = CropScaleImageTransformer(
        training=False,
        result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
        crop_size=crop,
        scaled_size=s,
        n_jobs=-1,
        memmap=True)

    test_images = test_x_crop_scale.transform()
    test_x = kmeans_generator.transform(
        test_images,
        save_to_file='data/data_test_kmeans_features_006_centroids_{}.npy'.
        format(n_centroids),
        memmap=True)
    res = wrapper.predict(test_x)
    sub = classes.Submission(res)
    sub.to_file('sub_kmeans_008.csv')
Exemple #14
0
def rbm_001():
    s = 15
    crop = 150
    n_patches = 400000
    rf_size = 5

    train_x_crop_scale = CropScaleImageTransformer(
        training=True,
        result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s),
        crop_size=crop,
        scaled_size=s,
        n_jobs=-1,
        memmap=True)

    patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                         patch_size=rf_size,
                                                         n_jobs=-1)
    images = train_x_crop_scale.transform()
    images = images.reshape((images.shape[0], 15 * 15 * 3))

    # rbm needs inputs to be between 0 and 1
    scaler = MinMaxScaler()
    images = scaler.fit_transform(images)

    # Training takes a long time, says 80 seconds per iteration, but seems like longer
    # And this is only with 256 components
    rbm = BernoulliRBM(verbose=1)
    rbm.fit(images)

    train_x = rbm.transform(images)
    train_y = classes.train_solutions.data

    # 0.138 CV on 50% of the dataset
    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
        'alpha': 500,
        'n_estimators': 500
    },
                           n_jobs=-1)
    wrapper.cross_validation(train_x,
                             train_y,
                             sample=0.5,
                             parallel_estimator=True)
Exemple #15
0
# like logging, grid search and cross validation.
# For fit, it is basically equivalent to calling fit on the estimator

# The estimator takes the X and y and trains a ridge regression (sklearn.linear_model.Ridge),
# predicts using the ridge regressor, then uses the results of the prediction to train a random forest.
wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
    'alpha': 500,
    'n_estimators': 500
},
                       n_jobs=-1)
wrapper.fit(train_x, train_y)

test_x_crop_scale = CropScaleImageTransformer(
    training=False,
    result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
    crop_size=crop,
    scaled_size=s,
    n_jobs=-1,
    memmap=True)

# Crop and scale the test images
test_images = test_x_crop_scale.transform()

# Generate the test features
test_x = kmeans_generator.transform(
    test_images,
    save_to_file='data/data_test_kmeans_features_006_centroids_{}.npy'.format(
        n_centroids),
    memmap=True)

# Predict on the test features
Exemple #16
0
def kmeans_006():
    """
    Testing number of centroids

    [(1000, array([-0.10926318, -0.10853047])),
     (2000, array([-0.10727502, -0.10710292])),
     (2500, array([-0.107019  , -0.10696262])),
     (3000, array([-0.10713973, -0.1066932 ]))]

    """
    n_centroids_vals = [1000, 2000, 2500, 3000]
    scores = []

    for n_centroids in n_centroids_vals:
        s = 15
        crop = 150
        n_patches = 400000
        rf_size = 5
        logger.info("Training with n_centroids {}".format(n_centroids))

        train_x_crop_scale = CropScaleImageTransformer(
            training=True,
            result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s),
            crop_size=crop,
            scaled_size=s,
            n_jobs=-1,
            memmap=True)
        test_x_crop_scale = CropScaleImageTransformer(
            training=False,
            result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
            crop_size=crop,
            scaled_size=s,
            n_jobs=-1,
            memmap=True)

        kmeans_generator = KMeansFeatureGenerator(
            n_centroids=n_centroids,
            rf_size=rf_size,
            result_path='data/mdl_kmeans_006_centroids_{}'.format(n_centroids),
            n_iterations=20,
            n_jobs=-1,
        )

        patch_extractor = models.KMeansFeatures.PatchSampler(
            n_patches=n_patches, patch_size=rf_size, n_jobs=-1)
        images = train_x_crop_scale.transform()

        patches = patch_extractor.transform(images)

        kmeans_generator.fit(patches)

        del patches
        gc.collect()

        train_x = kmeans_generator.transform(
            images,
            save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.
            format(n_centroids),
            memmap=True)
        train_y = classes.train_solutions.data
        # Unload some objects
        del images
        gc.collect()

        wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
            'alpha': 500,
            'n_estimators': 250
        },
                               n_jobs=-1)
        wrapper.cross_validation(train_x,
                                 train_y,
                                 n_folds=2,
                                 parallel_estimator=True)

        score = (n_centroids, wrapper.cv_scores)
        logger.info("Scores: {}".format(score))
        scores.append(score)

        del wrapper
        gc.collect()
classes.logstream.setLevel(classes.logging.DEBUG)
a = models.RandomForest.RandomForestCascadeModel(cv_sample=0.1)
a.run('cv')
a.run('train')
b = models.RandomForest.RandomForestModel(cv_sample=0.1)
b.estimator.set_params(n_estimators=10)
b.run('train')


import models
from models.Base import CropScaleImageTransformer
from models.KMeansFeatures import KMeansFeatureGenerator

train_x_crop_scale = CropScaleImageTransformer(training=True,
                                               # result_path='data/data_train_crop_150_scale_15.npy',
                                               crop_size=150,
                                               scaled_size=15,
                                               n_jobs=-1,
                                               memmap=True)

raw_images = train_x_crop_scale.transform()

patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=1000,
                                                     patch_size=5,
                                                     n_jobs=-1)

reds = raw_images[0:100, :, :, 0]
a = patch_extractor.transform(reds)

kmeans_generator = KMeansFeatureGenerator(n_centroids=10,
                                          rf_size=5,
                                          result_path='foo.npy',
def kmeans_006():
    """
    Testing number of centroids

    [(1000, array([-0.10926318, -0.10853047])),
     (2000, array([-0.10727502, -0.10710292])),
     (2500, array([-0.107019  , -0.10696262])),
     (3000, array([-0.10713973, -0.1066932 ]))]

    """
    n_centroids_vals = [1000, 2000, 2500, 3000]
    scores = []

    for n_centroids in n_centroids_vals:
        s = 15
        crop = 150
        n_patches = 400000
        rf_size = 5
        logger.info("Training with n_centroids {}".format(n_centroids))

        train_x_crop_scale = CropScaleImageTransformer(training=True,
                                                       result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s),
                                                       crop_size=crop,
                                                       scaled_size=s,
                                                       n_jobs=-1,
                                                       memmap=True)
        test_x_crop_scale = CropScaleImageTransformer(training=False,
                                                      result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
                                                      crop_size=crop,
                                                      scaled_size=s,
                                                      n_jobs=-1,
                                                      memmap=True)

        kmeans_generator = KMeansFeatureGenerator(n_centroids=n_centroids,
                                                  rf_size=rf_size,
                                                  result_path='data/mdl_kmeans_006_centroids_{}'.format(n_centroids),
                                                  n_iterations=20,
                                                  n_jobs=-1,)

        patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                             patch_size=rf_size,
                                                             n_jobs=-1)
        images = train_x_crop_scale.transform()

        patches = patch_extractor.transform(images)

        kmeans_generator.fit(patches)

        del patches
        gc.collect()

        train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True)
        train_y = classes.train_solutions.data
        # Unload some objects
        del images
        gc.collect()

        wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 250}, n_jobs=-1)
        wrapper.cross_validation(train_x, train_y, n_folds=2, parallel_estimator=True)

        score = (n_centroids, wrapper.cv_scores)
        logger.info("Scores: {}".format(score))
        scores.append(score)

        del wrapper
        gc.collect()
Exemple #19
0
def rf_size_10():
    # Pretty bad as well
    # 2014-03-28 13:04:07 - Base - INFO - Cross validation completed in 1475.74401999.  Scores:
    # 2014-03-28 13:04:07 - Base - INFO - [-0.12217214 -0.12209735]

    n_centroids = 3000
    s = 15
    crop = 150
    n_patches = 400000
    rf_size = 5

    train_x_crop_scale = CropScaleImageTransformer(
        training=True,
        result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s),
        crop_size=crop,
        scaled_size=s,
        n_jobs=-1,
        memmap=True)
    test_x_crop_scale = CropScaleImageTransformer(
        training=False,
        result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
        crop_size=crop,
        scaled_size=s,
        n_jobs=-1,
        memmap=True)

    kmeans_generator = KMeansFeatureGenerator(
        n_centroids=n_centroids,
        rf_size=rf_size,
        result_path='data/mdl_kmeans_008_rf10'.format(n_centroids),
        n_iterations=20,
        n_jobs=-1,
    )

    patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                         patch_size=rf_size,
                                                         n_jobs=-1)
    images = train_x_crop_scale.transform()

    patches = patch_extractor.transform(images)

    kmeans_generator.fit(patches)

    del patches
    gc.collect()

    train_x = kmeans_generator.transform(
        images,
        save_to_file='data/data_kmeans_features_008_rf10.npy'.format(
            n_centroids),
        memmap=True)
    train_y = classes.train_solutions.data

    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
        'alpha': 500,
        'n_estimators': 500
    },
                           n_jobs=-1)
    wrapper.cross_validation(train_x,
                             train_y,
                             sample=0.5,
                             parallel_estimator=True)
Exemple #20
0
def kmeans_004():
    """
    Tuning the scale/crop and RF size parameters

    First number is the scaling, cropped to 200, with rf size of 5.  75 scaling took forever ot transform, so killed
    [(30, array([-0.11374265, -0.1134896 ]))
     (50, array([-0.11677854, -0.11696837]))]

    Trying again with larger RF size of 10.
    As a note, scale to 30 with rf 10 takes about 25 minutes to extract features on the train set
    Scale to 50 with rf 10 takes almost 90 minutes.
    [(30, array([-0.10828216, -0.1081058 ])),
    (50, array([-0.10840914, -0.10868195]))]
    Interesting that scale size of 50 does worse

    Crop is not 150, so this is not really an apples to apples comparison with kmeans_003

    It is possibly worth making a submission with scale 30 and rf size 10
    """
    crops = [200]  # Should probably also add 250
    scales = [30, 50]  # Scaling is probably the most important part here

    scores = []
    for s in scales:
        crop = 200
        n_centroids = 1600
        n_patches = 400000
        # rf_size = int(round(s * .2))
        rf_size = 10
        logger.info("Training with crop {}, scale {}, patch size {}, patches {}, centroids {}".format(crop, s, rf_size, n_patches, n_centroids))

        train_x_crop_scale = CropScaleImageTransformer(training=True,
                                                       result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s),
                                                       crop_size=crop,
                                                       scaled_size=s,
                                                       n_jobs=-1,
                                                       memmap=True)

        # spherical generator
        kmeans_generator = KMeansFeatureGenerator(n_centroids=n_centroids,
                                                  rf_size=rf_size,
                                                  result_path='data/mdl_kmeans_004_scale_{}_rf_{}'.format(s, rf_size),
                                                  n_iterations=20,
                                                  n_jobs=-1,)

        patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                             patch_size=rf_size,
                                                             n_jobs=-1)
        images = train_x_crop_scale.transform()
        logger.info("Images ndarray shape: {}".format(images.shape))
        patches = patch_extractor.transform(images)
        logger.info("Patches ndarray shape: {}".format(patches.shape))

        kmeans_generator.fit(patches)

        del patches
        gc.collect()

        train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_004_scale_{}_rf_{}.npy'.format(s, rf_size), memmap=True)
        train_y = classes.train_solutions.data
        # Unload some objects
        del images
        gc.collect()
        logger.info("Train X ndarray shape: {}".format(train_x.shape))

        wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 250}, n_jobs=-1)
        wrapper.cross_validation(train_x, train_y, n_folds=2, parallel_estimator=True)
        scores.append((s, wrapper.cv_scores))
        del wrapper
        gc.collect()
Exemple #21
0
def kmeans_003():
    """
    Grid search for Ridge RF parameters
    Not sure whether to use spherical or minibatch, so maybe do one run with both

    .106 on the leaderboard.  So the difference in CV scores narrowed
    """

    train_x_crop_scale = CropScaleImageTransformer(training=True,
                                                   result_path='data/data_train_crop_150_scale_15.npy',
                                                   crop_size=150,
                                                   scaled_size=15,
                                                   n_jobs=-1,
                                                   memmap=True)



    # spherical generator
    kmeans_generator = KMeansFeatureGenerator(n_centroids=1600,
                                              rf_size=5,
                                              result_path='data/mdl_kmeans_002_new',
                                              n_iterations=20,
                                              n_jobs=-1,)

    # minibatch generator
    # kmeans_generator = models.KMeansFeatures.KMeansFeatureGenerator(n_centroids=1600,
    #                                                                 rf_size=5,
    #                                                                 result_path='data/mdl_kmeans_002_new_minibatch',
    #                                                                 method='minibatch',
    #                                                                 n_init=1,
    #                                                                 n_jobs=-1,)


    # Don't need to fit, as already cached
    patches = ''
    kmeans_generator.fit(patches)
    images = train_x_crop_scale.transform()

    # Problematic here - memory usage spikes to ~ 11GB when threads return
    # train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_002_new.npy', memmap=True)
    train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_002_new.npy', memmap=True)
    train_y = classes.train_solutions.data
    # Unload some objects
    del images
    gc.collect()
    # mdl = models.Ridge.RidgeRFEstimator(alpha=14, n_estimators=250, n_jobs=-1)
    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 14, 'n_estimators': 500}, n_jobs=-1)
    params = {
        'alpha': [150, 250, 500, 750, 1000],
        'n_estimators': [250]
    }

    # 500 trees and alpha 25 gives cv of .10972 on 2-fold CV, but 25 was on the upper range of the search space,
    # So need to re-run with larger range of alpha
    # Will hit 30GB of ram with 500 trees.
    wrapper.grid_search(train_x, train_y, params, refit=False, parallel_estimator=True)

    # [mean: -0.11024, std: 0.00018, params: {'n_estimators': 250, 'alpha': 20.0},
    # mean: -0.11000, std: 0.00019, params: {'n_estimators': 250, 'alpha': 25.0},
    # mean: -0.10969, std: 0.00018, params: {'n_estimators': 250, 'alpha': 35},
    # mean: -0.10934, std: 0.00019, params: {'n_estimators': 250, 'alpha': 50},
    # mean: -0.10892, std: 0.00025, params: {'n_estimators': 250, 'alpha': 75},
    # mean: -0.10860, std: 0.00025, params: {'n_estimators': 250, 'alpha': 100},
    # mean: -0.10828, std: 0.00019, params: {'n_estimators': 250, 'alpha': 150},
    # mean: -0.10789, std: 0.00016, params: {'n_estimators': 250, 'alpha': 250},
    # mean: -0.10775, std: 0.00024, params: {'n_estimators': 250, 'alpha': 500},
    # mean: -0.10779, std: 0.00022, params: {'n_estimators': 250, 'alpha': 750},
    # mean: -0.10784, std: 0.00023, params: {'n_estimators': 250, 'alpha': 1000}]

    # Fit the final model
    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1)
    wrapper.fit(train_x, train_y)
    test_x_crop_scale = CropScaleImageTransformer(training=False,
                                                  result_path='data/data_test_crop_150_scale_15.npy',
                                                  crop_size=150,
                                                  scaled_size=15,
                                                  n_jobs=-1,
                                                  memmap=True)


    test_images = test_x_crop_scale.transform()
    test_x = kmeans_generator.transform(test_images, save_to_file='data/data_kmeans_test_features_003_new.npy', memmap=True)
    res = wrapper.predict(test_x)
    sub = classes.Submission(res)
    sub.to_file('sub_kmeans_003.csv')
Exemple #22
0
def kmeans_005():
    """
    Testing whether extracting patches from train and test images works better

    [(500000, False, array([-0.10799986, -0.10744586])),
    (500000, True, array([-0.10790803, -0.10733288])),
    (600000, False, array([-0.10812188, -0.10735988])),
    (600000, True, array([-0.10778652, -0.10752664]))]
    """
    n_patches_vals = [500000, 600000, 700000]
    include_test_images = [False, True]

    scores = []
    for n_patches in n_patches_vals:
        for incl in include_test_images:
            s = 15
            crop = 150
            n_centroids = 1600
            rf_size = 5
            logger.info("Training with n_patches {}, with test images {}".format(n_patches, incl))

            train_x_crop_scale = CropScaleImageTransformer(training=True,
                                                           result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s),
                                                           crop_size=crop,
                                                           scaled_size=s,
                                                           n_jobs=-1,
                                                           memmap=True)
            test_x_crop_scale = CropScaleImageTransformer(training=False,
                                                          result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
                                                          crop_size=crop,
                                                          scaled_size=s,
                                                          n_jobs=-1,
                                                          memmap=True)

            kmeans_generator = KMeansFeatureGenerator(n_centroids=n_centroids,
                                                      rf_size=rf_size,
                                                      result_path='data/mdl_kmeans_005_patches_{}_test{}'.format(n_patches, incl),
                                                      n_iterations=20,
                                                      n_jobs=-1,)

            patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                                 patch_size=rf_size,
                                                                 n_jobs=-1)
            images = train_x_crop_scale.transform()
            if incl:
                test_images = test_x_crop_scale.transform()
                images = np.vstack([images, test_images])
            logger.info("Extracting patches from images ndarray shape: {}".format(images.shape))

            patches = patch_extractor.transform(images)
            logger.info("Patches ndarray shape: {}".format(patches.shape))

            kmeans_generator.fit(patches)

            del patches
            gc.collect()

            # Reload the original images
            images = train_x_crop_scale.transform()
            logger.info("Generating features on images ndarray shape: {}".format(images.shape))
            train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_005_patches_{}_test_{}.npy'.format(n_patches, incl), memmap=True)
            train_y = classes.train_solutions.data
            # Unload some objects
            del images
            gc.collect()

            wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 250}, n_jobs=-1)
            wrapper.cross_validation(train_x, train_y, n_folds=2, parallel_estimator=True)

            score = (n_patches, incl, wrapper.cv_scores)
            logger.info("Score: {}".format(score))
            scores.append(score)

            del wrapper
            gc.collect()
Exemple #23
0
def ensemble_001():
    """
    Ensemble of kmeans and random forest results
    Conducting some analysis of whether the errors from these two models for individual Ys are different

    Ensembled error is .1149.

    Kmeans is better on every class than RF.
    """
    n_centroids = 3000
    s = 15
    crop = 150
    n_patches = 400000
    rf_size = 5

    train_x_crop_scale = CropScaleImageTransformer(training=True,
                                                   crop_size=crop,
                                                   scaled_size=s,
                                                   n_jobs=-1,
                                                   memmap=True)

    kmeans_generator = KMeansFeatureGenerator(
        n_centroids=n_centroids,
        rf_size=rf_size,
        result_path='data/mdl_ensemble_001',
        n_iterations=20,
        n_jobs=-1,
    )

    patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                         patch_size=rf_size,
                                                         n_jobs=-1)
    images = train_x_crop_scale.transform()
    patches = patch_extractor.transform(images)

    kmeans_generator.fit(patches)

    del patches
    gc.collect()

    X = kmeans_generator.transform(images,
                                   save_to_file='data/data_ensemble_001.npy',
                                   memmap=True)
    Y = classes.train_solutions.data

    # Unload some objects
    del images
    gc.collect()

    # Get the input for the RF so that we can split together
    sampler = SampleTransformer(training=True,
                                steps=2,
                                step_size=20,
                                n_jobs=-1)
    pX = sampler.transform()

    # manual split of train and test
    train_x, test_x, ptrain_x, ptest_x, train_y, test_y = train_test_split(
        X, pX, Y, test_size=0.5)

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
        'alpha': 500,
        'n_estimators': 500
    },
                           n_jobs=-1)
    wrapper.fit(train_x, train_y)
    kmeans_preds = wrapper.predict(test_x)

    pWrapper = ModelWrapper(RandomForestRegressor, {
        'n_estimators': 500,
        'verbose': 3
    },
                            n_jobs=-1)
    pWrapper.fit(ptrain_x, train_y)
    pixel_preds = pWrapper.predict(ptest_x)

    logger.info('Kmeans')
    classes.colwise_rmse(kmeans_preds, test_y)
    classes.rmse(kmeans_preds, test_y)
    logger.info('Pixel RF')
    classes.colwise_rmse(pixel_preds, test_y)
    classes.rmse(pixel_preds, test_y)

    logger.info("Ensembling predictions")
    etrain_x = np.hstack(
        (wrapper.predict(train_x), pWrapper.predict(ptrain_x)))
    etest_x = np.hstack((kmeans_preds, pixel_preds))
    eWrapper = ModelWrapper(RandomForestRegressor, {
        'n_estimators': 500,
        'verbose': 3
    },
                            n_jobs=-1)
    eWrapper.fit(etrain_x, train_y)
    ensemble_preds = eWrapper.predict(etest_x)
    classes.colwise_rmse(ensemble_preds, test_y)
    classes.rmse(ensemble_preds, test_y)
Exemple #24
0
def kmeans_005():
    """
    Testing whether extracting patches from train and test images works better

    [(500000, False, array([-0.10799986, -0.10744586])),
    (500000, True, array([-0.10790803, -0.10733288])),
    (600000, False, array([-0.10812188, -0.10735988])),
    (600000, True, array([-0.10778652, -0.10752664]))]
    """
    n_patches_vals = [500000, 600000, 700000]
    include_test_images = [False, True]

    scores = []
    for n_patches in n_patches_vals:
        for incl in include_test_images:
            s = 15
            crop = 150
            n_centroids = 1600
            rf_size = 5
            logger.info(
                "Training with n_patches {}, with test images {}".format(
                    n_patches, incl))

            train_x_crop_scale = CropScaleImageTransformer(
                training=True,
                result_path='data/data_train_crop_{}_scale_{}.npy'.format(
                    crop, s),
                crop_size=crop,
                scaled_size=s,
                n_jobs=-1,
                memmap=True)
            test_x_crop_scale = CropScaleImageTransformer(
                training=False,
                result_path='data/data_test_crop_{}_scale_{}.npy'.format(
                    crop, s),
                crop_size=crop,
                scaled_size=s,
                n_jobs=-1,
                memmap=True)

            kmeans_generator = KMeansFeatureGenerator(
                n_centroids=n_centroids,
                rf_size=rf_size,
                result_path='data/mdl_kmeans_005_patches_{}_test{}'.format(
                    n_patches, incl),
                n_iterations=20,
                n_jobs=-1,
            )

            patch_extractor = models.KMeansFeatures.PatchSampler(
                n_patches=n_patches, patch_size=rf_size, n_jobs=-1)
            images = train_x_crop_scale.transform()
            if incl:
                test_images = test_x_crop_scale.transform()
                images = np.vstack([images, test_images])
            logger.info(
                "Extracting patches from images ndarray shape: {}".format(
                    images.shape))

            patches = patch_extractor.transform(images)
            logger.info("Patches ndarray shape: {}".format(patches.shape))

            kmeans_generator.fit(patches)

            del patches
            gc.collect()

            # Reload the original images
            images = train_x_crop_scale.transform()
            logger.info(
                "Generating features on images ndarray shape: {}".format(
                    images.shape))
            train_x = kmeans_generator.transform(
                images,
                save_to_file=
                'data/data_kmeans_features_005_patches_{}_test_{}.npy'.format(
                    n_patches, incl),
                memmap=True)
            train_y = classes.train_solutions.data
            # Unload some objects
            del images
            gc.collect()

            wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
                'alpha': 500,
                'n_estimators': 250
            },
                                   n_jobs=-1)
            wrapper.cross_validation(train_x,
                                     train_y,
                                     n_folds=2,
                                     parallel_estimator=True)

            score = (n_patches, incl, wrapper.cv_scores)
            logger.info("Score: {}".format(score))
            scores.append(score)

            del wrapper
            gc.collect()
Exemple #25
0
def kmeans_006_submission():
    # Final submission
    n_centroids = 3000
    s = 15
    crop = 150
    n_patches = 400000
    rf_size = 5
    logger.info("Training with n_centroids {}".format(n_centroids))

    train_x_crop_scale = CropScaleImageTransformer(
        training=True,
        result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s),
        crop_size=crop,
        scaled_size=s,
        n_jobs=-1,
        memmap=True)

    kmeans_generator = KMeansFeatureGenerator(
        n_centroids=n_centroids,
        rf_size=rf_size,
        result_path='data/mdl_kmeans_006_centroids_{}'.format(n_centroids),
        n_iterations=20,
        n_jobs=-1,
    )

    patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                         patch_size=rf_size,
                                                         n_jobs=-1)
    images = train_x_crop_scale.transform()

    patches = patch_extractor.transform(images)

    kmeans_generator.fit(patches)

    del patches
    gc.collect()

    train_x = kmeans_generator.transform(
        images,
        save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(
            n_centroids),
        memmap=True)
    train_y = classes.train_solutions.data
    # Unload some objects
    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
        'alpha': 500,
        'n_estimators': 500
    },
                           n_jobs=-1)
    wrapper.fit(train_x, train_y)

    test_x_crop_scale = CropScaleImageTransformer(
        training=False,
        result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
        crop_size=crop,
        scaled_size=s,
        n_jobs=-1,
        memmap=True)

    test_images = test_x_crop_scale.transform()
    test_x = kmeans_generator.transform(
        test_images,
        save_to_file='data/data_test_kmeans_features_006_centroids_{}.npy'.
        format(n_centroids),
        memmap=True)
    res = wrapper.predict(test_x)
    sub = classes.Submission(res)
    sub.to_file('sub_kmeans_006.csv')
Exemple #26
0
def kmeans_004():
    """
    Tuning the scale/crop and RF size parameters

    First number is the scaling, cropped to 200, with rf size of 5.  75 scaling took forever ot transform, so killed
    [(30, array([-0.11374265, -0.1134896 ]))
     (50, array([-0.11677854, -0.11696837]))]

    Trying again with larger RF size of 10.
    As a note, scale to 30 with rf 10 takes about 25 minutes to extract features on the train set
    Scale to 50 with rf 10 takes almost 90 minutes.
    [(30, array([-0.10828216, -0.1081058 ])),
    (50, array([-0.10840914, -0.10868195]))]
    Interesting that scale size of 50 does worse

    Crop is not 150, so this is not really an apples to apples comparison with kmeans_003

    It is possibly worth making a submission with scale 30 and rf size 10
    """
    crops = [200]  # Should probably also add 250
    scales = [30, 50]  # Scaling is probably the most important part here

    scores = []
    for s in scales:
        crop = 200
        n_centroids = 1600
        n_patches = 400000
        # rf_size = int(round(s * .2))
        rf_size = 10
        logger.info(
            "Training with crop {}, scale {}, patch size {}, patches {}, centroids {}"
            .format(crop, s, rf_size, n_patches, n_centroids))

        train_x_crop_scale = CropScaleImageTransformer(
            training=True,
            result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s),
            crop_size=crop,
            scaled_size=s,
            n_jobs=-1,
            memmap=True)

        # spherical generator
        kmeans_generator = KMeansFeatureGenerator(
            n_centroids=n_centroids,
            rf_size=rf_size,
            result_path='data/mdl_kmeans_004_scale_{}_rf_{}'.format(
                s, rf_size),
            n_iterations=20,
            n_jobs=-1,
        )

        patch_extractor = models.KMeansFeatures.PatchSampler(
            n_patches=n_patches, patch_size=rf_size, n_jobs=-1)
        images = train_x_crop_scale.transform()
        logger.info("Images ndarray shape: {}".format(images.shape))
        patches = patch_extractor.transform(images)
        logger.info("Patches ndarray shape: {}".format(patches.shape))

        kmeans_generator.fit(patches)

        del patches
        gc.collect()

        train_x = kmeans_generator.transform(
            images,
            save_to_file='data/data_kmeans_features_004_scale_{}_rf_{}.npy'.
            format(s, rf_size),
            memmap=True)
        train_y = classes.train_solutions.data
        # Unload some objects
        del images
        gc.collect()
        logger.info("Train X ndarray shape: {}".format(train_x.shape))

        wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
            'alpha': 500,
            'n_estimators': 250
        },
                               n_jobs=-1)
        wrapper.cross_validation(train_x,
                                 train_y,
                                 n_folds=2,
                                 parallel_estimator=True)
        scores.append((s, wrapper.cv_scores))
        del wrapper
        gc.collect()
Exemple #27
0
def kmeans_003():
    """
    Grid search for Ridge RF parameters
    Not sure whether to use spherical or minibatch, so maybe do one run with both

    .106 on the leaderboard.  So the difference in CV scores narrowed
    """

    train_x_crop_scale = CropScaleImageTransformer(
        training=True,
        result_path='data/data_train_crop_150_scale_15.npy',
        crop_size=150,
        scaled_size=15,
        n_jobs=-1,
        memmap=True)

    # spherical generator
    kmeans_generator = KMeansFeatureGenerator(
        n_centroids=1600,
        rf_size=5,
        result_path='data/mdl_kmeans_002_new',
        n_iterations=20,
        n_jobs=-1,
    )

    # minibatch generator
    # kmeans_generator = models.KMeansFeatures.KMeansFeatureGenerator(n_centroids=1600,
    #                                                                 rf_size=5,
    #                                                                 result_path='data/mdl_kmeans_002_new_minibatch',
    #                                                                 method='minibatch',
    #                                                                 n_init=1,
    #                                                                 n_jobs=-1,)

    # Don't need to fit, as already cached
    patches = ''
    kmeans_generator.fit(patches)
    images = train_x_crop_scale.transform()

    # Problematic here - memory usage spikes to ~ 11GB when threads return
    # train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_002_new.npy', memmap=True)
    train_x = kmeans_generator.transform(
        images,
        save_to_file='data/data_kmeans_features_002_new.npy',
        memmap=True)
    train_y = classes.train_solutions.data
    # Unload some objects
    del images
    gc.collect()
    # mdl = models.Ridge.RidgeRFEstimator(alpha=14, n_estimators=250, n_jobs=-1)
    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
        'alpha': 14,
        'n_estimators': 500
    },
                           n_jobs=-1)
    params = {'alpha': [150, 250, 500, 750, 1000], 'n_estimators': [250]}

    # 500 trees and alpha 25 gives cv of .10972 on 2-fold CV, but 25 was on the upper range of the search space,
    # So need to re-run with larger range of alpha
    # Will hit 30GB of ram with 500 trees.
    wrapper.grid_search(train_x,
                        train_y,
                        params,
                        refit=False,
                        parallel_estimator=True)

    # [mean: -0.11024, std: 0.00018, params: {'n_estimators': 250, 'alpha': 20.0},
    # mean: -0.11000, std: 0.00019, params: {'n_estimators': 250, 'alpha': 25.0},
    # mean: -0.10969, std: 0.00018, params: {'n_estimators': 250, 'alpha': 35},
    # mean: -0.10934, std: 0.00019, params: {'n_estimators': 250, 'alpha': 50},
    # mean: -0.10892, std: 0.00025, params: {'n_estimators': 250, 'alpha': 75},
    # mean: -0.10860, std: 0.00025, params: {'n_estimators': 250, 'alpha': 100},
    # mean: -0.10828, std: 0.00019, params: {'n_estimators': 250, 'alpha': 150},
    # mean: -0.10789, std: 0.00016, params: {'n_estimators': 250, 'alpha': 250},
    # mean: -0.10775, std: 0.00024, params: {'n_estimators': 250, 'alpha': 500},
    # mean: -0.10779, std: 0.00022, params: {'n_estimators': 250, 'alpha': 750},
    # mean: -0.10784, std: 0.00023, params: {'n_estimators': 250, 'alpha': 1000}]

    # Fit the final model
    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
        'alpha': 500,
        'n_estimators': 500
    },
                           n_jobs=-1)
    wrapper.fit(train_x, train_y)
    test_x_crop_scale = CropScaleImageTransformer(
        training=False,
        result_path='data/data_test_crop_150_scale_15.npy',
        crop_size=150,
        scaled_size=15,
        n_jobs=-1,
        memmap=True)

    test_images = test_x_crop_scale.transform()
    test_x = kmeans_generator.transform(
        test_images,
        save_to_file='data/data_kmeans_test_features_003_new.npy',
        memmap=True)
    res = wrapper.predict(test_x)
    sub = classes.Submission(res)
    sub.to_file('sub_kmeans_003.csv')
Exemple #28
0
def ensemble_001():
    """
    Ensemble of kmeans and random forest results
    Conducting some analysis of whether the errors from these two models for individual Ys are different

    Ensembled error is .1149.

    Kmeans is better on every class than RF.
    """
    n_centroids = 3000
    s = 15
    crop = 150
    n_patches = 400000
    rf_size = 5

    train_x_crop_scale = CropScaleImageTransformer(training=True,
                                                   crop_size=crop,
                                                   scaled_size=s,
                                                   n_jobs=-1,
                                                   memmap=True)

    kmeans_generator = KMeansFeatureGenerator(n_centroids=n_centroids,
                                              rf_size=rf_size,
                                              result_path='data/mdl_ensemble_001',
                                              n_iterations=20,
                                              n_jobs=-1,)

    patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                         patch_size=rf_size,
                                                         n_jobs=-1)
    images = train_x_crop_scale.transform()
    patches = patch_extractor.transform(images)

    kmeans_generator.fit(patches)

    del patches
    gc.collect()

    X = kmeans_generator.transform(images, save_to_file='data/data_ensemble_001.npy', memmap=True)
    Y = classes.train_solutions.data

    # Unload some objects
    del images
    gc.collect()

    # Get the input for the RF so that we can split together
    sampler = SampleTransformer(training=True, steps=2, step_size=20, n_jobs=-1)
    pX = sampler.transform()

    # manual split of train and test
    train_x, test_x, ptrain_x, ptest_x, train_y, test_y = train_test_split(X, pX, Y, test_size=0.5)

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1)
    wrapper.fit(train_x, train_y)
    kmeans_preds = wrapper.predict(test_x)

    pWrapper = ModelWrapper(RandomForestRegressor, {'n_estimators': 500, 'verbose': 3}, n_jobs=-1)
    pWrapper.fit(ptrain_x, train_y)
    pixel_preds = pWrapper.predict(ptest_x)

    logger.info('Kmeans')
    classes.colwise_rmse(kmeans_preds, test_y)
    classes.rmse(kmeans_preds, test_y)
    logger.info('Pixel RF')
    classes.colwise_rmse(pixel_preds, test_y)
    classes.rmse(pixel_preds, test_y)

    logger.info("Ensembling predictions")
    etrain_x = np.hstack((wrapper.predict(train_x), pWrapper.predict(ptrain_x)))
    etest_x = np.hstack((kmeans_preds, pixel_preds))
    eWrapper = ModelWrapper(RandomForestRegressor, {'n_estimators': 500, 'verbose': 3}, n_jobs=-1)
    eWrapper.fit(etrain_x, train_y)
    ensemble_preds = eWrapper.predict(etest_x)
    classes.colwise_rmse(ensemble_preds, test_y)
    classes.rmse(ensemble_preds, test_y)
# Unload some objects for memory
del images
gc.collect()

# ModelWrapper is a convenience class that we created to automate some of the typical tasks
# like logging, grid search and cross validation.
# For fit, it is basically equivalent to calling fit on the estimator

# The estimator takes the X and y and trains a ridge regression (sklearn.linear_model.Ridge),
# predicts using the ridge regressor, then uses the results of the prediction to train a random forest.
wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1)
wrapper.fit(train_x, train_y)

test_x_crop_scale = CropScaleImageTransformer(training=False,
                                              result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
                                              crop_size=crop,
                                              scaled_size=s,
                                              n_jobs=-1,
                                              memmap=True)

# Crop and scale the test images
test_images = test_x_crop_scale.transform()

# Generate the test features
test_x = kmeans_generator.transform(test_images, save_to_file='data/data_test_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True)

# Predict on the test features
res = wrapper.predict(test_x)

# Generate a submission file
sub = classes.Submission(res)
sub.to_file('sub_kmeans_006.csv')
Exemple #30
0
classes.logstream.setLevel(classes.logging.DEBUG)
a = models.RandomForest.RandomForestCascadeModel(cv_sample=0.1)
a.run('cv')
a.run('train')
b = models.RandomForest.RandomForestModel(cv_sample=0.1)
b.estimator.set_params(n_estimators=10)
b.run('train')

import models
from models.Base import CropScaleImageTransformer
from models.KMeansFeatures import KMeansFeatureGenerator

train_x_crop_scale = CropScaleImageTransformer(
    training=True,
    # result_path='data/data_train_crop_150_scale_15.npy',
    crop_size=150,
    scaled_size=15,
    n_jobs=-1,
    memmap=True)

raw_images = train_x_crop_scale.transform()

patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=1000,
                                                     patch_size=5,
                                                     n_jobs=-1)

reds = raw_images[0:100, :, :, 0]
a = patch_extractor.transform(reds)

kmeans_generator = KMeansFeatureGenerator(n_centroids=10,
                                          rf_size=5,