Exemple #1
0
def train_kmeans_generator(images,
                           n_centroids=3000,
                           n_patches=400000,
                           rf_size=5):
    """
    Takes the image ndarray and extracts patches, then trains the kmeans feature generator
    with those patches.

    Patches are taken by iterating over images sequentially, and randomly selecting a patch within each image.
    For example, if you have 1000 images, the 1st and the 1001st patch will both be from the first image.

    The feature generator applies normalization and ZCA whitening before using spherical k-means to find the
    centroids.
    """
    kmeans_generator = KMeansFeatureGenerator(
        n_centroids=n_centroids,
        rf_size=rf_size,
        result_path='data/mdl_kmeans_006_centroids_{}'.format(n_centroids),
        n_iterations=20,
        n_jobs=-1,
    )

    patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                         patch_size=rf_size,
                                                         n_jobs=-1)
    patches = patch_extractor.transform(images)
    kmeans_generator.fit(patches)
    return kmeans_generator
Exemple #2
0
def kmeans_007():
    """
    Increasing crop/scale size, rf size, centroids, and patches all at once.

    2014-02-18 02:45:15 - Base - INFO - Cross validation completed in 5426.04788399.  Scores:
    2014-02-18 02:45:15 - Base - INFO - [-0.10834319 -0.10825868]
    """
    n_centroids = 5000
    s = 50
    crop = 200
    # Originally, 1600 centroids for 400,000 patches, or 250 patches per centroid
    # 800000 / 5000 = will give us 160 patches per centroid
    n_patches = 800000
    rf_size = 20
    # 31 x 31 = 961 patches per image, which is 10x more patches than the original settings
    # If we set stride 2, then it's 16 x 16 patches = 256, only twice as many patches
    stride = 2
    train_x_crop_scale = CropScaleImageTransformer(training=True,
                                                   crop_size=crop,
                                                   scaled_size=s,
                                                   n_jobs=-1,
                                                   memmap=True)
    images = train_x_crop_scale.transform()
    patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                         patch_size=rf_size,
                                                         n_jobs=-1)
    patches = patch_extractor.transform(images)

    kmeans_generator = KMeansFeatureGenerator(
        n_centroids=n_centroids,
        rf_size=rf_size,
        result_path='data/mdl_kmeans_007'.format(n_centroids),
        n_iterations=20,
        n_jobs=-1,
    )
    kmeans_generator.fit(patches)

    del patches
    gc.collect()

    train_x = kmeans_generator.transform(
        images,
        save_to_file='data/data_kmeans_features_007.npy',
        stride_size=stride,
        memmap=True)
    train_y = classes.train_solutions.data
    # Unload some objects
    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
        'alpha': 500,
        'n_estimators': 250
    },
                           n_jobs=-1)
    wrapper.cross_validation(train_x, train_y, parallel_estimator=True)
    """
Exemple #3
0
def kmeans_006_submission():
    # Final submission
    n_centroids = 3000
    s = 15
    crop = 150
    n_patches = 400000
    rf_size = 5
    logger.info("Training with n_centroids {}".format(n_centroids))

    train_x_crop_scale = CropScaleImageTransformer(training=True,
                                                   result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s),
                                                   crop_size=crop,
                                                   scaled_size=s,
                                                   n_jobs=-1,
                                                   memmap=True)

    kmeans_generator = KMeansFeatureGenerator(n_centroids=n_centroids,
                                              rf_size=rf_size,
                                              result_path='data/mdl_kmeans_006_centroids_{}'.format(n_centroids),
                                              n_iterations=20,
                                              n_jobs=-1,)

    patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                         patch_size=rf_size,
                                                         n_jobs=-1)
    images = train_x_crop_scale.transform()

    patches = patch_extractor.transform(images)

    kmeans_generator.fit(patches)

    del patches
    gc.collect()

    train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True)
    train_y = classes.train_solutions.data
    # Unload some objects
    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1)
    wrapper.fit(train_x, train_y)

    test_x_crop_scale = CropScaleImageTransformer(training=False,
                                                  result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
                                                  crop_size=crop,
                                                  scaled_size=s,
                                                  n_jobs=-1,
                                                  memmap=True)

    test_images = test_x_crop_scale.transform()
    test_x = kmeans_generator.transform(test_images, save_to_file='data/data_test_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True)
    res = wrapper.predict(test_x)
    sub = classes.Submission(res)
    sub.to_file('sub_kmeans_006.csv')
def rf_size_10():
    # Pretty bad as well
    # 2014-03-28 13:04:07 - Base - INFO - Cross validation completed in 1475.74401999.  Scores:
    # 2014-03-28 13:04:07 - Base - INFO - [-0.12217214 -0.12209735]

    n_centroids = 3000
    s = 15
    crop = 150
    n_patches = 400000
    rf_size = 5

    train_x_crop_scale = CropScaleImageTransformer(training=True,
                                                   result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s),
                                                   crop_size=crop,
                                                   scaled_size=s,
                                                   n_jobs=-1,
                                                   memmap=True)
    test_x_crop_scale = CropScaleImageTransformer(training=False,
                                                  result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
                                                  crop_size=crop,
                                                  scaled_size=s,
                                                  n_jobs=-1,
                                                  memmap=True)

    kmeans_generator = KMeansFeatureGenerator(n_centroids=n_centroids,
                                              rf_size=rf_size,
                                              result_path='data/mdl_kmeans_008_rf10'.format(n_centroids),
                                              n_iterations=20,
                                              n_jobs=-1,)

    patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                         patch_size=rf_size,
                                                         n_jobs=-1)
    images = train_x_crop_scale.transform()

    patches = patch_extractor.transform(images)

    kmeans_generator.fit(patches)

    del patches
    gc.collect()

    train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_008_rf10.npy'.format(n_centroids), memmap=True)
    train_y = classes.train_solutions.data

    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1)
    wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True)
def train_kmeans_generator(images, n_centroids=3000, n_patches=400000, rf_size=5):

    kmeans_generator = KMeansFeatureGenerator(n_centroids=n_centroids,
                                              rf_size=rf_size,
                                              result_path='data/mdl_kmeans_006_centroids_{}'.format(n_centroids),
                                              n_iterations=20,
                                              n_jobs=-1,)


    patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                         patch_size=rf_size,
                                                         n_jobs=-1)
    patches = patch_extractor.transform(images)
    kmeans_generator.fit(patches)
    return kmeans_generator
Exemple #6
0
def kmeans_007():
    """
    Increasing crop/scale size, rf size, centroids, and patches all at once.

    2014-02-18 02:45:15 - Base - INFO - Cross validation completed in 5426.04788399.  Scores:
    2014-02-18 02:45:15 - Base - INFO - [-0.10834319 -0.10825868]
    """
    n_centroids = 5000
    s = 50
    crop = 200
    # Originally, 1600 centroids for 400,000 patches, or 250 patches per centroid
    # 800000 / 5000 = will give us 160 patches per centroid
    n_patches = 800000
    rf_size = 20
    # 31 x 31 = 961 patches per image, which is 10x more patches than the original settings
    # If we set stride 2, then it's 16 x 16 patches = 256, only twice as many patches
    stride = 2
    train_x_crop_scale = CropScaleImageTransformer(training=True,
                                                   crop_size=crop,
                                                   scaled_size=s,
                                                   n_jobs=-1,
                                                   memmap=True)
    images = train_x_crop_scale.transform()
    patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                         patch_size=rf_size,
                                                         n_jobs=-1)
    patches = patch_extractor.transform(images)

    kmeans_generator = KMeansFeatureGenerator(n_centroids=n_centroids,
                                              rf_size=rf_size,
                                              result_path='data/mdl_kmeans_007'.format(n_centroids),
                                              n_iterations=20,
                                              n_jobs=-1,)
    kmeans_generator.fit(patches)

    del patches
    gc.collect()

    train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_007.npy', stride_size=stride, memmap=True)
    train_y = classes.train_solutions.data
    # Unload some objects
    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 250}, n_jobs=-1)
    wrapper.cross_validation(train_x, train_y, parallel_estimator=True)

    """
Exemple #7
0
def train_kmeans_generator(images,
                           n_centroids=3000,
                           n_patches=400000,
                           rf_size=5):

    kmeans_generator = KMeansFeatureGenerator(
        n_centroids=n_centroids,
        rf_size=rf_size,
        result_path='data/mdl_kmeans_006_centroids_{}'.format(n_centroids),
        n_iterations=20,
        n_jobs=-1,
    )

    patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                         patch_size=rf_size,
                                                         n_jobs=-1)
    patches = patch_extractor.transform(images)
    kmeans_generator.fit(patches)
    return kmeans_generator
def train_kmeans_generator(images, n_centroids=3000, n_patches=400000, rf_size=5):
    """
    Takes the image ndarray and extracts patches, then trains the kmeans feature generator
    with those patches.

    Patches are taken by iterating over images sequentially, and randomly selecting a patch within each image.
    For example, if you have 1000 images, the 1st and the 1001st patch will both be from the first image.

    The feature generator applies normalization and ZCA whitening before using spherical k-means to find the
    centroids.
    """
    kmeans_generator = KMeansFeatureGenerator(n_centroids=n_centroids,
                                              rf_size=rf_size,
                                              result_path='data/mdl_kmeans_006_centroids_{}'.format(n_centroids),
                                              n_iterations=20,
                                              n_jobs=-1,)


    patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                         patch_size=rf_size,
                                                         n_jobs=-1)
    patches = patch_extractor.transform(images)
    kmeans_generator.fit(patches)
    return kmeans_generator
Exemple #9
0
def kmeans_006():
    """
    Testing number of centroids

    [(1000, array([-0.10926318, -0.10853047])),
     (2000, array([-0.10727502, -0.10710292])),
     (2500, array([-0.107019  , -0.10696262])),
     (3000, array([-0.10713973, -0.1066932 ]))]

    """
    n_centroids_vals = [1000, 2000, 2500, 3000]
    scores = []

    for n_centroids in n_centroids_vals:
        s = 15
        crop = 150
        n_patches = 400000
        rf_size = 5
        logger.info("Training with n_centroids {}".format(n_centroids))

        train_x_crop_scale = CropScaleImageTransformer(
            training=True,
            result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s),
            crop_size=crop,
            scaled_size=s,
            n_jobs=-1,
            memmap=True)
        test_x_crop_scale = CropScaleImageTransformer(
            training=False,
            result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
            crop_size=crop,
            scaled_size=s,
            n_jobs=-1,
            memmap=True)

        kmeans_generator = KMeansFeatureGenerator(
            n_centroids=n_centroids,
            rf_size=rf_size,
            result_path='data/mdl_kmeans_006_centroids_{}'.format(n_centroids),
            n_iterations=20,
            n_jobs=-1,
        )

        patch_extractor = models.KMeansFeatures.PatchSampler(
            n_patches=n_patches, patch_size=rf_size, n_jobs=-1)
        images = train_x_crop_scale.transform()

        patches = patch_extractor.transform(images)

        kmeans_generator.fit(patches)

        del patches
        gc.collect()

        train_x = kmeans_generator.transform(
            images,
            save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.
            format(n_centroids),
            memmap=True)
        train_y = classes.train_solutions.data
        # Unload some objects
        del images
        gc.collect()

        wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
            'alpha': 500,
            'n_estimators': 250
        },
                               n_jobs=-1)
        wrapper.cross_validation(train_x,
                                 train_y,
                                 n_folds=2,
                                 parallel_estimator=True)

        score = (n_centroids, wrapper.cv_scores)
        logger.info("Scores: {}".format(score))
        scores.append(score)

        del wrapper
        gc.collect()
def kmeans_006():
    """
    Testing number of centroids

    [(1000, array([-0.10926318, -0.10853047])),
     (2000, array([-0.10727502, -0.10710292])),
     (2500, array([-0.107019  , -0.10696262])),
     (3000, array([-0.10713973, -0.1066932 ]))]

    """
    n_centroids_vals = [1000, 2000, 2500, 3000]
    scores = []

    for n_centroids in n_centroids_vals:
        s = 15
        crop = 150
        n_patches = 400000
        rf_size = 5
        logger.info("Training with n_centroids {}".format(n_centroids))

        train_x_crop_scale = CropScaleImageTransformer(training=True,
                                                       result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s),
                                                       crop_size=crop,
                                                       scaled_size=s,
                                                       n_jobs=-1,
                                                       memmap=True)
        test_x_crop_scale = CropScaleImageTransformer(training=False,
                                                      result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
                                                      crop_size=crop,
                                                      scaled_size=s,
                                                      n_jobs=-1,
                                                      memmap=True)

        kmeans_generator = KMeansFeatureGenerator(n_centroids=n_centroids,
                                                  rf_size=rf_size,
                                                  result_path='data/mdl_kmeans_006_centroids_{}'.format(n_centroids),
                                                  n_iterations=20,
                                                  n_jobs=-1,)

        patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                             patch_size=rf_size,
                                                             n_jobs=-1)
        images = train_x_crop_scale.transform()

        patches = patch_extractor.transform(images)

        kmeans_generator.fit(patches)

        del patches
        gc.collect()

        train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(n_centroids), memmap=True)
        train_y = classes.train_solutions.data
        # Unload some objects
        del images
        gc.collect()

        wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 250}, n_jobs=-1)
        wrapper.cross_validation(train_x, train_y, n_folds=2, parallel_estimator=True)

        score = (n_centroids, wrapper.cv_scores)
        logger.info("Scores: {}".format(score))
        scores.append(score)

        del wrapper
        gc.collect()
Exemple #11
0
def rf_size_10():
    # Pretty bad as well
    # 2014-03-28 13:04:07 - Base - INFO - Cross validation completed in 1475.74401999.  Scores:
    # 2014-03-28 13:04:07 - Base - INFO - [-0.12217214 -0.12209735]

    n_centroids = 3000
    s = 15
    crop = 150
    n_patches = 400000
    rf_size = 5

    train_x_crop_scale = CropScaleImageTransformer(
        training=True,
        result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s),
        crop_size=crop,
        scaled_size=s,
        n_jobs=-1,
        memmap=True)
    test_x_crop_scale = CropScaleImageTransformer(
        training=False,
        result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
        crop_size=crop,
        scaled_size=s,
        n_jobs=-1,
        memmap=True)

    kmeans_generator = KMeansFeatureGenerator(
        n_centroids=n_centroids,
        rf_size=rf_size,
        result_path='data/mdl_kmeans_008_rf10'.format(n_centroids),
        n_iterations=20,
        n_jobs=-1,
    )

    patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                         patch_size=rf_size,
                                                         n_jobs=-1)
    images = train_x_crop_scale.transform()

    patches = patch_extractor.transform(images)

    kmeans_generator.fit(patches)

    del patches
    gc.collect()

    train_x = kmeans_generator.transform(
        images,
        save_to_file='data/data_kmeans_features_008_rf10.npy'.format(
            n_centroids),
        memmap=True)
    train_y = classes.train_solutions.data

    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
        'alpha': 500,
        'n_estimators': 500
    },
                           n_jobs=-1)
    wrapper.cross_validation(train_x,
                             train_y,
                             sample=0.5,
                             parallel_estimator=True)
Exemple #12
0
def ensemble_001():
    """
    Ensemble of kmeans and random forest results
    Conducting some analysis of whether the errors from these two models for individual Ys are different

    Ensembled error is .1149.

    Kmeans is better on every class than RF.
    """
    n_centroids = 3000
    s = 15
    crop = 150
    n_patches = 400000
    rf_size = 5

    train_x_crop_scale = CropScaleImageTransformer(training=True,
                                                   crop_size=crop,
                                                   scaled_size=s,
                                                   n_jobs=-1,
                                                   memmap=True)

    kmeans_generator = KMeansFeatureGenerator(
        n_centroids=n_centroids,
        rf_size=rf_size,
        result_path='data/mdl_ensemble_001',
        n_iterations=20,
        n_jobs=-1,
    )

    patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                         patch_size=rf_size,
                                                         n_jobs=-1)
    images = train_x_crop_scale.transform()
    patches = patch_extractor.transform(images)

    kmeans_generator.fit(patches)

    del patches
    gc.collect()

    X = kmeans_generator.transform(images,
                                   save_to_file='data/data_ensemble_001.npy',
                                   memmap=True)
    Y = classes.train_solutions.data

    # Unload some objects
    del images
    gc.collect()

    # Get the input for the RF so that we can split together
    sampler = SampleTransformer(training=True,
                                steps=2,
                                step_size=20,
                                n_jobs=-1)
    pX = sampler.transform()

    # manual split of train and test
    train_x, test_x, ptrain_x, ptest_x, train_y, test_y = train_test_split(
        X, pX, Y, test_size=0.5)

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
        'alpha': 500,
        'n_estimators': 500
    },
                           n_jobs=-1)
    wrapper.fit(train_x, train_y)
    kmeans_preds = wrapper.predict(test_x)

    pWrapper = ModelWrapper(RandomForestRegressor, {
        'n_estimators': 500,
        'verbose': 3
    },
                            n_jobs=-1)
    pWrapper.fit(ptrain_x, train_y)
    pixel_preds = pWrapper.predict(ptest_x)

    logger.info('Kmeans')
    classes.colwise_rmse(kmeans_preds, test_y)
    classes.rmse(kmeans_preds, test_y)
    logger.info('Pixel RF')
    classes.colwise_rmse(pixel_preds, test_y)
    classes.rmse(pixel_preds, test_y)

    logger.info("Ensembling predictions")
    etrain_x = np.hstack(
        (wrapper.predict(train_x), pWrapper.predict(ptrain_x)))
    etest_x = np.hstack((kmeans_preds, pixel_preds))
    eWrapper = ModelWrapper(RandomForestRegressor, {
        'n_estimators': 500,
        'verbose': 3
    },
                            n_jobs=-1)
    eWrapper.fit(etrain_x, train_y)
    ensemble_preds = eWrapper.predict(etest_x)
    classes.colwise_rmse(ensemble_preds, test_y)
    classes.rmse(ensemble_preds, test_y)
Exemple #13
0
def kmeans_003():
    """
    Grid search for Ridge RF parameters
    Not sure whether to use spherical or minibatch, so maybe do one run with both

    .106 on the leaderboard.  So the difference in CV scores narrowed
    """

    train_x_crop_scale = CropScaleImageTransformer(training=True,
                                                   result_path='data/data_train_crop_150_scale_15.npy',
                                                   crop_size=150,
                                                   scaled_size=15,
                                                   n_jobs=-1,
                                                   memmap=True)



    # spherical generator
    kmeans_generator = KMeansFeatureGenerator(n_centroids=1600,
                                              rf_size=5,
                                              result_path='data/mdl_kmeans_002_new',
                                              n_iterations=20,
                                              n_jobs=-1,)

    # minibatch generator
    # kmeans_generator = models.KMeansFeatures.KMeansFeatureGenerator(n_centroids=1600,
    #                                                                 rf_size=5,
    #                                                                 result_path='data/mdl_kmeans_002_new_minibatch',
    #                                                                 method='minibatch',
    #                                                                 n_init=1,
    #                                                                 n_jobs=-1,)


    # Don't need to fit, as already cached
    patches = ''
    kmeans_generator.fit(patches)
    images = train_x_crop_scale.transform()

    # Problematic here - memory usage spikes to ~ 11GB when threads return
    # train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_002_new.npy', memmap=True)
    train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_002_new.npy', memmap=True)
    train_y = classes.train_solutions.data
    # Unload some objects
    del images
    gc.collect()
    # mdl = models.Ridge.RidgeRFEstimator(alpha=14, n_estimators=250, n_jobs=-1)
    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 14, 'n_estimators': 500}, n_jobs=-1)
    params = {
        'alpha': [150, 250, 500, 750, 1000],
        'n_estimators': [250]
    }

    # 500 trees and alpha 25 gives cv of .10972 on 2-fold CV, but 25 was on the upper range of the search space,
    # So need to re-run with larger range of alpha
    # Will hit 30GB of ram with 500 trees.
    wrapper.grid_search(train_x, train_y, params, refit=False, parallel_estimator=True)

    # [mean: -0.11024, std: 0.00018, params: {'n_estimators': 250, 'alpha': 20.0},
    # mean: -0.11000, std: 0.00019, params: {'n_estimators': 250, 'alpha': 25.0},
    # mean: -0.10969, std: 0.00018, params: {'n_estimators': 250, 'alpha': 35},
    # mean: -0.10934, std: 0.00019, params: {'n_estimators': 250, 'alpha': 50},
    # mean: -0.10892, std: 0.00025, params: {'n_estimators': 250, 'alpha': 75},
    # mean: -0.10860, std: 0.00025, params: {'n_estimators': 250, 'alpha': 100},
    # mean: -0.10828, std: 0.00019, params: {'n_estimators': 250, 'alpha': 150},
    # mean: -0.10789, std: 0.00016, params: {'n_estimators': 250, 'alpha': 250},
    # mean: -0.10775, std: 0.00024, params: {'n_estimators': 250, 'alpha': 500},
    # mean: -0.10779, std: 0.00022, params: {'n_estimators': 250, 'alpha': 750},
    # mean: -0.10784, std: 0.00023, params: {'n_estimators': 250, 'alpha': 1000}]

    # Fit the final model
    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1)
    wrapper.fit(train_x, train_y)
    test_x_crop_scale = CropScaleImageTransformer(training=False,
                                                  result_path='data/data_test_crop_150_scale_15.npy',
                                                  crop_size=150,
                                                  scaled_size=15,
                                                  n_jobs=-1,
                                                  memmap=True)


    test_images = test_x_crop_scale.transform()
    test_x = kmeans_generator.transform(test_images, save_to_file='data/data_kmeans_test_features_003_new.npy', memmap=True)
    res = wrapper.predict(test_x)
    sub = classes.Submission(res)
    sub.to_file('sub_kmeans_003.csv')
Exemple #14
0
def kmeans_006_submission():
    # Final submission
    n_centroids = 3000
    s = 15
    crop = 150
    n_patches = 400000
    rf_size = 5
    logger.info("Training with n_centroids {}".format(n_centroids))

    train_x_crop_scale = CropScaleImageTransformer(
        training=True,
        result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s),
        crop_size=crop,
        scaled_size=s,
        n_jobs=-1,
        memmap=True)

    kmeans_generator = KMeansFeatureGenerator(
        n_centroids=n_centroids,
        rf_size=rf_size,
        result_path='data/mdl_kmeans_006_centroids_{}'.format(n_centroids),
        n_iterations=20,
        n_jobs=-1,
    )

    patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                         patch_size=rf_size,
                                                         n_jobs=-1)
    images = train_x_crop_scale.transform()

    patches = patch_extractor.transform(images)

    kmeans_generator.fit(patches)

    del patches
    gc.collect()

    train_x = kmeans_generator.transform(
        images,
        save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(
            n_centroids),
        memmap=True)
    train_y = classes.train_solutions.data
    # Unload some objects
    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
        'alpha': 500,
        'n_estimators': 500
    },
                           n_jobs=-1)
    wrapper.fit(train_x, train_y)

    test_x_crop_scale = CropScaleImageTransformer(
        training=False,
        result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
        crop_size=crop,
        scaled_size=s,
        n_jobs=-1,
        memmap=True)

    test_images = test_x_crop_scale.transform()
    test_x = kmeans_generator.transform(
        test_images,
        save_to_file='data/data_test_kmeans_features_006_centroids_{}.npy'.
        format(n_centroids),
        memmap=True)
    res = wrapper.predict(test_x)
    sub = classes.Submission(res)
    sub.to_file('sub_kmeans_006.csv')
Exemple #15
0
def kmeans_004():
    """
    Tuning the scale/crop and RF size parameters

    First number is the scaling, cropped to 200, with rf size of 5.  75 scaling took forever ot transform, so killed
    [(30, array([-0.11374265, -0.1134896 ]))
     (50, array([-0.11677854, -0.11696837]))]

    Trying again with larger RF size of 10.
    As a note, scale to 30 with rf 10 takes about 25 minutes to extract features on the train set
    Scale to 50 with rf 10 takes almost 90 minutes.
    [(30, array([-0.10828216, -0.1081058 ])),
    (50, array([-0.10840914, -0.10868195]))]
    Interesting that scale size of 50 does worse

    Crop is not 150, so this is not really an apples to apples comparison with kmeans_003

    It is possibly worth making a submission with scale 30 and rf size 10
    """
    crops = [200]  # Should probably also add 250
    scales = [30, 50]  # Scaling is probably the most important part here

    scores = []
    for s in scales:
        crop = 200
        n_centroids = 1600
        n_patches = 400000
        # rf_size = int(round(s * .2))
        rf_size = 10
        logger.info("Training with crop {}, scale {}, patch size {}, patches {}, centroids {}".format(crop, s, rf_size, n_patches, n_centroids))

        train_x_crop_scale = CropScaleImageTransformer(training=True,
                                                       result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s),
                                                       crop_size=crop,
                                                       scaled_size=s,
                                                       n_jobs=-1,
                                                       memmap=True)

        # spherical generator
        kmeans_generator = KMeansFeatureGenerator(n_centroids=n_centroids,
                                                  rf_size=rf_size,
                                                  result_path='data/mdl_kmeans_004_scale_{}_rf_{}'.format(s, rf_size),
                                                  n_iterations=20,
                                                  n_jobs=-1,)

        patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                             patch_size=rf_size,
                                                             n_jobs=-1)
        images = train_x_crop_scale.transform()
        logger.info("Images ndarray shape: {}".format(images.shape))
        patches = patch_extractor.transform(images)
        logger.info("Patches ndarray shape: {}".format(patches.shape))

        kmeans_generator.fit(patches)

        del patches
        gc.collect()

        train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_004_scale_{}_rf_{}.npy'.format(s, rf_size), memmap=True)
        train_y = classes.train_solutions.data
        # Unload some objects
        del images
        gc.collect()
        logger.info("Train X ndarray shape: {}".format(train_x.shape))

        wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 250}, n_jobs=-1)
        wrapper.cross_validation(train_x, train_y, n_folds=2, parallel_estimator=True)
        scores.append((s, wrapper.cv_scores))
        del wrapper
        gc.collect()
Exemple #16
0
def kmeans_004():
    """
    Tuning the scale/crop and RF size parameters

    First number is the scaling, cropped to 200, with rf size of 5.  75 scaling took forever ot transform, so killed
    [(30, array([-0.11374265, -0.1134896 ]))
     (50, array([-0.11677854, -0.11696837]))]

    Trying again with larger RF size of 10.
    As a note, scale to 30 with rf 10 takes about 25 minutes to extract features on the train set
    Scale to 50 with rf 10 takes almost 90 minutes.
    [(30, array([-0.10828216, -0.1081058 ])),
    (50, array([-0.10840914, -0.10868195]))]
    Interesting that scale size of 50 does worse

    Crop is not 150, so this is not really an apples to apples comparison with kmeans_003

    It is possibly worth making a submission with scale 30 and rf size 10
    """
    crops = [200]  # Should probably also add 250
    scales = [30, 50]  # Scaling is probably the most important part here

    scores = []
    for s in scales:
        crop = 200
        n_centroids = 1600
        n_patches = 400000
        # rf_size = int(round(s * .2))
        rf_size = 10
        logger.info(
            "Training with crop {}, scale {}, patch size {}, patches {}, centroids {}"
            .format(crop, s, rf_size, n_patches, n_centroids))

        train_x_crop_scale = CropScaleImageTransformer(
            training=True,
            result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s),
            crop_size=crop,
            scaled_size=s,
            n_jobs=-1,
            memmap=True)

        # spherical generator
        kmeans_generator = KMeansFeatureGenerator(
            n_centroids=n_centroids,
            rf_size=rf_size,
            result_path='data/mdl_kmeans_004_scale_{}_rf_{}'.format(
                s, rf_size),
            n_iterations=20,
            n_jobs=-1,
        )

        patch_extractor = models.KMeansFeatures.PatchSampler(
            n_patches=n_patches, patch_size=rf_size, n_jobs=-1)
        images = train_x_crop_scale.transform()
        logger.info("Images ndarray shape: {}".format(images.shape))
        patches = patch_extractor.transform(images)
        logger.info("Patches ndarray shape: {}".format(patches.shape))

        kmeans_generator.fit(patches)

        del patches
        gc.collect()

        train_x = kmeans_generator.transform(
            images,
            save_to_file='data/data_kmeans_features_004_scale_{}_rf_{}.npy'.
            format(s, rf_size),
            memmap=True)
        train_y = classes.train_solutions.data
        # Unload some objects
        del images
        gc.collect()
        logger.info("Train X ndarray shape: {}".format(train_x.shape))

        wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
            'alpha': 500,
            'n_estimators': 250
        },
                               n_jobs=-1)
        wrapper.cross_validation(train_x,
                                 train_y,
                                 n_folds=2,
                                 parallel_estimator=True)
        scores.append((s, wrapper.cv_scores))
        del wrapper
        gc.collect()
Exemple #17
0
def kmeans_005():
    """
    Testing whether extracting patches from train and test images works better

    [(500000, False, array([-0.10799986, -0.10744586])),
    (500000, True, array([-0.10790803, -0.10733288])),
    (600000, False, array([-0.10812188, -0.10735988])),
    (600000, True, array([-0.10778652, -0.10752664]))]
    """
    n_patches_vals = [500000, 600000, 700000]
    include_test_images = [False, True]

    scores = []
    for n_patches in n_patches_vals:
        for incl in include_test_images:
            s = 15
            crop = 150
            n_centroids = 1600
            rf_size = 5
            logger.info(
                "Training with n_patches {}, with test images {}".format(
                    n_patches, incl))

            train_x_crop_scale = CropScaleImageTransformer(
                training=True,
                result_path='data/data_train_crop_{}_scale_{}.npy'.format(
                    crop, s),
                crop_size=crop,
                scaled_size=s,
                n_jobs=-1,
                memmap=True)
            test_x_crop_scale = CropScaleImageTransformer(
                training=False,
                result_path='data/data_test_crop_{}_scale_{}.npy'.format(
                    crop, s),
                crop_size=crop,
                scaled_size=s,
                n_jobs=-1,
                memmap=True)

            kmeans_generator = KMeansFeatureGenerator(
                n_centroids=n_centroids,
                rf_size=rf_size,
                result_path='data/mdl_kmeans_005_patches_{}_test{}'.format(
                    n_patches, incl),
                n_iterations=20,
                n_jobs=-1,
            )

            patch_extractor = models.KMeansFeatures.PatchSampler(
                n_patches=n_patches, patch_size=rf_size, n_jobs=-1)
            images = train_x_crop_scale.transform()
            if incl:
                test_images = test_x_crop_scale.transform()
                images = np.vstack([images, test_images])
            logger.info(
                "Extracting patches from images ndarray shape: {}".format(
                    images.shape))

            patches = patch_extractor.transform(images)
            logger.info("Patches ndarray shape: {}".format(patches.shape))

            kmeans_generator.fit(patches)

            del patches
            gc.collect()

            # Reload the original images
            images = train_x_crop_scale.transform()
            logger.info(
                "Generating features on images ndarray shape: {}".format(
                    images.shape))
            train_x = kmeans_generator.transform(
                images,
                save_to_file=
                'data/data_kmeans_features_005_patches_{}_test_{}.npy'.format(
                    n_patches, incl),
                memmap=True)
            train_y = classes.train_solutions.data
            # Unload some objects
            del images
            gc.collect()

            wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
                'alpha': 500,
                'n_estimators': 250
            },
                                   n_jobs=-1)
            wrapper.cross_validation(train_x,
                                     train_y,
                                     n_folds=2,
                                     parallel_estimator=True)

            score = (n_patches, incl, wrapper.cv_scores)
            logger.info("Score: {}".format(score))
            scores.append(score)

            del wrapper
            gc.collect()
Exemple #18
0
def kmeans_003():
    """
    Grid search for Ridge RF parameters
    Not sure whether to use spherical or minibatch, so maybe do one run with both

    .106 on the leaderboard.  So the difference in CV scores narrowed
    """

    train_x_crop_scale = CropScaleImageTransformer(
        training=True,
        result_path='data/data_train_crop_150_scale_15.npy',
        crop_size=150,
        scaled_size=15,
        n_jobs=-1,
        memmap=True)

    # spherical generator
    kmeans_generator = KMeansFeatureGenerator(
        n_centroids=1600,
        rf_size=5,
        result_path='data/mdl_kmeans_002_new',
        n_iterations=20,
        n_jobs=-1,
    )

    # minibatch generator
    # kmeans_generator = models.KMeansFeatures.KMeansFeatureGenerator(n_centroids=1600,
    #                                                                 rf_size=5,
    #                                                                 result_path='data/mdl_kmeans_002_new_minibatch',
    #                                                                 method='minibatch',
    #                                                                 n_init=1,
    #                                                                 n_jobs=-1,)

    # Don't need to fit, as already cached
    patches = ''
    kmeans_generator.fit(patches)
    images = train_x_crop_scale.transform()

    # Problematic here - memory usage spikes to ~ 11GB when threads return
    # train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_002_new.npy', memmap=True)
    train_x = kmeans_generator.transform(
        images,
        save_to_file='data/data_kmeans_features_002_new.npy',
        memmap=True)
    train_y = classes.train_solutions.data
    # Unload some objects
    del images
    gc.collect()
    # mdl = models.Ridge.RidgeRFEstimator(alpha=14, n_estimators=250, n_jobs=-1)
    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
        'alpha': 14,
        'n_estimators': 500
    },
                           n_jobs=-1)
    params = {'alpha': [150, 250, 500, 750, 1000], 'n_estimators': [250]}

    # 500 trees and alpha 25 gives cv of .10972 on 2-fold CV, but 25 was on the upper range of the search space,
    # So need to re-run with larger range of alpha
    # Will hit 30GB of ram with 500 trees.
    wrapper.grid_search(train_x,
                        train_y,
                        params,
                        refit=False,
                        parallel_estimator=True)

    # [mean: -0.11024, std: 0.00018, params: {'n_estimators': 250, 'alpha': 20.0},
    # mean: -0.11000, std: 0.00019, params: {'n_estimators': 250, 'alpha': 25.0},
    # mean: -0.10969, std: 0.00018, params: {'n_estimators': 250, 'alpha': 35},
    # mean: -0.10934, std: 0.00019, params: {'n_estimators': 250, 'alpha': 50},
    # mean: -0.10892, std: 0.00025, params: {'n_estimators': 250, 'alpha': 75},
    # mean: -0.10860, std: 0.00025, params: {'n_estimators': 250, 'alpha': 100},
    # mean: -0.10828, std: 0.00019, params: {'n_estimators': 250, 'alpha': 150},
    # mean: -0.10789, std: 0.00016, params: {'n_estimators': 250, 'alpha': 250},
    # mean: -0.10775, std: 0.00024, params: {'n_estimators': 250, 'alpha': 500},
    # mean: -0.10779, std: 0.00022, params: {'n_estimators': 250, 'alpha': 750},
    # mean: -0.10784, std: 0.00023, params: {'n_estimators': 250, 'alpha': 1000}]

    # Fit the final model
    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
        'alpha': 500,
        'n_estimators': 500
    },
                           n_jobs=-1)
    wrapper.fit(train_x, train_y)
    test_x_crop_scale = CropScaleImageTransformer(
        training=False,
        result_path='data/data_test_crop_150_scale_15.npy',
        crop_size=150,
        scaled_size=15,
        n_jobs=-1,
        memmap=True)

    test_images = test_x_crop_scale.transform()
    test_x = kmeans_generator.transform(
        test_images,
        save_to_file='data/data_kmeans_test_features_003_new.npy',
        memmap=True)
    res = wrapper.predict(test_x)
    sub = classes.Submission(res)
    sub.to_file('sub_kmeans_003.csv')
b.run('train')


import models
from models.Base import CropScaleImageTransformer
from models.KMeansFeatures import KMeansFeatureGenerator

train_x_crop_scale = CropScaleImageTransformer(training=True,
                                               # result_path='data/data_train_crop_150_scale_15.npy',
                                               crop_size=150,
                                               scaled_size=15,
                                               n_jobs=-1,
                                               memmap=True)

raw_images = train_x_crop_scale.transform()

patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=1000,
                                                     patch_size=5,
                                                     n_jobs=-1)

reds = raw_images[0:100, :, :, 0]
a = patch_extractor.transform(reds)

kmeans_generator = KMeansFeatureGenerator(n_centroids=10,
                                          rf_size=5,
                                          result_path='foo.npy',
                                          n_iterations=20,
                                          n_jobs=1)

kmeans_generator.fit(a)
train_reds = kmeans_generator.transform(reds, stride_size=2)
Exemple #20
0
b.run('train')

import models
from models.Base import CropScaleImageTransformer
from models.KMeansFeatures import KMeansFeatureGenerator

train_x_crop_scale = CropScaleImageTransformer(
    training=True,
    # result_path='data/data_train_crop_150_scale_15.npy',
    crop_size=150,
    scaled_size=15,
    n_jobs=-1,
    memmap=True)

raw_images = train_x_crop_scale.transform()

patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=1000,
                                                     patch_size=5,
                                                     n_jobs=-1)

reds = raw_images[0:100, :, :, 0]
a = patch_extractor.transform(reds)

kmeans_generator = KMeansFeatureGenerator(n_centroids=10,
                                          rf_size=5,
                                          result_path='foo.npy',
                                          n_iterations=20,
                                          n_jobs=1)

kmeans_generator.fit(a)
train_reds = kmeans_generator.transform(reds, stride_size=2)
Exemple #21
0
def ensemble_001():
    """
    Ensemble of kmeans and random forest results
    Conducting some analysis of whether the errors from these two models for individual Ys are different

    Ensembled error is .1149.

    Kmeans is better on every class than RF.
    """
    n_centroids = 3000
    s = 15
    crop = 150
    n_patches = 400000
    rf_size = 5

    train_x_crop_scale = CropScaleImageTransformer(training=True,
                                                   crop_size=crop,
                                                   scaled_size=s,
                                                   n_jobs=-1,
                                                   memmap=True)

    kmeans_generator = KMeansFeatureGenerator(n_centroids=n_centroids,
                                              rf_size=rf_size,
                                              result_path='data/mdl_ensemble_001',
                                              n_iterations=20,
                                              n_jobs=-1,)

    patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                         patch_size=rf_size,
                                                         n_jobs=-1)
    images = train_x_crop_scale.transform()
    patches = patch_extractor.transform(images)

    kmeans_generator.fit(patches)

    del patches
    gc.collect()

    X = kmeans_generator.transform(images, save_to_file='data/data_ensemble_001.npy', memmap=True)
    Y = classes.train_solutions.data

    # Unload some objects
    del images
    gc.collect()

    # Get the input for the RF so that we can split together
    sampler = SampleTransformer(training=True, steps=2, step_size=20, n_jobs=-1)
    pX = sampler.transform()

    # manual split of train and test
    train_x, test_x, ptrain_x, ptest_x, train_y, test_y = train_test_split(X, pX, Y, test_size=0.5)

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 500}, n_jobs=-1)
    wrapper.fit(train_x, train_y)
    kmeans_preds = wrapper.predict(test_x)

    pWrapper = ModelWrapper(RandomForestRegressor, {'n_estimators': 500, 'verbose': 3}, n_jobs=-1)
    pWrapper.fit(ptrain_x, train_y)
    pixel_preds = pWrapper.predict(ptest_x)

    logger.info('Kmeans')
    classes.colwise_rmse(kmeans_preds, test_y)
    classes.rmse(kmeans_preds, test_y)
    logger.info('Pixel RF')
    classes.colwise_rmse(pixel_preds, test_y)
    classes.rmse(pixel_preds, test_y)

    logger.info("Ensembling predictions")
    etrain_x = np.hstack((wrapper.predict(train_x), pWrapper.predict(ptrain_x)))
    etest_x = np.hstack((kmeans_preds, pixel_preds))
    eWrapper = ModelWrapper(RandomForestRegressor, {'n_estimators': 500, 'verbose': 3}, n_jobs=-1)
    eWrapper.fit(etrain_x, train_y)
    ensemble_preds = eWrapper.predict(etest_x)
    classes.colwise_rmse(ensemble_preds, test_y)
    classes.rmse(ensemble_preds, test_y)
Exemple #22
0
def kmeans_005():
    """
    Testing whether extracting patches from train and test images works better

    [(500000, False, array([-0.10799986, -0.10744586])),
    (500000, True, array([-0.10790803, -0.10733288])),
    (600000, False, array([-0.10812188, -0.10735988])),
    (600000, True, array([-0.10778652, -0.10752664]))]
    """
    n_patches_vals = [500000, 600000, 700000]
    include_test_images = [False, True]

    scores = []
    for n_patches in n_patches_vals:
        for incl in include_test_images:
            s = 15
            crop = 150
            n_centroids = 1600
            rf_size = 5
            logger.info("Training with n_patches {}, with test images {}".format(n_patches, incl))

            train_x_crop_scale = CropScaleImageTransformer(training=True,
                                                           result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s),
                                                           crop_size=crop,
                                                           scaled_size=s,
                                                           n_jobs=-1,
                                                           memmap=True)
            test_x_crop_scale = CropScaleImageTransformer(training=False,
                                                          result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
                                                          crop_size=crop,
                                                          scaled_size=s,
                                                          n_jobs=-1,
                                                          memmap=True)

            kmeans_generator = KMeansFeatureGenerator(n_centroids=n_centroids,
                                                      rf_size=rf_size,
                                                      result_path='data/mdl_kmeans_005_patches_{}_test{}'.format(n_patches, incl),
                                                      n_iterations=20,
                                                      n_jobs=-1,)

            patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                                 patch_size=rf_size,
                                                                 n_jobs=-1)
            images = train_x_crop_scale.transform()
            if incl:
                test_images = test_x_crop_scale.transform()
                images = np.vstack([images, test_images])
            logger.info("Extracting patches from images ndarray shape: {}".format(images.shape))

            patches = patch_extractor.transform(images)
            logger.info("Patches ndarray shape: {}".format(patches.shape))

            kmeans_generator.fit(patches)

            del patches
            gc.collect()

            # Reload the original images
            images = train_x_crop_scale.transform()
            logger.info("Generating features on images ndarray shape: {}".format(images.shape))
            train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_005_patches_{}_test_{}.npy'.format(n_patches, incl), memmap=True)
            train_y = classes.train_solutions.data
            # Unload some objects
            del images
            gc.collect()

            wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {'alpha': 500, 'n_estimators': 250}, n_jobs=-1)
            wrapper.cross_validation(train_x, train_y, n_folds=2, parallel_estimator=True)

            score = (n_patches, incl, wrapper.cv_scores)
            logger.info("Score: {}".format(score))
            scores.append(score)

            del wrapper
            gc.collect()