def ridge_rf_001(outfile='sub_ridge_rf_001.csv'): mdl = models.Ridge.RidgeRFModel(cv_sample=0.5, cv_folds=2) mdl.run('cv') mdl.run('train') mdl.run('predict') sub = classes.Submission(mdl.test_y) sub.to_file(outfile) # Testing this with new models train_predictors_file = 'data/data_ridge_rf_train_001.npy' test_predictors_file = 'data/data_ridge_rf_test_001.npy' train_x = np.load(train_predictors_file) train_y = classes.train_solutions.data mdl = models.Base.ModelWrapper(models.Ridge.RidgeRFEstimator, { 'alpha': 14, 'n_estimators': 10, 'verbose': 3, 'oob_score': True }, n_jobs=-1) # mdl.cross_validation(train_x, train_y, sample=0.5, n_folds=3) mdl.grid_search(train_x, train_y, { 'alpha': [1, 2], 'n_estimators': [5] }, sample=0.1) test_x = np.load(test_predictors_file) mdl.fit(train_x, train_y) pred = mdl.predict(test_x)
def random_forest_001(outfile="sub_random_forest_001.csv", n_jobs=1): """ Uses a sample of central pixels in RGB space to feed in as inputs to the neural network # 3-fold CV using half the training set reports RMSE of .126 or so """ model = models.RandomForest.RandomForestModel(n_jobs=n_jobs) model.run('train') predictions = model.run('predict') output = classes.Submission(predictions) output.to_file(outfile)
def central_pixel_benchmark(outfile="sub_central_pixel_001.csv"): """ Tries to duplicate the central pixel benchmark, which is defined as: Simple benchmark that clusters training galaxies according to the color in the center of the image and then assigns the associated probability values to like-colored images in the test set. """ test_averages = models.Benchmarks.CentralPixelBenchmark().execute() predictions = classes.Submission(test_averages) # Write to file predictions.to_file(outfile)
def kmeans_006_submission(): # 2014-03-28 10:53:22 - Base - INFO - Cross validation completed in 1487.18687487. Scores: # 2014-03-28 10:53:22 - Base - INFO - [-0.11018943 -0.10946863] # Final submission crop = 150 s = 15 n_centroids = 3000 images = get_images(crop=crop, s=s) kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids) train_x = kmeans_generator.transform( images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format( n_centroids), memmap=True) train_y = classes.train_solutions.data # Unload some objects del images gc.collect() wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, { 'alpha': 500, 'n_estimators': 500 }, n_jobs=-1) # wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True) wrapper.fit(train_x, train_y) test_x_crop_scale = CropScaleImageTransformer( training=False, result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) test_images = test_x_crop_scale.transform() test_x = kmeans_generator.transform( test_images, save_to_file='data/data_test_kmeans_features_006_centroids_{}.npy'. format(n_centroids), memmap=True) res = wrapper.predict(test_x) sub = classes.Submission(res) sub.to_file('sub_kmeans_006.csv')
def extra_trees_submission(): # Somehow the submission on the leaderboard scores 0.22 crop = 150 s = 15 n_centroids = 3000 images = get_images(crop=crop, s=s) kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids) train_x = kmeans_generator.transform( images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format( n_centroids), memmap=True) train_y = classes.train_solutions.data # Unload some objects del images gc.collect() wrapper = ModelWrapper(models.Ridge.RidgeExtraTreesEstimator, { 'alpha': 500, 'n_estimators': 500 }, n_jobs=-1) # wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True) wrapper.fit(train_x, train_y) test_x_crop_scale = CropScaleImageTransformer( training=False, result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) test_images = test_x_crop_scale.transform() test_x = kmeans_generator.transform( test_images, save_to_file='data/data_test_kmeans_features_006_centroids_{}.npy'. format(n_centroids), memmap=True) res = wrapper.predict(test_x) sub = classes.Submission(res) sub.to_file('sub_kmeans_008.csv')
def train_set_average_benchmark(outfile="sub_average_benchmark_000.csv"): """ What should be the actual baseline. Takes the training set solutions, averages them, and uses that as the submission for every row in the test set """ start_time = time.time() training_data = classes.TrainSolutions().data solutions = np.mean(training_data, axis=0) # Calculate an RMSE train_solution = np.tile(solutions, (N_TRAIN, 1)) rmse = classes.rmse(train_solution, training_data) solution = classes.Submission(np.tile(solutions, (N_TEST, 1))) solution.to_file(outfile) end_time = time.time() logger.info("Model completed in {}".format(end_time - start_time))
def kmeans_006_submission(): # Final submission n_centroids = 3000 s = 15 crop = 150 n_patches = 400000 rf_size = 5 logger.info("Training with n_centroids {}".format(n_centroids)) train_x_crop_scale = CropScaleImageTransformer( training=True, result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) kmeans_generator = KMeansFeatureGenerator( n_centroids=n_centroids, rf_size=rf_size, result_path='data/mdl_kmeans_006_centroids_{}'.format(n_centroids), n_iterations=20, n_jobs=-1, ) patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches, patch_size=rf_size, n_jobs=-1) images = train_x_crop_scale.transform() patches = patch_extractor.transform(images) kmeans_generator.fit(patches) del patches gc.collect() train_x = kmeans_generator.transform( images, save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format( n_centroids), memmap=True) train_y = classes.train_solutions.data # Unload some objects del images gc.collect() wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, { 'alpha': 500, 'n_estimators': 500 }, n_jobs=-1) wrapper.fit(train_x, train_y) test_x_crop_scale = CropScaleImageTransformer( training=False, result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) test_images = test_x_crop_scale.transform() test_x = kmeans_generator.transform( test_images, save_to_file='data/data_test_kmeans_features_006_centroids_{}.npy'. format(n_centroids), memmap=True) res = wrapper.predict(test_x) sub = classes.Submission(res) sub.to_file('sub_kmeans_006.csv')
def kmeans_003(): """ Grid search for Ridge RF parameters Not sure whether to use spherical or minibatch, so maybe do one run with both .106 on the leaderboard. So the difference in CV scores narrowed """ train_x_crop_scale = CropScaleImageTransformer( training=True, result_path='data/data_train_crop_150_scale_15.npy', crop_size=150, scaled_size=15, n_jobs=-1, memmap=True) # spherical generator kmeans_generator = KMeansFeatureGenerator( n_centroids=1600, rf_size=5, result_path='data/mdl_kmeans_002_new', n_iterations=20, n_jobs=-1, ) # minibatch generator # kmeans_generator = models.KMeansFeatures.KMeansFeatureGenerator(n_centroids=1600, # rf_size=5, # result_path='data/mdl_kmeans_002_new_minibatch', # method='minibatch', # n_init=1, # n_jobs=-1,) # Don't need to fit, as already cached patches = '' kmeans_generator.fit(patches) images = train_x_crop_scale.transform() # Problematic here - memory usage spikes to ~ 11GB when threads return # train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_002_new.npy', memmap=True) train_x = kmeans_generator.transform( images, save_to_file='data/data_kmeans_features_002_new.npy', memmap=True) train_y = classes.train_solutions.data # Unload some objects del images gc.collect() # mdl = models.Ridge.RidgeRFEstimator(alpha=14, n_estimators=250, n_jobs=-1) wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, { 'alpha': 14, 'n_estimators': 500 }, n_jobs=-1) params = {'alpha': [150, 250, 500, 750, 1000], 'n_estimators': [250]} # 500 trees and alpha 25 gives cv of .10972 on 2-fold CV, but 25 was on the upper range of the search space, # So need to re-run with larger range of alpha # Will hit 30GB of ram with 500 trees. wrapper.grid_search(train_x, train_y, params, refit=False, parallel_estimator=True) # [mean: -0.11024, std: 0.00018, params: {'n_estimators': 250, 'alpha': 20.0}, # mean: -0.11000, std: 0.00019, params: {'n_estimators': 250, 'alpha': 25.0}, # mean: -0.10969, std: 0.00018, params: {'n_estimators': 250, 'alpha': 35}, # mean: -0.10934, std: 0.00019, params: {'n_estimators': 250, 'alpha': 50}, # mean: -0.10892, std: 0.00025, params: {'n_estimators': 250, 'alpha': 75}, # mean: -0.10860, std: 0.00025, params: {'n_estimators': 250, 'alpha': 100}, # mean: -0.10828, std: 0.00019, params: {'n_estimators': 250, 'alpha': 150}, # mean: -0.10789, std: 0.00016, params: {'n_estimators': 250, 'alpha': 250}, # mean: -0.10775, std: 0.00024, params: {'n_estimators': 250, 'alpha': 500}, # mean: -0.10779, std: 0.00022, params: {'n_estimators': 250, 'alpha': 750}, # mean: -0.10784, std: 0.00023, params: {'n_estimators': 250, 'alpha': 1000}] # Fit the final model wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, { 'alpha': 500, 'n_estimators': 500 }, n_jobs=-1) wrapper.fit(train_x, train_y) test_x_crop_scale = CropScaleImageTransformer( training=False, result_path='data/data_test_crop_150_scale_15.npy', crop_size=150, scaled_size=15, n_jobs=-1, memmap=True) test_images = test_x_crop_scale.transform() test_x = kmeans_generator.transform( test_images, save_to_file='data/data_kmeans_test_features_003_new.npy', memmap=True) res = wrapper.predict(test_x) sub = classes.Submission(res) sub.to_file('sub_kmeans_003.csv')
def kmeans_002(): """ Kmeans feature learning, first rescaling images down, then extracting patches, so we get more variation in each patch Rescaling to 15 x 15 then taking out patches of 5 x 5 The centroids don't look like anything (splotches of color against mostly gray), but the CV score on 10000 samples and 20 trees was .128, which is quite promising. Training the kmeans then using RidgeRFEstimator got us to .107 on the leaderboard Broadly speaking, the pipe looks like this: Encoder: CropScaleImageTransformer -> PatchExtractorTransformer -> KMeansFeatureGenerator.fit Model: CropSCaleImageTransformer -> KMeansFeatureGenerator.transform -> RidgeRFEstimator """ train_mmap_path = 'data/train_cropped_150_scale_15.memmap' test_mmap_path = 'data/test_cropped_150_scale_15.memmap' if not os.path.exists('data/train_cropped_150.memmap'): classes.crop_to_memmap(150, training=True) if not os.path.exists('data/test_cropped_150.memmap'): classes.crop_to_memmap(150, training=False) if not os.path.exists(train_mmap_path): logger.info("Prepping training images") pre_scale = np.memmap('data/train_cropped_150.memmap', mode='r', shape=(N_TRAIN, 150, 150, 3)) trainX = classes.rescale_memmap(15, pre_scale, train_mmap_path) del pre_scale else: trainX = np.memmap(train_mmap_path, mode='r', shape=(N_TRAIN, 15, 15, 3)) if not os.path.exists(test_mmap_path): logger.info("Prepping testing images") pre_scale = np.memmap('data/test_cropped_150.memmap', mode='r', shape=(N_TEST, 150, 150, 3)) testX = classes.rescale_memmap(15, pre_scale, test_mmap_path) del pre_scale else: testX = np.memmap(test_mmap_path, mode='r', shape=(N_TEST, 15, 15, 3)) n_jobs = multiprocessing.cpu_count() if not os.path.exists('data/mdl_kmeans_002_centroids.npy'): logger.info("Pretraining KMeans feature encoder") km = models.KMeansFeatures.KMeansFeatures(rf_size=5, num_centroids=1600, num_patches=400000) km.fit(trainX) km.save_to_file('mdl_kmeans_002') else: logger.info("Loading KMeans feature encoder from file") km = models.KMeansFeatures.KMeansFeatures.load_from_file( 'mdl_kmeans_002', rf_size=5) # Takes waaaay too long to finish. At least an hour per tree. Clearly too # many dimensions # Instead ran with ridge rf manually mdl = models.RandomForest.KMeansRandomForest(km, trainX, testX, n_jobs=n_jobs, cv_sample=0.5) # mdl.run('cv') mdl.run('train') res = mdl.run('predict') np.save('submissions/sub_kmeans_rf_002.npy', res) output = classes.Submission(res) output.to_file('sub_kmeans_rf_002.csv')
wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, { 'alpha': 500, 'n_estimators': 500 }, n_jobs=-1) wrapper.fit(train_x, train_y) test_x_crop_scale = CropScaleImageTransformer( training=False, result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s), crop_size=crop, scaled_size=s, n_jobs=-1, memmap=True) # Crop and scale the test images test_images = test_x_crop_scale.transform() # Generate the test features test_x = kmeans_generator.transform( test_images, save_to_file='data/data_test_kmeans_features_006_centroids_{}.npy'.format( n_centroids), memmap=True) # Predict on the test features res = wrapper.predict(test_x) # Generate a submission file sub = classes.Submission(res) sub.to_file('sub_kmeans_006.csv')