예제 #1
0
파일: run.py 프로젝트: lixunlove/galaxy-zoo
def ridge_rf_001(outfile='sub_ridge_rf_001.csv'):
    mdl = models.Ridge.RidgeRFModel(cv_sample=0.5, cv_folds=2)
    mdl.run('cv')
    mdl.run('train')
    mdl.run('predict')
    sub = classes.Submission(mdl.test_y)
    sub.to_file(outfile)

    # Testing this with new models
    train_predictors_file = 'data/data_ridge_rf_train_001.npy'
    test_predictors_file = 'data/data_ridge_rf_test_001.npy'
    train_x = np.load(train_predictors_file)
    train_y = classes.train_solutions.data
    mdl = models.Base.ModelWrapper(models.Ridge.RidgeRFEstimator, {
        'alpha': 14,
        'n_estimators': 10,
        'verbose': 3,
        'oob_score': True
    },
                                   n_jobs=-1)
    # mdl.cross_validation(train_x, train_y, sample=0.5, n_folds=3)
    mdl.grid_search(train_x,
                    train_y, {
                        'alpha': [1, 2],
                        'n_estimators': [5]
                    },
                    sample=0.1)

    test_x = np.load(test_predictors_file)
    mdl.fit(train_x, train_y)
    pred = mdl.predict(test_x)
예제 #2
0
파일: run.py 프로젝트: lixunlove/galaxy-zoo
def random_forest_001(outfile="sub_random_forest_001.csv", n_jobs=1):
    """
    Uses a sample of central pixels in RGB space to feed in as inputs to the neural network

    # 3-fold CV using half the training set reports RMSE of .126 or so
    """
    model = models.RandomForest.RandomForestModel(n_jobs=n_jobs)
    model.run('train')
    predictions = model.run('predict')
    output = classes.Submission(predictions)
    output.to_file(outfile)
예제 #3
0
파일: run.py 프로젝트: lixunlove/galaxy-zoo
def central_pixel_benchmark(outfile="sub_central_pixel_001.csv"):
    """
    Tries to duplicate the central pixel benchmark, which is defined as:
    Simple benchmark that clusters training galaxies according to the color in the center of the image
    and then assigns the associated probability values to like-colored images in the test set.
    """

    test_averages = models.Benchmarks.CentralPixelBenchmark().execute()
    predictions = classes.Submission(test_averages)
    # Write to file
    predictions.to_file(outfile)
예제 #4
0
def kmeans_006_submission():
    # 2014-03-28 10:53:22 - Base - INFO - Cross validation completed in 1487.18687487.  Scores:
    # 2014-03-28 10:53:22 - Base - INFO - [-0.11018943 -0.10946863]

    # Final submission
    crop = 150
    s = 15
    n_centroids = 3000

    images = get_images(crop=crop, s=s)
    kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids)

    train_x = kmeans_generator.transform(
        images,
        save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(
            n_centroids),
        memmap=True)
    train_y = classes.train_solutions.data

    # Unload some objects
    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
        'alpha': 500,
        'n_estimators': 500
    },
                           n_jobs=-1)
    # wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True)
    wrapper.fit(train_x, train_y)

    test_x_crop_scale = CropScaleImageTransformer(
        training=False,
        result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
        crop_size=crop,
        scaled_size=s,
        n_jobs=-1,
        memmap=True)

    test_images = test_x_crop_scale.transform()
    test_x = kmeans_generator.transform(
        test_images,
        save_to_file='data/data_test_kmeans_features_006_centroids_{}.npy'.
        format(n_centroids),
        memmap=True)
    res = wrapper.predict(test_x)
    sub = classes.Submission(res)
    sub.to_file('sub_kmeans_006.csv')
예제 #5
0
def extra_trees_submission():
    # Somehow the submission on the leaderboard scores 0.22
    crop = 150
    s = 15
    n_centroids = 3000

    images = get_images(crop=crop, s=s)
    kmeans_generator = train_kmeans_generator(images, n_centroids=n_centroids)

    train_x = kmeans_generator.transform(
        images,
        save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(
            n_centroids),
        memmap=True)
    train_y = classes.train_solutions.data

    # Unload some objects
    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeExtraTreesEstimator, {
        'alpha': 500,
        'n_estimators': 500
    },
                           n_jobs=-1)
    # wrapper.cross_validation(train_x, train_y, sample=0.5, parallel_estimator=True)
    wrapper.fit(train_x, train_y)

    test_x_crop_scale = CropScaleImageTransformer(
        training=False,
        result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
        crop_size=crop,
        scaled_size=s,
        n_jobs=-1,
        memmap=True)

    test_images = test_x_crop_scale.transform()
    test_x = kmeans_generator.transform(
        test_images,
        save_to_file='data/data_test_kmeans_features_006_centroids_{}.npy'.
        format(n_centroids),
        memmap=True)
    res = wrapper.predict(test_x)
    sub = classes.Submission(res)
    sub.to_file('sub_kmeans_008.csv')
예제 #6
0
파일: run.py 프로젝트: lixunlove/galaxy-zoo
def train_set_average_benchmark(outfile="sub_average_benchmark_000.csv"):
    """
    What should be the actual baseline.  Takes the training set solutions, averages them, and uses that as the
    submission for every row in the test set
    """
    start_time = time.time()
    training_data = classes.TrainSolutions().data

    solutions = np.mean(training_data, axis=0)

    # Calculate an RMSE
    train_solution = np.tile(solutions, (N_TRAIN, 1))
    rmse = classes.rmse(train_solution, training_data)

    solution = classes.Submission(np.tile(solutions, (N_TEST, 1)))
    solution.to_file(outfile)

    end_time = time.time()
    logger.info("Model completed in {}".format(end_time - start_time))
예제 #7
0
파일: run.py 프로젝트: lixunlove/galaxy-zoo
def kmeans_006_submission():
    # Final submission
    n_centroids = 3000
    s = 15
    crop = 150
    n_patches = 400000
    rf_size = 5
    logger.info("Training with n_centroids {}".format(n_centroids))

    train_x_crop_scale = CropScaleImageTransformer(
        training=True,
        result_path='data/data_train_crop_{}_scale_{}.npy'.format(crop, s),
        crop_size=crop,
        scaled_size=s,
        n_jobs=-1,
        memmap=True)

    kmeans_generator = KMeansFeatureGenerator(
        n_centroids=n_centroids,
        rf_size=rf_size,
        result_path='data/mdl_kmeans_006_centroids_{}'.format(n_centroids),
        n_iterations=20,
        n_jobs=-1,
    )

    patch_extractor = models.KMeansFeatures.PatchSampler(n_patches=n_patches,
                                                         patch_size=rf_size,
                                                         n_jobs=-1)
    images = train_x_crop_scale.transform()

    patches = patch_extractor.transform(images)

    kmeans_generator.fit(patches)

    del patches
    gc.collect()

    train_x = kmeans_generator.transform(
        images,
        save_to_file='data/data_kmeans_features_006_centroids_{}.npy'.format(
            n_centroids),
        memmap=True)
    train_y = classes.train_solutions.data
    # Unload some objects
    del images
    gc.collect()

    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
        'alpha': 500,
        'n_estimators': 500
    },
                           n_jobs=-1)
    wrapper.fit(train_x, train_y)

    test_x_crop_scale = CropScaleImageTransformer(
        training=False,
        result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
        crop_size=crop,
        scaled_size=s,
        n_jobs=-1,
        memmap=True)

    test_images = test_x_crop_scale.transform()
    test_x = kmeans_generator.transform(
        test_images,
        save_to_file='data/data_test_kmeans_features_006_centroids_{}.npy'.
        format(n_centroids),
        memmap=True)
    res = wrapper.predict(test_x)
    sub = classes.Submission(res)
    sub.to_file('sub_kmeans_006.csv')
예제 #8
0
파일: run.py 프로젝트: lixunlove/galaxy-zoo
def kmeans_003():
    """
    Grid search for Ridge RF parameters
    Not sure whether to use spherical or minibatch, so maybe do one run with both

    .106 on the leaderboard.  So the difference in CV scores narrowed
    """

    train_x_crop_scale = CropScaleImageTransformer(
        training=True,
        result_path='data/data_train_crop_150_scale_15.npy',
        crop_size=150,
        scaled_size=15,
        n_jobs=-1,
        memmap=True)

    # spherical generator
    kmeans_generator = KMeansFeatureGenerator(
        n_centroids=1600,
        rf_size=5,
        result_path='data/mdl_kmeans_002_new',
        n_iterations=20,
        n_jobs=-1,
    )

    # minibatch generator
    # kmeans_generator = models.KMeansFeatures.KMeansFeatureGenerator(n_centroids=1600,
    #                                                                 rf_size=5,
    #                                                                 result_path='data/mdl_kmeans_002_new_minibatch',
    #                                                                 method='minibatch',
    #                                                                 n_init=1,
    #                                                                 n_jobs=-1,)

    # Don't need to fit, as already cached
    patches = ''
    kmeans_generator.fit(patches)
    images = train_x_crop_scale.transform()

    # Problematic here - memory usage spikes to ~ 11GB when threads return
    # train_x = kmeans_generator.transform(images, save_to_file='data/data_kmeans_features_002_new.npy', memmap=True)
    train_x = kmeans_generator.transform(
        images,
        save_to_file='data/data_kmeans_features_002_new.npy',
        memmap=True)
    train_y = classes.train_solutions.data
    # Unload some objects
    del images
    gc.collect()
    # mdl = models.Ridge.RidgeRFEstimator(alpha=14, n_estimators=250, n_jobs=-1)
    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
        'alpha': 14,
        'n_estimators': 500
    },
                           n_jobs=-1)
    params = {'alpha': [150, 250, 500, 750, 1000], 'n_estimators': [250]}

    # 500 trees and alpha 25 gives cv of .10972 on 2-fold CV, but 25 was on the upper range of the search space,
    # So need to re-run with larger range of alpha
    # Will hit 30GB of ram with 500 trees.
    wrapper.grid_search(train_x,
                        train_y,
                        params,
                        refit=False,
                        parallel_estimator=True)

    # [mean: -0.11024, std: 0.00018, params: {'n_estimators': 250, 'alpha': 20.0},
    # mean: -0.11000, std: 0.00019, params: {'n_estimators': 250, 'alpha': 25.0},
    # mean: -0.10969, std: 0.00018, params: {'n_estimators': 250, 'alpha': 35},
    # mean: -0.10934, std: 0.00019, params: {'n_estimators': 250, 'alpha': 50},
    # mean: -0.10892, std: 0.00025, params: {'n_estimators': 250, 'alpha': 75},
    # mean: -0.10860, std: 0.00025, params: {'n_estimators': 250, 'alpha': 100},
    # mean: -0.10828, std: 0.00019, params: {'n_estimators': 250, 'alpha': 150},
    # mean: -0.10789, std: 0.00016, params: {'n_estimators': 250, 'alpha': 250},
    # mean: -0.10775, std: 0.00024, params: {'n_estimators': 250, 'alpha': 500},
    # mean: -0.10779, std: 0.00022, params: {'n_estimators': 250, 'alpha': 750},
    # mean: -0.10784, std: 0.00023, params: {'n_estimators': 250, 'alpha': 1000}]

    # Fit the final model
    wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
        'alpha': 500,
        'n_estimators': 500
    },
                           n_jobs=-1)
    wrapper.fit(train_x, train_y)
    test_x_crop_scale = CropScaleImageTransformer(
        training=False,
        result_path='data/data_test_crop_150_scale_15.npy',
        crop_size=150,
        scaled_size=15,
        n_jobs=-1,
        memmap=True)

    test_images = test_x_crop_scale.transform()
    test_x = kmeans_generator.transform(
        test_images,
        save_to_file='data/data_kmeans_test_features_003_new.npy',
        memmap=True)
    res = wrapper.predict(test_x)
    sub = classes.Submission(res)
    sub.to_file('sub_kmeans_003.csv')
예제 #9
0
파일: run.py 프로젝트: lixunlove/galaxy-zoo
def kmeans_002():
    """
    Kmeans feature learning, first rescaling images down, then extracting patches, so we get more variation in each patch
    Rescaling to 15 x 15 then taking out patches of 5 x 5

    The centroids don't look like anything (splotches of color against mostly gray), but the CV score on 10000 samples and 20 trees
    was .128, which is quite promising.

    Training the kmeans then using RidgeRFEstimator got us to .107 on the leaderboard

    Broadly speaking, the pipe looks like this:

    Encoder:
    CropScaleImageTransformer -> PatchExtractorTransformer -> KMeansFeatureGenerator.fit

    Model:
    CropSCaleImageTransformer -> KMeansFeatureGenerator.transform -> RidgeRFEstimator
    """
    train_mmap_path = 'data/train_cropped_150_scale_15.memmap'
    test_mmap_path = 'data/test_cropped_150_scale_15.memmap'

    if not os.path.exists('data/train_cropped_150.memmap'):
        classes.crop_to_memmap(150, training=True)
    if not os.path.exists('data/test_cropped_150.memmap'):
        classes.crop_to_memmap(150, training=False)

    if not os.path.exists(train_mmap_path):
        logger.info("Prepping training images")
        pre_scale = np.memmap('data/train_cropped_150.memmap',
                              mode='r',
                              shape=(N_TRAIN, 150, 150, 3))
        trainX = classes.rescale_memmap(15, pre_scale, train_mmap_path)
        del pre_scale
    else:
        trainX = np.memmap(train_mmap_path,
                           mode='r',
                           shape=(N_TRAIN, 15, 15, 3))

    if not os.path.exists(test_mmap_path):
        logger.info("Prepping testing images")
        pre_scale = np.memmap('data/test_cropped_150.memmap',
                              mode='r',
                              shape=(N_TEST, 150, 150, 3))
        testX = classes.rescale_memmap(15, pre_scale, test_mmap_path)
        del pre_scale
    else:
        testX = np.memmap(test_mmap_path, mode='r', shape=(N_TEST, 15, 15, 3))

    n_jobs = multiprocessing.cpu_count()

    if not os.path.exists('data/mdl_kmeans_002_centroids.npy'):
        logger.info("Pretraining KMeans feature encoder")
        km = models.KMeansFeatures.KMeansFeatures(rf_size=5,
                                                  num_centroids=1600,
                                                  num_patches=400000)
        km.fit(trainX)
        km.save_to_file('mdl_kmeans_002')
    else:
        logger.info("Loading KMeans feature encoder from file")
        km = models.KMeansFeatures.KMeansFeatures.load_from_file(
            'mdl_kmeans_002', rf_size=5)

    # Takes waaaay too long to finish.  At least an hour per tree.  Clearly too
    # many dimensions

    # Instead ran with ridge rf manually
    mdl = models.RandomForest.KMeansRandomForest(km,
                                                 trainX,
                                                 testX,
                                                 n_jobs=n_jobs,
                                                 cv_sample=0.5)
    # mdl.run('cv')
    mdl.run('train')
    res = mdl.run('predict')
    np.save('submissions/sub_kmeans_rf_002.npy', res)
    output = classes.Submission(res)
    output.to_file('sub_kmeans_rf_002.csv')
예제 #10
0
wrapper = ModelWrapper(models.Ridge.RidgeRFEstimator, {
    'alpha': 500,
    'n_estimators': 500
},
                       n_jobs=-1)
wrapper.fit(train_x, train_y)

test_x_crop_scale = CropScaleImageTransformer(
    training=False,
    result_path='data/data_test_crop_{}_scale_{}.npy'.format(crop, s),
    crop_size=crop,
    scaled_size=s,
    n_jobs=-1,
    memmap=True)

# Crop and scale the test images
test_images = test_x_crop_scale.transform()

# Generate the test features
test_x = kmeans_generator.transform(
    test_images,
    save_to_file='data/data_test_kmeans_features_006_centroids_{}.npy'.format(
        n_centroids),
    memmap=True)

# Predict on the test features
res = wrapper.predict(test_x)

# Generate a submission file
sub = classes.Submission(res)
sub.to_file('sub_kmeans_006.csv')