def cross_validationcv(model, verbose=0): data = GalaxyData(feature_extraction.raw_1, scale_features=False) (features, solutions) = data.get_training_data() # Train and Predict Model (clf, _) = model(features, solutions, verbose) scores = cross_validation.cross_val_score(clf, features, solutions, cv=5, scoring=rmse_scorer, n_jobs=-1) print(scores) print("Cross validation error: ", sum(scores)/len(scores))
def competition_run(): data = GalaxyData() (training_features, training_solutions) = data.get_training_data() (test_features, _) = data.get_test_data() # Predict (clf, columns) = models.default_model(training_features, training_solutions, 5) predicted_solutions = models.predict(clf, test_features, columns) data.save_solution(predicted_solutions)
def grid_search_cv(model, verbose=0): data = GalaxyData(feature_extraction.hog_features, scale_features=False) (features, solutions) = data.get_training_data() # Train and Predict Model (clf, _) = model(features, solutions, verbose) parameters = {'min_sample_split': [1e-6, 5e-6, 1e-5, 5e-5, 1e-4, 1e-3, 1e-2, 1e-1]} gs = grid_search.GridSearchCV(clf, param_grid=parameters, scoring=rmse_scorer, n_jobs=-1, cv=5, verbose=5) gs.fit(features, solutions) print(gs.grid_scores_)
def run_training_test(model, verbose=0): """Entry Point to run models Args: model: model function to run. """ # Load the data and split into training and validation sets data = GalaxyData(feature_extraction.raw_9, scale_features=False) (test_features, test_solutions) = data.get_test_data() (training_features, training_solutions) = data.get_training_data() # Train and Predict Model (clf, columns) = model(training_features, training_solutions, verbose) predicted_solutions = models.predict(clf, test_features, columns) # Evaluate Predictions score = evaluate.get_rmse(test_solutions, predicted_solutions) print(score)
def extract_features(extraction_method, index=None, percent_subset=100, classification=False): """Runs the given extraction method on only those galaxys listed in index. Return a subset of those galaxies. Attrubutes: extraction_method: Extraction method to use. See feature_extraction index: Index of Galaxy for which to process data. If None, process all galaxies. percent_subset: Returns a subset of the data of this size (percent). Returns: A Tuple containing (X, y), with X being the features and y the labels. """ data = GalaxyData(extraction_method, scale_features=False) if index is not None: data.set_restricted_universe(index) if percent_subset == 100: (X, y) = data.get_training_data(competition=True) else: (X, y, _, _) = data.split_training_and_validation_data(100-percent_subset, competition=True) y = get_reduced_solutions(y, classification=classification) return (X, y)
import numpy as np from evaluate import cross_validate from galaxy_data import GalaxyData from sklearn import (ensemble, cross_validation) data = GalaxyData(scale_features=False) (X_train, y_train) = data.get_training_data() (X_test, y_test) = data.get_test_data() clf = ensemble.RandomForestRegressor(n_estimators=1, n_jobs=-1, verbose=5) scores = cross_validate(clf, X_train, y_train, 2) mean_score = sum(scores) / float(scores.shape[0]) print(scores) print(mean_score)
import random import numpy as np import pandas as pd import SimpleCV as cv from sklearn import svm from sklearn import cross_validation from sklearn.ensemble import RandomForestClassifier import evaluate import feature_extraction from galaxy_data import GalaxyData solutions_raw = pd.read_csv("./input_data/training_solutions_rev1.csv", index_col="GalaxyID") solutions = solutions_raw[["Class1.1", "Class1.2"]] upper_threshold = 1 solutions = solutions[(solutions >= upper_threshold)] solutions = solutions.dropna(how='all') #solutions.apply(lambda x: x[0] if not isnan(x[0]) else x[1], axis=1).to_frame() solutions = solutions.applymap(lambda x: 0 if np.isnan(x) else x) data = GalaxyData(feature_extraction.raw) data.set_restricted_universe(solutions.index) (feature_vectors, _) = data.get_training_data()