def improveHyperParameters(train, targetId, tuned_parameter, controlEst): expReg = ExploreRegressor.ExploreRegressor(train, targetId, svm.SVR(), 0.25) best = expReg.gridSearch(tuned_parameters=tuned_parameter, cv=10, verbose=1) print '%s control est' % (targetId) control = ExploreRegressor.ExploreRegressor(train, targetId, controlEst, 0.25) control.reportCrossValidationError(cv=20) print '%s optimal est. Parmeters: %s' % (targetId, best.best_params_) optimal = ExploreRegressor.ExploreRegressor(train, targetId, best.best_estimator_, 0.25) optimal.reportCrossValidationError(cv=10)
def improveHyperParameters(train, targetId, tuned_parameter, controlEst): # Shuffle rows to not over fit on groups train = train.reindex(np.random.permutation(train.index)) expReg = ExploreRegressor.ExploreRegressor( train, targetId, ensemble.ExtraTreesRegressor(n_jobs=10), 0.25) best = expReg.gridSearch(tuned_parameters=tuned_parameter, cv=5, verbose=2) print '%s control est' % (targetId) control = ExploreRegressor.ExploreRegressor(train, targetId, controlEst, 0.25) control.reportCrossValidationError(cv=10) print '%s optimal est. Parmeters: %s' % (targetId, best.best_params_) optimal = ExploreRegressor.ExploreRegressor(train, targetId, best.best_estimator_, 0.25) optimal.reportCrossValidationError(cv=10)
def testRegressor(train, regressor, target, id): expReg = ExploreRegressor.ExploreRegressor(train, target, regressor, 0.25) print '%s model' % (id) expReg.reportCrossValidationError(cv=20)
def reportError( trainFile, testFile, dataId ): train = pd.read_csv( trainFile, header=0 ) test = pd.read_csv( testFile, header=0 ) # Replace nominal parameters by 0 and 1s train.replace( to_replace='Topsoil', value=0, inplace=True ) train.replace( to_replace='Subsoil', value=1, inplace=True ) test.replace( to_replace='Topsoil', value=0, inplace=True ) test.replace( to_replace='Subsoil', value=1, inplace=True ) est = svm.SVR(C=10000.0, verbose = 0) targets = ['Ca','P','pH','SOC','Sand'] # ests = [ # svm.SVR(kernel='poly',C=10000.0, degree=2, gamma=0.001 ), # Control SVR for Ca # svm.SVR(kernel='rbf',C=15000.0, degree=1, gamma=0.001 ), # Control SVR for P # svm.SVR(kernel='rbf',C=10000.0, degree=1, gamma=0.001 ), # Control SVR for pH # svm.SVR(kernel='rbf',C=5000.0, degree=1, gamma=0 ), # Control SVR for SOC # svm.SVR(kernel='rbf',C=15000.0, degree=1, gamma=0 ), # Control SVR for Sand # ] # ests = [ # svm.SVR(kernel='poly',C=9000.0, degree=2, gamma=0.0009 ), # Control SVR for Ca # svm.SVR(kernel='rbf',C=17500.0, degree=1, gamma=0.00125 ), # Control SVR for P # svm.SVR(kernel='rbf',C=7500.0, degree=1, gamma=0.00075 ), # Control SVR for pH # svm.SVR(kernel='rbf',C=5000.0, degree=1, gamma=0 ), # Control SVR for SOC # svm.SVR(kernel='rbf',C=12500.0, degree=1, gamma=0 ), # Control SVR for Sand # ] # ests = [ # svm.SVR(C=10000.0), # Control SVR for Ca # svm.SVR(C=10000.0), # Control SVR for P # svm.SVR(C=10000.0), # Control SVR for pH # svm.SVR(C=10000.0), # Control SVR for SOC # svm.SVR(C=10000.0), # Control SVR for Sand # ] ests = [ svm.SVR( kernel='poly', C=17000, gamma=0.0075, degree=1 ), # Control SVR for Ca svm.SVR( kernel='rbf', C=11000, gamma=0.0025, degree=1 ), # Control SVR for P svm.SVR( kernel='rbf',C=5750, gamma=0, degree=1 ), # Control SVR for pH svm.SVR( kernel='rbf', C=8250, gamma=0, degree=1 ), # Control SVR for SOC svm.SVR( kernel='rbf', C=20500, gamma=0, degree=1 ), # Control SVR for Sand ] errors = [] for i in range( len(targets) ): target = targets[ i ] if( target == 'Ca' ): continue if( target == 'P' ): continue if( target == 'pH' ): continue # if( target == 'SOC' ): # continue # if( target == 'Sand' ): # continue svmReg = ExploreRegressor.ExploreRegressor( train, target, ests[ i ], 0.25 ) scores = svmReg.reportCrossValidationError( cv=20 ) errors.append( np.sqrt( -scores.mean() ) ) print '%s, mean error %0.4f of errors %s' % ( dataId, np.mean( errors ), errors )
}], # Parameter tuning for Sand ] # Regular est from 20 fold cv controlEsts = [ linear_model.Ridge(alpha=0.06), # Control SVR for Ca linear_model.Ridge(alpha=0.05), # Control SVR for P linear_model.Ridge(alpha=0.0015), # Control SVR for pH linear_model.Ridge(alpha=0.05), # Control SVR for SOC linear_model.Ridge(alpha=0.015), # Control SVR for Sand ] targetId = 'SOC' print '%s control est' % (targetId) control = ExploreRegressor.ExploreRegressor(train, targetId, linear_model.Ridge(alpha=0.1), 0.25) control.reportCrossValidationError(cv=100) print '%s optimal est. Parmeters: %s' % (targetId, 'optimal') optimal = ExploreRegressor.ExploreRegressor(train, targetId, linear_model.Ridge(alpha=0.05), 0.25) optimal.reportCrossValidationError(cv=100) for i in range(len(targets)): target = targets[i] if (target == 'Ca'): continue
# Load data from CSV file df = pd.read_csv( '/Users/carrillo/workspace/Kaggle/resources/AfSIS/trainingTransformed.csv', header=0) # Specify SOC regressor targetId = 'P' est = RandomForestRegressor(n_estimators=800, max_depth=16, min_samples_leaf=1, n_jobs=10, oob_score=True, verbose=0) regAll = ExploreRegressor.ExploreRegressor(df, targetId, est, 0.25) trainMSE, testMSE = regAll.getErrors(iterations=20) print 'All features' print 'meanTrainingMSE\tsdTrainingMSE\tmeanTestMSE\tsdTestMSE' print '%f\t%f\t%f\t%f' % (np.mean(trainMSE), np.std(trainMSE), np.mean(testMSE), np.std(testMSE)) clv = regAll.gridSearch(verbose=0) regNew = ExploreRegressor.ExploreRegressor(df, targetId, clv.best_estimator_, 0.25) regNew.learn() trainMSE, testMSE = regNew.getErrors(iterations=20) print '%f\t%f\t%f\t%f\t%s' % (np.mean(trainMSE), np.std(trainMSE), np.mean(testMSE), np.std(testMSE), clv.best_params_)