예제 #1
0
def improveHyperParameters(train, targetId, tuned_parameter, controlEst):
    expReg = ExploreRegressor.ExploreRegressor(train, targetId, svm.SVR(),
                                               0.25)
    best = expReg.gridSearch(tuned_parameters=tuned_parameter,
                             cv=10,
                             verbose=1)

    print '%s control est' % (targetId)
    control = ExploreRegressor.ExploreRegressor(train, targetId, controlEst,
                                                0.25)
    control.reportCrossValidationError(cv=20)

    print '%s optimal est. Parmeters: %s' % (targetId, best.best_params_)
    optimal = ExploreRegressor.ExploreRegressor(train, targetId,
                                                best.best_estimator_, 0.25)
    optimal.reportCrossValidationError(cv=10)
예제 #2
0
def improveHyperParameters(train, targetId, tuned_parameter, controlEst):
    # Shuffle rows to not over fit on groups
    train = train.reindex(np.random.permutation(train.index))

    expReg = ExploreRegressor.ExploreRegressor(
        train, targetId, ensemble.ExtraTreesRegressor(n_jobs=10), 0.25)
    best = expReg.gridSearch(tuned_parameters=tuned_parameter, cv=5, verbose=2)

    print '%s control est' % (targetId)
    control = ExploreRegressor.ExploreRegressor(train, targetId, controlEst,
                                                0.25)
    control.reportCrossValidationError(cv=10)

    print '%s optimal est. Parmeters: %s' % (targetId, best.best_params_)
    optimal = ExploreRegressor.ExploreRegressor(train, targetId,
                                                best.best_estimator_, 0.25)
    optimal.reportCrossValidationError(cv=10)
def testRegressor(train, regressor, target, id):
    expReg = ExploreRegressor.ExploreRegressor(train, target, regressor, 0.25)
    print '%s model' % (id)
    expReg.reportCrossValidationError(cv=20)
예제 #4
0
def reportError( trainFile, testFile, dataId ):
    train = pd.read_csv( trainFile, header=0 )
    test = pd.read_csv( testFile, header=0 )

    # Replace nominal parameters by 0 and 1s
    train.replace( to_replace='Topsoil', value=0, inplace=True )
    train.replace( to_replace='Subsoil', value=1, inplace=True )

    test.replace( to_replace='Topsoil', value=0, inplace=True )
    test.replace( to_replace='Subsoil', value=1, inplace=True )

    est = svm.SVR(C=10000.0, verbose = 0)

    targets = ['Ca','P','pH','SOC','Sand']

    # ests = [
    # svm.SVR(kernel='poly',C=10000.0, degree=2, gamma=0.001 ), # Control SVR for Ca
    # svm.SVR(kernel='rbf',C=15000.0, degree=1, gamma=0.001 ), # Control SVR for P
    # svm.SVR(kernel='rbf',C=10000.0, degree=1, gamma=0.001 ), # Control SVR for pH
    # svm.SVR(kernel='rbf',C=5000.0, degree=1, gamma=0 ), # Control SVR for SOC
    # svm.SVR(kernel='rbf',C=15000.0, degree=1, gamma=0 ), # Control SVR for Sand
    # ]

    # ests = [
    # svm.SVR(kernel='poly',C=9000.0, degree=2, gamma=0.0009 ), # Control SVR for Ca
    # svm.SVR(kernel='rbf',C=17500.0, degree=1, gamma=0.00125 ), # Control SVR for P
    # svm.SVR(kernel='rbf',C=7500.0, degree=1, gamma=0.00075 ), # Control SVR for pH
    # svm.SVR(kernel='rbf',C=5000.0, degree=1, gamma=0 ), # Control SVR for SOC
    # svm.SVR(kernel='rbf',C=12500.0, degree=1, gamma=0 ), # Control SVR for Sand
    # ]

    # ests = [
    # svm.SVR(C=10000.0), # Control SVR for Ca
    # svm.SVR(C=10000.0),  # Control SVR for P
    # svm.SVR(C=10000.0),  # Control SVR for pH
    # svm.SVR(C=10000.0),  # Control SVR for SOC
    # svm.SVR(C=10000.0),  # Control SVR for Sand
    # ]

    ests = [
    svm.SVR( kernel='poly', C=17000, gamma=0.0075, degree=1 ), # Control SVR for Ca
    svm.SVR( kernel='rbf', C=11000, gamma=0.0025, degree=1 ), # Control SVR for P
    svm.SVR( kernel='rbf',C=5750, gamma=0, degree=1 ), # Control SVR for pH
    svm.SVR( kernel='rbf', C=8250, gamma=0, degree=1 ), # Control SVR for SOC
    svm.SVR( kernel='rbf', C=20500, gamma=0, degree=1 ), # Control SVR for Sand
    ]


    errors = []
    for i in range( len(targets) ):
        target = targets[ i ]

        if( target == 'Ca' ):
            continue

        if( target == 'P' ):
            continue

        if( target == 'pH' ):
            continue

        # if( target == 'SOC' ):
        #     continue
        # if( target == 'Sand' ):
        #     continue

        svmReg = ExploreRegressor.ExploreRegressor( train, target, ests[ i ], 0.25 )
        scores = svmReg.reportCrossValidationError( cv=20 )
        errors.append( np.sqrt( -scores.mean() ) )

    print '%s, mean error %0.4f of errors %s' % ( dataId, np.mean( errors ), errors )
    }],  # Parameter tuning for Sand
]

# Regular est from 20 fold cv
controlEsts = [
    linear_model.Ridge(alpha=0.06),  # Control SVR for Ca
    linear_model.Ridge(alpha=0.05),  # Control SVR for P
    linear_model.Ridge(alpha=0.0015),  # Control SVR for pH
    linear_model.Ridge(alpha=0.05),  # Control SVR for SOC
    linear_model.Ridge(alpha=0.015),  # Control SVR for Sand
]

targetId = 'SOC'
print '%s control est' % (targetId)
control = ExploreRegressor.ExploreRegressor(train, targetId,
                                            linear_model.Ridge(alpha=0.1),
                                            0.25)
control.reportCrossValidationError(cv=100)

print '%s optimal est. Parmeters: %s' % (targetId, 'optimal')
optimal = ExploreRegressor.ExploreRegressor(train, targetId,
                                            linear_model.Ridge(alpha=0.05),
                                            0.25)
optimal.reportCrossValidationError(cv=100)

for i in range(len(targets)):
    target = targets[i]

    if (target == 'Ca'):
        continue
예제 #6
0

# Load data from CSV file
df = pd.read_csv(
    '/Users/carrillo/workspace/Kaggle/resources/AfSIS/trainingTransformed.csv',
    header=0)

# Specify SOC regressor
targetId = 'P'
est = RandomForestRegressor(n_estimators=800,
                            max_depth=16,
                            min_samples_leaf=1,
                            n_jobs=10,
                            oob_score=True,
                            verbose=0)
regAll = ExploreRegressor.ExploreRegressor(df, targetId, est, 0.25)
trainMSE, testMSE = regAll.getErrors(iterations=20)
print 'All features'
print 'meanTrainingMSE\tsdTrainingMSE\tmeanTestMSE\tsdTestMSE'
print '%f\t%f\t%f\t%f' % (np.mean(trainMSE), np.std(trainMSE),
                          np.mean(testMSE), np.std(testMSE))

clv = regAll.gridSearch(verbose=0)
regNew = ExploreRegressor.ExploreRegressor(df, targetId, clv.best_estimator_,
                                           0.25)
regNew.learn()
trainMSE, testMSE = regNew.getErrors(iterations=20)
print '%f\t%f\t%f\t%f\t%s' % (np.mean(trainMSE), np.std(trainMSE),
                              np.mean(testMSE), np.std(testMSE),
                              clv.best_params_)