import thesisFunctions months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'] regions = ['IntMnt', 'Xeric'] dryFolder = 'AllMonthsDryHalf/' wetFolder = 'AllMonthsWetHalf/' baseFolders = [dryFolder, wetFolder] outputFolder = 'Output/' outputFileName = 'AllMonthsData.csv' # Aggregate each half-region-month's predictions into one file. thesisFunctions.aggregateSacPredictions(baseFolders, outputFolder, outputFileName, months, regions) # Output IntMnt file for specific water year for use in DWRAT aggregateFile = outputFolder + outputFileName waterYear = 1977 thesisFunctions.formatWaterYearPredictions(waterYear, aggregateFile)
import thesisFunctions months = [ 'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec' ] regions = ['IntMnt', 'Xeric'] dryFolder = 'AllMonthsDryHalf/' wetFolder = 'AllMonthsWetHalf/' baseFolders = [dryFolder, wetFolder] outputFolder = 'Output/' outputFileName = 'AllMonthsData.csv' # Aggregate each half-region-month's predictions into one file. thesisFunctions.aggregateSacPredictions(baseFolders, outputFolder, outputFileName, months, regions) # Output IntMnt file for specific water year for use in DWRAT aggregateFile = outputFolder + outputFileName waterYear = 1977 thesisFunctions.formatWaterYearPredictions(waterYear, aggregateFile)
def runModels(basePath, performanceEstimation=True, prediction=False): randomSeed = constants.randomSeed myFeaturesIndex = 6 myLabelIndex = 5 kFolds = 5 regions = ['IntMnt', 'Xeric'] months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'] r2Method = mltypes.ModelScoreMethod('R Squared', sklearn.metrics.r2_score) meanOEMethod = mltypes.ModelScoreMethod('Mean O/E', mlmodel.meanObservedExpectedScore) sdOEMethod = mltypes.ModelScoreMethod('Standard Deviation O/E', mlmodel.sdObservedExpectedScore) mseMethod = mltypes.ModelScoreMethod('Mean Squared Error (cfs)', sklearn.metrics.mean_squared_error) testScoreMethods = [r2Method, meanOEMethod, sdOEMethod, mseMethod] randomForestParameters = {'n_estimators': 2000, 'max_features': .333, 'random_state': randomSeed, 'n_jobs': -1} randomForestMethod = mltypes.ModellingMethod(constants.randomForest, sklearn.ensemble.RandomForestRegressor) for region in regions: for month in months: print('Processing:', region, month.capitalize()) # Get expert features from text files selectedFeatures = getMonthVars(basePath, month, region) expertSelectedConfig = mltypes.FeatureEngineeringConfiguration('Expert Selection', 'selection', mltypes.ExtractSpecificFeatures, {'featureList': selectedFeatures}) modelFolder = basePath + region + '/' + month + '/' # Run model once on each fold to get estimates of test metrics if performanceEstimation: allFoldScoreModelResultsDFs = [] for fold in range(kFolds): # Get dataset info foldTestFilePath = modelFolder + '{}_{}_{}_test.csv'.format(month, region, fold) foldTrainFilePath = modelFolder + '{}_{}_all_{}_train.csv'.format(month, region, fold) testDescription = month.capitalize() + ' ' + region + ' Test' trainDescription = month.capitalize() + ' ' + region + ' Train' # Copy to CurrentFoldDataFolder testFilePath = modelFolder + 'CurrentFoldData/' + '{}_{}_test.csv'.format(month, region) trainFilePath = modelFolder + 'CurrentFoldData/' + '{}_{}_all_train.csv'.format(month, region) shutil.copyfile(foldTestFilePath, testFilePath) shutil.copyfile(foldTrainFilePath, trainFilePath) # Get datasets fullTestDataSet = mltypes.DataSet(testDescription, testFilePath, featuresIndex=myFeaturesIndex, labelIndex=myLabelIndex) fullTrainDataSet = mltypes.DataSet(trainDescription, trainFilePath, featuresIndex=myFeaturesIndex, labelIndex=myLabelIndex) fullTrainDataSet = makeLabelRunoffPerDrainageUnit(fullTrainDataSet, 'labeled') fullTestDataSet = makeLabelRunoffPerDrainageUnit(fullTestDataSet, 'labeled') # Select features trainDataSet, transformer = mldata.engineerFeaturesForDataSet(fullTrainDataSet, expertSelectedConfig) testDataSet = mldata.engineerFeaturesByTransformer(fullTestDataSet, transformer) # Apply model applyRFModelConfig = mltypes.ApplyModelConfiguration('Apply ' + constants.randomForest, randomForestMethod, randomForestParameters, trainDataSet, testDataSet) randomForestResult = mlmodel.applyModel(applyRFModelConfig) applyModelResults = [randomForestResult] # Score model and convert results to data frame scoreModelResults = mlmodel.scoreModels(applyModelResults, testScoreMethods) scoreModelResultsDF = mlutils.createScoreDataFrame(scoreModelResults) # Add RMSE, then add to list of results for this month scoreModelResultsDF['RMSE (cfs)'] = scoreModelResultsDF['Mean Squared Error (cfs)'].map(lambda x: x ** (1/2)) allFoldScoreModelResultsDFs.append(scoreModelResultsDF) print(region, month, fold, 'processed') # Aggregate results into a single DataFrame allResultsDF = pandas.DataFrame() for fold in allFoldScoreModelResultsDFs: allResultsDF = allResultsDF.append(fold, ignore_index=True) allResultsDF.to_csv(modelFolder + 'Output/scoreModelResults_all.csv', index=False) # Group by unique model & dataset combinations to average averageResultsDF = allResultsDF.groupby(['Base DataSet', 'Model Method']).mean().reset_index() sortedAverageResultsDF = averageResultsDF.sort(columns='R Squared', ascending=False) sortedAverageResultsDF.to_csv(modelFolder + 'Output/scoreModelResults_average.csv', index=False) # Prediction if prediction: predictionFolder = modelFolder + 'Prediction/' # Get data fullTrainDataSet = mltypes.DataSet(month.capitalize() + ' Training Data', predictionFolder + '{}_{}_all.csv'.format(month, region), featuresIndex=myFeaturesIndex, labelIndex=myLabelIndex) fullPredictionDataSet = mltypes.DataSet(month.capitalize() + ' Prediction Data', predictionFolder + 'sacramentoData.csv', featuresIndex=3, labelIndex=None) # Get scaled label (runoff/drainage unit) fullTrainDataSet = makeLabelRunoffPerDrainageUnit(fullTrainDataSet, 'labeled') fullPredictionDataSet = makeLabelRunoffPerDrainageUnit(fullPredictionDataSet, 'prediction') # Select features trainDataSet, transformer = mldata.engineerFeaturesForDataSet(fullTrainDataSet, expertSelectedConfig) predictionDataSet = mldata.engineerFeaturesByTransformer(fullPredictionDataSet, transformer) # Train model and predict for the Sacramento region applyRFModelConfig = mltypes.ApplyModelConfiguration('Apply ' + constants.randomForest, randomForestMethod, randomForestParameters, trainDataSet, predictionDataSet) applyRFModelResult = mlmodel.applyModel(applyRFModelConfig) rescalePredictions(applyRFModelResult, predictionDataSet) predictionOutputPath = predictionFolder + 'sacramentoPredictions.csv' thesisFunctions.outputPredictions(applyRFModelResult, predictionOutputPath) if prediction: print('Aggregating predictions.') aggregateFile = thesisFunctions.aggregateSacPredictions([basePath], 'Output/', 'RandomForestData.csv', months, regions) waterYear = 1977 thesisFunctions.formatWaterYearPredictions(waterYear, aggregateFile)
def runModels(basePath, performanceEstimation=True, prediction=False): randomSeed = constants.randomSeed myFeaturesIndex = 6 myLabelIndex = 5 kFolds = 5 regions = ['IntMnt', 'Xeric'] months = [ 'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec' ] r2Method = mltypes.ModelScoreMethod('R Squared', sklearn.metrics.r2_score) meanOEMethod = mltypes.ModelScoreMethod('Mean O/E', mlmodel.meanObservedExpectedScore) sdOEMethod = mltypes.ModelScoreMethod('Standard Deviation O/E', mlmodel.sdObservedExpectedScore) mseMethod = mltypes.ModelScoreMethod('Mean Squared Error (cfs)', sklearn.metrics.mean_squared_error) testScoreMethods = [r2Method, meanOEMethod, sdOEMethod, mseMethod] randomForestParameters = { 'n_estimators': 2000, 'max_features': .333, 'random_state': randomSeed, 'n_jobs': -1 } randomForestMethod = mltypes.ModellingMethod( constants.randomForest, sklearn.ensemble.RandomForestRegressor) for region in regions: for month in months: print('Processing:', region, month.capitalize()) # Get expert features from text files selectedFeatures = getMonthVars(basePath, month, region) expertSelectedConfig = mltypes.FeatureEngineeringConfiguration( 'Expert Selection', 'selection', mltypes.ExtractSpecificFeatures, {'featureList': selectedFeatures}) modelFolder = basePath + region + '/' + month + '/' # Run model once on each fold to get estimates of test metrics if performanceEstimation: allFoldScoreModelResultsDFs = [] for fold in range(kFolds): # Get dataset info foldTestFilePath = modelFolder + '{}_{}_{}_test.csv'.format( month, region, fold) foldTrainFilePath = modelFolder + '{}_{}_all_{}_train.csv'.format( month, region, fold) testDescription = month.capitalize( ) + ' ' + region + ' Test' trainDescription = month.capitalize( ) + ' ' + region + ' Train' # Copy to CurrentFoldDataFolder testFilePath = modelFolder + 'CurrentFoldData/' + '{}_{}_test.csv'.format( month, region) trainFilePath = modelFolder + 'CurrentFoldData/' + '{}_{}_all_train.csv'.format( month, region) shutil.copyfile(foldTestFilePath, testFilePath) shutil.copyfile(foldTrainFilePath, trainFilePath) # Get datasets fullTestDataSet = mltypes.DataSet( testDescription, testFilePath, featuresIndex=myFeaturesIndex, labelIndex=myLabelIndex) fullTrainDataSet = mltypes.DataSet( trainDescription, trainFilePath, featuresIndex=myFeaturesIndex, labelIndex=myLabelIndex) fullTrainDataSet = makeLabelRunoffPerDrainageUnit( fullTrainDataSet, 'labeled') fullTestDataSet = makeLabelRunoffPerDrainageUnit( fullTestDataSet, 'labeled') # Select features trainDataSet, transformer = mldata.engineerFeaturesForDataSet( fullTrainDataSet, expertSelectedConfig) testDataSet = mldata.engineerFeaturesByTransformer( fullTestDataSet, transformer) # Apply model applyRFModelConfig = mltypes.ApplyModelConfiguration( 'Apply ' + constants.randomForest, randomForestMethod, randomForestParameters, trainDataSet, testDataSet) randomForestResult = mlmodel.applyModel(applyRFModelConfig) applyModelResults = [randomForestResult] # Score model and convert results to data frame scoreModelResults = mlmodel.scoreModels( applyModelResults, testScoreMethods) scoreModelResultsDF = mlutils.createScoreDataFrame( scoreModelResults) # Add RMSE, then add to list of results for this month scoreModelResultsDF['RMSE (cfs)'] = scoreModelResultsDF[ 'Mean Squared Error (cfs)'].map(lambda x: x**(1 / 2)) allFoldScoreModelResultsDFs.append(scoreModelResultsDF) print(region, month, fold, 'processed') # Aggregate results into a single DataFrame allResultsDF = pandas.DataFrame() for fold in allFoldScoreModelResultsDFs: allResultsDF = allResultsDF.append(fold, ignore_index=True) allResultsDF.to_csv(modelFolder + 'Output/scoreModelResults_all.csv', index=False) # Group by unique model & dataset combinations to average averageResultsDF = allResultsDF.groupby( ['Base DataSet', 'Model Method']).mean().reset_index() sortedAverageResultsDF = averageResultsDF.sort( columns='R Squared', ascending=False) sortedAverageResultsDF.to_csv( modelFolder + 'Output/scoreModelResults_average.csv', index=False) # Prediction if prediction: predictionFolder = modelFolder + 'Prediction/' # Get data fullTrainDataSet = mltypes.DataSet( month.capitalize() + ' Training Data', predictionFolder + '{}_{}_all.csv'.format(month, region), featuresIndex=myFeaturesIndex, labelIndex=myLabelIndex) fullPredictionDataSet = mltypes.DataSet( month.capitalize() + ' Prediction Data', predictionFolder + 'sacramentoData.csv', featuresIndex=3, labelIndex=None) # Get scaled label (runoff/drainage unit) fullTrainDataSet = makeLabelRunoffPerDrainageUnit( fullTrainDataSet, 'labeled') fullPredictionDataSet = makeLabelRunoffPerDrainageUnit( fullPredictionDataSet, 'prediction') # Select features trainDataSet, transformer = mldata.engineerFeaturesForDataSet( fullTrainDataSet, expertSelectedConfig) predictionDataSet = mldata.engineerFeaturesByTransformer( fullPredictionDataSet, transformer) # Train model and predict for the Sacramento region applyRFModelConfig = mltypes.ApplyModelConfiguration( 'Apply ' + constants.randomForest, randomForestMethod, randomForestParameters, trainDataSet, predictionDataSet) applyRFModelResult = mlmodel.applyModel(applyRFModelConfig) rescalePredictions(applyRFModelResult, predictionDataSet) predictionOutputPath = predictionFolder + 'sacramentoPredictions.csv' thesisFunctions.outputPredictions(applyRFModelResult, predictionOutputPath) if prediction: print('Aggregating predictions.') aggregateFile = thesisFunctions.aggregateSacPredictions( [basePath], 'Output/', 'RandomForestData.csv', months, regions) waterYear = 1977 thesisFunctions.formatWaterYearPredictions(waterYear, aggregateFile)