import thesisFunctions

months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
regions = ['IntMnt', 'Xeric']

dryFolder = 'AllMonthsDryHalf/'
wetFolder = 'AllMonthsWetHalf/'
baseFolders = [dryFolder, wetFolder]

outputFolder = 'Output/'
outputFileName = 'AllMonthsData.csv'

# Aggregate each half-region-month's predictions into one file.
thesisFunctions.aggregateSacPredictions(baseFolders, outputFolder, outputFileName, months, regions)

# Output IntMnt file for specific water year for use in DWRAT
aggregateFile = outputFolder + outputFileName
waterYear = 1977
thesisFunctions.formatWaterYearPredictions(waterYear, aggregateFile)
Ejemplo n.º 2
0
import thesisFunctions

months = [
    'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct',
    'nov', 'dec'
]
regions = ['IntMnt', 'Xeric']

dryFolder = 'AllMonthsDryHalf/'
wetFolder = 'AllMonthsWetHalf/'
baseFolders = [dryFolder, wetFolder]

outputFolder = 'Output/'
outputFileName = 'AllMonthsData.csv'

# Aggregate each half-region-month's predictions into one file.
thesisFunctions.aggregateSacPredictions(baseFolders, outputFolder,
                                        outputFileName, months, regions)

# Output IntMnt file for specific water year for use in DWRAT
aggregateFile = outputFolder + outputFileName
waterYear = 1977
thesisFunctions.formatWaterYearPredictions(waterYear, aggregateFile)
def runModels(basePath, performanceEstimation=True, prediction=False):

    randomSeed = constants.randomSeed
    myFeaturesIndex = 6
    myLabelIndex = 5
    kFolds = 5
    regions = ['IntMnt', 'Xeric']
    months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']

    r2Method = mltypes.ModelScoreMethod('R Squared', sklearn.metrics.r2_score)
    meanOEMethod = mltypes.ModelScoreMethod('Mean O/E', mlmodel.meanObservedExpectedScore)
    sdOEMethod = mltypes.ModelScoreMethod('Standard Deviation O/E', mlmodel.sdObservedExpectedScore)
    mseMethod = mltypes.ModelScoreMethod('Mean Squared Error (cfs)', sklearn.metrics.mean_squared_error)
    testScoreMethods = [r2Method, meanOEMethod, sdOEMethod, mseMethod]

    randomForestParameters = {'n_estimators': 2000,
                              'max_features': .333,
                              'random_state': randomSeed,
                              'n_jobs': -1}
    randomForestMethod = mltypes.ModellingMethod(constants.randomForest,
                                                 sklearn.ensemble.RandomForestRegressor)

    for region in regions:
        for month in months:

            print('Processing:', region, month.capitalize())

            # Get expert features from text files
            selectedFeatures = getMonthVars(basePath, month, region)
            expertSelectedConfig = mltypes.FeatureEngineeringConfiguration('Expert Selection',
                                                                           'selection',
                                                                           mltypes.ExtractSpecificFeatures,
                                                                           {'featureList': selectedFeatures})

            modelFolder = basePath + region + '/' + month + '/'

            # Run model once on each fold to get estimates of test metrics
            if performanceEstimation:
                allFoldScoreModelResultsDFs = []

                for fold in range(kFolds):

                    # Get dataset info
                    foldTestFilePath = modelFolder + '{}_{}_{}_test.csv'.format(month, region, fold)
                    foldTrainFilePath = modelFolder + '{}_{}_all_{}_train.csv'.format(month, region, fold)
                    testDescription = month.capitalize() + ' ' + region + ' Test'
                    trainDescription = month.capitalize() + ' ' + region + ' Train'

                    # Copy to CurrentFoldDataFolder
                    testFilePath = modelFolder + 'CurrentFoldData/' + '{}_{}_test.csv'.format(month, region)
                    trainFilePath = modelFolder + 'CurrentFoldData/' + '{}_{}_all_train.csv'.format(month, region)
                    shutil.copyfile(foldTestFilePath, testFilePath)
                    shutil.copyfile(foldTrainFilePath, trainFilePath)

                    # Get datasets
                    fullTestDataSet = mltypes.DataSet(testDescription,
                                                      testFilePath,
                                                      featuresIndex=myFeaturesIndex,
                                                      labelIndex=myLabelIndex)
                    fullTrainDataSet = mltypes.DataSet(trainDescription,
                                                       trainFilePath,
                                                       featuresIndex=myFeaturesIndex,
                                                       labelIndex=myLabelIndex)

                    fullTrainDataSet = makeLabelRunoffPerDrainageUnit(fullTrainDataSet, 'labeled')
                    fullTestDataSet = makeLabelRunoffPerDrainageUnit(fullTestDataSet, 'labeled')

                    # Select features
                    trainDataSet, transformer = mldata.engineerFeaturesForDataSet(fullTrainDataSet,
                                                                                  expertSelectedConfig)
                    testDataSet = mldata.engineerFeaturesByTransformer(fullTestDataSet,
                                                                       transformer)

                    # Apply model
                    applyRFModelConfig = mltypes.ApplyModelConfiguration('Apply ' + constants.randomForest,
                                                                         randomForestMethod,
                                                                         randomForestParameters,
                                                                         trainDataSet,
                                                                         testDataSet)
                    randomForestResult = mlmodel.applyModel(applyRFModelConfig)
                    applyModelResults = [randomForestResult]

                    # Score model and convert results to data frame
                    scoreModelResults = mlmodel.scoreModels(applyModelResults, testScoreMethods)
                    scoreModelResultsDF = mlutils.createScoreDataFrame(scoreModelResults)

                    # Add RMSE, then add to list of results for this month
                    scoreModelResultsDF['RMSE (cfs)'] = scoreModelResultsDF['Mean Squared Error (cfs)'].map(lambda x: x ** (1/2))
                    allFoldScoreModelResultsDFs.append(scoreModelResultsDF)

                    print(region, month, fold, 'processed')

                # Aggregate results into a single DataFrame
                allResultsDF = pandas.DataFrame()
                for fold in allFoldScoreModelResultsDFs:
                    allResultsDF = allResultsDF.append(fold, ignore_index=True)
                allResultsDF.to_csv(modelFolder + 'Output/scoreModelResults_all.csv', index=False)

                # Group by unique model & dataset combinations to average
                averageResultsDF = allResultsDF.groupby(['Base DataSet', 'Model Method']).mean().reset_index()
                sortedAverageResultsDF = averageResultsDF.sort(columns='R Squared', ascending=False)
                sortedAverageResultsDF.to_csv(modelFolder + 'Output/scoreModelResults_average.csv', index=False)

            # Prediction
            if prediction:

                predictionFolder = modelFolder + 'Prediction/'

                # Get data
                fullTrainDataSet = mltypes.DataSet(month.capitalize() + ' Training Data',
                                                  predictionFolder + '{}_{}_all.csv'.format(month, region),
                                                  featuresIndex=myFeaturesIndex,
                                                  labelIndex=myLabelIndex)
                fullPredictionDataSet = mltypes.DataSet(month.capitalize() + ' Prediction Data',
                                                    predictionFolder + 'sacramentoData.csv',
                                                    featuresIndex=3,
                                                    labelIndex=None)

                # Get scaled label (runoff/drainage unit)
                fullTrainDataSet = makeLabelRunoffPerDrainageUnit(fullTrainDataSet, 'labeled')
                fullPredictionDataSet = makeLabelRunoffPerDrainageUnit(fullPredictionDataSet, 'prediction')

                # Select features
                trainDataSet, transformer = mldata.engineerFeaturesForDataSet(fullTrainDataSet,
                                                                              expertSelectedConfig)
                predictionDataSet = mldata.engineerFeaturesByTransformer(fullPredictionDataSet,
                                                                   transformer)

                # Train model and predict for the Sacramento region
                applyRFModelConfig = mltypes.ApplyModelConfiguration('Apply ' + constants.randomForest,
                                                                     randomForestMethod,
                                                                     randomForestParameters,
                                                                     trainDataSet,
                                                                     predictionDataSet)
                applyRFModelResult = mlmodel.applyModel(applyRFModelConfig)
                rescalePredictions(applyRFModelResult, predictionDataSet)
                predictionOutputPath = predictionFolder + 'sacramentoPredictions.csv'
                thesisFunctions.outputPredictions(applyRFModelResult, predictionOutputPath)

    if prediction:
        print('Aggregating predictions.')
        aggregateFile = thesisFunctions.aggregateSacPredictions([basePath],
                                                                'Output/',
                                                                'RandomForestData.csv',
                                                                months,
                                                                regions)
        waterYear = 1977
        thesisFunctions.formatWaterYearPredictions(waterYear, aggregateFile)
def runModels(basePath, performanceEstimation=True, prediction=False):

    randomSeed = constants.randomSeed
    myFeaturesIndex = 6
    myLabelIndex = 5
    kFolds = 5
    regions = ['IntMnt', 'Xeric']
    months = [
        'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct',
        'nov', 'dec'
    ]

    r2Method = mltypes.ModelScoreMethod('R Squared', sklearn.metrics.r2_score)
    meanOEMethod = mltypes.ModelScoreMethod('Mean O/E',
                                            mlmodel.meanObservedExpectedScore)
    sdOEMethod = mltypes.ModelScoreMethod('Standard Deviation O/E',
                                          mlmodel.sdObservedExpectedScore)
    mseMethod = mltypes.ModelScoreMethod('Mean Squared Error (cfs)',
                                         sklearn.metrics.mean_squared_error)
    testScoreMethods = [r2Method, meanOEMethod, sdOEMethod, mseMethod]

    randomForestParameters = {
        'n_estimators': 2000,
        'max_features': .333,
        'random_state': randomSeed,
        'n_jobs': -1
    }
    randomForestMethod = mltypes.ModellingMethod(
        constants.randomForest, sklearn.ensemble.RandomForestRegressor)

    for region in regions:
        for month in months:

            print('Processing:', region, month.capitalize())

            # Get expert features from text files
            selectedFeatures = getMonthVars(basePath, month, region)
            expertSelectedConfig = mltypes.FeatureEngineeringConfiguration(
                'Expert Selection', 'selection',
                mltypes.ExtractSpecificFeatures,
                {'featureList': selectedFeatures})

            modelFolder = basePath + region + '/' + month + '/'

            # Run model once on each fold to get estimates of test metrics
            if performanceEstimation:
                allFoldScoreModelResultsDFs = []

                for fold in range(kFolds):

                    # Get dataset info
                    foldTestFilePath = modelFolder + '{}_{}_{}_test.csv'.format(
                        month, region, fold)
                    foldTrainFilePath = modelFolder + '{}_{}_all_{}_train.csv'.format(
                        month, region, fold)
                    testDescription = month.capitalize(
                    ) + ' ' + region + ' Test'
                    trainDescription = month.capitalize(
                    ) + ' ' + region + ' Train'

                    # Copy to CurrentFoldDataFolder
                    testFilePath = modelFolder + 'CurrentFoldData/' + '{}_{}_test.csv'.format(
                        month, region)
                    trainFilePath = modelFolder + 'CurrentFoldData/' + '{}_{}_all_train.csv'.format(
                        month, region)
                    shutil.copyfile(foldTestFilePath, testFilePath)
                    shutil.copyfile(foldTrainFilePath, trainFilePath)

                    # Get datasets
                    fullTestDataSet = mltypes.DataSet(
                        testDescription,
                        testFilePath,
                        featuresIndex=myFeaturesIndex,
                        labelIndex=myLabelIndex)
                    fullTrainDataSet = mltypes.DataSet(
                        trainDescription,
                        trainFilePath,
                        featuresIndex=myFeaturesIndex,
                        labelIndex=myLabelIndex)

                    fullTrainDataSet = makeLabelRunoffPerDrainageUnit(
                        fullTrainDataSet, 'labeled')
                    fullTestDataSet = makeLabelRunoffPerDrainageUnit(
                        fullTestDataSet, 'labeled')

                    # Select features
                    trainDataSet, transformer = mldata.engineerFeaturesForDataSet(
                        fullTrainDataSet, expertSelectedConfig)
                    testDataSet = mldata.engineerFeaturesByTransformer(
                        fullTestDataSet, transformer)

                    # Apply model
                    applyRFModelConfig = mltypes.ApplyModelConfiguration(
                        'Apply ' + constants.randomForest, randomForestMethod,
                        randomForestParameters, trainDataSet, testDataSet)
                    randomForestResult = mlmodel.applyModel(applyRFModelConfig)
                    applyModelResults = [randomForestResult]

                    # Score model and convert results to data frame
                    scoreModelResults = mlmodel.scoreModels(
                        applyModelResults, testScoreMethods)
                    scoreModelResultsDF = mlutils.createScoreDataFrame(
                        scoreModelResults)

                    # Add RMSE, then add to list of results for this month
                    scoreModelResultsDF['RMSE (cfs)'] = scoreModelResultsDF[
                        'Mean Squared Error (cfs)'].map(lambda x: x**(1 / 2))
                    allFoldScoreModelResultsDFs.append(scoreModelResultsDF)

                    print(region, month, fold, 'processed')

                # Aggregate results into a single DataFrame
                allResultsDF = pandas.DataFrame()
                for fold in allFoldScoreModelResultsDFs:
                    allResultsDF = allResultsDF.append(fold, ignore_index=True)
                allResultsDF.to_csv(modelFolder +
                                    'Output/scoreModelResults_all.csv',
                                    index=False)

                # Group by unique model & dataset combinations to average
                averageResultsDF = allResultsDF.groupby(
                    ['Base DataSet', 'Model Method']).mean().reset_index()
                sortedAverageResultsDF = averageResultsDF.sort(
                    columns='R Squared', ascending=False)
                sortedAverageResultsDF.to_csv(
                    modelFolder + 'Output/scoreModelResults_average.csv',
                    index=False)

            # Prediction
            if prediction:

                predictionFolder = modelFolder + 'Prediction/'

                # Get data
                fullTrainDataSet = mltypes.DataSet(
                    month.capitalize() + ' Training Data',
                    predictionFolder + '{}_{}_all.csv'.format(month, region),
                    featuresIndex=myFeaturesIndex,
                    labelIndex=myLabelIndex)
                fullPredictionDataSet = mltypes.DataSet(
                    month.capitalize() + ' Prediction Data',
                    predictionFolder + 'sacramentoData.csv',
                    featuresIndex=3,
                    labelIndex=None)

                # Get scaled label (runoff/drainage unit)
                fullTrainDataSet = makeLabelRunoffPerDrainageUnit(
                    fullTrainDataSet, 'labeled')
                fullPredictionDataSet = makeLabelRunoffPerDrainageUnit(
                    fullPredictionDataSet, 'prediction')

                # Select features
                trainDataSet, transformer = mldata.engineerFeaturesForDataSet(
                    fullTrainDataSet, expertSelectedConfig)
                predictionDataSet = mldata.engineerFeaturesByTransformer(
                    fullPredictionDataSet, transformer)

                # Train model and predict for the Sacramento region
                applyRFModelConfig = mltypes.ApplyModelConfiguration(
                    'Apply ' + constants.randomForest, randomForestMethod,
                    randomForestParameters, trainDataSet, predictionDataSet)
                applyRFModelResult = mlmodel.applyModel(applyRFModelConfig)
                rescalePredictions(applyRFModelResult, predictionDataSet)
                predictionOutputPath = predictionFolder + 'sacramentoPredictions.csv'
                thesisFunctions.outputPredictions(applyRFModelResult,
                                                  predictionOutputPath)

    if prediction:
        print('Aggregating predictions.')
        aggregateFile = thesisFunctions.aggregateSacPredictions(
            [basePath], 'Output/', 'RandomForestData.csv', months, regions)
        waterYear = 1977
        thesisFunctions.formatWaterYearPredictions(waterYear, aggregateFile)