Ejemplo n.º 1
0
def evalColumns(columns):

    overallY = []
    overallPred = []

    for location in locations:
        location2s = [l for l in locations if l != location]
        
        print("Location: " + str(location) + ", location2: " + str(location2s))
        
        # generating testPreds
        testPreds = {}
        for datagroup in topDatagroups:
            tag, features = getTagAndFeatures(datagroup)
            trainX, testX, trainY, testY = splitDataForXValidation(location, "location", data, features, "target")
                
            model = RandomForestRegressor(min_samples_leaf = 9, n_estimators = 59, n_jobs = -1, random_state=42)                    
            model.fit(trainX, trainY)
            prediction = model.predict(testX)
            testPreds[tag] = prediction
          
        trainPreds = defaultdict(list)
          
        for datagroup in topDatagroups:
            tag, features = getTagAndFeatures(datagroup)
            print("\ttag: " + str(tag) + ", features: " + str(features))
            for location2 in location2s:
                trainX1, trainX2, trainY1, trainY2, testX, testY = splitDataForXValidationSampled2(location, location2, "location", data, features, "target")
                model = RandomForestRegressor(min_samples_leaf = 9, n_estimators = 59, n_jobs = -1, random_state=42)                    
                model.fit(trainX1, trainY1)
                train1Prediction = model.predict(trainX1)
                train2Prediction = model.predict(trainX2)
                testPrediction = model.predict(testX)
                train1Rmse = str(rmseEval(trainY1, train1Prediction)[1])
                train2Rmse = str(rmseEval(trainY2, train2Prediction)[1])
                testRmse = str(rmseEval(testY, testPrediction)[1])
                print("\t\ttrain1 rmse: " + train1Rmse)
                print("\t\ttrain2 rmse: " + train2Rmse)
                print("\t\ttest rmse: " + testRmse)
                for x in train2Prediction:
                    trainPreds[tag].append(x)

        # get combined train2y                
        combinedTrain2Y = []        
        for location2 in location2s:
            trainX1, trainX2, trainY1, trainY2, testX, testY = splitDataForXValidationSampled2(location, location2, "location", data, all_features, "target")
            combinedTrain2Y = combinedTrain2Y + trainY2
          
        # calculate labels 
        labelTrain2Y = []
        for i in range(0, len(combinedTrain2Y)):
            bestModel = 0
            bestAbs = abs(combinedTrain2Y[i] - trainPreds[topTags[0]][i])
            for j in range(0, len(topTags)):
                tag = topTags[j]
                modelAbs = abs(combinedTrain2Y[i] - trainPreds[tag][i])
                if modelAbs < bestAbs:
                    bestAbs = modelAbs
                    bestModel = j
            labelTrain2Y.append(bestModel)
            
        # generating testX
        _, testX, _, _ = splitDataForXValidation(location, "location", data, all_features, "target")

        # trainX2             
        tX2 = []
        for location2 in location2s:
            _, trainX2, _, _, _, _ = splitDataForXValidationSampled2(location, location2, "location", data, all_features, "target")
            for row in trainX2:
                tX2.append(row)
        
        for tag in topTags:
            for i in range(0, len(trainPreds[tag])):
                tX2[i].append(trainPreds[tag][i]) 
        
        reducedTrainX2 = []
        for d in tX2:
            reducedD = []
            for i in range(0, len(all_columns)):
                if columns[i]:
                    reducedD.append(d[i])
            reducedTrainX2.append(reducedD)
              
        model = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=15)
        model.fit(reducedTrainX2, labelTrain2Y)
        
        for tag in topTags:
            for i in range(0, len(testPreds[tag])):
                testX[i].append(testPreds[tag][i]) 
        
        reducedTestX = []
        for d in testX:
            reducedD = []
            for i in range(0, len(all_columns)):
                if columns[i]:
                    reducedD.append(d[i])
            reducedTestX.append(reducedD)
         
        pred = model.predict(reducedTestX)
         
        finalPrediction = []
        for i in range(0, len(testY)):
            p = testPreds[topTags[pred[i]]][i]
            finalPrediction.append(p)      
        rmse = str(rmseEval(testY, finalPrediction)[1])
        print("\tRMSE: " + str(rmse))
        
        for x in testY:
            overallY.append(x)
        for x in finalPrediction:
            overallPred.append(x)
    
    rmse = rmseEval(overallPred, overallY)[1]
    return rmse
Ejemplo n.º 2
0
        if isinstance(data[i], list):
            for j in range(0, len(data[i])):
                if j != 0:
                    output.write(",")
                output.write(str(data[i][j]))
        else:
            output.write(str(data[i]))
        output.write("\n")
    output.close()


top16datagroups = []
data_groups = generateAllDataGroups()
for tag in top16tags:
    for datagroup in data_groups:
        dgtag, _ = getTagAndFeatures(datagroup)
        if dgtag == tag:
            top16datagroups.append(datagroup)
            break

all_tags, all_features = getTagAndFeatures(['T', 'W', 'A', 'R', 'L', 'B'])

for location in locations:
    print("Location: " + str(location))
    trainX1, trainX2, trainY1, trainY2, testX, testY = splitDataForXValidationSampled(
        location, "location", sampleRate, 42, data, all_features, "target")

    writeOutData(OUTPUT_DIRECTORY + "z_" + str(int(location)) + "_trainX.csv",
                 all_features, trainX2)
    writeOutData(OUTPUT_DIRECTORY + "z_" + str(int(location)) + "_testX.csv",
                 all_features, testX)
Ejemplo n.º 3
0
all_features = ['hour', 'day_of_week', 'month', 'bank_holiday', 'race_day', 'winddirection', 'windspeed', 'temperature', 'rain', 'pressure', 'atc', 'lane_length', 'length', 'landuse_area', 'leisure_area', 'buildings_area', 'buildings_number']

topTags = ['TW','TWA', 'TWL', 'WA']
topPreds = ["pred_" + tag for tag in topTags]

locations = [2.0, 3.0, 4.0, 6.0, 8.0]

all_columns = all_features + topPreds

topDatagroups = []
data_groups = generateAllDataGroups()

for tag in topTags:
    for datagroup in data_groups:
        dgtag, _ = getTagAndFeatures(datagroup)
        if dgtag == tag:
            topDatagroups.append(datagroup)
            break

def evalColumns(columns):

    overallY = []
    overallPred = []

    for location in locations:
        location2s = [l for l in locations if l != location]
        
        print("Location: " + str(location) + ", location2: " + str(location2s))
        
        # generating testPreds
Ejemplo n.º 4
0
from ex27.ex27_lib import generateAllDataGroups, getTagAndFeatures
from collections import defaultdict

DATA_FILE = "/data/york3_hour_2013.csv"
OUTPUT_DIRECTORY = "/experiments/ex27/"

locations = [2.0, 3.0, 4.0, 6.0, 8.0]

data = {}
columns = []
loadData(DATA_FILE, ['timestamp'], data, columns)

sampleRate = 0.75

data_groups = generateAllDataGroups()
tags = [getTagAndFeatures(datagroup)[0] for datagroup in data_groups]
top10tags = ['TW', 'TWA', 'W', 'TWL', 'TWB', 'T', 'WA', 'WB', 'TA', 'A']

overAllFreq = defaultdict(lambda: 0)
overAllFreqT16 = defaultdict(lambda: 0)

for location in locations:
    print("Location: " + str(location))

    trainPreds = {}
    testPreds = {}
    t2Y = None
    tY = None

    for datagroup in data_groups:
        tag, features = getTagAndFeatures(datagroup)