from ex25.ex25_lib import loadEx25Data from data.data import loadData from ex25.crossvalidation import splitDataForXValidation from eval.rmse import rmseEval from _collections import defaultdict DATA_FILE = "/data/york3_hour_2013.csv" INPUT_FILE = "/experiments/ex25/file.csv" predData = loadEx25Data(INPUT_FILE) tags = set(tag for tag in predData) data = {} columns = [] loadData(DATA_FILE, [], data, columns) timestampDoubleData = data["timestamp"] timestampData = [] for v in timestampDoubleData: timestampData.append(str(int(v))) stationsNames = { 2.0: "Fulford", 3.0: "Gillygate", 4.0: "Heworth", 6.0: "Lawrence", 8.0: "Fishergate" } locations = [2.0, 3.0, 4.0, 6.0, 8.0] bestCounter = defaultdict(lambda: 0)
def doEval(landuse, topo, traffic_static, traffic_dynamic, weather, time, output): if landuse == False and topo == False and traffic_dynamic == False and traffic_static == False and weather == False and time == False: return groupName = "lu" if landuse == True: groupName = groupName + "1" else: groupName = groupName + "0" groupName = groupName + "to" if topo == True: groupName = groupName + "1" else: groupName = groupName + "0" groupName = groupName + "ts" if traffic_static == True: groupName = groupName + "1" else: groupName = groupName + "0" groupName = groupName + "td" if traffic_dynamic == True: groupName = groupName + "1" else: groupName = groupName + "0" groupName = groupName + "we" if weather == True: groupName = groupName + "1" else: groupName = groupName + "0" groupName = groupName + "ti" if time == True: groupName = groupName + "1" else: groupName = groupName + "0" print("Group: " + groupName) data = {} columns = {} columnsToSkip = ['timestamp'] if landuse == False: columnsToSkip.append('leisure_area') columnsToSkip.append('landuse_area') if topo == False: columnsToSkip.append('buildings_number') columnsToSkip.append('buildings_area') if traffic_static == False: columnsToSkip.append('lane_length') columnsToSkip.append('length') if traffic_dynamic == False: columnsToSkip.append('traffic_length_car') columnsToSkip.append('traffic_length_lgv') columnsToSkip.append('traffic_length_hgv') if weather == False: columnsToSkip.append('winddirection') columnsToSkip.append('windspeed') columnsToSkip.append('temperature') columnsToSkip.append('rain') columnsToSkip.append('pressure') if time == False: #columnsToSkip.append('hour') columnsToSkip.append('day_of_week') columnsToSkip.append('month') columnsToSkip.append('bank_holiday') columnsToSkip.append('race_day') for year in years: columns[year] = [] data[year] = {} loadData(dataFiles[year], columnsToSkip, data[year], columns[year]) for year in years: print("year " + str(year)) modelColumns = columns[year] if time == False: modelColumns.remove("hour") print(str(modelColumns)) model = {"name": "rfr", "norm": NONORMALIZATION, "train": trainRandomForest, "apply": applyRandomForest, "data": data[year], "columns": modelColumns, "parameters": {'estimators': 50, 'leaf': 10}} result = crossValidationLocationHour(model["data"], model["columns"], "target", model["norm"], model["train"], model["apply"], [evalFunction], model["parameters"]) print(str(result["eval"])) for hourValues in result["eval"][evalFunctionName]: for value in result["eval"][evalFunctionName][hourValues]: output.write(str(groupName) + "," + str(hourValues) + "," + str(value) + "\n")
from ex27.ex27_lib import generateAllDataGroups,getTagAndFeatures from eval.rmse import rmseEval from sklearn.ensemble.forest import RandomForestClassifier,\ RandomForestRegressor from copy import deepcopy from data.data import loadData from ex27.crossvalidation import splitDataForXValidationSampled2,\ splitDataForXValidation from collections import defaultdict DATA_FILE = "/data/york3_hour_2013.csv" OUTPUT_FILE = "/experiments/ex27/ex27_f.txt" data = {} columns = [] loadData(DATA_FILE, ['timestamp'], data, columns) output = open(OUTPUT_FILE, 'w') def log(message): print(message) output.write(message) output.write("\n") output.flush() all_features = ['hour', 'day_of_week', 'month', 'bank_holiday', 'race_day', 'winddirection', 'windspeed', 'temperature', 'rain', 'pressure', 'atc', 'lane_length', 'length', 'landuse_area', 'leisure_area', 'buildings_area', 'buildings_number'] topTags = ['TW','TWA', 'TWL', 'WA'] topPreds = ["pred_" + tag for tag in topTags] locations = [2.0, 3.0, 4.0, 6.0, 8.0]
from data.data import loadData from norm import NONORMALIZATION from models.model_decisiontree import trainDecisionTree, applyDecisionTree from models.model_randomforest import trainRandomForest, applyRandomForest import time # open train data dataFile1 = "/media/sf_Google_Drive/transfer/data/data_year.csv" data1 = {} columns1 = [] loadData(dataFile1, ["location", "year"], data1, columns1) # train #models = [] #models.append({"name": "dtr", "norm": NONORMALIZATION, "train": trainDecisionTree, "apply": applyDecisionTree, "data": data1, "columns": columns1, "parameters": }) #models.append({"name": "rfr", "norm": NONORMALIZATION, "train": trainRandomForest, "apply": applyRandomForest, "data": data1, "columns": columns1, "parameters": }) start = time.time() model = trainRandomForest(data1, columns1, "target", { 'estimators': 59, 'leaf': 9 }) end = time.time() print(end - start) # load apply data dataFile2 = "/media/sf_Google_Drive/transfer/data/data_grid.csv"
print("#locations: " + str(len(locations))) locations_grouped = [[], [], [], [], []] for i in range(0, 90): group = i % 5 locations_grouped[group].append(float(locations[i])) for i in range(0, 5): print("group_" + str(i) + ": " + str(locations_grouped[i])) DATA_FILE = "/data/london_hour_2015.csv" data = {} columns = [] loadData(DATA_FILE, [ "timestamp", 'natural_area', 'building_count', 'leisure_area', 'landuse_area', 'lane_length', 'length', 'building_area' ], data, columns) for iteration in range(0, 5): print("iter_" + str(iteration)) trainStations = [] testStations = [] for i in range(0, 5): if i == iteration: testStations = testStations + locations_grouped[i] else: trainStations = trainStations + locations_grouped[i] print("\ttrainStations: " + str(trainStations)) print("\ttestStations: " + str(testStations)) trainStationSet = set(s for s in trainStations)
6.0: "Lawrence", 7.0: "Nunnery", 8.0: "Fishergate" } predictions = {} observations = {} predictionsPerStation = defaultdict(lambda: defaultdict(list)) observationsPerStation = defaultdict(lambda: defaultdict(list)) predictionsNormal = defaultdict(list) observationsNormal = defaultdict(list) for method in methods: d = {} columns = [] loadData("/experiments/ex2/ex2_" + method + ".csv", [], d, columns) predictions[method] = d["prediction"] observations[method] = d["observation"] for i in range(0, len(d["prediction"])): p = d["prediction"][i] o = d["observation"][i] l = d["location"][i] if method == 'svm' and l == 6.0: continue predictionsPerStation[method][l].append(p) observationsPerStation[method][l].append(o) predictionsNormal[method].append(p) observationsNormal[method].append(o) rmseLevels = {} maeLevels = {}
from ex24.crossvalidation import splitDataForXValidation from sklearn.ensemble.forest import RandomForestRegressor from eval.rmse import rmseEval DATA_FILE = "/data/london3_hour_2016.csv" OUTPUT_FILE_TW = "/experiments/ex24/ex25_5_tw.csv" OUTPUT_FILE_TWA = "/experiments/ex24/ex25_5_twa.csv" OUTPUT_FILE_ALL = "/experiments/ex24/ex25_5_all.csv" features_TW = ['rain', 'temperature', 'windspeed', 'winddirection', 'humidity', 'pressure', 'bank_holiday', 'hour', 'month', 'day_of_week'] features_TWA = ['rain', 'temperature', 'windspeed', 'winddirection', 'humidity', 'pressure', 'bank_holiday', 'hour', 'month', 'day_of_week', 'atc'] features_ALL = ['leisure_area', 'rain', 'temperature', 'atc', 'windspeed', 'lane_length', 'building_area', 'winddirection', 'landuse_area', 'humidity', 'pressure', 'bank_holiday', 'hour', 'month', 'day_of_week', 'building_count', 'length', 'natural_area'] data = {} columns = [] loadData(DATA_FILE, ["timestamp"], data, columns) dataByStation = defaultdict(lambda: defaultdict(list)) for i in range(0, len(data["location"])): loc = data["location"][i] for c in columns: dataByStation[loc][c].append(data[c][i]) def evalTrainStationTestStation(trainStation, testStation, features): trainX, _, trainY, _ = splitDataForXValidation(set([trainStation]), set(), "location", dataByStation[trainStation], features, "target") _, testX2, _, testY2 = splitDataForXValidation(set(), set([testStation]), "location", dataByStation[testStation], features, "target") model = RandomForestRegressor(max_depth=10, n_estimators = 60, n_jobs = -1, random_state=42) model.fit(trainX, trainY) prediction = model.predict(testX2) rmse = rmseEval(testY2, prediction)[1]
#output.write("method,eval,result\n") years = [2012, 2013, 2014, 2015] #years = [2012] for year in years: # parameters dataFile1 = "/media/sf_lur/data/data_hour_" + str(year) + ".csv" dataFile2 = "/media/sf_lur/data/data2_hour_" + str(year) + ".csv" outputDir = "/media/sf_lur/model_output/eval/" # load the data, both of them data1 = {} columns1 = [] loadData(dataFile1, ["timestamp"], data1, columns1) data2 = {} columns2 = [] loadData(dataFile2, ["timestamp"], data2, columns2) random.seed(42) models = [] linearColumns = deepcopy(columns1) linearColumns.remove("landuse_area") linearColumns.remove("leisure_area") linearColumns2 = deepcopy(columns2) linearColumns2.remove("landuse_area") linearColumns2.remove("leisure_area") linearColumns3 = deepcopy(columns1)
loadOSPMData(OSPM_DATA_DIRECTORY + station + "_2013.dat", ospmData2013, station, "\t") print("DOne...") print("Load ospm 2014 data") ospmData2014 = {} for stationName in stationNames: station = stationNames[stationName] loadOSPMData(OSPM_DATA_DIRECTORY + station + "_2014.dat", ospmData2014, station, "\t") print("DOne...") print("Load data 2013...") data2013 = {} columns2013 = [] loadData(DATA_DIRECTORY + "data_hour_2013.csv", [], data2013, columns2013) print("Done...") print("Load data 2014...") data2014 = {} columns2014 = [] loadData(DATA_DIRECTORY + "data_hour_2014.csv", [], data2014, columns2014) print("Done...") for i in range(1, 53): week = 52 - i resOspm = ospm(week, timestampWeekCategory, stationNames, ospmData2013, ospmData2014, data2013, data2014) resRf = rf(week, timestampWeekCategory, stationNames, ospmData2013, ospmData2014, data2013, data2014) print(str(i) + "," + str(resOspm[1]) + "," + str(resRf[1]))
from sklearn import decomposition from data.data import loadData from sklearn import preprocessing OUTPUT_DIRECTORY = "/media/sf_lur/experiments/ex5/" data = {} columns = [] loadData(OUTPUT_DIRECTORY + "errors_rae.csv", [], data, columns) targetColumn = "error_rae" trainColumns = [] for column in columns: if column != targetColumn: trainColumns.append(column) trainData = [] for i in range(0, len(data[targetColumn])): record = [] for column in trainColumns: record.append(data[column][i]) trainData.append(record) print(str(trainColumns)) print(str(len(trainColumns))) trainDataScaled = preprocessing.scale(trainData) pca = decomposition.PCA(n_components=20)
stationNames["5.0"] = "Holgate" stationNames["6.0"] = "Lawrence" stationNames["7.0"] = "Nunnery" stationNames["8.0"] = "Fishergate" locations = [2.0, 3.0, 4.0, 8.0] #locations = [2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0] models = ["tw", "twa", "all"] data = {} columns = {} for m in models: d = {} c = [] loadData(INPUT_DIRECTORY + m + ".csv", [], d, c) data[m] = d columns[m] = c errorsTimestamps = {} eData = {} for m in models: errors = {} for l in locations: errors[l] = 0 errorsTimestamps[m] = set() records = len(data[m]['target'])
from data.data import loadData from ex18_lib import doBoxplot INPUT_DIRECTORY = "/experiments/ex18/" OUTPUT_DIRECTORY = "/experiments/ex18/" dataTW = {} columnsTW = [] loadData(INPUT_DIRECTORY + "tw.csv", [], dataTW, columnsTW) dataTWA = {} columnsTWA = [] loadData(INPUT_DIRECTORY + "twa.csv", [], dataTWA, columnsTWA) dataWA = {} columnsWA = [] loadData(INPUT_DIRECTORY + "wa.csv", [], dataWA, columnsWA) names = [] data = [] names.append("e(T+W)") d = [] for i in range(0, len(dataTW["prediction"])): ae = abs(dataTW["prediction"][i] - dataTW["target"][i]) d.append(ae) data.append(d) names.append("e(T+W+A)") d = [] for i in range(0, len(dataTW["prediction"])):
for i in range(0, 50): overall1.append(0) overall2 = [] for i in range(0, 101): overall2.append(0) overall3 = [] for i in range(0, 300): overall3.append(0) overall4 = [] for i in range(-300, 300): overall4.append(0) # load data data = {} columns = [] loadData(DATA_DIRECTORY + "data_hour_2013.csv", [], data, columns) values = findOutKForValidation("location", data) for v in values: sName = stationNames[str(v)] print("location: " + str(v) + " -> " + sName) trainData, testData = splitDataForXValidation(v, "location", data) trainColumns = [] for c in trainData: if c != "target": trainColumns.append(c) model = trainRandomForest(trainData, trainColumns, "target", { 'estimators': 59, 'leaf': 9
DATA_DIRECTORY + "weather.csv", DATA_DIRECTORY + "yorktime.csv"] joinFiles( filesToJoin, DATA_DIRECTORY + "data.csv", True, "\t") print("Done...") # learn the model trainDataFile = DATAPRE_DIRECTORY + "data_hour_2015.csv" print("Load the data for training the model from " + trainDataFile + "...") trainData = {} trainColumns = [] loadData(trainDataFile, ["location", "timestamp"], trainData, trainColumns) print("Done...") print("Train the model...") model = trainRandomForest(trainData, trainColumns, "target", {'estimators': 59, 'leaf': 9}) print("Done...") # apply model on the joined data applyDataFile = DATA_DIRECTORY + "data.csv" print("Load data the prepared data from " + applyDataFile + "...") applyData = {}
ax2.yaxis.labelpad = y_axis_labelpad[data_group_code][i] ax2.tick_params(axis='y', colors=colors[i]) if limits[data_group_code][i] != None: ax2.set_ylim(limits[data_group_code][i][0], limits[data_group_code][i][1]) fig.subplots_adjust(right=right_border[data_group_code], left=0.12) plt.savefig(fileName) plt.close() # load the data data = {} columns = [] loadData("/data/york_hour_2013.csv", [], data, columns) for station in stations: stationName = stationNames[str(station)] print("Station " + stationName) for timestamp in timestamps: print("day: " + timestamp) #generate timestamps hourlyTimestamps = [] for i in range(0, 24): hour = "" if i < 10: hour = "0" hour = hour + str(i)
def doEval(landuse, topo, traffic_static, traffic_dynamic, weather, time, output): if landuse == False and topo == False and traffic_dynamic == False and traffic_static == False and weather == False and time == False: return groupName = "lu" if landuse == True: groupName = groupName + "1" else: groupName = groupName + "0" groupName = groupName + "to" if topo == True: groupName = groupName + "1" else: groupName = groupName + "0" groupName = groupName + "ts" if traffic_static == True: groupName = groupName + "1" else: groupName = groupName + "0" groupName = groupName + "td" if traffic_dynamic == True: groupName = groupName + "1" else: groupName = groupName + "0" groupName = groupName + "we" if weather == True: groupName = groupName + "1" else: groupName = groupName + "0" groupName = groupName + "ti" if time == True: groupName = groupName + "1" else: groupName = groupName + "0" print("Group: " + groupName) columnsToSkip = ['timestamp'] if landuse == False: columnsToSkip.append('leisure_area') columnsToSkip.append('landuse_area') if topo == False: columnsToSkip.append('buildings_number') columnsToSkip.append('buildings_area') if traffic_static == False: columnsToSkip.append('lane_length') columnsToSkip.append('length') if traffic_dynamic == False: columnsToSkip.append('atc') if weather == False: columnsToSkip.append('winddirection') columnsToSkip.append('windspeed') columnsToSkip.append('temperature') columnsToSkip.append('rain') columnsToSkip.append('pressure') if time == False: columnsToSkip.append('hour') columnsToSkip.append('day_of_week') columnsToSkip.append('month') columnsToSkip.append('bank_holiday') columnsToSkip.append('race_day') columns = [] data = {} loadData(DATA_FILE, columnsToSkip, data, columns) # modelling for location in locations: print("Location: " + str(location)) trainX, testX, trainY, testY = splitDataForXValidation1( location, "location", data, columns, "target") print("\tRFR #train: " + str(len(trainY)) + ", #test:" + str(len(testY))) model = RandomForestRegressor(min_samples_leaf=9, n_estimators=59, n_jobs=-1, random_state=42) model.fit(trainX, trainY) prediction = model.predict(testX) rmse = rmseEval(testY, prediction)[1] print("\trmse: " + str(rmse)) output.write(str(groupName) + "," + str(rmse) + "\n")
import numpy as np from norm import NONORMALIZATION from models.model_decisiontree import trainDecisionTree, applyDecisionTree from models.model_randomforest import trainRandomForest, applyRandomForest from sklearn import tree import time from data.data import loadData # open train data dataFile1 = "/media/sf_lur/data/data_hour.csv" data1 = {} columns1 = [] loadData(dataFile1, ["location", "timestamp"], data1, columns1) modelViz = trainDecisionTree(data1, columns1, "target", {'depth': 4}) print(str(modelViz.modelColumns)) tree.export_graphviz(modelViz.model, out_file='/media/sf_lur/data/dtr.dot', feature_names=modelViz.modelColumns, max_depth=8) exit() start = time.time() model = trainRandomForest(data1, columns1, "target", {'estimators': 59, 'leaf': 9}) # importances = model.model.feature_importances_ # std = np.std([tree.feature_importances_ for tree in model.model.estimators_], # axis=0) # indices = np.argsort(importances)[::-1] # print("Feature ranking:") # for f in range(0,len(model.modelColumns)):
dayString = str(day) if month < 10: monthString = "0" + str(month) else: monthString = str(month) timestampString = year + monthString + dayString + hourString d[location][timestampString] = cLevel print("done...") # load the data data = {} columns = [] loadData(INPUT_DATA_FILE, [], data, columns) obsData = defaultdict(lambda: defaultdict(lambda: 0.0)) for i in range(0, len(data["target"])): l = data["location"][i] t = str(int(float(data["timestamp"][i]))) o = data["target"][i] obsData[l][t] = o ospmData = {} for location in stations: loadOspmData( INPUT_DIRECTORY + stationNames[location].lower() + "_2013.dat", ospmData, location)
daysInInterest.append(("Heworth", "20131012")) stationNames = {} stationNames["2.0"] = "Fulford" stationNames["3.0"] = "Gillygate" stationNames["4.0"] = "Heworth" stationNames["5.0"] = "Holgate" stationNames["6.0"] = "Lawrence" stationNames["7.0"] = "Nunnery" stationNames["8.0"] = "Fishergate" locations = [2.0, 3.0, 4.0, 8.0] data = {} columns = [] loadData(DATA_FILE, [], data, columns) observationData = {} TWpredictionData = {} TWAtcpredictionData = {} WAtcpredictionData = {} for loc in locations: observationData[str(loc)] = {} TWpredictionData[str(loc)] = {} TWAtcpredictionData[str(loc)] = {} WAtcpredictionData[str(loc)] = {} for i in range(0, len(data["target"])): location = str(data["location"][i]) timestamp = str(int(data["timestamp"][i]))
DATA_DIRECTORY = "/media/sf_lur/data/" OUTPUT_DIRECTORY = "/media/sf_lur/experiments/ex5/" stationNames = {} stationNames["2.0"] = "Fulford" stationNames["3.0"] = "Gillygate" stationNames["4.0"] = "Heworth" stationNames["5.0"] = "Holgate" stationNames["6.0"] = "Lawrence" stationNames["7.0"] = "Nunnery" stationNames["8.0"] = "Fishergate" # load data data = {} columns = [] loadData(DATA_DIRECTORY + "data_hour_2013.csv", ["timestamp"], data, columns) values = findOutKForValidation("location", data) output = open(OUTPUT_DIRECTORY + "errors_rae.csv", 'w') outputColumns = [] for v in values: sName = stationNames[str(v)] print("location: " + str(v) + " -> " + sName) trainData, testData = splitDataForXValidation(v, "location", data) trainColumns = [] for c in trainData: if c != "target": trainColumns.append(c)
from sklearn.ensemble.forest import RandomForestRegressor OUTPUT_DATA_FILE = "/experiments/ex1/ex1_rf2.csv" parametersList = [] for n in range(5, 200): for leaf in range(2, 200): parametersList.append({"n_estimators": n, "leaf": leaf}) locations = [2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0] # load the data data = {} columns = [] loadData("/data/york_hour_2013.csv", ["timestamp", "atc"], data, columns) all_features = deepcopy(columns) all_features.remove("target") all_features.remove("location") output = open(OUTPUT_DATA_FILE, 'w') output.write("method,value,rmse\n") def evalOne(parameters): all_obs = [] all_pred = [] for location in locations: trainX, testX, trainY, testY = splitDataForXValidation( location, "location", data, all_features, "target")
def doEval(dayNight, landuse, topo, traffic_static, traffic_dynamic, weather, time, output): if landuse == False and topo == False and traffic_dynamic == False and traffic_static == False and weather == False and time == False: return groupName = "lu" if landuse == True: groupName = groupName + "1" else: groupName = groupName + "0" groupName = groupName + "to" if topo == True: groupName = groupName + "1" else: groupName = groupName + "0" groupName = groupName + "ts" if traffic_static == True: groupName = groupName + "1" else: groupName = groupName + "0" groupName = groupName + "td" if traffic_dynamic == True: groupName = groupName + "1" else: groupName = groupName + "0" groupName = groupName + "we" if weather == True: groupName = groupName + "1" else: groupName = groupName + "0" groupName = groupName + "ti" if time == True: groupName = groupName + "1" else: groupName = groupName + "0" print("Group: " + groupName) columnsToUse = [] if landuse == True: columnsToUse.append('leisure_area') columnsToUse.append('landuse_area') if topo == True: columnsToUse.append('buildings_number') columnsToUse.append('buildings_area') if traffic_static == True: columnsToUse.append('lane_length') columnsToUse.append('length') if traffic_dynamic == True: columnsToUse.append('traffic_length_car') columnsToUse.append('traffic_length_lgv') columnsToUse.append('traffic_length_hgv') if weather == True: columnsToUse.append('winddirection') columnsToUse.append('windspeed') columnsToUse.append('temperature') columnsToUse.append('rain') columnsToUse.append('pressure') if time == True: columnsToUse.append('hour') columnsToUse.append('day_of_week') columnsToUse.append('month') columnsToUse.append('bank_holiday') columnsToUse.append('race_day') data = {} columns = [] loadData(dataFile, ['timestamp'], data, columns) locationValues = findOutKForValidation("location", data) for location in locationValues: trainX, testX, trainY, testY = splitDataForXValidation( location, "location", data, columnsToUse, "target", dayNight) print("\t" + str(len(trainX)) + "," + str(len(testX))) model = RandomForestRegressor(min_samples_leaf=9, n_estimators=59, n_jobs=1, random_state=42) model.fit(trainX, trainY) prediction = model.predict(testX) rmse = rmseEval(testY, prediction) print("\t" + str(rmse)) output.write(str(dayNight) + ",") output.write(groupName + ",") output.write(str(rmse[1]) + "\n") output.flush()