def dDeriv(station1, station2, variable, startDate, endDate): # compute directional derivative using # (unitVector . windX2, windY2) (var1 - var2) import numpy as np import wUUtils as Util # load longitude and latitude of both stations lon, lat = Util.getStationLonLat([station1, station2]) # compute unit vector from station2 to station1 uVec = unitVector(lon, lat) # print("unit vector: " + str(uVec)) # get mean wind vector at station2: windX2 = Util.loadDailyVariableRange(station2, startDate, endDate, \ 'WindMeanX', castFloat=True) windY2 = Util.loadDailyVariableRange(station2, startDate, endDate, \ 'WindMeanY', castFloat=True) # get variable at station1 and station2 var1 = Util.loadDailyVariableRange(station1, startDate, endDate, \ variable, castFloat=True) var2 = Util.loadDailyVariableRange(station2, startDate, endDate, \ variable, castFloat=True) # construct wind vectors (N x 2 array) windVec = np.vstack((windX2, windY2)) # project wind vector onto unit vector proj = np.dot(uVec, windVec) dD = (np.array(var1) - np.array(var2)) * proj return dD, uVec
def oneCityPredict(model_params, startDate, endDate, actual=True): # predict targetVar for a single station using # previously generated regression model import numpy as np import wUUtils as Util # extract city and feature data station = model_params['station'] targetVar = model_params['targetVar'] features = model_params['features'] lag = model_params['lag'] regr = model_params['regr'] scale = model_params['scale'] if scale: scaler = model_params['scaler'] # build list of dates in datetime format date_list = Util.dateList(startDate, endDate) date_list = date_list[lag:] # if actual data available if actual: # load target variable data target = Util.loadDailyVariableRange(station, startDate, endDate, \ targetVar, castFloat=True) # "baseline" model is predicted target same as value on prediction day baseline = target[:(-lag)] baseline = np.array(baseline) # shift vector by lag target = target[lag:] target = np.array(target) else: target = None # load feature data featureData = [] for feature in features: # print("Adding " + feature) fd = Util.loadDailyVariableRange(station, startDate, endDate, \ feature, castFloat=True) # shorten vector by lag fd = fd[:(-lag)] featureData.append(fd) # convert features to np arrays featureData = (np.array(featureData)).T if scale: featureData = scaler.transform(featureData) pred = regr.predict(featureData) if actual: print("R^2_mean:" + "\t" + str(regr.score(featureData,target))) sse = ((pred-target)**2).sum() ssm = ((baseline-target)**2).sum() print("R^2_base:" + "\t" + str(1 - sse/ssm)) rmse = np.sqrt(((pred - target)**2).mean()) rmse_base = np.sqrt(((baseline - target)**2).mean()) print("RMSE:\t" + "\t" + str(rmse)) print("RMSE_base:\t" + str(rmse_base)) model_perf = { 'R2_mean': regr.score(featureData,target), \ 'R2_base': 1 - sse/ssm, \ 'RMSE': rmse} return date_list, pred, target, model_perf
def oneCityModel(station, startDate, endDate, \ features, targetVar='TempMax', lag=1, scale=False): # build regression model to predict "variable" for a single # station using training data from only the same station # between startdate and enddate # features is a list of variables to use as predictors import wUUtils as Util import numpy as np from sklearn import preprocessing from sklearn import linear_model # load target variable data target = Util.loadDailyVariableRange(station, startDate, endDate, \ targetVar, castFloat=True) # shift vector by lag target = target[lag:] # load feature data featureData = [] for feature in features: # print("Adding " + feature) fd = Util.loadDailyVariableRange(station, startDate, endDate, \ feature, castFloat=True) # shorten vector by lag fd = fd[:(-lag)] featureData.append(fd) # convert target and features to np arrays target = np.array(target) featureData = (np.array(featureData)).T # rescale features scaler = None if scale: scaler = preprocessing.StandardScaler().fit(featureData) featureData = scaler.transform(featureData) regr = linear_model.LinearRegression() regr.fit(featureData, target) model_params = { 'station': station, \ 'startDate': startDate, \ 'endDate': endDate, \ 'targetVar': targetVar, \ 'features': features, \ 'regr': regr, \ 'lag': lag, \ 'scale': scale, \ 'scaler': scaler} # report regression results: print("R^2: " + str(regr.score(featureData,target))) if scale: print("Regression coefficients (scaled, sorted):") print(" intercept" + ":\t" + str(regr.intercept_)) for ii in np.argsort(-np.abs(regr.coef_)): print(" " + features[ii] + ":\t" + str(regr.coef_[ii])) else: print("Regression coefficients:") print(" intercept" + ":\t" + str(regr.intercept_)) for ii in range(len(regr.coef_)): print(" " + features[ii] + ":\t" + str(regr.coef_[ii])) return featureData, target, model_params
def isDaytimeMinPress(station, startDate, endDate): # binary variable for minimum pressure occurring before noon local time import datetime import wUUtils as Util mTime = Util.loadDailyVariableRange(station, startDate, endDate, \ 'PressMinTime', castFloat=False) timeZone = Util.loadDailyVariableRange(station, startDate, endDate, \ 'TimeZone', castFloat=True) # convert minTime to datetime mTime = [datetime.datetime.strptime(mm,"%Y-%m-%d %H:%M:%S") for mm in mTime] hours = [(mm.hour + mm.minute/60 + timeZone[i]) % 24 \ for i,mm in enumerate(mTime)] isDaytime = [int(hour > 8 and hour < 20) for hour in hours] return(isDaytime)
def isMorningMaxWind(station, startDate, endDate): # binary variable for maximum wind speed occurring before noon local time import datetime import wUUtils as Util mTime = Util.loadDailyVariableRange(station, startDate, endDate, \ 'WindMaxTime', castFloat=False) timeZone = Util.loadDailyVariableRange(station, startDate, endDate, \ 'TimeZone', castFloat=True) # convert maxTime to datetime mTime = [datetime.datetime.strptime(mm,"%Y-%m-%d %H:%M:%S") for mm in mTime] hours = [(mm.hour + mm.minute/60 + timeZone[i]) % 24 \ for i,mm in enumerate(mTime)] isMorning = [int(hour < 12) for hour in hours] return(isMorning)
def contourPlotMeanVarOnMap(variable, startDate, endDate, \ npts = 20, ncntrs = 10, \ width_fac = 16, height_fac = 12): import numpy as np import wUUtils as Util import matplotlib.pyplot as plt import scipy.interpolate from mpl_toolkits.basemap import Basemap # open new figure window plt.figure() # setup Lambert Conformal basemap. m = Basemap(width=width_fac*100000,height=height_fac*100000, \ projection='lcc', resolution='i', \ lat_1=45.,lat_0=43.6,lon_0=-82.) # draw coastlines. m.drawcoastlines() m.drawcountries() m.drawstates() # draw a boundary around the map, fill the background. # this background will end up being the ocean color, since # the continents/data will be drawn on top. m.drawmapboundary(fill_color='aqua') # load data stations = Util.getStationList() lon, lat = Util.getStationLonLat(stations) data = [] for station in stations: vals = Util.loadDailyVariableRange(station, startDate, endDate, \ variable, castFloat=True) data.append(np.mean(vals)) # print(zip(stations,data)) # convert data to arrays: x, y, z = np.array(lon), np.array(lat), np.array(data) # map data points to projection coordinates xmap, ymap = m(x,y) # Set up a regular grid of interpolation points xi, yi = np.linspace(x.min(), x.max(), npts), \ np.linspace(y.min(), y.max(), npts) # map regular lon-lat grid to projection coordinates xi, yi = m(*np.meshgrid(xi,yi)) # Interpolate data to projected regular grid # function is one of 'linear', 'multiquadric', 'gaussian', # 'inverse', 'cubic', 'quintic', 'thin_plate' rbf = scipy.interpolate.Rbf(xmap, ymap, z, \ function='linear') zi = rbf(xi, yi) # draw filled contours cs = m.contourf(xi,yi,zi,ncntrs,cmap=plt.cm.jet) # plot circles at original (projected) data points m.scatter(xmap,ymap,c=z) # add colorbar. cbar = m.colorbar(cs,location='bottom',pad="5%") cbar.set_label(variable) plt.title(variable + " -- Mean " + startDate + " to " + endDate) # display plot plt.show()
def plotMeanWindVectorsOnMap(startDate, endDate, showIt=True): import numpy as np import wUUtils as Util from mpl_toolkits.basemap import Basemap import matplotlib.pyplot as plt # setup Lambert Conformal basemap. m = Basemap(width=3200000,height=2500000,projection='lcc', resolution='i',lat_1=45.,lat_0=43.6,lon_0=-80.) # draw coastlines. m.drawcoastlines() m.drawcountries() m.drawstates() # draw a boundary around the map, fill the background. # this background will end up being the ocean color, since # the continents will be drawn on top. m.drawmapboundary(fill_color='aqua') # fill continents, set lake color same as ocean color. m.fillcontinents(color='wheat',lake_color='aqua') # get station locations (Toronto, Montreal, Detroit) stations = Util.getStationList() lon, lat = Util.getStationLonLat(stations) # convert to map projection coords. # Note that lon,lat can be scalars, lists or numpy arrays. xpt,ypt = m(lon,lat) m.plot(xpt,ypt,'bo') # plot a blue dot there # calculate mean wind at each station windX = [] windY = [] for station in stations: wX = Util.loadDailyVariableRange(station, startDate, endDate, \ 'WindMeanX', castFloat=True) windX.append(np.mean(wX)) wY = Util.loadDailyVariableRange(station, startDate, endDate, \ 'WindMeanY', castFloat=True) windY.append(np.mean(wY)) for istation in range(len(stations)): stretch = 50000 dx, dy = stretch*windX[istation], stretch*windY[istation] plt.arrow(xpt[istation],ypt[istation],dx,dy,color='r',width=12000,head_length=40000,head_width=40000) plt.text(xpt[istation]+30000,ypt[istation]+20000,stations[istation], size='large') plt.title("Time-mean Wind: " + startDate + " to " + endDate) if showIt: plt.show()
def dailyTempRange(station, startDate, endDate): import datetime import wUUtils as Util # get daily range (positive if max occurs later than min) maximum = Util.loadDailyVariableRange(station, startDate, endDate, \ 'TempMax', castFloat=True) minimum = Util.loadDailyVariableRange(station, startDate, endDate, \ 'TempMin', castFloat=True) maxTime = Util.loadDailyVariableRange(station, startDate, endDate, \ 'TempMaxTime', castFloat=False) minTime = Util.loadDailyVariableRange(station, startDate, endDate, \ 'TempMinTime', castFloat=False) # positive if maximum occurs later than minimum plusminus = [2*int(datetime.datetime.strptime(maxTime[i],"%Y-%m-%d %H:%M:%S") - datetime.datetime.strptime(minTime[i],"%Y-%m-%d %H:%M:%S") > datetime.timedelta(0)) - 1 \ for i in range(len(maxTime))] # calculate return value vals = [float(plusminus[i]*(maximum[i]-minimum[i])) \ for i in range(len(plusminus))] return vals
def advectionTaylorModel(stations, startDate, endDate, \ features, targetVar='TempMax', \ lag=1, order=0, verbose=False): # build regression model to predict "variable" for a single # station using training data from multiple stations # between startdate and enddate. Uses a "Taylor expansion" # by combining information from several days (higher order # time derivatives) # # for each variable, at target station, use value, and # at other stations, only the projection of its gradient # in the direction of the target station # # stations: a list of station codes, the first entry is # the target station (for which forecast is generated) # features: a list of variables to use as predictors # lag: the number of days in the future to forecast # order: the number of days in the past to include # (also maximum order of time derivative) import numpy as np import wUUtils as Util import wUAdvection as Adv reload(Adv) from sklearn import linear_model # load target variable data target = Util.loadDailyVariableRange(stations[0], startDate, endDate, \ targetVar, castFloat=True) # shift vector by lag target = target[lag:] # load feature data featureData = [] # add data for target station for feature in features: fd = Util.loadDailyVariableRange(stations[0], startDate, endDate, \ feature, castFloat=True) # shorten vector by lag fd = fd[:(-lag)] featureData.append(fd) # for other stations, add the advection of each feature in the # direction of the target station for station in stations[1:]: for feature in features: # print("Adding " + feature + " from " + station) fd, uVec = Adv.dDeriv(stations[0], station, \ feature, startDate, endDate) # shorten vector by lag fd = fd[:(-lag)] featureData.append(fd) # add in "derivative" terms for ideriv in range(1,order+1): ncols = len(stations)*len(features) for ii in range(ncols): # print("Adding " + str(ideriv) + " derivative of " + feature[jfeat]) fd = np.diff(featureData[ii],n=ideriv) featureData.append(fd) # shorten vectors to length of highest order derivative nrows = len(featureData[-1]) for column in range(len(featureData)): featureData[column] = featureData[column][-nrows:] target = target[-nrows:] # convert target and features to np arrays target = np.array(target) featureData = (np.array(featureData)).T regr = linear_model.LinearRegression() regr.fit(featureData, target) model_params = { 'stations': stations, \ 'startDate': startDate, \ 'endDate': endDate, \ 'targetVar': targetVar, \ 'features': features, \ 'regr': regr, \ 'lag': lag, \ 'order': order} # report regression results: print("R^2: " + str(regr.score(featureData,target))) if verbose: print("Regression coefficients:") print(" intercept" + ":\t" + str(regr.intercept_)) column = 0 for ideriv in range(order+1): print(" " + str(ideriv) + "th derivative:") for jj, station in enumerate(stations): if jj > 0: print(" Station (Adv): " + station) else: print(" Station: " + station) for ii, feature in enumerate(features): print(" " + feature + ":\t" + str(regr.coef_[column])) column += 1 return featureData, target, model_params
def multiCityInteractionPredict(model_params, startDate, endDate, actual=True): # predict targetVar for a single station using # previously generated regression model import numpy as np import wUUtils as Util # extract city and feature data stations = model_params['stations'] targetVar = model_params['targetVar'] features = model_params['features'] regr = model_params['regr'] lag = model_params['lag'] order = model_params['order'] scale = model_params['scale'] prescalers = model_params['prescalers'] if scale: scaler = model_params['scaler'] # build list of dates in datetime format date_list = Util.dateList(startDate, endDate) date_list = date_list[(lag+order):] # if actual data available if actual: # load target variable data target = Util.loadDailyVariableRange(stations[0], startDate, endDate, \ targetVar, castFloat=True) # "baseline" model is predicted target same as value on prediction day baseline = target[order:(-lag)] baseline = np.array(baseline) # shift vector by lag target = target[lag:] target = np.array(target) else: target = None # load feature data featureData = [] idata = 0 for station in stations: for feature in features: # check if feature contains an interaction if ':' in feature: feat1 = feature.split(':')[0] feat2 = feature.split(':')[1] fd1 = Util.loadDailyVariableRange(station, startDate, endDate, \ feat1, castFloat=True) fd2 = Util.loadDailyVariableRange(station, startDate, endDate, \ feat2, castFloat=True) # rescale factors in interaction prescaler1, prescaler2 = prescalers[idata] fd1 = prescaler1.transform(fd1) fd2 = prescaler2.transform(fd2) # compute interaction fd = (np.array(fd1)*np.array(fd2)).tolist() else: fd = Util.loadDailyVariableRange(station, startDate, endDate, \ feature, castFloat=True) # shorten vector by lag fd = fd[:(-lag)] featureData.append(fd) # increment feature counter idata += 1 # add in "derivative" terms for ideriv in range(1,order+1): ncols = len(stations)*len(features) for ii in range(ncols): # print("Adding " + str(ideriv) + " derivative of " + feature[jfeat]) fd = np.diff(featureData[ii],n=ideriv) featureData.append(fd) # shorten vectors to length of highest order derivative nrows = len(featureData[-1]) for column in range(len(featureData)): featureData[column] = featureData[column][-nrows:] if actual: target = target[-nrows:] # convert features to np arrays featureData = (np.array(featureData)).T if scale: featureData = scaler.transform(featureData) pred = regr.predict(featureData) if actual: print("R^2_mean:" + "\t" + str(regr.score(featureData,target))) sse = ((pred-target)**2).sum() ssm = ((baseline-target)**2).sum() print("R^2_base:" + "\t" + str(1 - sse/ssm)) rmse = np.sqrt(((pred - target)**2).mean()) print("RMSE:\t" + "\t" + str(rmse)) model_perf = { 'R2_mean': regr.score(featureData,target), \ 'R2_base': 1 - sse/ssm, \ 'RMSE': rmse} else: model_perf = None return date_list, pred, target, model_perf
def multiCityInteractionModel(stations, startDate, endDate, \ features, targetVar='TempMax', \ lag=1, order=0, verbose=False, scale=False): # build regression model to predict "variable" for a single # station using training data from multiple stations # between startdate and enddate. Uses a "Taylor expansion" # by combining information from several days (higher order # time derivatives) # # stations: a list of station codes, the first entry is # the station for which forecast is generated # features: a list of variables to use as predictors # *** if a feature string contains a ":" it is parsed as # an interaction between two features ... # *** features in interaction terms pre-scaled! # lag: the number of days in the future to forecast # order: the number of days in the past to include # (also maximum order of time derivative) import numpy as np import wUUtils as Util from sklearn import preprocessing from sklearn import linear_model # load target variable data target = Util.loadDailyVariableRange(stations[0], startDate, endDate, \ targetVar, castFloat=True) # shift vector by lag target = target[lag:] # load feature data featureData = [] prescalers = [] for station in stations: for feature in features: # check if feature contains an interaction if ':' in feature: feat1 = feature.split(':')[0] feat2 = feature.split(':')[1] fd1 = Util.loadDailyVariableRange(station, startDate, endDate, \ feat1, castFloat=True) fd2 = Util.loadDailyVariableRange(station, startDate, endDate, \ feat2, castFloat=True) prescaler1 = preprocessing.StandardScaler().fit(fd1) fd1 = prescaler1.transform(fd1) prescaler2 = preprocessing.StandardScaler().fit(fd2) fd2 = prescaler2.transform(fd2) # save prescaler objects (for prediction) prescalers.append([prescaler1,prescaler2]) # compute interaction fd = (np.array(fd1)*np.array(fd2)).tolist() else: fd = Util.loadDailyVariableRange(station, startDate, endDate, \ feature, castFloat=True) prescalers.append(None) # shorten vector by lag fd = fd[:(-lag)] featureData.append(fd) # add in "derivative" terms for ideriv in range(1,order+1): ncols = len(stations)*len(features) for ii in range(ncols): # print("Adding " + str(ideriv) + " derivative of " + feature[jfeat]) fd = np.diff(featureData[ii],n=ideriv) featureData.append(fd) # shorten vectors to length of highest order derivative nrows = len(featureData[-1]) for column in range(len(featureData)): featureData[column] = featureData[column][-nrows:] target = target[-nrows:] # convert target and features to np arrays target = np.array(target) featureData = (np.array(featureData)).T # rescale features scaler = None if scale: scaler = preprocessing.StandardScaler().fit(featureData) featureData = scaler.transform(featureData) # fit regression model regr = linear_model.LinearRegression() regr.fit(featureData, target) model_params = { 'stations': stations, \ 'startDate': startDate, \ 'endDate': endDate, \ 'targetVar': targetVar, \ 'features': features, \ 'regr': regr, \ 'lag': lag, \ 'order': order, \ 'scale': scale, \ 'scaler': scaler, \ 'prescalers': prescalers} # report regression results: print("R^2: " + str(regr.score(featureData,target))) if verbose: print("Regression coefficients:") print(" intercept" + ":\t" + str(regr.intercept_)) column = 0 for ideriv in range(order+1): print(" " + str(ideriv) + "th derivative:") for jj, station in enumerate(stations): print(" Station: " + station) for ii, feature in enumerate(features): print(" " + feature + ":\t" + str(regr.coef_[column])) column += 1 return featureData, target, model_params
def oneCityTaylorModel(station, startDate, endDate, \ features, targetVar='TempMax', \ lag=1, order=0, verbose=True, scale=False): # build regression model to predict "variable" for a single # station using training data from only the same station # between startdate and enddate # features is a list of variables to use as predictors # use a "Taylor expansion" by combining information from # order is the maximum order of derivative to use import numpy as np import wUUtils as Util from sklearn import preprocessing from sklearn import linear_model # load target variable data target = Util.loadDailyVariableRange(station, startDate, endDate, \ targetVar, castFloat=True) # shift vector by lag target = target[lag:] # load feature data featureData = [] for feature in features: # print("Adding " + feature) fd = Util.loadDailyVariableRange(station, startDate, endDate, \ feature, castFloat=True) # shorten vector by lag fd = fd[:(-lag)] featureData.append(fd) # add in "derivative" terms for ideriv in range(1,order+1): for jfeat in range(len(features)): # print("Adding " + str(ideriv) + " derivative of " + feature[jfeat]) fd = np.diff(featureData[jfeat],n=ideriv) featureData.append(fd) # shorten vectors to length of highest order derivative nrows = len(featureData[-1]) # print("nrows ... " + str(nrows)) for column in range(len(featureData)): featureData[column] = featureData[column][-nrows:] target = target[-nrows:] # convert target and features to np arrays target = np.array(target) featureData = (np.array(featureData)).T # rescale features scaler = None if scale: scaler = preprocessing.StandardScaler().fit(featureData) featureData = scaler.transform(featureData) regr = linear_model.LinearRegression() regr.fit(featureData, target) model_params = { 'station': station, \ 'startDate': startDate, \ 'endDate': endDate, \ 'targetVar': targetVar, \ 'features': features, \ 'regr': regr, \ 'lag': lag, \ 'order': order, \ 'scale': scale, \ 'scaler': scaler} # report regression results: print("R^2: " + str(regr.score(featureData,target))) if verbose: if scale: print("Regression coefficients (scaled, sorted):") print(" intercept" + ":\t" + str(regr.intercept_)) for ii in np.argsort(-np.abs(regr.coef_)): ideriv = ii / len(features) ifeat = ii - len(features)*ideriv print(" " + str(ideriv) + 'th deriv of ' \ + features[ifeat] + ":\t" + str(regr.coef_[ii])) else: print("Regression coefficients:") print(" intercept" + ":\t" + str(regr.intercept_)) for ideriv in range(order+1): print(" " + str(ideriv) + "th derivative:") for ii, feature in enumerate(features): column = len(features)*ideriv + ii print(" " + feature + ":\t" + str(regr.coef_[column])) return featureData, target, model_params
def pcaClusterPredict(modelParams, startDate, endDate, actual=True): # predict targetVar for a single station using # previously generated regression model import numpy as np import wUUtils as Util import wUCluster as Clust import wUPCA reload(wUPCA) # extract city and feature data stations = modelParams['stations'] targetVar = modelParams['targetVar'] features = modelParams['features'] regrs = modelParams['regrs'] lag = modelParams['lag'] order = modelParams['order'] transformParams = modelParams['transformParams'] ncomp = transformParams['ncomp'] clusterVars = modelParams['clusterVars'] clusterParams = modelParams['clusterParams'] nclusters = clusterParams['nclusters'] cols = clusterParams['cols'] scaler = clusterParams['scaler'] clusterer = clusterParams['clusterer'] # build list of dates in datetime format date_list = Util.dateList(startDate, endDate) date_list = date_list[(lag+order):] # if actual data available if actual: # load target variable data target = Util.loadDailyVariableRange(stations[0], startDate, endDate, \ targetVar, castFloat=True) # shift vector by lag target = target[lag:] target = np.array(target) else: target = None # load features data and compute PC pcaData = wUPCA.pcaPredict(transformParams, startDate, endDate) # flatten featureData into single list of lists, while shortening by lag featureData = [data[:(-lag)] for dataList in pcaData for data in dataList] # number of PC-transformed features nfeat = sum(ncomp) # add in "derivative" terms for ideriv in range(1,order+1): for ii in range(nfeat): # print("Adding " + str(ideriv) + " derivative of " + feature[jfeat]) fd = np.diff(featureData[ii],n=ideriv) featureData.append(fd) # shorten vectors to length of highest order derivative nrows = len(featureData[-1]) for column in range(len(featureData)): featureData[column] = featureData[column][-nrows:] if actual: target = target[-nrows:] # assign points (rows) to clusters clusterData = np.array([featureData[ii] for ii in cols]).T classes = Clust.assignClusters(scaler, clusterer, clusterData) # separate data into clusters featureClusters = [] dateClusters = [] if actual: targetClusters = [] for icl in range(nclusters): # features clust = [f for i,f in enumerate(zip(*featureData)) if classes[i]==icl] featureClusters.append( map(list,zip(*clust)) ) if actual: # targetVar clust = [t for i,t in enumerate(target) if classes[i]==icl] targetClusters.append(clust) # dates dateClusters.append([t for i,t in enumerate(date_list) if classes[i] == icl]) R2 = [] RMSE = [] preds = [] for icl in range(nclusters): regr = regrs[icl] # convert features and target to arrays featureClusters[icl] = (np.array(featureClusters[icl])).T # make predictions if len(featureClusters[icl]) > 0: preds.append(regr.predict(featureClusters[icl])) else: preds.append([]) if actual: targetClusters[icl] = np.array(targetClusters[icl]) print('Cluster %d, %d rows:' % (icl,len(dateClusters[icl])) ) if len(featureClusters[icl]) > 0: r2 = regrs[icl].score(featureClusters[icl],targetClusters[icl]) print(' R^2_mean:' + '\t' + str(r2)) rmse = np.sqrt(((preds[icl] - targetClusters[icl])**2).mean()) print(' RMSE:\t' + '\t' + str(rmse)) RMSE.append(rmse) R2.append(r2) else: RMSE.append(None) R2.append(None) # assemble predictions into one list date_list_mixed = np.concatenate(dateClusters).tolist() pred_mixed = np.concatenate(preds).tolist() pred = [pr for (d,pr) in sorted(zip(date_list_mixed,pred_mixed))] if actual: rmse = np.sqrt(((np.array(pred) - np.array(target))**2).mean()) print('\nOverall performance:') print(' RMSE:' + '\t' + str(rmse)) modelPerf = {'RMSE': RMSE, 'R2': R2, 'RMSE_total': rmse } else: modelPerf = None return date_list, pred, target, featureData, classes, modelPerf
def windQuadrant(station, startDate, endDate): # integer variable for quadrant of maximum wind import wUUtils as Util windDir = Util.loadDailyVariableRange(station, startDate, endDate, \ 'WindMaxDir', castFloat=True) return [int(w)/90 for w in windDir]
def pcaTaylorModel(stations, startDate, endDate, \ features, ncomp=None, targetVar='TempMax', \ lag=1, order=0, smooth_window=0, verbose=False): # build regression model to predict "variable" for a single # station using training data from multiple stations # between startdate and enddate. # # The set of values of each feature at all stations is converted # to a truncated list of principal components for purposes of # feature-reduction and reduction of multicolinearity # # Uses a "Taylor expansion" by combining information from # several days (higher order time derivatives) # # stations: a list of station codes, the first entry is # the station for which forecast is generated # features: a list of variables to use as predictors # ncomp: a list of same length as features containing the # number of PCA to keep for each feature # lag: the number of days in the future to forecast # order: the number of days in the past to include # (also maximum order of time derivative) import numpy as np import wUUtils as Util import wUPCA reload(wUPCA) from sklearn import preprocessing from sklearn import linear_model # load target variable data target = Util.loadDailyVariableRange(stations[0], startDate, endDate, \ targetVar, castFloat=True) if smooth_window > 0: target = Util.smooth(target, smooth_window) # shift vector by lag target = target[lag:] # load features data and compute PC pcaData, transform_params = wUPCA.pcaConvert(stations, features, \ startDate, endDate, ncomp) # flatten featureData into single list of lists, while shortening by lag featureData = [data[:(-lag)] for dataList in pcaData for data in dataList] if smooth_window > 0: for data in featureData: data = Util.smooth(data,smooth_window) # number of PC-transformed features if ncomp == None: nfeat = len(stations)*len(features) else: nfeat = sum(ncomp) # add in "derivative" terms for ideriv in range(1,order+1): for ii in range(nfeat): # print("Adding " + str(ideriv) + " derivative of " + feature[jfeat]) fd = np.diff(featureData[ii],n=ideriv) featureData.append(fd) # shorten vectors to length of highest order derivative nrows = len(featureData[-1]) for column in range(len(featureData)): featureData[column] = featureData[column][-nrows:] target = target[-nrows:] # convert target and features to np arrays target = np.array(target) featureData = (np.array(featureData)).T # fit regression model regr = linear_model.LinearRegression() regr.fit(featureData, target) model_params = { 'stations': stations, \ 'startDate': startDate, \ 'endDate': endDate, \ 'targetVar': targetVar, \ 'features': features, \ 'regr': regr, \ 'lag': lag, \ 'order': order, \ 'smooth_window': smooth_window, \ 'transform_params': transform_params} # report regression results: print("R^2: " + str(regr.score(featureData,target))) if verbose: print("Regression coefficients:") print(" intercept" + ":\t" + str(regr.intercept_)) column = 0 for ideriv in range(order+1): print(" " + str(ideriv) + "th derivative:") for ii, feature in enumerate(features): print(" " + feature) if ncomp == None: nc = len(stations) else: nc = ncomp[ii] for jj in range(nc): print(" PC " + str(jj) + " :\t" + str(regr.coef_[column])) column += 1 return featureData, target, model_params
def isEasterly(station, startDate, endDate): # binary variable for max wind (0 = easterly, 1 = westerly) import wUUtils as Util windDir = Util.loadDailyVariableRange(station, startDate, endDate, \ 'WindMaxDir', castFloat=True) return [int(w > 0.0 and w < 180.0) for w in windDir]
def isNotFoggy(station, startDate, endDate): # binary variable for fogginess (visibility < 5 km) import wUUtils as Util visibility = Util.loadDailyVariableRange(station, startDate, endDate, \ 'VisibilityMean', castFloat=True) return [int(v > 5.0) for v in visibility]
def advectionTaylorPredict(model_params, startDate, endDate, actual=True): # predict targetVar for a single station using # previously generated regression model import numpy as np import wUUtils as Util import wUAdvection as Adv # extract city and feature data stations = model_params['stations'] targetVar = model_params['targetVar'] features = model_params['features'] regr = model_params['regr'] lag = model_params['lag'] order = model_params['order'] # build list of dates in datetime format date_list = Util.dateList(startDate, endDate) date_list = date_list[(lag+order):] # if actual data available if actual: # load target variable data target = Util.loadDailyVariableRange(stations[0], startDate, endDate, \ targetVar, castFloat=True) # "baseline" model is predicted target same as value on prediction day baseline = target[order:(-lag)] baseline = np.array(baseline) # shift vector by lag target = target[lag:] target = np.array(target) else: target = None # load feature data featureData = [] # add data for target station for feature in features: fd = Util.loadDailyVariableRange(stations[0], startDate, endDate, \ feature, castFloat=True) # shorten vector by lag fd = fd[:(-lag)] featureData.append(fd) # for other stations, add the advection of each feature in the # direction of the target station for station in stations[1:]: for feature in features: # print("Adding " + feature + " from " + station) fd, uVec = Adv.dDeriv(stations[0], station, \ feature, startDate, endDate) # shorten vector by lag fd = fd[:(-lag)] featureData.append(fd) # add in "derivative" terms for ideriv in range(1,order+1): ncols = len(stations)*len(features) for ii in range(ncols): # print("Adding " + str(ideriv) + " derivative of " + feature[jfeat]) fd = np.diff(featureData[ii],n=ideriv) featureData.append(fd) # shorten vectors to length of highest order derivative nrows = len(featureData[-1]) for column in range(len(featureData)): featureData[column] = featureData[column][-nrows:] if actual: target = target[-nrows:] # convert features to np arrays featureData = (np.array(featureData)).T pred = regr.predict(featureData) if actual: print("R^2_mean:" + "\t" + str(regr.score(featureData,target))) sse = ((pred-target)**2).sum() ssm = ((baseline-target)**2).sum() print("R^2_base:" + "\t" + str(1 - sse/ssm)) rmse = np.sqrt(((pred - target)**2).mean()) print("RMSE:\t" + "\t" + str(rmse)) model_perf = { 'R2_mean': regr.score(featureData,target), \ 'R2_base': 1 - sse/ssm, \ 'RMSE': rmse} else: model_perf = None return date_list, pred, target, model_perf
def clusterRegression(stations, startDate, endDate, \ features, clusterFeatures=None, \ nclusters=1, ranseed=666, \ targetVar='TempMax', \ lag=1, order=0, scale=False, verbose=False): # build regression model to predict a variable for a single # station using training data from multiple stations # between startdate and enddate. Uses a "Taylor expansion" # by combining information from several days (higher order # time derivatives) # # stations: a list of station codes, the first entry is # the station for which forecast is generated # features: a list of variables to use as predictors # *** if a feature string contains a ":" it is parsed as # an interaction between two features ... # *** features in interaction terms pre-scaled! # clusterFeatures: subset of features with respect to which # k-means clustering is applied before training # regression models # nclusters: number of clusters to compute # lag: the number of days in the future to forecast # order: the number of days in the past to include # (also maximum order of time derivative) import wUCluster as Clust reload(Clust) import numpy as np import wUUtils as Util from sklearn import preprocessing from sklearn import linear_model # load target variable data target = Util.loadDailyVariableRange(stations[0], startDate, endDate, \ targetVar, castFloat=True) # shift vector by lag target = target[lag:] # load feature data featureData = [] prescalers = [] for station in stations: for feature in features: # check if feature contains an interaction if ':' in feature: feat1 = feature.split(':')[0] feat2 = feature.split(':')[1] fd1 = Util.loadDailyVariableRange(station, startDate, endDate, \ feat1, castFloat=True) fd2 = Util.loadDailyVariableRange(station, startDate, endDate, \ feat2, castFloat=True) prescaler1 = preprocessing.StandardScaler().fit(fd1) fd1 = prescaler1.transform(fd1) prescaler2 = preprocessing.StandardScaler().fit(fd2) fd2 = prescaler2.transform(fd2) # save prescaler objects (for prediction) prescalers.append([prescaler1,prescaler2]) # compute interaction fd = (np.array(fd1)*np.array(fd2)).tolist() else: fd = Util.loadDailyVariableRange(station, startDate, endDate, \ feature, castFloat=True) prescalers.append(None) # shorten vector by lag fd = fd[:(-lag)] featureData.append(fd) # add in "derivative" terms for ideriv in range(1,order+1): ncols = len(stations)*len(features) for ii in range(ncols): # print("Adding " + str(ideriv) + " derivative of " + feature[jfeat]) fd = np.diff(featureData[ii],n=ideriv) featureData.append(fd) # shorten vectors to length of highest order derivative nrows = len(featureData[-1]) for column in range(len(featureData)): featureData[column] = featureData[column][-nrows:] target = target[-nrows:] # apply k-means clustering if clusterFeatures is not None: classes, clusterParams = Clust.clusterFeatureData(featureData, stations, features, \ clusterFeatures, nclusters, \ ranseed) classes, featureClusters = Clust.assignClustersAllFeatures(featureData, clusterParams) targetClusters = [] for cl in range(nclusters): targetClusters.append([t for i,t in enumerate(target) if classes[i] == cl]) else: # everything is one cluster classes = range(len(target)) featureClusters = [featureData] targetClusters = [target] clusterParams = { 'nclusters': 1 } # train separate regression model for each cluster regrs = [] scalers = [] for icl in range(nclusters): # convert features and target to arrays featureClusters[icl] = (np.array(featureClusters[icl])).T targetClusters[icl] = np.array(targetClusters[icl]) scaler = None if scale: scaler = preprocessing.StandardScaler().fit(featureClusters[icl]) featureClusters[icl] = scaler.transform(featureClusters[icl]) scalers.append(scaler) regr = linear_model.LinearRegression() regr.fit(featureClusters[icl], targetClusters[icl]) regrs.append(regr) print('Cluster %d, nrows %d, R^2 %f' \ % (icl, \ len(targetClusters[icl]), \ regr.score(featureClusters[icl],targetClusters[icl])) ) if verbose: print("Regression coefficients:") print(" intercept" + ":\t" + str(regr.intercept_)) column = 0 for ideriv in range(order+1): print(" " + str(ideriv) + "th derivative:") for jj, station in enumerate(stations): print(" Station: " + station) for ii, feature in enumerate(features): print(" " + feature + ":\t" + str(regr.coef_[column])) column += 1 # save model parameters modelParams = { 'stations': stations, \ 'startDate': startDate, \ 'endDate': endDate, \ 'targetVar': targetVar, \ 'features': features, \ 'regrs': regrs, \ 'clusterParams': clusterParams, \ 'classes': classes, \ 'lag': lag, \ 'order': order, \ 'scale': scale, \ 'scalers': scalers, \ 'prescalers': prescalers} return featureData, target, modelParams
def clusterRegressionPredict(modelParams, startDate, endDate, actual=True): # predict targetVar for a single station using # previously generated regression model import wUCluster as Clust reload(Clust) import numpy as np import wUUtils as Util # extract city and feature data stations = modelParams['stations'] targetVar = modelParams['targetVar'] features = modelParams['features'] regrs = modelParams['regrs'] clusterParams = modelParams['clusterParams'] nclusters = clusterParams['nclusters'] lag = modelParams['lag'] order = modelParams['order'] scale = modelParams['scale'] prescalers = modelParams['prescalers'] scalers = modelParams['scalers'] # build list of dates in datetime format date_list = Util.dateList(startDate, endDate) date_list = date_list[(lag+order):] # if actual data available if actual: # load target variable data target = Util.loadDailyVariableRange(stations[0], startDate, endDate, \ targetVar, castFloat=True) # "baseline" model is predicted target same as value on prediction day baseline = target[order:(-lag)] baseline = np.array(baseline) # shift vector by lag target = target[lag:] target = np.array(target) else: target = None # load feature data featureData = [] idata = 0 for station in stations: for feature in features: # check if feature contains an interaction if ':' in feature: feat1 = feature.split(':')[0] feat2 = feature.split(':')[1] fd1 = Util.loadDailyVariableRange(station, startDate, endDate, \ feat1, castFloat=True) fd2 = Util.loadDailyVariableRange(station, startDate, endDate, \ feat2, castFloat=True) # rescale factors in interaction prescaler1, prescaler2 = prescalers[idata] fd1 = prescaler1.transform(fd1) fd2 = prescaler2.transform(fd2) # compute interaction fd = (np.array(fd1)*np.array(fd2)).tolist() else: fd = Util.loadDailyVariableRange(station, startDate, endDate, \ feature, castFloat=True) # shorten vector by lag fd = fd[:(-lag)] featureData.append(fd) # increment feature counter idata += 1 # add in "derivative" terms for ideriv in range(1,order+1): ncols = len(stations)*len(features) for ii in range(ncols): # print("Adding " + str(ideriv) + " derivative of " + feature[jfeat]) fd = np.diff(featureData[ii],n=ideriv) featureData.append(fd) # shorten vectors to length of highest order derivative nrows = len(featureData[-1]) for column in range(len(featureData)): featureData[column] = featureData[column][-nrows:] if actual: target = target[-nrows:] # allocate features to clusters if clusterParams['nclusters'] > 1: classes, featureClusters = Clust.assignClustersAllFeatures(featureData, clusterParams) dateClusters = [] for icl in range(nclusters): dateClusters.append([t for i,t in enumerate(date_list) if classes[i] == icl]) if actual: targetClusters = [] for icl in range(nclusters): targetClusters.append([t for i,t in enumerate(target) if classes[i] == icl]) else: # everything is one cluster classes = range(len(target)) featureClusters = [featureData] dateClusters = [date_list] if actual: targetClusters = [target] preds = [] RMSE = [] R2 = [] for icl in range(nclusters): # convert features and target to arrays featureClusters[icl] = (np.array(featureClusters[icl])).T if scale: scaler = scalers[icl] featureClusters[icl] = scaler.transform(featureClusters[icl]) regr = regrs[icl] preds.append(regr.predict(featureClusters[icl])) if actual: targetClusters[icl] = np.array(targetClusters[icl]) print('Cluster %d, %d rows:' % (icl,len(dateClusters[icl])) ) r2 = regrs[icl].score(featureClusters[icl],targetClusters[icl]) print(' R^2_mean:' + '\t' + str(r2)) rmse = np.sqrt(((preds[icl] - targetClusters[icl])**2).mean()) print(' RMSE:\t' + '\t' + str(rmse)) RMSE.append(rmse) R2.append(r2) # assemble predictions into one list date_list_mixed = np.concatenate(dateClusters).tolist() pred_mixed = np.concatenate(preds).tolist() pred = [pr for (d,pr) in sorted(zip(date_list_mixed,pred_mixed))] if actual: rmse = np.sqrt(((np.array(pred) - np.array(target))**2).mean()) print('\nOverall performance:') print(' RMSE:' + '\t' + str(rmse)) modelPerf = {'RMSE': RMSE, 'R2': R2, 'RMSE_total': rmse } else: modelPerf = None return date_list, pred, target, featureData, classes, modelPerf
def isNortherly(station, startDate, endDate): # binary variable for max wind (0 = northerly, 1 = southerly) import wUUtils as Util windDir = Util.loadDailyVariableRange(station, startDate, endDate, \ 'WindMaxDir', castFloat=True) return [int(w > 270.0 or w < 90.0) for w in windDir]
def pcaTaylorPredict(model_params, startDate, endDate, actual=True): # predict targetVar for a single station using # previously generated regression model import numpy as np import wUUtils as Util import wUPCA reload(wUPCA) # extract city and feature data stations = model_params['stations'] targetVar = model_params['targetVar'] features = model_params['features'] regr = model_params['regr'] lag = model_params['lag'] order = model_params['order'] transform_params = model_params['transform_params'] ncomp = transform_params['ncomp'] # build list of dates in datetime format date_list = Util.dateList(startDate, endDate) date_list = date_list[(lag+order):] # if actual data available if actual: # load target variable data target = Util.loadDailyVariableRange(stations[0], startDate, endDate, \ targetVar, castFloat=True) # "baseline" model is predicted target same as value on prediction day baseline = target[order:(-lag)] baseline = np.array(baseline) # shift vector by lag target = target[lag:] target = np.array(target) else: target = None # load features data and compute PC pcaData = wUPCA.pcaPredict(transform_params, startDate, endDate) # flatten featureData into single list of lists, while shortening by lag featureData = [data[:(-lag)] for dataList in pcaData for data in dataList] # number of PC-transformed features nfeat = sum(ncomp) # add in "derivative" terms for ideriv in range(1,order+1): for ii in range(nfeat): # print("Adding " + str(ideriv) + " derivative of " + feature[jfeat]) fd = np.diff(featureData[ii],n=ideriv) featureData.append(fd) # shorten vectors to length of highest order derivative nrows = len(featureData[-1]) for column in range(len(featureData)): featureData[column] = featureData[column][-nrows:] if actual: target = target[-nrows:] # convert features to np arrays featureData = (np.array(featureData)).T pred = regr.predict(featureData) if actual: print("R^2_mean:" + "\t" + str(regr.score(featureData,target))) sse = ((pred-target)**2).sum() ssm = ((baseline-target)**2).sum() print("R^2_base:" + "\t" + str(1 - sse/ssm)) rmse = np.sqrt(((pred - target)**2).mean()) print("RMSE:\t" + "\t" + str(rmse)) model_perf = { 'R2_mean': regr.score(featureData,target), \ 'R2_base': 1 - sse/ssm, \ 'RMSE': rmse} else: model_perf = None return date_list, pred, target, model_perf
def pcaClusterModel(stations, startDate, endDate, \ features, ncomp=None, \ clusterVars=[], nclusters=1, \ targetVar='TempMax', \ lag=1, order=0, ranseed=666, verbose=False): # build regression model to predict "variable" for a single # station using training data from multiple stations # between startdate and enddate. # # The set of values of each feature at all stations is converted # to a truncated list of principal components for purposes of # feature-reduction and reduction of multicolinearity # # Clustering is used to train multiple models for different # partitions of the data # # Uses a "Taylor expansion" by combining information from # several days (higher order time derivatives) # # stations: a list of station codes, the first entry is # the station for which forecast is generated # features: a list of variables to use as predictors # ncomp: a list of same length as features containing the # number of PCA to keep for each feature # clusterVars: a list of pairs of form ('feature',npc), where # where npc is the index of the PC to use for # clustering # lag: the number of days in the future to forecast # order: the number of days in the past to include # (also maximum order of time derivative) import numpy as np import wUUtils as Util import wUPCA import wUCluster as Clust from sklearn import preprocessing from sklearn import linear_model # load target variable data target = Util.loadDailyVariableRange(stations[0], startDate, endDate, \ targetVar, castFloat=True) # shift vector by lag target = target[lag:] # load features data and compute PC pcaData, transformParams = wUPCA.pcaConvert(stations, features, \ startDate, endDate, ncomp) # flatten featureData into single list of lists, while shortening by lag featureData = [data[:(-lag)] for dataList in pcaData for data in dataList] # number of PC-transformed features if ncomp == None: nfeat = len(stations)*len(features) else: nfeat = sum(ncomp) # add in "derivative" terms for ideriv in range(1,order+1): for ii in range(nfeat): # print("Adding " + str(ideriv) + " derivative of " + feature[jfeat]) fd = np.diff(featureData[ii],n=ideriv) featureData.append(fd) # shorten vectors to length of highest order derivative nrows = len(featureData[-1]) for column in range(len(featureData)): featureData[column] = featureData[column][-nrows:] target = target[-nrows:] # apply clustering # locate columns to be used for clustering cols = [] for clusterPair in clusterVars: ifeat = features.index(clusterPair[0]) # index of feature col = sum(ncomp[:ifeat]) + clusterPair[1] cols += [col] if clusterPair[1] >= ncomp[ifeat]: print('Requested cluster variable out of range') print(clusterPair[0] + ' ' + str(clusterPair[1]) + ' >= ' + str(ncomp[ifeat])) return print('columns for clustering: ' + str(cols)) clusterData = np.array([featureData[ii] for ii in cols]).T scaler, clusterer = Clust.computeClusters(clusterData, nclusters, ranseed) classes = Clust.assignClusters(scaler, clusterer, clusterData) clusterParams = { \ 'scaler': scaler, \ 'clusterer': clusterer, \ 'nclusters': nclusters, \ 'ranseed': ranseed, \ 'cols': cols } # separate data into clusters featureClusters = [] targetClusters = [] for icl in range(nclusters): # features clust = [f for i,f in enumerate(zip(*featureData)) if classes[i]==icl] featureClusters.append( map(list,zip(*clust)) ) # targetVar clust = [t for i,t in enumerate(target) if classes[i]==icl] targetClusters.append(clust) # train separate regression model for each cluster regrs = [] for icl in range(nclusters): # convert features and target to arrays featureClusters[icl] = (np.array(featureClusters[icl])).T targetClusters[icl] = np.array(targetClusters[icl]) regr = linear_model.LinearRegression() regr.fit(featureClusters[icl], targetClusters[icl]) regrs.append(regr) print('Cluster %d, nrows %d, R^2 %f' \ % (icl, \ len(targetClusters[icl]), \ regr.score(featureClusters[icl],targetClusters[icl])) ) if verbose: print("\nCluster " + str(icl)) print("Regression coefficients:") print(" intercept" + ":\t" + str(regr.intercept_)) column = 0 for ideriv in range(order+1): print(" " + str(ideriv) + "th derivative:") for ii, feature in enumerate(features): print(" " + feature) for jj in range(ncomp[ii]): print(" PC " + str(jj) + " :\t" + str(regr.coef_[column])) column += 1 modelParams = { 'stations': stations, \ 'startDate': startDate, \ 'endDate': endDate, \ 'targetVar': targetVar, \ 'features': features, \ 'clusterVars': clusterVars, \ 'clusterParams': clusterParams, \ 'classes': classes, \ 'regrs': regrs, \ 'lag': lag, \ 'order': order, \ 'transformParams': transformParams} return featureData, target, modelParams