Esempio n. 1
0
def dDeriv(station1, station2, variable, startDate, endDate):
     # compute directional derivative using
     # (unitVector . windX2, windY2) (var1 - var2)
     import numpy as np
     import wUUtils as Util
     # load longitude and latitude of both stations
     lon, lat = Util.getStationLonLat([station1, station2])
     # compute unit vector from station2 to station1
     uVec = unitVector(lon, lat)
     # print("unit vector: " + str(uVec))
     # get mean wind vector at station2:
     windX2 = Util.loadDailyVariableRange(station2, startDate, endDate, \
                        'WindMeanX', castFloat=True)
     windY2 = Util.loadDailyVariableRange(station2, startDate, endDate, \
                        'WindMeanY', castFloat=True)
     # get variable at station1 and station2
     var1 = Util.loadDailyVariableRange(station1, startDate, endDate, \
                        variable, castFloat=True)
     var2 = Util.loadDailyVariableRange(station2, startDate, endDate, \
                        variable, castFloat=True)
     # construct wind vectors (N x 2 array)
     windVec = np.vstack((windX2, windY2))
     # project wind vector onto unit vector
     proj = np.dot(uVec, windVec)
     dD = (np.array(var1) - np.array(var2)) * proj
     return dD, uVec
Esempio n. 2
0
def oneCityPredict(model_params, startDate, endDate, actual=True):
     # predict targetVar for a single station using 
     # previously generated regression model
     import numpy as np
     import wUUtils as Util
     # extract city and feature data
     station = model_params['station']
     targetVar = model_params['targetVar']
     features = model_params['features']
     lag = model_params['lag']
     regr = model_params['regr']
     scale = model_params['scale']
     if scale:
          scaler = model_params['scaler']
     # build list of dates in datetime format
     date_list = Util.dateList(startDate, endDate)
     date_list = date_list[lag:]
     # if actual data available
     if actual:
          # load target variable data
          target = Util.loadDailyVariableRange(station, startDate, endDate, \
                             targetVar, castFloat=True)

          # "baseline" model is predicted target same as value on prediction day
          baseline = target[:(-lag)]
          baseline = np.array(baseline)
          
          # shift vector by lag
          target = target[lag:]
          target = np.array(target)
     else:
          target = None
     # load feature data
     featureData = []
     for feature in features:
          # print("Adding " + feature)
          fd = Util.loadDailyVariableRange(station, startDate, endDate, \
                        feature, castFloat=True)
          # shorten vector by lag
          fd = fd[:(-lag)]
          featureData.append(fd)
     # convert features to np arrays
     featureData = (np.array(featureData)).T
     if scale:
          featureData = scaler.transform(featureData)
     pred = regr.predict(featureData)
     if actual:
          print("R^2_mean:" + "\t" + str(regr.score(featureData,target)))
          sse = ((pred-target)**2).sum()
          ssm = ((baseline-target)**2).sum()
          print("R^2_base:" + "\t" + str(1 - sse/ssm))
          rmse = np.sqrt(((pred - target)**2).mean())
          rmse_base = np.sqrt(((baseline - target)**2).mean())
          print("RMSE:\t" + "\t" + str(rmse))
          print("RMSE_base:\t" + str(rmse_base))
          model_perf = {
               'R2_mean': regr.score(featureData,target), \
               'R2_base': 1 - sse/ssm, \
               'RMSE': rmse}
     return date_list, pred, target, model_perf
Esempio n. 3
0
def oneCityModel(station, startDate, endDate, \
                 features, targetVar='TempMax', lag=1, scale=False):
     # build regression model to predict "variable" for a single
     # station using training data from only the same station 
     # between startdate and enddate
     # features is a list of variables to use as predictors
     import wUUtils as Util
     import numpy as np
     from sklearn import preprocessing
     from sklearn import linear_model
     # load target variable data
     target = Util.loadDailyVariableRange(station, startDate, endDate, \
                        targetVar, castFloat=True)
     # shift vector by lag
     target = target[lag:]
     # load feature data
     featureData = []
     for feature in features:
          # print("Adding " + feature)
          fd = Util.loadDailyVariableRange(station, startDate, endDate, \
                        feature, castFloat=True)
          # shorten vector by lag
          fd = fd[:(-lag)]
          featureData.append(fd)
     # convert target and features to np arrays
     target = np.array(target)
     featureData = (np.array(featureData)).T
     # rescale features
     scaler = None
     if scale:
          scaler = preprocessing.StandardScaler().fit(featureData)
          featureData = scaler.transform(featureData)          
     regr = linear_model.LinearRegression()
     regr.fit(featureData, target)
     model_params = {
            'station': station, \
            'startDate': startDate, \
            'endDate': endDate, \
            'targetVar': targetVar, \
            'features': features, \
            'regr': regr, \
            'lag': lag, \
            'scale': scale, \
            'scaler': scaler}
     # report regression results:
     print("R^2: " + str(regr.score(featureData,target)))
     if scale:
          print("Regression coefficients (scaled, sorted):")
          print("  intercept" + ":\t" + str(regr.intercept_))
          for ii in np.argsort(-np.abs(regr.coef_)):
               print("  " + features[ii] + ":\t" + str(regr.coef_[ii]))         
     else:
          print("Regression coefficients:")
          print("  intercept" + ":\t" + str(regr.intercept_))
          for ii in range(len(regr.coef_)):
               print("  " + features[ii] + ":\t" + str(regr.coef_[ii]))         
     return featureData, target, model_params
Esempio n. 4
0
def isDaytimeMinPress(station, startDate, endDate):
     # binary variable for minimum pressure occurring before noon local time
     import datetime
     import wUUtils as Util
     mTime = Util.loadDailyVariableRange(station, startDate, endDate, \
                           'PressMinTime', castFloat=False)
     timeZone = Util.loadDailyVariableRange(station, startDate, endDate, \
                           'TimeZone', castFloat=True)
     # convert minTime to datetime
     mTime = [datetime.datetime.strptime(mm,"%Y-%m-%d %H:%M:%S") for mm in mTime]
     hours = [(mm.hour + mm.minute/60 + timeZone[i]) % 24 \
                                           for i,mm in enumerate(mTime)]
     isDaytime = [int(hour > 8 and hour < 20) for hour in hours]
     return(isDaytime)
Esempio n. 5
0
def isMorningMaxWind(station, startDate, endDate):
     # binary variable for maximum wind speed occurring before noon local time
     import datetime
     import wUUtils as Util
     mTime = Util.loadDailyVariableRange(station, startDate, endDate, \
                           'WindMaxTime', castFloat=False)
     timeZone = Util.loadDailyVariableRange(station, startDate, endDate, \
                           'TimeZone', castFloat=True)
     # convert maxTime to datetime
     mTime = [datetime.datetime.strptime(mm,"%Y-%m-%d %H:%M:%S") for mm in mTime]
     hours = [(mm.hour + mm.minute/60 + timeZone[i]) % 24 \
                                           for i,mm in enumerate(mTime)]
     isMorning = [int(hour < 12) for hour in hours]
     return(isMorning)
Esempio n. 6
0
def contourPlotMeanVarOnMap(variable, startDate, endDate, \
                            npts = 20, ncntrs = 10, \
                            width_fac = 16, height_fac = 12):
     import numpy as np
     import wUUtils as Util
     import matplotlib.pyplot as plt
     import scipy.interpolate
     from mpl_toolkits.basemap import Basemap
     # open new figure window
     plt.figure()
     # setup Lambert Conformal basemap.
     m = Basemap(width=width_fac*100000,height=height_fac*100000, \
                 projection='lcc', resolution='i', \
                 lat_1=45.,lat_0=43.6,lon_0=-82.)
     # draw coastlines.
     m.drawcoastlines()
     m.drawcountries()
     m.drawstates()
     # draw a boundary around the map, fill the background.
     # this background will end up being the ocean color, since
     # the continents/data will be drawn on top.
     m.drawmapboundary(fill_color='aqua')
     # load data
     stations = Util.getStationList()
     lon, lat = Util.getStationLonLat(stations)
     data = []
     for station in stations:
          vals = Util.loadDailyVariableRange(station, startDate, endDate, \
                                        variable, castFloat=True)
          data.append(np.mean(vals))
     # print(zip(stations,data))
     # convert data to arrays:
     x, y, z = np.array(lon), np.array(lat), np.array(data)
     # map data points to projection coordinates
     xmap, ymap = m(x,y)
     # Set up a regular grid of interpolation points
     xi, yi = np.linspace(x.min(), x.max(), npts), \
              np.linspace(y.min(), y.max(), npts)
     # map regular lon-lat grid to projection coordinates
     xi, yi = m(*np.meshgrid(xi,yi))
     # Interpolate data to projected regular grid 
     # function is one of 'linear', 'multiquadric', 'gaussian',
     #                    'inverse', 'cubic', 'quintic', 'thin_plate'
     rbf = scipy.interpolate.Rbf(xmap, ymap, z, \
                                 function='linear')
     zi = rbf(xi, yi)
     # draw filled contours
     cs = m.contourf(xi,yi,zi,ncntrs,cmap=plt.cm.jet)
     # plot circles at original (projected) data points
     m.scatter(xmap,ymap,c=z)  
     # add colorbar.
     cbar = m.colorbar(cs,location='bottom',pad="5%")
     cbar.set_label(variable)
     plt.title(variable + " -- Mean " + startDate + " to " + endDate)
     # display plot
     plt.show()
Esempio n. 7
0
def plotMeanWindVectorsOnMap(startDate, endDate, showIt=True):
     import numpy as np
     import wUUtils as Util
     from mpl_toolkits.basemap import Basemap
     import matplotlib.pyplot as plt
     # setup Lambert Conformal basemap.
     m = Basemap(width=3200000,height=2500000,projection='lcc',
            resolution='i',lat_1=45.,lat_0=43.6,lon_0=-80.)
     # draw coastlines.
     m.drawcoastlines()
     m.drawcountries()
     m.drawstates()
     # draw a boundary around the map, fill the background.
     # this background will end up being the ocean color, since
     # the continents will be drawn on top.
     m.drawmapboundary(fill_color='aqua')
     # fill continents, set lake color same as ocean color.
     m.fillcontinents(color='wheat',lake_color='aqua')
     # get station locations (Toronto, Montreal, Detroit)
     stations = Util.getStationList()
     lon, lat = Util.getStationLonLat(stations)
     # convert to map projection coords.
     # Note that lon,lat can be scalars, lists or numpy arrays.
     xpt,ypt = m(lon,lat)
     m.plot(xpt,ypt,'bo')  # plot a blue dot there
     # calculate mean wind at each station
     windX = []
     windY = []
     for station in stations:
          wX = Util.loadDailyVariableRange(station, startDate, endDate, \
                                        'WindMeanX', castFloat=True)
          windX.append(np.mean(wX))
          wY = Util.loadDailyVariableRange(station, startDate, endDate, \
                                        'WindMeanY', castFloat=True)
          windY.append(np.mean(wY))
     for istation in range(len(stations)):
          stretch = 50000
          dx, dy = stretch*windX[istation], stretch*windY[istation]
          plt.arrow(xpt[istation],ypt[istation],dx,dy,color='r',width=12000,head_length=40000,head_width=40000)
          plt.text(xpt[istation]+30000,ypt[istation]+20000,stations[istation], size='large')
     plt.title("Time-mean Wind: " + startDate + " to " + endDate)
     if showIt:
          plt.show()
Esempio n. 8
0
def dailyTempRange(station, startDate, endDate):
     import datetime
     import wUUtils as Util
     # get daily range (positive if max occurs later than min)
     maximum = Util.loadDailyVariableRange(station, startDate, endDate, \
                           'TempMax', castFloat=True)
     minimum = Util.loadDailyVariableRange(station, startDate, endDate, \
                           'TempMin', castFloat=True)
     maxTime = Util.loadDailyVariableRange(station, startDate, endDate, \
                           'TempMaxTime', castFloat=False)
     minTime = Util.loadDailyVariableRange(station, startDate, endDate, \
                           'TempMinTime', castFloat=False)
     # positive if maximum occurs later than minimum
     plusminus = [2*int(datetime.datetime.strptime(maxTime[i],"%Y-%m-%d %H:%M:%S")
                      - datetime.datetime.strptime(minTime[i],"%Y-%m-%d %H:%M:%S") 
                          > datetime.timedelta(0)) - 1 \
                                    for i in range(len(maxTime))]
     # calculate return value                                        
     vals = [float(plusminus[i]*(maximum[i]-minimum[i])) \
                                    for i in range(len(plusminus))]
     return vals
Esempio n. 9
0
def advectionTaylorModel(stations, startDate, endDate, \
                     features, targetVar='TempMax', \
                     lag=1, order=0, verbose=False):
     # build regression model to predict "variable" for a single
     # station using training data from multiple stations 
     # between startdate and enddate.  Uses a "Taylor expansion" 
     # by combining information from several days (higher order
     # time derivatives)
     #
     # for each variable, at target station, use value, and
     # at other stations, only the projection of its gradient 
     # in the direction of the target station
     # 
     # stations: a list of station codes, the first entry is
     #           the target station (for which forecast is generated)
     # features: a list of variables to use as predictors
     #      lag: the number of days in the future to forecast
     #    order: the number of days in the past to include
     #           (also maximum order of time derivative)
     import numpy as np
     import wUUtils as Util
     import wUAdvection as Adv
     reload(Adv)
     from sklearn import linear_model
     # load target variable data
     target = Util.loadDailyVariableRange(stations[0], startDate, endDate, \
                        targetVar, castFloat=True)
     # shift vector by lag
     target = target[lag:]
     # load feature data
     featureData = []
     # add data for target station
     for feature in features:
          fd = Util.loadDailyVariableRange(stations[0], startDate, endDate, \
                             feature, castFloat=True)
          # shorten vector by lag
          fd = fd[:(-lag)]
          featureData.append(fd)
     # for other stations, add the advection of each feature in the
     # direction of the target station
     for station in stations[1:]:
          for feature in features:
               # print("Adding " + feature + " from " + station)
               fd, uVec = Adv.dDeriv(stations[0], station, \
                                     feature, startDate, endDate)
               # shorten vector by lag
               fd = fd[:(-lag)]
               featureData.append(fd)
     # add in "derivative" terms
     for ideriv in range(1,order+1):
          ncols = len(stations)*len(features)
          for ii in range(ncols):
               # print("Adding " + str(ideriv) + " derivative of " + feature[jfeat])
               fd = np.diff(featureData[ii],n=ideriv)
               featureData.append(fd)
     # shorten vectors to length of highest order derivative
     nrows = len(featureData[-1])
     for column in range(len(featureData)):
          featureData[column] = featureData[column][-nrows:]
     target = target[-nrows:]
     # convert target and features to np arrays
     target = np.array(target)
     featureData = (np.array(featureData)).T
     regr = linear_model.LinearRegression()
     regr.fit(featureData, target)
     model_params = {
            'stations': stations, \
            'startDate': startDate, \
            'endDate': endDate, \
            'targetVar': targetVar, \
            'features': features, \
            'regr': regr, \
            'lag': lag, \
            'order': order}
     # report regression results:
     print("R^2: " + str(regr.score(featureData,target)))
     if verbose:
          print("Regression coefficients:")
          print("  intercept" + ":\t" + str(regr.intercept_))
          column = 0
          for ideriv in range(order+1):
               print("  " + str(ideriv) + "th derivative:")
               for jj, station in enumerate(stations):
                    if jj > 0:
                         print("    Station (Adv): " + station)
                    else:
                         print("    Station: " + station)
                    for ii, feature in enumerate(features):
                         print("       " + feature + ":\t" + str(regr.coef_[column]))
                         column += 1
     return featureData, target, model_params
Esempio n. 10
0
def multiCityInteractionPredict(model_params, startDate, endDate, actual=True):
     # predict targetVar for a single station using 
     # previously generated regression model
     import numpy as np
     import wUUtils as Util
     # extract city and feature data
     stations = model_params['stations']
     targetVar = model_params['targetVar']
     features = model_params['features']
     regr = model_params['regr']
     lag = model_params['lag']
     order = model_params['order']
     scale = model_params['scale']
     prescalers = model_params['prescalers']
     if scale:
          scaler = model_params['scaler']
     # build list of dates in datetime format
     date_list = Util.dateList(startDate, endDate)
     date_list = date_list[(lag+order):]
     # if actual data available
     if actual:
          # load target variable data
          target = Util.loadDailyVariableRange(stations[0], startDate, endDate, \
                             targetVar, castFloat=True)
          # "baseline" model is predicted target same as value on prediction day
          baseline = target[order:(-lag)]
          baseline = np.array(baseline)
          # shift vector by lag
          target = target[lag:]
          target = np.array(target)
     else:
          target = None
     # load feature data
     featureData = []
     idata = 0
     for station in stations:
          for feature in features:
	       # check if feature contains an interaction
	       if ':' in feature:
                    feat1 = feature.split(':')[0]
		    feat2 = feature.split(':')[1]
		    fd1 = Util.loadDailyVariableRange(station, startDate, endDate, \
				    feat1, castFloat=True)
		    fd2 = Util.loadDailyVariableRange(station, startDate, endDate, \
				    feat2, castFloat=True)
		    # rescale factors in interaction
		    prescaler1, prescaler2 = prescalers[idata]
		    fd1 = prescaler1.transform(fd1)
		    fd2 = prescaler2.transform(fd2)
		    # compute interaction
		    fd = (np.array(fd1)*np.array(fd2)).tolist()
	       else:
                    fd = Util.loadDailyVariableRange(station, startDate, endDate, \
                                  feature, castFloat=True)
               # shorten vector by lag
               fd = fd[:(-lag)]
               featureData.append(fd)
               # increment feature counter
	       idata += 1
     # add in "derivative" terms
     for ideriv in range(1,order+1):
          ncols = len(stations)*len(features)
          for ii in range(ncols):
               # print("Adding " + str(ideriv) + " derivative of " + feature[jfeat])
               fd = np.diff(featureData[ii],n=ideriv)
               featureData.append(fd)
     # shorten vectors to length of highest order derivative
     nrows = len(featureData[-1])
     for column in range(len(featureData)):
          featureData[column] = featureData[column][-nrows:]
     if actual:
          target = target[-nrows:]
     # convert features to np arrays
     featureData = (np.array(featureData)).T
     if scale:
          featureData = scaler.transform(featureData)
     pred = regr.predict(featureData)
     if actual:
          print("R^2_mean:" + "\t" + str(regr.score(featureData,target)))
          sse = ((pred-target)**2).sum()
          ssm = ((baseline-target)**2).sum()
          print("R^2_base:" + "\t" + str(1 - sse/ssm))
          rmse = np.sqrt(((pred - target)**2).mean())
          print("RMSE:\t" + "\t" + str(rmse))
          model_perf = {
               'R2_mean': regr.score(featureData,target), \
               'R2_base': 1 - sse/ssm, \
               'RMSE': rmse}
     else:
          model_perf = None
     return date_list, pred, target, model_perf
Esempio n. 11
0
def multiCityInteractionModel(stations, startDate, endDate, \
                     features, targetVar='TempMax', \
                     lag=1, order=0, verbose=False, scale=False):
     # build regression model to predict "variable" for a single
     # station using training data from multiple stations 
     # between startdate and enddate.  Uses a "Taylor expansion" 
     # by combining information from several days (higher order
     # time derivatives)
     #
     # stations: a list of station codes, the first entry is
     #             the station for which forecast is generated
     # features: a list of variables to use as predictors
     #         *** if a feature string contains a ":" it is parsed as
     #             an interaction between two features ...
     #         *** features in interaction terms pre-scaled! 
     #      lag: the number of days in the future to forecast
     #    order: the number of days in the past to include
     #           (also maximum order of time derivative)
     import numpy as np
     import wUUtils as Util
     from sklearn import preprocessing
     from sklearn import linear_model
     # load target variable data
     target = Util.loadDailyVariableRange(stations[0], startDate, endDate, \
                        targetVar, castFloat=True)
     # shift vector by lag
     target = target[lag:]
     # load feature data
     featureData = []
     prescalers = []
     for station in stations:
          for feature in features:
	       # check if feature contains an interaction
	       if ':' in feature:
                    feat1 = feature.split(':')[0]
		    feat2 = feature.split(':')[1]
		    fd1 = Util.loadDailyVariableRange(station, startDate, endDate, \
				    feat1, castFloat=True)
		    fd2 = Util.loadDailyVariableRange(station, startDate, endDate, \
				    feat2, castFloat=True)
		    prescaler1 = preprocessing.StandardScaler().fit(fd1)
		    fd1 = prescaler1.transform(fd1)
		    prescaler2 = preprocessing.StandardScaler().fit(fd2)
		    fd2 = prescaler2.transform(fd2)
		    # save prescaler objects (for prediction)
		    prescalers.append([prescaler1,prescaler2])
		    # compute interaction
		    fd = (np.array(fd1)*np.array(fd2)).tolist()
	       else:
                    fd = Util.loadDailyVariableRange(station, startDate, endDate, \
                                  feature, castFloat=True)
		    prescalers.append(None)
               # shorten vector by lag
               fd = fd[:(-lag)]
               featureData.append(fd)
     # add in "derivative" terms
     for ideriv in range(1,order+1):
          ncols = len(stations)*len(features)
          for ii in range(ncols):
               # print("Adding " + str(ideriv) + " derivative of " + feature[jfeat])
               fd = np.diff(featureData[ii],n=ideriv)
               featureData.append(fd)
     # shorten vectors to length of highest order derivative
     nrows = len(featureData[-1])
     for column in range(len(featureData)):
          featureData[column] = featureData[column][-nrows:]
     target = target[-nrows:]
     # convert target and features to np arrays
     target = np.array(target)
     featureData = (np.array(featureData)).T
     # rescale features
     scaler = None
     if scale:
          scaler = preprocessing.StandardScaler().fit(featureData)
          featureData = scaler.transform(featureData)          
     # fit regression model
     regr = linear_model.LinearRegression()
     regr.fit(featureData, target)
     model_params = {
            'stations': stations, \
            'startDate': startDate, \
            'endDate': endDate, \
            'targetVar': targetVar, \
            'features': features, \
            'regr': regr, \
            'lag': lag, \
            'order': order, \
            'scale': scale, \
            'scaler': scaler, \
	    'prescalers': prescalers}
     # report regression results:
     print("R^2: " + str(regr.score(featureData,target)))
     if verbose:
          print("Regression coefficients:")
          print("  intercept" + ":\t" + str(regr.intercept_))
          column = 0
          for ideriv in range(order+1):
               print("  " + str(ideriv) + "th derivative:")
               for jj, station in enumerate(stations):
                    print("    Station: " + station)
                    for ii, feature in enumerate(features):
                         print("       " + feature + ":\t" + str(regr.coef_[column]))
                         column += 1
     return featureData, target, model_params
Esempio n. 12
0
def oneCityTaylorModel(station, startDate, endDate, \
                       features, targetVar='TempMax', \
                       lag=1, order=0, verbose=True, scale=False):
     # build regression model to predict "variable" for a single
     # station using training data from only the same station 
     # between startdate and enddate
     # features is a list of variables to use as predictors
     # use a "Taylor expansion" by combining information from
     # order is the maximum order of derivative to use
     import numpy as np
     import wUUtils as Util
     from sklearn import preprocessing
     from sklearn import linear_model
     # load target variable data
     target = Util.loadDailyVariableRange(station, startDate, endDate, \
                        targetVar, castFloat=True)
     # shift vector by lag
     target = target[lag:]
     # load feature data
     featureData = []
     for feature in features:
          # print("Adding " + feature)
          fd = Util.loadDailyVariableRange(station, startDate, endDate, \
                        feature, castFloat=True)
          # shorten vector by lag
          fd = fd[:(-lag)]
          featureData.append(fd)
     # add in "derivative" terms
     for ideriv in range(1,order+1):
          for jfeat in range(len(features)):
               # print("Adding " + str(ideriv) + " derivative of " + feature[jfeat])
               fd = np.diff(featureData[jfeat],n=ideriv)
               featureData.append(fd)
     # shorten vectors to length of highest order derivative
     nrows = len(featureData[-1])
     # print("nrows ... " + str(nrows))
     for column in range(len(featureData)):
          featureData[column] = featureData[column][-nrows:]
     target = target[-nrows:]
     # convert target and features to np arrays
     target = np.array(target)
     featureData = (np.array(featureData)).T
     # rescale features
     scaler = None
     if scale:
          scaler = preprocessing.StandardScaler().fit(featureData)
          featureData = scaler.transform(featureData)          
     regr = linear_model.LinearRegression()
     regr.fit(featureData, target)
     model_params = {
            'station': station, \
            'startDate': startDate, \
            'endDate': endDate, \
            'targetVar': targetVar, \
            'features': features, \
            'regr': regr, \
            'lag': lag, \
            'order': order, \
            'scale': scale, \
            'scaler': scaler}
     # report regression results:
     print("R^2: " + str(regr.score(featureData,target)))
     if verbose:
          if scale:
               print("Regression coefficients (scaled, sorted):")
               print("  intercept" + ":\t" + str(regr.intercept_))
               for ii in np.argsort(-np.abs(regr.coef_)):
                    ideriv = ii / len(features)
                    ifeat = ii - len(features)*ideriv
                    print("  " + str(ideriv) + 'th deriv of ' \
                            + features[ifeat] + ":\t" + str(regr.coef_[ii]))         
          else:
               print("Regression coefficients:")
               print("  intercept" + ":\t" + str(regr.intercept_))
               for ideriv in range(order+1):
                    print("  " + str(ideriv) + "th derivative:")
                    for ii, feature in enumerate(features):
                         column = len(features)*ideriv + ii
                         print("    " + feature + ":\t" + str(regr.coef_[column]))
     return featureData, target, model_params
Esempio n. 13
0
def pcaClusterPredict(modelParams, startDate, endDate, actual=True):
     # predict targetVar for a single station using 
     # previously generated regression model
     import numpy as np
     import wUUtils as Util
     import wUCluster as Clust
     import wUPCA
     reload(wUPCA)
     # extract city and feature data
     stations = modelParams['stations']
     targetVar = modelParams['targetVar']
     features = modelParams['features']
     regrs = modelParams['regrs']
     lag = modelParams['lag']
     order = modelParams['order']
     transformParams = modelParams['transformParams']
     ncomp = transformParams['ncomp']
     clusterVars = modelParams['clusterVars']
     clusterParams = modelParams['clusterParams']
     nclusters = clusterParams['nclusters']
     cols = clusterParams['cols']
     scaler = clusterParams['scaler']
     clusterer = clusterParams['clusterer']

     # build list of dates in datetime format
     date_list = Util.dateList(startDate, endDate)
     date_list = date_list[(lag+order):]
     # if actual data available
     if actual:
          # load target variable data
          target = Util.loadDailyVariableRange(stations[0], startDate, endDate, \
                             targetVar, castFloat=True)
          # shift vector by lag
          target = target[lag:]
          target = np.array(target)
     else:
          target = None

     # load features data and compute PC
     pcaData = wUPCA.pcaPredict(transformParams, startDate, endDate)
     # flatten featureData into single list of lists, while shortening by lag
     featureData = [data[:(-lag)] for dataList in pcaData for data in dataList] 
     # number of PC-transformed features
     nfeat = sum(ncomp) 
     # add in "derivative" terms
     for ideriv in range(1,order+1):
          for ii in range(nfeat):
               # print("Adding " + str(ideriv) + " derivative of " + feature[jfeat])
               fd = np.diff(featureData[ii],n=ideriv)
               featureData.append(fd)
     # shorten vectors to length of highest order derivative
     nrows = len(featureData[-1])
     for column in range(len(featureData)):
          featureData[column] = featureData[column][-nrows:]
     if actual:
          target = target[-nrows:]

     # assign points (rows) to clusters
     clusterData = np.array([featureData[ii] for ii in cols]).T
     classes = Clust.assignClusters(scaler, clusterer, clusterData)

     # separate data into clusters
     featureClusters = []
     dateClusters = []
     if actual:
          targetClusters = []
     for icl in range(nclusters):
         # features
         clust = [f for i,f in enumerate(zip(*featureData)) if classes[i]==icl]
         featureClusters.append( map(list,zip(*clust)) )
         if actual:
              # targetVar
              clust = [t for i,t in enumerate(target) if classes[i]==icl]
              targetClusters.append(clust)
         # dates
         dateClusters.append([t for i,t in enumerate(date_list) if classes[i] == icl])

     R2 = []
     RMSE = []
     preds = []
     for icl in range(nclusters):
          regr = regrs[icl]
          # convert features and target to arrays
          featureClusters[icl] = (np.array(featureClusters[icl])).T
          # make predictions
          if len(featureClusters[icl]) > 0:
               preds.append(regr.predict(featureClusters[icl]))
          else:
               preds.append([])
          if actual:
               targetClusters[icl] = np.array(targetClusters[icl])
               print('Cluster %d, %d rows:' % (icl,len(dateClusters[icl])) )
               if len(featureClusters[icl]) > 0:
                    r2 = regrs[icl].score(featureClusters[icl],targetClusters[icl])
                    print('  R^2_mean:' + '\t' + str(r2))
                    rmse = np.sqrt(((preds[icl] - targetClusters[icl])**2).mean())
                    print('  RMSE:\t' + '\t' + str(rmse))
                    RMSE.append(rmse)
                    R2.append(r2)
               else:
                    RMSE.append(None)
                    R2.append(None)
     
     # assemble predictions into one list
     date_list_mixed = np.concatenate(dateClusters).tolist()
     pred_mixed = np.concatenate(preds).tolist()
     pred = [pr for (d,pr) in sorted(zip(date_list_mixed,pred_mixed))]

     if actual:
          rmse = np.sqrt(((np.array(pred) - np.array(target))**2).mean()) 
          print('\nOverall performance:')
          print('  RMSE:' + '\t' + str(rmse))

          modelPerf = {'RMSE': RMSE, 'R2': R2, 'RMSE_total': rmse }
     else:
          modelPerf = None

     return date_list, pred, target, featureData, classes, modelPerf
Esempio n. 14
0
def windQuadrant(station, startDate, endDate):
     # integer variable for quadrant of maximum wind
     import wUUtils as Util
     windDir = Util.loadDailyVariableRange(station, startDate, endDate, \
                           'WindMaxDir', castFloat=True)
     return [int(w)/90 for w in windDir]
Esempio n. 15
0
def pcaTaylorModel(stations, startDate, endDate, \
                   features, ncomp=None, targetVar='TempMax', \
                   lag=1, order=0, smooth_window=0, verbose=False):
     # build regression model to predict "variable" for a single
     # station using training data from multiple stations 
     # between startdate and enddate.
     #
     # The set of values of each feature at all stations is converted
     # to a truncated list of principal components for purposes of 
     # feature-reduction and reduction of multicolinearity 
     # 
     # Uses a "Taylor expansion" by combining information from 
     # several days (higher order time derivatives)
     #
     # stations: a list of station codes, the first entry is
     #             the station for which forecast is generated
     # features: a list of variables to use as predictors
     #    ncomp: a list of same length as features containing the
     #           number of PCA to keep for each feature
     #      lag: the number of days in the future to forecast
     #    order: the number of days in the past to include
     #           (also maximum order of time derivative)
     import numpy as np
     import wUUtils as Util
     import wUPCA
     reload(wUPCA)
     from sklearn import preprocessing
     from sklearn import linear_model
     # load target variable data
     target = Util.loadDailyVariableRange(stations[0], startDate, endDate, \
                        targetVar, castFloat=True)
     if smooth_window > 0:
          target = Util.smooth(target, smooth_window)
     # shift vector by lag
     target = target[lag:]
     # load features data and compute PC
     pcaData, transform_params = wUPCA.pcaConvert(stations, features, \
                                                  startDate, endDate, ncomp)
     # flatten featureData into single list of lists, while shortening by lag
     featureData = [data[:(-lag)] for dataList in pcaData for data in dataList] 
     if smooth_window > 0:
          for data in featureData:
               data = Util.smooth(data,smooth_window)
     # number of PC-transformed features
     if ncomp == None:
          nfeat = len(stations)*len(features)
     else:
          nfeat = sum(ncomp) 
     # add in "derivative" terms
     for ideriv in range(1,order+1):
          for ii in range(nfeat):
               # print("Adding " + str(ideriv) + " derivative of " + feature[jfeat])
               fd = np.diff(featureData[ii],n=ideriv)
               featureData.append(fd)
     # shorten vectors to length of highest order derivative
     nrows = len(featureData[-1])
     for column in range(len(featureData)):
          featureData[column] = featureData[column][-nrows:]
     target = target[-nrows:]

     # convert target and features to np arrays
     target = np.array(target)
     featureData = (np.array(featureData)).T

     # fit regression model
     regr = linear_model.LinearRegression()
     regr.fit(featureData, target)
     model_params = {
            'stations': stations, \
            'startDate': startDate, \
            'endDate': endDate, \
            'targetVar': targetVar, \
            'features': features, \
            'regr': regr, \
            'lag': lag, \
            'order': order, \
            'smooth_window': smooth_window, \
            'transform_params': transform_params}
     # report regression results:
     print("R^2: " + str(regr.score(featureData,target)))
     if verbose:
          print("Regression coefficients:")
          print("  intercept" + ":\t" + str(regr.intercept_))
          column = 0
          for ideriv in range(order+1):
               print("  " + str(ideriv) + "th derivative:")
               for ii, feature in enumerate(features):
                    print("    " + feature)
                    if ncomp == None:
                         nc = len(stations)
                    else:
                         nc = ncomp[ii]
                    for jj in range(nc):
                         print("      PC " + str(jj) + " :\t" + str(regr.coef_[column]))
                         column += 1
     return featureData, target, model_params
Esempio n. 16
0
def isEasterly(station, startDate, endDate):
     # binary variable for max wind (0 = easterly, 1 = westerly)
     import wUUtils as Util
     windDir = Util.loadDailyVariableRange(station, startDate, endDate, \
                           'WindMaxDir', castFloat=True)
     return [int(w > 0.0 and w < 180.0) for w in windDir]
Esempio n. 17
0
def isNotFoggy(station, startDate, endDate):
     # binary variable for fogginess (visibility < 5 km)
     import wUUtils as Util
     visibility = Util.loadDailyVariableRange(station, startDate, endDate, \
                           'VisibilityMean', castFloat=True)
     return [int(v > 5.0) for v in visibility]
Esempio n. 18
0
def advectionTaylorPredict(model_params, startDate, endDate, actual=True):
     # predict targetVar for a single station using 
     # previously generated regression model
     import numpy as np
     import wUUtils as Util
     import wUAdvection as Adv
     # extract city and feature data
     stations = model_params['stations']
     targetVar = model_params['targetVar']
     features = model_params['features']
     regr = model_params['regr']
     lag = model_params['lag']
     order = model_params['order']
     # build list of dates in datetime format
     date_list = Util.dateList(startDate, endDate)
     date_list = date_list[(lag+order):]
     # if actual data available
     if actual:
          # load target variable data
          target = Util.loadDailyVariableRange(stations[0], startDate, endDate, \
                             targetVar, castFloat=True)
          # "baseline" model is predicted target same as value on prediction day
          baseline = target[order:(-lag)]
          baseline = np.array(baseline)
          # shift vector by lag
          target = target[lag:]
          target = np.array(target)
     else:
          target = None
     # load feature data
     featureData = []
     # add data for target station
     for feature in features:
          fd = Util.loadDailyVariableRange(stations[0], startDate, endDate, \
                             feature, castFloat=True)
          # shorten vector by lag
          fd = fd[:(-lag)]
          featureData.append(fd)
     # for other stations, add the advection of each feature in the
     # direction of the target station
     for station in stations[1:]:
          for feature in features:
               # print("Adding " + feature + " from " + station)
               fd, uVec = Adv.dDeriv(stations[0], station, \
                                     feature, startDate, endDate)
               # shorten vector by lag
               fd = fd[:(-lag)]
               featureData.append(fd)
     # add in "derivative" terms
     for ideriv in range(1,order+1):
          ncols = len(stations)*len(features)
          for ii in range(ncols):
               # print("Adding " + str(ideriv) + " derivative of " + feature[jfeat])
               fd = np.diff(featureData[ii],n=ideriv)
               featureData.append(fd)
     # shorten vectors to length of highest order derivative
     nrows = len(featureData[-1])
     for column in range(len(featureData)):
          featureData[column] = featureData[column][-nrows:]
     if actual:
          target = target[-nrows:]
     # convert features to np arrays
     featureData = (np.array(featureData)).T
     pred = regr.predict(featureData)
     if actual:
          print("R^2_mean:" + "\t" + str(regr.score(featureData,target)))
          sse = ((pred-target)**2).sum()
          ssm = ((baseline-target)**2).sum()
          print("R^2_base:" + "\t" + str(1 - sse/ssm))
          rmse = np.sqrt(((pred - target)**2).mean())
          print("RMSE:\t" + "\t" + str(rmse))
          model_perf = {
                    'R2_mean': regr.score(featureData,target), \
                    'R2_base': 1 - sse/ssm, \
                    'RMSE': rmse}
     else:
          model_perf = None
     return date_list, pred, target, model_perf
Esempio n. 19
0
def clusterRegression(stations, startDate, endDate, \
                      features, clusterFeatures=None, \
                      nclusters=1, ranseed=666, \
                      targetVar='TempMax', \
                      lag=1, order=0, scale=False, verbose=False):
     # build regression model to predict a variable for a single
     # station using training data from multiple stations 
     # between startdate and enddate.  Uses a "Taylor expansion" 
     # by combining information from several days (higher order
     # time derivatives)
     #
     #   stations: a list of station codes, the first entry is
     #             the station for which forecast is generated
     #   features: a list of variables to use as predictors
     #         *** if a feature string contains a ":" it is parsed as
     #             an interaction between two features ...
     #         *** features in interaction terms pre-scaled!
     # clusterFeatures: subset of features with respect to which
     #             k-means clustering is applied before training
     #             regression models
     #  nclusters: number of clusters to compute
     #        lag: the number of days in the future to forecast
     #      order: the number of days in the past to include
     #             (also maximum order of time derivative)
     import wUCluster as Clust
     reload(Clust)
     import numpy as np
     import wUUtils as Util
     from sklearn import preprocessing
     from sklearn import linear_model

     # load target variable data
     target = Util.loadDailyVariableRange(stations[0], startDate, endDate, \
                        targetVar, castFloat=True)
     # shift vector by lag
     target = target[lag:]
     # load feature data
     featureData = []
     prescalers = []
     for station in stations:
          for feature in features:
               # check if feature contains an interaction
               if ':' in feature:
                    feat1 = feature.split(':')[0]
                    feat2 = feature.split(':')[1]
                    fd1 = Util.loadDailyVariableRange(station, startDate, endDate, \
                                                      feat1, castFloat=True)
                    fd2 = Util.loadDailyVariableRange(station, startDate, endDate, \
                                                      feat2, castFloat=True)
                    prescaler1 = preprocessing.StandardScaler().fit(fd1)
                    fd1 = prescaler1.transform(fd1)
                    prescaler2 = preprocessing.StandardScaler().fit(fd2)
                    fd2 = prescaler2.transform(fd2)
                    # save prescaler objects (for prediction)
                    prescalers.append([prescaler1,prescaler2])
                    # compute interaction
                    fd = (np.array(fd1)*np.array(fd2)).tolist()
               else:
                    fd = Util.loadDailyVariableRange(station, startDate, endDate, \
                                                     feature, castFloat=True)
                    prescalers.append(None)
               # shorten vector by lag
               fd = fd[:(-lag)]
               featureData.append(fd)
     # add in "derivative" terms
     for ideriv in range(1,order+1):
          ncols = len(stations)*len(features)
          for ii in range(ncols):
               # print("Adding " + str(ideriv) + " derivative of " + feature[jfeat])
               fd = np.diff(featureData[ii],n=ideriv)
               featureData.append(fd)
     # shorten vectors to length of highest order derivative
     nrows = len(featureData[-1])
     for column in range(len(featureData)):
          featureData[column] = featureData[column][-nrows:]
     target = target[-nrows:]

     # apply k-means clustering
     if clusterFeatures is not None:
          classes, clusterParams = Clust.clusterFeatureData(featureData, stations, features, \
                                                            clusterFeatures, nclusters, \
                                                            ranseed)
          classes, featureClusters = Clust.assignClustersAllFeatures(featureData, clusterParams)
          targetClusters = []
          for cl in range(nclusters):
               targetClusters.append([t for i,t in enumerate(target) if classes[i] == cl])
     else:
          # everything is one cluster
          classes = range(len(target))
          featureClusters = [featureData]
          targetClusters = [target]
          clusterParams = { 'nclusters': 1 }

     # train separate regression model for each cluster
     regrs = []
     scalers = []
     for icl in range(nclusters):
          # convert features and target to arrays
          featureClusters[icl] = (np.array(featureClusters[icl])).T
          targetClusters[icl] = np.array(targetClusters[icl])

          scaler = None
          if scale:
               scaler = preprocessing.StandardScaler().fit(featureClusters[icl])
               featureClusters[icl] = scaler.transform(featureClusters[icl])
          scalers.append(scaler)
     
          regr = linear_model.LinearRegression()
          regr.fit(featureClusters[icl], targetClusters[icl])
          regrs.append(regr)
          print('Cluster %d, nrows %d, R^2 %f' \
                       % (icl, \
                          len(targetClusters[icl]), \
                          regr.score(featureClusters[icl],targetClusters[icl])) )
          if verbose:
               print("Regression coefficients:")
               print("  intercept" + ":\t" + str(regr.intercept_))
               column = 0
               for ideriv in range(order+1):
                    print("  " + str(ideriv) + "th derivative:")
                    for jj, station in enumerate(stations):
                         print("    Station: " + station)
                         for ii, feature in enumerate(features):
                              print("       " + feature + ":\t" + str(regr.coef_[column]))
                              column += 1

     # save model parameters
     modelParams = {
            'stations': stations, \
            'startDate': startDate, \
            'endDate': endDate, \
            'targetVar': targetVar, \
            'features': features, \
            'regrs': regrs, \
            'clusterParams': clusterParams, \
            'classes': classes, \
            'lag': lag, \
            'order': order, \
            'scale': scale, \
            'scalers': scalers, \
            'prescalers': prescalers}
     return featureData, target, modelParams
Esempio n. 20
0
def clusterRegressionPredict(modelParams, startDate, endDate, actual=True):
     # predict targetVar for a single station using 
     # previously generated regression model
     import wUCluster as Clust
     reload(Clust)
     import numpy as np
     import wUUtils as Util
     # extract city and feature data
     stations = modelParams['stations']
     targetVar = modelParams['targetVar']
     features = modelParams['features']
     regrs = modelParams['regrs']
     clusterParams = modelParams['clusterParams']
     nclusters = clusterParams['nclusters']
     lag = modelParams['lag']
     order = modelParams['order']
     scale = modelParams['scale']
     prescalers = modelParams['prescalers']
     scalers = modelParams['scalers']

     # build list of dates in datetime format
     date_list = Util.dateList(startDate, endDate)
     date_list = date_list[(lag+order):]
     # if actual data available
     if actual:
          # load target variable data
          target = Util.loadDailyVariableRange(stations[0], startDate, endDate, \
                             targetVar, castFloat=True)
          # "baseline" model is predicted target same as value on prediction day
          baseline = target[order:(-lag)]
          baseline = np.array(baseline)
          # shift vector by lag
          target = target[lag:]
          target = np.array(target)
     else:
          target = None
     # load feature data
     featureData = []
     idata = 0
     for station in stations:
          for feature in features:
               # check if feature contains an interaction
               if ':' in feature:
                    feat1 = feature.split(':')[0]
                    feat2 = feature.split(':')[1]
                    fd1 = Util.loadDailyVariableRange(station, startDate, endDate, \
                                                      feat1, castFloat=True)
                    fd2 = Util.loadDailyVariableRange(station, startDate, endDate, \
                                                      feat2, castFloat=True)
                    # rescale factors in interaction
                    prescaler1, prescaler2 = prescalers[idata]
                    fd1 = prescaler1.transform(fd1)
                    fd2 = prescaler2.transform(fd2)
                    # compute interaction
                    fd = (np.array(fd1)*np.array(fd2)).tolist()
               else:
                    fd = Util.loadDailyVariableRange(station, startDate, endDate, \
                                                     feature, castFloat=True)
               # shorten vector by lag
               fd = fd[:(-lag)]
               featureData.append(fd)
               # increment feature counter
               idata += 1
     # add in "derivative" terms
     for ideriv in range(1,order+1):
          ncols = len(stations)*len(features)
          for ii in range(ncols):
               # print("Adding " + str(ideriv) + " derivative of " + feature[jfeat])
               fd = np.diff(featureData[ii],n=ideriv)
               featureData.append(fd)
     # shorten vectors to length of highest order derivative
     nrows = len(featureData[-1])
     for column in range(len(featureData)):
          featureData[column] = featureData[column][-nrows:]
     if actual:
          target = target[-nrows:]

     # allocate features to clusters
     if clusterParams['nclusters'] > 1:
          classes, featureClusters = Clust.assignClustersAllFeatures(featureData, clusterParams)
          dateClusters = []
          for icl in range(nclusters):
               dateClusters.append([t for i,t in enumerate(date_list) if classes[i] == icl])
          if actual:
               targetClusters = []
               for icl in range(nclusters):
                    targetClusters.append([t for i,t in enumerate(target) if classes[i] == icl])
     else:
          # everything is one cluster
          classes = range(len(target))
          featureClusters = [featureData]
          dateClusters = [date_list]
          if actual:
               targetClusters = [target]

     preds = []
     RMSE = []
     R2 = []
     for icl in range(nclusters):
          # convert features and target to arrays
          featureClusters[icl] = (np.array(featureClusters[icl])).T

          if scale:
               scaler = scalers[icl]
               featureClusters[icl] = scaler.transform(featureClusters[icl])

          regr = regrs[icl]
          preds.append(regr.predict(featureClusters[icl]))
          if actual:
               targetClusters[icl] = np.array(targetClusters[icl])
               print('Cluster %d, %d rows:' % (icl,len(dateClusters[icl])) )
               r2 = regrs[icl].score(featureClusters[icl],targetClusters[icl])
               print('  R^2_mean:' + '\t' + str(r2))
               rmse = np.sqrt(((preds[icl] - targetClusters[icl])**2).mean())
               print('  RMSE:\t' + '\t' + str(rmse))
               RMSE.append(rmse)
               R2.append(r2)

     # assemble predictions into one list
     date_list_mixed = np.concatenate(dateClusters).tolist()
     pred_mixed = np.concatenate(preds).tolist()
     pred = [pr for (d,pr) in sorted(zip(date_list_mixed,pred_mixed))]

     if actual:
          rmse = np.sqrt(((np.array(pred) - np.array(target))**2).mean()) 
          print('\nOverall performance:')
          print('  RMSE:' + '\t' + str(rmse))

          modelPerf = {'RMSE': RMSE, 'R2': R2, 'RMSE_total': rmse }
     else:
          modelPerf = None

     return date_list, pred, target, featureData, classes, modelPerf
Esempio n. 21
0
def isNortherly(station, startDate, endDate):
     # binary variable for max wind (0 = northerly, 1 = southerly)
     import wUUtils as Util
     windDir = Util.loadDailyVariableRange(station, startDate, endDate, \
                           'WindMaxDir', castFloat=True)
     return [int(w > 270.0 or w < 90.0) for w in windDir]
Esempio n. 22
0
def pcaTaylorPredict(model_params, startDate, endDate, actual=True):
     # predict targetVar for a single station using 
     # previously generated regression model
     import numpy as np
     import wUUtils as Util
     import wUPCA
     reload(wUPCA)
     # extract city and feature data
     stations = model_params['stations']
     targetVar = model_params['targetVar']
     features = model_params['features']
     regr = model_params['regr']
     lag = model_params['lag']
     order = model_params['order']
     transform_params = model_params['transform_params']
     ncomp = transform_params['ncomp']
     # build list of dates in datetime format
     date_list = Util.dateList(startDate, endDate)
     date_list = date_list[(lag+order):]
     # if actual data available
     if actual:
          # load target variable data
          target = Util.loadDailyVariableRange(stations[0], startDate, endDate, \
                             targetVar, castFloat=True)
          # "baseline" model is predicted target same as value on prediction day
          baseline = target[order:(-lag)]
          baseline = np.array(baseline)
          # shift vector by lag
          target = target[lag:]
          target = np.array(target)
     else:
          target = None

     # load features data and compute PC
     pcaData = wUPCA.pcaPredict(transform_params, startDate, endDate)
     # flatten featureData into single list of lists, while shortening by lag
     featureData = [data[:(-lag)] for dataList in pcaData for data in dataList] 
     # number of PC-transformed features
     nfeat = sum(ncomp) 
     # add in "derivative" terms
     for ideriv in range(1,order+1):
          for ii in range(nfeat):
               # print("Adding " + str(ideriv) + " derivative of " + feature[jfeat])
               fd = np.diff(featureData[ii],n=ideriv)
               featureData.append(fd)
     # shorten vectors to length of highest order derivative
     nrows = len(featureData[-1])
     for column in range(len(featureData)):
          featureData[column] = featureData[column][-nrows:]
     if actual:
          target = target[-nrows:]
     # convert features to np arrays
     featureData = (np.array(featureData)).T
     pred = regr.predict(featureData)
     if actual:
          print("R^2_mean:" + "\t" + str(regr.score(featureData,target)))
          sse = ((pred-target)**2).sum()
          ssm = ((baseline-target)**2).sum()
          print("R^2_base:" + "\t" + str(1 - sse/ssm))
          rmse = np.sqrt(((pred - target)**2).mean())
          print("RMSE:\t" + "\t" + str(rmse))
          model_perf = {
               'R2_mean': regr.score(featureData,target), \
               'R2_base': 1 - sse/ssm, \
               'RMSE': rmse}
     else:
          model_perf = None
     return date_list, pred, target, model_perf
Esempio n. 23
0
def pcaClusterModel(stations, startDate, endDate, \
                   features, ncomp=None, \
                   clusterVars=[], nclusters=1, \
                   targetVar='TempMax', \
                   lag=1, order=0, ranseed=666, verbose=False):
     # build regression model to predict "variable" for a single
     # station using training data from multiple stations 
     # between startdate and enddate.
     #
     # The set of values of each feature at all stations is converted
     # to a truncated list of principal components for purposes of 
     # feature-reduction and reduction of multicolinearity 
     # 
     # Clustering is used to train multiple models for different
     # partitions of the data
     #
     # Uses a "Taylor expansion" by combining information from 
     # several days (higher order time derivatives)
     #
     # stations: a list of station codes, the first entry is
     #             the station for which forecast is generated
     # features: a list of variables to use as predictors
     #    ncomp: a list of same length as features containing the
     #           number of PCA to keep for each feature
     # clusterVars: a list of pairs of form ('feature',npc), where
     #              where npc is the index of the PC to use for
     #              clustering
     #      lag: the number of days in the future to forecast
     #    order: the number of days in the past to include
     #           (also maximum order of time derivative)
     import numpy as np
     import wUUtils as Util
     import wUPCA
     import wUCluster as Clust
     from sklearn import preprocessing
     from sklearn import linear_model
     # load target variable data
     target = Util.loadDailyVariableRange(stations[0], startDate, endDate, \
                        targetVar, castFloat=True)
     # shift vector by lag
     target = target[lag:]
     # load features data and compute PC
     pcaData, transformParams = wUPCA.pcaConvert(stations, features, \
                                                 startDate, endDate, ncomp)
     # flatten featureData into single list of lists, while shortening by lag
     featureData = [data[:(-lag)] for dataList in pcaData for data in dataList] 
     # number of PC-transformed features
     if ncomp == None:
          nfeat = len(stations)*len(features)
     else:
          nfeat = sum(ncomp) 
     # add in "derivative" terms
     for ideriv in range(1,order+1):
          for ii in range(nfeat):
               # print("Adding " + str(ideriv) + " derivative of " + feature[jfeat])
               fd = np.diff(featureData[ii],n=ideriv)
               featureData.append(fd)
     # shorten vectors to length of highest order derivative
     nrows = len(featureData[-1])
     for column in range(len(featureData)):
          featureData[column] = featureData[column][-nrows:]
     target = target[-nrows:]

     # apply clustering
     # locate columns to be used for clustering
     cols = []
     for clusterPair in clusterVars:
          ifeat = features.index(clusterPair[0]) # index of feature
          col = sum(ncomp[:ifeat]) + clusterPair[1]
          cols += [col]
          if clusterPair[1] >= ncomp[ifeat]:
               print('Requested cluster variable out of range')
               print(clusterPair[0] + ' ' + str(clusterPair[1]) + ' >= ' + str(ncomp[ifeat]))
               return
     print('columns for clustering: ' + str(cols))

     clusterData = np.array([featureData[ii] for ii in cols]).T
     scaler, clusterer = Clust.computeClusters(clusterData, nclusters, ranseed)
     classes = Clust.assignClusters(scaler, clusterer, clusterData)
     clusterParams = { \
               'scaler': scaler, \
               'clusterer': clusterer, \
               'nclusters': nclusters, \
               'ranseed': ranseed, \
               'cols': cols }

     # separate data into clusters
     featureClusters = []
     targetClusters = []
     for icl in range(nclusters):
         # features
         clust = [f for i,f in enumerate(zip(*featureData)) if classes[i]==icl]
         featureClusters.append( map(list,zip(*clust)) )
         # targetVar
         clust = [t for i,t in enumerate(target) if classes[i]==icl]
         targetClusters.append(clust)

     # train separate regression model for each cluster
     regrs = []
     for icl in range(nclusters):
          # convert features and target to arrays
          featureClusters[icl] = (np.array(featureClusters[icl])).T
          targetClusters[icl] = np.array(targetClusters[icl])

          regr = linear_model.LinearRegression()
          regr.fit(featureClusters[icl], targetClusters[icl])
          regrs.append(regr)
          print('Cluster %d, nrows %d, R^2 %f' \
                       % (icl, \
                          len(targetClusters[icl]), \
                          regr.score(featureClusters[icl],targetClusters[icl])) )
          if verbose:
               print("\nCluster " + str(icl))
               print("Regression coefficients:")
               print("  intercept" + ":\t" + str(regr.intercept_))
               column = 0
               for ideriv in range(order+1):
                    print("  " + str(ideriv) + "th derivative:")
                    for ii, feature in enumerate(features):
                         print("    " + feature)
                         for jj in range(ncomp[ii]):
                              print("      PC " + str(jj) + " :\t" + str(regr.coef_[column]))
                              column += 1

     modelParams = {
            'stations': stations, \
            'startDate': startDate, \
            'endDate': endDate, \
            'targetVar': targetVar, \
            'features': features, \
            'clusterVars': clusterVars, \
            'clusterParams': clusterParams, \
            'classes': classes, \
            'regrs': regrs, \
            'lag': lag, \
            'order': order, \
            'transformParams': transformParams}

     return featureData, target, modelParams