def pcaTaylorPredict(model_params, startDate, endDate, actual=True): # predict targetVar for a single station using # previously generated regression model import numpy as np import wUUtils as Util import wUPCA reload(wUPCA) # extract city and feature data stations = model_params['stations'] targetVar = model_params['targetVar'] features = model_params['features'] regr = model_params['regr'] lag = model_params['lag'] order = model_params['order'] transform_params = model_params['transform_params'] ncomp = transform_params['ncomp'] # build list of dates in datetime format date_list = Util.dateList(startDate, endDate) date_list = date_list[(lag+order):] # if actual data available if actual: # load target variable data target = Util.loadDailyVariableRange(stations[0], startDate, endDate, \ targetVar, castFloat=True) # "baseline" model is predicted target same as value on prediction day baseline = target[order:(-lag)] baseline = np.array(baseline) # shift vector by lag target = target[lag:] target = np.array(target) else: target = None # load features data and compute PC pcaData = wUPCA.pcaPredict(transform_params, startDate, endDate) # flatten featureData into single list of lists, while shortening by lag featureData = [data[:(-lag)] for dataList in pcaData for data in dataList] # number of PC-transformed features nfeat = sum(ncomp) # add in "derivative" terms for ideriv in range(1,order+1): for ii in range(nfeat): # print("Adding " + str(ideriv) + " derivative of " + feature[jfeat]) fd = np.diff(featureData[ii],n=ideriv) featureData.append(fd) # shorten vectors to length of highest order derivative nrows = len(featureData[-1]) for column in range(len(featureData)): featureData[column] = featureData[column][-nrows:] if actual: target = target[-nrows:] # convert features to np arrays featureData = (np.array(featureData)).T pred = regr.predict(featureData) if actual: print("R^2_mean:" + "\t" + str(regr.score(featureData,target))) sse = ((pred-target)**2).sum() ssm = ((baseline-target)**2).sum() print("R^2_base:" + "\t" + str(1 - sse/ssm)) rmse = np.sqrt(((pred - target)**2).mean()) print("RMSE:\t" + "\t" + str(rmse)) model_perf = { 'R2_mean': regr.score(featureData,target), \ 'R2_base': 1 - sse/ssm, \ 'RMSE': rmse} else: model_perf = None return date_list, pred, target, model_perf
def pcaClusterPredict(modelParams, startDate, endDate, actual=True): # predict targetVar for a single station using # previously generated regression model import numpy as np import wUUtils as Util import wUCluster as Clust import wUPCA reload(wUPCA) # extract city and feature data stations = modelParams['stations'] targetVar = modelParams['targetVar'] features = modelParams['features'] regrs = modelParams['regrs'] lag = modelParams['lag'] order = modelParams['order'] transformParams = modelParams['transformParams'] ncomp = transformParams['ncomp'] clusterVars = modelParams['clusterVars'] clusterParams = modelParams['clusterParams'] nclusters = clusterParams['nclusters'] cols = clusterParams['cols'] scaler = clusterParams['scaler'] clusterer = clusterParams['clusterer'] # build list of dates in datetime format date_list = Util.dateList(startDate, endDate) date_list = date_list[(lag+order):] # if actual data available if actual: # load target variable data target = Util.loadDailyVariableRange(stations[0], startDate, endDate, \ targetVar, castFloat=True) # shift vector by lag target = target[lag:] target = np.array(target) else: target = None # load features data and compute PC pcaData = wUPCA.pcaPredict(transformParams, startDate, endDate) # flatten featureData into single list of lists, while shortening by lag featureData = [data[:(-lag)] for dataList in pcaData for data in dataList] # number of PC-transformed features nfeat = sum(ncomp) # add in "derivative" terms for ideriv in range(1,order+1): for ii in range(nfeat): # print("Adding " + str(ideriv) + " derivative of " + feature[jfeat]) fd = np.diff(featureData[ii],n=ideriv) featureData.append(fd) # shorten vectors to length of highest order derivative nrows = len(featureData[-1]) for column in range(len(featureData)): featureData[column] = featureData[column][-nrows:] if actual: target = target[-nrows:] # assign points (rows) to clusters clusterData = np.array([featureData[ii] for ii in cols]).T classes = Clust.assignClusters(scaler, clusterer, clusterData) # separate data into clusters featureClusters = [] dateClusters = [] if actual: targetClusters = [] for icl in range(nclusters): # features clust = [f for i,f in enumerate(zip(*featureData)) if classes[i]==icl] featureClusters.append( map(list,zip(*clust)) ) if actual: # targetVar clust = [t for i,t in enumerate(target) if classes[i]==icl] targetClusters.append(clust) # dates dateClusters.append([t for i,t in enumerate(date_list) if classes[i] == icl]) R2 = [] RMSE = [] preds = [] for icl in range(nclusters): regr = regrs[icl] # convert features and target to arrays featureClusters[icl] = (np.array(featureClusters[icl])).T # make predictions if len(featureClusters[icl]) > 0: preds.append(regr.predict(featureClusters[icl])) else: preds.append([]) if actual: targetClusters[icl] = np.array(targetClusters[icl]) print('Cluster %d, %d rows:' % (icl,len(dateClusters[icl])) ) if len(featureClusters[icl]) > 0: r2 = regrs[icl].score(featureClusters[icl],targetClusters[icl]) print(' R^2_mean:' + '\t' + str(r2)) rmse = np.sqrt(((preds[icl] - targetClusters[icl])**2).mean()) print(' RMSE:\t' + '\t' + str(rmse)) RMSE.append(rmse) R2.append(r2) else: RMSE.append(None) R2.append(None) # assemble predictions into one list date_list_mixed = np.concatenate(dateClusters).tolist() pred_mixed = np.concatenate(preds).tolist() pred = [pr for (d,pr) in sorted(zip(date_list_mixed,pred_mixed))] if actual: rmse = np.sqrt(((np.array(pred) - np.array(target))**2).mean()) print('\nOverall performance:') print(' RMSE:' + '\t' + str(rmse)) modelPerf = {'RMSE': RMSE, 'R2': R2, 'RMSE_total': rmse } else: modelPerf = None return date_list, pred, target, featureData, classes, modelPerf