Python assignClusters Examples

Programming Language: Python

Namespace/Package Name: wUCluster

Method/Function: assignClusters

Examples at hotexamples.com: 2

Python assignClusters - 2 examples found. These are the top rated real world Python examples of wUCluster.assignClusters extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: wUClusterRegression.py Project: majorgowan/wpwp

def pcaClusterPredict(modelParams, startDate, endDate, actual=True):
     # predict targetVar for a single station using 
     # previously generated regression model
     import numpy as np
     import wUUtils as Util
     import wUCluster as Clust
     import wUPCA
     reload(wUPCA)
     # extract city and feature data
     stations = modelParams['stations']
     targetVar = modelParams['targetVar']
     features = modelParams['features']
     regrs = modelParams['regrs']
     lag = modelParams['lag']
     order = modelParams['order']
     transformParams = modelParams['transformParams']
     ncomp = transformParams['ncomp']
     clusterVars = modelParams['clusterVars']
     clusterParams = modelParams['clusterParams']
     nclusters = clusterParams['nclusters']
     cols = clusterParams['cols']
     scaler = clusterParams['scaler']
     clusterer = clusterParams['clusterer']

     # build list of dates in datetime format
     date_list = Util.dateList(startDate, endDate)
     date_list = date_list[(lag+order):]
     # if actual data available
     if actual:
          # load target variable data
          target = Util.loadDailyVariableRange(stations[0], startDate, endDate, \
                             targetVar, castFloat=True)
          # shift vector by lag
          target = target[lag:]
          target = np.array(target)
     else:
          target = None

     # load features data and compute PC
     pcaData = wUPCA.pcaPredict(transformParams, startDate, endDate)
     # flatten featureData into single list of lists, while shortening by lag
     featureData = [data[:(-lag)] for dataList in pcaData for data in dataList] 
     # number of PC-transformed features
     nfeat = sum(ncomp) 
     # add in "derivative" terms
     for ideriv in range(1,order+1):
          for ii in range(nfeat):
               # print("Adding " + str(ideriv) + " derivative of " + feature[jfeat])
               fd = np.diff(featureData[ii],n=ideriv)
               featureData.append(fd)
     # shorten vectors to length of highest order derivative
     nrows = len(featureData[-1])
     for column in range(len(featureData)):
          featureData[column] = featureData[column][-nrows:]
     if actual:
          target = target[-nrows:]

     # assign points (rows) to clusters
     clusterData = np.array([featureData[ii] for ii in cols]).T
     classes = Clust.assignClusters(scaler, clusterer, clusterData)

     # separate data into clusters
     featureClusters = []
     dateClusters = []
     if actual:
          targetClusters = []
     for icl in range(nclusters):
         # features
         clust = [f for i,f in enumerate(zip(*featureData)) if classes[i]==icl]
         featureClusters.append( map(list,zip(*clust)) )
         if actual:
              # targetVar
              clust = [t for i,t in enumerate(target) if classes[i]==icl]
              targetClusters.append(clust)
         # dates
         dateClusters.append([t for i,t in enumerate(date_list) if classes[i] == icl])

     R2 = []
     RMSE = []
     preds = []
     for icl in range(nclusters):
          regr = regrs[icl]
          # convert features and target to arrays
          featureClusters[icl] = (np.array(featureClusters[icl])).T
          # make predictions
          if len(featureClusters[icl]) > 0:
               preds.append(regr.predict(featureClusters[icl]))
          else:
               preds.append([])
          if actual:
               targetClusters[icl] = np.array(targetClusters[icl])
               print('Cluster %d, %d rows:' % (icl,len(dateClusters[icl])) )
               if len(featureClusters[icl]) > 0:
                    r2 = regrs[icl].score(featureClusters[icl],targetClusters[icl])
                    print('  R^2_mean:' + '\t' + str(r2))
                    rmse = np.sqrt(((preds[icl] - targetClusters[icl])**2).mean())
                    print('  RMSE:\t' + '\t' + str(rmse))
                    RMSE.append(rmse)
                    R2.append(r2)
               else:
                    RMSE.append(None)
                    R2.append(None)
     
     # assemble predictions into one list
     date_list_mixed = np.concatenate(dateClusters).tolist()
     pred_mixed = np.concatenate(preds).tolist()
     pred = [pr for (d,pr) in sorted(zip(date_list_mixed,pred_mixed))]

     if actual:
          rmse = np.sqrt(((np.array(pred) - np.array(target))**2).mean()) 
          print('\nOverall performance:')
          print('  RMSE:' + '\t' + str(rmse))

          modelPerf = {'RMSE': RMSE, 'R2': R2, 'RMSE_total': rmse }
     else:
          modelPerf = None

     return date_list, pred, target, featureData, classes, modelPerf

Example #2

Show file

File: wUClusterRegression.py Project: majorgowan/wpwp

def pcaClusterModel(stations, startDate, endDate, \
                   features, ncomp=None, \
                   clusterVars=[], nclusters=1, \
                   targetVar='TempMax', \
                   lag=1, order=0, ranseed=666, verbose=False):
     # build regression model to predict "variable" for a single
     # station using training data from multiple stations 
     # between startdate and enddate.
     #
     # The set of values of each feature at all stations is converted
     # to a truncated list of principal components for purposes of 
     # feature-reduction and reduction of multicolinearity 
     # 
     # Clustering is used to train multiple models for different
     # partitions of the data
     #
     # Uses a "Taylor expansion" by combining information from 
     # several days (higher order time derivatives)
     #
     # stations: a list of station codes, the first entry is
     #             the station for which forecast is generated
     # features: a list of variables to use as predictors
     #    ncomp: a list of same length as features containing the
     #           number of PCA to keep for each feature
     # clusterVars: a list of pairs of form ('feature',npc), where
     #              where npc is the index of the PC to use for
     #              clustering
     #      lag: the number of days in the future to forecast
     #    order: the number of days in the past to include
     #           (also maximum order of time derivative)
     import numpy as np
     import wUUtils as Util
     import wUPCA
     import wUCluster as Clust
     from sklearn import preprocessing
     from sklearn import linear_model
     # load target variable data
     target = Util.loadDailyVariableRange(stations[0], startDate, endDate, \
                        targetVar, castFloat=True)
     # shift vector by lag
     target = target[lag:]
     # load features data and compute PC
     pcaData, transformParams = wUPCA.pcaConvert(stations, features, \
                                                 startDate, endDate, ncomp)
     # flatten featureData into single list of lists, while shortening by lag
     featureData = [data[:(-lag)] for dataList in pcaData for data in dataList] 
     # number of PC-transformed features
     if ncomp == None:
          nfeat = len(stations)*len(features)
     else:
          nfeat = sum(ncomp) 
     # add in "derivative" terms
     for ideriv in range(1,order+1):
          for ii in range(nfeat):
               # print("Adding " + str(ideriv) + " derivative of " + feature[jfeat])
               fd = np.diff(featureData[ii],n=ideriv)
               featureData.append(fd)
     # shorten vectors to length of highest order derivative
     nrows = len(featureData[-1])
     for column in range(len(featureData)):
          featureData[column] = featureData[column][-nrows:]
     target = target[-nrows:]

     # apply clustering
     # locate columns to be used for clustering
     cols = []
     for clusterPair in clusterVars:
          ifeat = features.index(clusterPair[0]) # index of feature
          col = sum(ncomp[:ifeat]) + clusterPair[1]
          cols += [col]
          if clusterPair[1] >= ncomp[ifeat]:
               print('Requested cluster variable out of range')
               print(clusterPair[0] + ' ' + str(clusterPair[1]) + ' >= ' + str(ncomp[ifeat]))
               return
     print('columns for clustering: ' + str(cols))

     clusterData = np.array([featureData[ii] for ii in cols]).T
     scaler, clusterer = Clust.computeClusters(clusterData, nclusters, ranseed)
     classes = Clust.assignClusters(scaler, clusterer, clusterData)
     clusterParams = { \
               'scaler': scaler, \
               'clusterer': clusterer, \
               'nclusters': nclusters, \
               'ranseed': ranseed, \
               'cols': cols }

     # separate data into clusters
     featureClusters = []
     targetClusters = []
     for icl in range(nclusters):
         # features
         clust = [f for i,f in enumerate(zip(*featureData)) if classes[i]==icl]
         featureClusters.append( map(list,zip(*clust)) )
         # targetVar
         clust = [t for i,t in enumerate(target) if classes[i]==icl]
         targetClusters.append(clust)

     # train separate regression model for each cluster
     regrs = []
     for icl in range(nclusters):
          # convert features and target to arrays
          featureClusters[icl] = (np.array(featureClusters[icl])).T
          targetClusters[icl] = np.array(targetClusters[icl])

          regr = linear_model.LinearRegression()
          regr.fit(featureClusters[icl], targetClusters[icl])
          regrs.append(regr)
          print('Cluster %d, nrows %d, R^2 %f' \
                       % (icl, \
                          len(targetClusters[icl]), \
                          regr.score(featureClusters[icl],targetClusters[icl])) )
          if verbose:
               print("\nCluster " + str(icl))
               print("Regression coefficients:")
               print("  intercept" + ":\t" + str(regr.intercept_))
               column = 0
               for ideriv in range(order+1):
                    print("  " + str(ideriv) + "th derivative:")
                    for ii, feature in enumerate(features):
                         print("    " + feature)
                         for jj in range(ncomp[ii]):
                              print("      PC " + str(jj) + " :\t" + str(regr.coef_[column]))
                              column += 1

     modelParams = {
            'stations': stations, \
            'startDate': startDate, \
            'endDate': endDate, \
            'targetVar': targetVar, \
            'features': features, \
            'clusterVars': clusterVars, \
            'clusterParams': clusterParams, \
            'classes': classes, \
            'regrs': regrs, \
            'lag': lag, \
            'order': order, \
            'transformParams': transformParams}

     return featureData, target, modelParams