def f_Golden(y, x_glob, x_loc, y_off, coords, mType, wType, criterion, maxVal, minVal, tol, maxIter=200, flag=0): """ Golden section search Arguments ---------- y : array n*1, dependent variable. x_glob : array n*k1, fixed independent variable. x_local : array n*k2, local independent variable, including constant. y_off : array n*1, offset variable for Poisson model coords : dictionary including (x,y) coordinates involved in the weight evaluation (including point i) mType : integer GWR model type, 0: Gaussian, 1: Poisson, 2: Logistic wType : integer kernel type, 0: fix_Gaussian, 1: adap_Gaussian, 2: fix_Bisquare, 3: adap_Bisquare criterion : integer bandwidth selection criterion, 0: AICc, 1: AIC, 2: BIC, 3: CV maxVal : float maximum value used in bandwidth searching minVal : float minimum value used in bandwidth searching tol : float tolerance used to determine convergence maxIter : integer maximum number of iteration if convergence cannot arrive at the tolerance flag : integer distance type Return: opt_band : float optimal bandwidth opt_weit : kernel optimal kernel output : list of tuple report searching process, keep bandwidth and score, [(bandwidth, score),(bandwidth, score),...] """ dist = Kernel.get_pairDist(coords, flag) #get pairwise distance between points # 1 set range of bandwidth if x_glob is None: nVar_glob = 0 else: nVar_glob = len(x_glob[0]) if x_loc is None: nVar_loc = 0 else: nVar_loc = len(x_loc[0]) nVars = nVar_glob + nVar_loc a, c = ini_band_dist(dist, nVars, wType, maxVal, minVal) # 2 get initial b value output = [] lamda = 0.38197 #1 - (np.sqrt(5.0)-1.0)/2.0 # get b and d b = a + lamda * abs(c - a) #distance or nn based on wType d = c - lamda * abs(c - a) # golden section if wType == 1 or wType == 3: # bandwidth is nn b = round(b, 0) d = round(d, 0) # 3 loop pre_opt = 0.0 diff = 1.0e9 nIter = 0 while abs(diff) > tol and nIter < maxIter: nIter += 1 # 3.1 create kernel weit_a = Kernel.GWR_W(coords, a, wType, dist) weit_b = Kernel.GWR_W(coords, b, wType, dist) weit_c = Kernel.GWR_W(coords, c, wType, dist) weit_d = Kernel.GWR_W(coords, d, wType, dist) # 3.2 decide whether local model or mixed model if x_glob is None: # local model #if mType == 0: #mType == 0 or #gwrMod_a = GWR_Gaussian_Base(y, x_loc, weit_a) #gwrMod_b = GWR_Gaussian_Base(y, x_loc, weit_b) #gwrMod_c = GWR_Gaussian_Base(y, x_loc, weit_c) #gwrMod_d = GWR_Gaussian_Base(y, x_loc, weit_d) #else: gwrMod_a = GWGLM_Base(y, x_loc, weit_a, mType, y_off) gwrMod_b = GWGLM_Base(y, x_loc, weit_b, mType, y_off) gwrMod_c = GWGLM_Base(y, x_loc, weit_c, mType, y_off) gwrMod_d = GWGLM_Base(y, x_loc, weit_d, mType, y_off) else: # mixed model gwrMod_a = semiGWR_Base(y, x_glob, x_loc, weit_a, mType, y_off) gwrMod_b = semiGWR_Base(y, x_glob, x_loc, weit_b, mType, y_off) gwrMod_c = semiGWR_Base(y, x_glob, x_loc, weit_c, mType, y_off) gwrMod_d = semiGWR_Base(y, x_glob, x_loc, weit_d, mType, y_off) # 3.3 get diagnostic value(0: AICc, 1: AIC, 2: BIC, 3: CV) if mType == 0: #or mType == 3 f_a = getDiag_GWR[criterion](gwrMod_a) f_b = getDiag_GWR[criterion](gwrMod_b) f_c = getDiag_GWR[criterion](gwrMod_c) f_d = getDiag_GWR[criterion](gwrMod_d) else: f_a = getDiag_GWGLM[criterion](gwrMod_a) f_b = getDiag_GWGLM[criterion](gwrMod_b) f_c = getDiag_GWGLM[criterion](gwrMod_c) f_d = getDiag_GWGLM[criterion](gwrMod_d) #print "a: %.3f, b: %.3f, c: %.3f, d: %.3f" % (a, b, c, d) # determine next triple if f_b <= f_d: # current optimal bandwidth opt_weit = weit_b opt_band = b opt_cri = f_b c = d d = b b = a + lamda * abs(c - a) if wType == 1 or wType == 3: # bandwidth is nn b = round(b, 0) else: # current optimal bandwidth opt_weit = weit_d opt_band = d opt_cri = f_d a = b b = d d = c - lamda * abs(c - a) if wType == 1 or wType == 3: # bandwidth is nn d = round(d, 0) output.append((opt_band, opt_cri)) # determine diff diff = f_b - f_d #opt_cri - pre_opt pre_opt = opt_cri #print "diff: %.6f" % (diff) return opt_band, opt_weit, output
def f_Interval(y, x_glob, x_loc, y_off, coords, mType, wType, criterion, maxVal, minVal, interval, flag=0): """ Interval search, using interval as stepsize Arguments ---------- y : array n*1, dependent variable. x_glob : array n*k1, fixed independent variable. x_local : array n*k2, local independent variable, including constant. y_off : array n*1, offset variable for Poisson model coords : dictionary including (x,y) coordinates involved in the weight evaluation (including point i) mType : integer GWR model type, 0: M_Gaussian, 1: M_Poisson, 2: Logistic wType : integer kernel type, 0: fix_Gaussian, 1: adap_Gaussian, 2: fix_Bisquare, 3: adap_Bisquare criterion : integer bandwidth selection criterion, 0: AICc, 1: AIC, 2: BIC, 3: CV maxVal : float maximum value used in bandwidth searching minVal : float minimum value used in bandwidth searching interval : float interval used in interval search flag : integer distance type Return: opt_band : float optimal bandwidth opt_weit : kernel optimal kernel output : list of tuple report searching process, keep bandwidth and score, [(bandwidth, score),(bandwidth, score),...] """ dist = Kernel.get_pairDist(coords, flag=0) #get pairwise distance between points a = minVal c = maxVal # add codes to check whether a and c are valid #------------------------------------------------------------ if wType == 1 or wType == 3: # bandwidth is nn a = int(a) c = int(c) output = [] # 1 get initial b value b = a + interval #distance or nn based on wType if wType == 1 or wType == 3: # bandwidth is nn b = int(b) # 2 create weight weit_a = Kernel.GWR_W(coords, a, wType, dist) weit_c = Kernel.GWR_W(coords, c, wType, dist) # 3 create model if x_glob is None: # local model #if mType == 3: #gwrMod_a = GWR_Gaussian(y, x_loc, weit_a) #gwrMod_c = GWR_Gaussian(y, x_loc, weit_c) #else: gwrMod_a = GWGLM_Base(y, x_loc, weit_a, mType, y_off) gwrMod_c = GWGLM_Base(y, x_loc, weit_c, mType, y_off) else: # mixed model gwrMod_a = semiGWR_Base(y, x_glob, x_loc, weit_a, mType, y_off) gwrMod_c = semiGWR_Base(y, x_glob, x_loc, weit_c, mType, y_off) # 4 get diagnostic value if mType == 0: #or mType == 3 f_a = getDiag_GWR[criterion](gwrMod_a) f_c = getDiag_GWR[criterion](gwrMod_c) else: f_a = getDiag_GWGLM[criterion](gwrMod_a) f_c = getDiag_GWGLM[criterion](gwrMod_c) # 5 add to the output output.append((a, f_a)) output.append((c, f_c)) #print "bandwidth: %.3f, f value: %.6f" % (a, f_a) #print "bandwidth: %.3f, f value: %.6f" % (c, f_c) if f_a < f_c: opt_weit = weit_a opt_band = a opt_val = f_a else: opt_weit = weit_c opt_band = c opt_val = f_c while b < c: # model using bandwidth b weit_b = Kernel.GWR_W(coords, b, wType, dist) # local model if x_glob is None: # local model #if mType == 3: #gwrMod_b = GWR_Gaussian(y, x_loc, weit_b) #else: gwrMod_b = GWGLM_Base(y, x_loc, weit_b, mType, y_off) else: # mixed model gwrMod_b = semiGWR_Base(y, x_glob, x_loc, weit_b, mType, y_off) if mType == 0: #or mType == 3 f_b = getDiag_GWR[criterion](gwrMod_b) else: f_b = getDiag_GWGLM[criterion](gwrMod_b) #print "bandwidth: %.3f, f value: %.6f" % (b, f_b) # add output output.append((b, f_b)) # determine next triple if f_b < opt_val: opt_weit = weit_b opt_band = b opt_val = f_b # update b b = b + interval return opt_band, opt_weit, output
def pred(data, refData, band, y, x_local, y_hat=None, wType=0, mType=0, flag=0, y_offset=None, sigma2=1, y_fix=None, fMatrix=None): """ predict values at unsampled locations Arguments: data : dictionary, (x,y) of unsampled locations refData : dictionary, (x,y) of sampled locations band : float bandwidth y : array n*1, dependent variable y_hat : array n*1, predicted y from original model, to calculate local statistics x_local : array n*k1, local independent variable y_offset : array n*1, offset variable for Poisson model sigma2 : float used to calculate std. error of betas for Gaussian model y_fix : array n*1, fixed part of y from global Xs, used in mixed model fMatrix : array n*n, hat matrix for global model, used in mixed model wType : integer define which kernel function to use mType : integer model type, model type, 0: Gaussian, 1: Poisson, 2: Logistic flag : dummy, 0 or 1, 0: Euclidean distance; 1: spherical distance Return: Betas : array n*k, Beta estimation std_err : array n*k, standard errors of Beta t_stat : array n*k, local t-statistics localR2 : array n*1, local R square or local p-dev """ # 1 get W matrix dicDist = {} n_pred = len(data.keys()) for i in range( n_pred ): # calculate distance between unsampled obs and sampled obs dicDist[i] = Kernel.get_focusDist(data[i], refData, flag) weit = Kernel.GWR_W(data, band, wType, dicDist) #print len(dicDist[0].keys()) #print len(weit.w.keys()) #print len(weit.w[0]) # 2 get predicted local Beta estimation #if mType == 0:# 2.1 basic Gaussian #mod_loc = GWR_Gaussian_Base(y, x_local, weit) #else:# 2.2 GWGLM models including mixed models mod_loc = GWGLM_Base(y, x_local, weit, mType, y_offset, y_fix, fMatrix) pred_betas = mod_loc.Betas[:n_pred] # 3 get std errors of Betas #if mType == 1 or mType == 2: #sigma2 = 1.0 pred_stdErr = np.sqrt(mod_loc.CCT * sigma2) # 4 get t statistics pred_tstat = pred_betas / pred_stdErr # 5 get local R2 or local p-dev localR2 = np.zeros(shape=(n_pred, 1)) n_reg = len(y) if mType == 0: # Gaussian model or mType == 3 for i in range(n_pred): w_i = np.reshape(np.array(weit.w[i]), (-1, 1)) sum_yw = np.sum(y * w_i) ybar = 1.0 * sum_yw / np.sum(w_i) rss = np.sum(w_i * (y - y_hat)**2) tss = np.sum(w_i * (y - ybar)**2) localR2[i] = (tss - rss) / tss if mType == 1: # Poisson model for i in range(n_pred): w_i = np.reshape(np.array(weit.w[i]), (-1, 1)) sum_yw = np.sum(y * w_i) ybar = 1.0 * sum_yw / np.sum(w_i * y_offset) dev = 0.0 dev0 = 0.0 for j in range(n_reg): if y[j] <> 0: dev += 2 * y[j] * (np.log(y[j]) - np.log(y_hat[j])) * w_i[j] dev0 += 2 * y[j] * (np.log(y[j]) - np.log(ybar * y_offset[j])) * w_i[j] dev -= 2 * (y[j] - y_hat[j]) * w_i[j] dev0 -= 2 * (y[j] - ybar * y_offset[j]) * w_i[j] localR2[i] = 1.0 - dev / dev0 if mType == 2: # Logistic model for i in range(n_pred): w_i = np.reshape(np.array(weit.w[i]), (-1, 1)) sum_yw = np.sum(y * w_i) ybar = 1.0 * sum_yw / np.sum(w_i) dev = 0.0 dev0 = 0.0 for j in range(n_reg): if (1.0 - y_hat[j] < 1e-10): nu = np.log(y_hat[j] / 1e-10) dev += -2 * (y[j] * nu + np.log(1e-10)) * w_i[j] else: nu = np.log(y_hat[j] / (1.0 - y_hat[j])) dev += -2 * (y[j] * nu + np.log(1.0 - y_hat[j])) * w_i[j] nu0 = np.log(ybar / (1 - ybar)) dev0 += -2 * (y[j] * nu0 + np.log(1.0 - ybar)) * w_i[j] localR2[i] = 1.0 - dev / dev0 return pred_betas, pred_stdErr, pred_tstat, localR2
coords = {} for i in range(nobs): coords[i] = dic_data[i][:2] # get coordinates lst_data.append(dic_data[i][2:]) arr_data = np.array(lst_data) # create x, y y = np.reshape(arr_data[:, 0], (-1, 1)) y_off = np.reshape(arr_data[:, 1], (-1, 1)) x = arr_data[:, 2:] x = np.hstack((np.ones(y.shape), x)) #**********************************1. GWR Poisson (adaptive bandwithd: bisquare)************************* #****************************************************************************************************** band = 100 weit = Kernel.GWR_W(coords, band, 3) begin_t = datetime.now() print begin_t myMod = GWGLM(y, x, weit, 1, y_off, False, 1e-6, 200, 'db2564', 'eb2564', ['OCC_TEC', 'OWNH', 'POP65', 'UNEMP'], flePath, True) end_t = datetime.now() print end_t #print myMod.Betas[:5] #print myMod.std_err[:5] #print myMod.nObs #print myMod.nVars #print myMod.tr_S #print myMod.tr_SWSTW #print myMod.tr_STS #print myMod.y_pred[:5]
def G2L(y, x_glob, x_loc, coords, mType=0, wType=3, y_off=None, orig_mod=None, criterion=0, bdinfo=0, band=0, maxVal=0.0, minVal=0.0, interval=0.0, tol=1.0e-2, maxIter=50): """ Variable selection: global to local Arguments ---------- y : array n*1, dependent variable. x_glob : array n*k1, fixed independent variable. x_loc : array n*k2, local independent variable, including constant. coords : dictionary including (x,y) coordinates involved in the weight evaluation (including point i) wType : integer weight type mType : integer GWR model type, 0: Gaussian, 1: Poisson, 2: Logistic y_off : array n*1, offset variable for Poisson model orig_mod : object of GWR model original model criterion : integer bandwidth selection criterion, 0: AICc, 1: AIC, 2: BIC, 3: CV bdinfo : integer bandwidth searching method: 0: golden search 1: interval 2: fixed single bandwidth band : float given bandwidth if bdinfo=2 maxVal : float maximum value used in bandwidth searching minVal : float minimum value used in bandwidth searching interval : float interval used in interval search tol : float tolerance used to determine convergence maxIter : integer maximum number of iteration if convergence cannot arrived to the tolerance Return: varsL : list, ids of local Xs varsG : list, ids of global Xs optband : list info of optimal bandwidth searching results optWeit : kernel kernel of best model optcri : float criterion value for optimal model """ nObs = len(y) nVars_glob = len(x_glob[0]) if x_loc is None: nVars_loc = 0 tmp_loc = np.zeros(shape=(nObs, 0)) else: nVars_loc = len(x_loc[0]) tmp_loc = np.zeros(shape=(nObs, nVars_loc)) tmp_loc = x_loc nVars = nVars_loc + nVars_glob optband = [] # loop flag = True # check whether is x moved to global if nVars_glob > 0: if orig_mod is None: # 1 set original model if x_loc is None: # global model gwrMod_old = GLM_Base(y, x_glob, mType, y_off) cri_old = getDiag_GLM[criterion](gwrMod_old) else: # should be mixed model# check original bandwidth if bdinfo == 0 or bdinfo == 1: # golden or interval search rs = M_selection.Band_Sel(y, x_glob, x_loc, coords, mType, y_off, wType, criterion, bdinfo, maxVal, minVal, interval, tol, maxIter) band = rs[0] weit = rs[1] optband.append(rs) else: # set original kernel weit = Kernel.GWR_W(coords, band, wType) optWeit = weit gwrMod_old = semiGWR_Base(y, x_glob, x_loc, weit, mType, y_off) # get original diagnostics if mType == 0: cri_old = getDiag_GWR[criterion](gwrMod_old) else: cri_old = getDiag_GWGLM[criterion](gwrMod_old) else: gwrMod_old = orig_mod weit = orig_mod.kernel optWeit = weit #print "original cri:" #print cri_old # 2 loop orilist = range(nVars_glob) # ids of original global Xs while flag: # until no improvement in one loop in orilist flag = False #print "original list:" #print orilist outlist = [] # ids of Xs from global to local n_currXs = len(orilist) # every time loop through orilist # set global x tmp_glob = np.zeros(shape=(nObs, 0)) for i in orilist: tmp_glob = np.hstack( (tmp_glob, np.reshape(x_glob[:, i], (-1, 1)))) for i in range(n_currXs): idx = orilist[i] #print i #print idx # try to remove ith x x_out = np.reshape(x_glob[:, idx], (-1, 1)) tmp_glob = np.delete(tmp_glob, i - len(outlist), 1) # get new x_loc tmp_loc = np.hstack((tmp_loc, x_out)) # new bandwidth if bdinfo == 0 or bdinfo == 1: # golden or interval search rs = M_selection.Band_Sel(y, tmp_glob, tmp_loc, coords, mType, y_off, wType, criterion, bdinfo, maxVal, minVal, interval, tol, maxIter) band = rs[0] weit = rs[1] optband.append(rs) else: # new kernel weit = Kernel.GWR_W(coords, band, wType) # decide whether is a local model if len(tmp_loc[0]) == nVars: # local model gwrMod_new = GWGLM_Base(y, tmp_loc, weit, mType, y_off) cri_new = getDiag_GWGLM[criterion](gwrMod_new) else: # should be mixed model gwrMod_new = semiGWR_Base(y, tmp_glob, tmp_loc, weit, mType, y_off) if mType == 0: # get diagnostics cri_new = getDiag_GWR[criterion](gwrMod_new) else: cri_new = getDiag_GWGLM[criterion](gwrMod_new) #print cri_new # check improvements if cri_new < cri_old: # move x from local to global outlist.append(idx) cri_old = cri_new # update criteria flag = True optWeit = weit else: tmp_loc = np.delete(tmp_loc, -1, 1) # move x back to local tmp_glob = np.hstack((x_out, tmp_glob)) orilist = list(set(orilist) - set(outlist)) #print "outlist:" #print outlist #print "old cri:" #print cri_old varsG = orilist varsL = list(set(range(nVars_glob)) - set(orilist)) return varsL, varsG, optband, optWeit, cri_old