def inverse_scg(w, data, scalar, transpose=False, symmetric=False,\ threshold=0.00001,\ max_iterations=None): #multiplier = SP.identity(w.n) - (scalar*w.sparse) # A n x n count = 0 # k scalar (step 1) run_tot = copy.copy(data) # z_k n x 1 (step 1) #residuals = data - run_tot * multiplier # r_k n x 1 (step 2) residuals = data - pysal.lag_spatial(w, scalar*data) #test1 = la.norm(residuals) # G_k scalar (step 3) test1 = norm(residuals) directions = copy.copy(residuals) # d_k n x 1 (step 6) while test1 > threshold: # (step 4) count += 1 # (step 5) #changes = multiplier * directions # t n x 1 (step 7) changes = directions - pysal.lag_spatial(w, scalar*directions) intensity = test1 / (np.dot(directions.T, changes)) # alpha scalar (step 8) #int_dir = intensity * directions # (step 8) run_tot += intensity * directions #run_tot += int_dir # (step 8) #residuals -= int_dir # (step 8) residuals -= intensity * changes #test2 = la.norm(residuals) # (step 3) test2 = norm(residuals) directions = residuals + ((test2/test1)*directions) # (step 6) test1 = test2 return run_tot
def set_endog(y, x, w, yend, q, w_lags, lag_q): # Create spatial lag of y yl = lag_spatial(w, y) # spatial and non-spatial instruments if issubclass(type(yend), np.ndarray): if lag_q: lag_vars = sphstack(x, q) else: lag_vars = x spatial_inst = get_lags(w, lag_vars, w_lags) q = sphstack(q, spatial_inst) yend = sphstack(yend, yl) elif yend == None: # spatial instruments only q = get_lags(w, x, w_lags) yend = yl else: raise Exception, "invalid value passed to yend" return yend, q lag = lag_spatial(w, x) spat_lags = lag for i in range(w_lags - 1): lag = lag_spatial(w, lag) spat_lags = sphstack(spat_lags, lag) return spat_lags
def get_lags(w, x, w_lags): ''' Calculates a given order of spatial lags and all the smaller orders Parameters ---------- w : weight PySAL weights instance x : array nxk arrays with the variables to be lagged w_lags : integer Maximum order of spatial lag Returns -------- rs : array nxk*(w_lags+1) array with original and spatially lagged variables ''' lag = lag_spatial(w, x) spat_lags = lag for i in range(w_lags-1): lag = lag_spatial(w, lag) spat_lags = sphstack(spat_lags, lag) return spat_lags
def __calc(self, z): zl = pysal.lag_spatial(self.w, z) bb = sum(z * zl) / 2.0 zw = 1 - z zl = pysal.lag_spatial(self.w, zw) ww = sum(zw * zl) / 2.0 bw = self.J - (bb + ww) return (bb, ww, bw)
def get_x_lag(self, w, regimes_att): if regimes_att: xlag = ps.lag_spatial(w, regimes_att['x']) xlag = REGI.Regimes_Frame.__init__(self, xlag, regimes_att['regimes'], constant_regi=None, cols2regi=regimes_att['cols2regi'])[0] xlag = xlag.toarray() else: xlag = ps.lag_spatial(w, self.x) return xlag
def get_x_lag(self, w, regimes_att): if regimes_att: xlag = ps.lag_spatial(w, regimes_att['x']) xlag = REGI.Regimes_Frame.__init__( self, xlag, regimes_att['regimes'], constant_regi=None, cols2regi=regimes_att['cols2regi'])[0] xlag = xlag.toarray() else: xlag = ps.lag_spatial(w, self.x) return xlag
def __init__(self, y, w, permutations=0, significance_level=0.05): y = y.transpose() pml = pysal.Moran_Local ################################################################# # have to optimize conditional spatial permutations over a # time series - this is a place holder for the foreclosure paper ml = [pml(yi, w, permutations=permutations) for yi in y] ################################################################# q = np.array([mli.q for mli in ml]).transpose() classes = np.arange(1, 5) # no guarantee all 4 quadrants are visited Markov.__init__(self, q, classes) self.q = q self.w = w n, k = q.shape k -= 1 self.significance_level = significance_level move_types = np.zeros((n, k), int) sm = np.zeros((n, k), int) self.significance_level = significance_level if permutations > 0: p = np.array([mli.p_z_sim for mli in ml]).transpose() self.p_values = p pb = p <= significance_level else: pb = np.zeros_like(y.T) for t in range(k): origin = q[:, t] dest = q[:, t + 1] p_origin = pb[:, t] p_dest = pb[:, t] for r in range(n): move_types[r, t] = TT[origin[r], dest[r]] key = (origin[r], dest[r], p_origin[r], p_dest[r]) sm[r, t] = MOVE_TYPES[key] if permutations > 0: self.significant_moves = sm self.move_types = move_types # null of own and lag moves being independent ybar = y.mean(axis=0) r = y / ybar ylag = np.array([pysal.lag_spatial(w, yt) for yt in y]) rlag = ylag / ybar rc = r < 1. rlagc = rlag < 1. markov_y = pysal.Markov(rc) markov_ylag = pysal.Markov(rlagc) A = np.matrix([[1, 0, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1], [0, 1, 0, 0]]) kp = A * np.kron(markov_y.p, markov_ylag.p) * A.T trans = self.transitions.sum(axis=1) t1 = np.diag(trans) * kp t2 = self.transitions t1 = t1.getA() self.chi_2 = pysal.spatial_dynamics.markov.chi2(t1, t2) self.expected_t = t1 self.permutations = permutations
def __calc(self, z, op): if op == 'c': # cross-product zl = pysal.lag_spatial(self.w, z) g = (z * zl).sum() elif op == 's': # squared difference zs = np.zeros(z.shape) z2 = z**2 for i, i0 in enumerate(self.w.id_order): neighbors = self.w.neighbor_offsets[i0] wijs = self.w.weights[i0] zw = zip(neighbors, wijs) zs[i] = sum([ wij * (z2[i] - 2.0 * z[i] * z[j] + z2[j]) for j, wij in zw ]) g = zs.sum() elif op == 'a': # absolute difference zs = np.zeros(z.shape) for i, i0 in enumerate(self.w.id_order): neighbors = self.w.neighbor_offsets[i0] wijs = self.w.weights[i0] zw = zip(neighbors, wijs) zs[i] = sum([wij * abs(z[i] - z[j]) for j, wij in zw]) g = zs.sum() else: # any previously defined function op zs = np.zeros(z.shape) for i, i0 in enumerate(self.w.id_order): neighbors = self.w.neighbor_offsets[i0] wijs = self.w.weights[i0] zw = zip(neighbors, wijs) zs[i] = sum([wij * op(z, i, j) for j, wij in zw]) g = zs.sum() return g
def __calc(self, z, op): if op == 'c': # cross-product zl = pysal.lag_spatial(self.w, z) g = (z * zl).sum() elif op == 's': # squared difference zs = np.zeros(z.shape) z2 = z ** 2 for i, i0 in enumerate(self.w.id_order): neighbors = self.w.neighbor_offsets[i0] wijs = self.w.weights[i0] zw = zip(neighbors, wijs) zs[i] = sum([wij * (z2[i] - 2.0 * z[i] * z[ j] + z2[j]) for j, wij in zw]) g = zs.sum() elif op == 'a': # absolute difference zs = np.zeros(z.shape) for i, i0 in enumerate(self.w.id_order): neighbors = self.w.neighbor_offsets[i0] wijs = self.w.weights[i0] zw = zip(neighbors, wijs) zs[i] = sum([wij * abs(z[i] - z[j]) for j, wij in zw]) g = zs.sum() else: # any previously defined function op zs = np.zeros(z.shape) for i, i0 in enumerate(self.w.id_order): neighbors = self.w.neighbor_offsets[i0] wijs = self.w.weights[i0] zw = zip(neighbors, wijs) zs[i] = sum([wij * op(z, i, j) for j, wij in zw]) g = zs.sum() return g
def moran_dispersao(IM, title='', xlabel='', ylabel=''): y_norm = normalizar(IM.y) y_lag = ps.lag_spatial(IM.w, IM.y) y_lag_norm = normalizar(y_lag) dados = pd.DataFrame({ 'y': IM.y, 'y_norm': y_norm, 'y_lag': y_lag, 'y_lag_norm': y_lag_norm }) f, ax = plt.subplots(1, figsize=(7, 5)) sns.regplot('y_norm', 'y_lag_norm', data=dados, ci=None, color='black', line_kws={'color': 'red'}) plt.axvline(0, c='gray', alpha=0.7) plt.axhline(0, c='gray', alpha=0.7) limits = np.array( [y_norm.min(), y_norm.max(), y_lag_norm.min(), y_lag_norm.max()]) limits = np.abs(limits).max() border = 0.02 ax.set_xlim(-limits - border, limits + border) ax.set_ylim(-limits - border, limits + border) plt.title(title) plt.xlabel(xlabel) plt.ylabel(ylabel) plt.show()
def mplot(m, xlabel='', ylabel='', title='', custom=(7, 7)): """ Produce basic Moran Plot Parameters ---------- m : pysal.Moran instance values of Moran's I Global Autocorrelation Statistic xlabel : str label for x axis ylabel : str label for y axis title : str title of plot custom : tuple dimensions of figure size Returns ------- fig : Matplotlib Figure instance Moran scatterplot figure Examples -------- >>> import matplotlib.pyplot as plt >>> import pysal as ps >>> from pysal.contrib.pdio import read_files >>> from pysal.contrib.viz.plot import mplot >>> link = ps.examples.get_path('columbus.shp') >>> db = read_files(link) >>> y = db['HOVAL'].values >>> w = ps.queen_from_shapefile(link) >>> w.transform = 'R' >>> m = ps.Moran(y, w) >>> mplot(m, xlabel='Response', ylabel='Spatial Lag', ... title='Moran Scatterplot', custom=(7,7)) >>> plt.show() """ lag = ps.lag_spatial(m.w, m.z) fit = ps.spreg.OLS(m.z[:, None], lag[:, None]) # Customize plot fig = plt.figure(figsize=custom) ax = fig.add_subplot(111) ax.set_xlabel(xlabel) ax.set_ylabel(ylabel) fig.suptitle(title) ax.scatter(m.z, lag, s=60, color='k', alpha=.6) ax.plot(lag, fit.predy, color='r') ax.axvline(0, alpha=0.5) ax.axhline(0, alpha=0.5) return fig
def moran_plot(IM): import pandas as pd import seaborn as sns import matplotlib.pyplot as plt import numpy as np import pysal as ps y_norm = normalize(IM.y) y_lag = ps.lag_spatial(IM.w, IM.y) y_lag_norm = normalize(y_lag) dados = pd.DataFrame({'y':IM.y, 'y_norm':y_norm, 'y_lag':y_lag, 'y_lag_norm':y_lag_norm}) f, ax = plt.subplots(1, figsize=(7, 5)) sns.regplot('y_norm', 'y_lag_norm', data=dados, ci=None, color='black', line_kws={'color':'red'}) plt.axvline(0, c='gray', alpha=0.7) plt.axhline(0, c='gray', alpha=0.7) limits = np.array([y_norm.min(), y_norm.max(), y_lag_norm.min(), y_lag_norm.max()]) limits = np.abs(limits).max() border = 0.02 ax.set_xlim(- limits - border, limits + border) ax.set_ylim(- limits - border, limits + border) plt.show();
def test_lag_spatial(self): yl = pysal.lag_spatial(self.w, self.y) np.testing.assert_array_almost_equal(yl, [1., 2., 1.]) self.w.id_order = ['b', 'c', 'a'] y = np.array([1, 2, 0]) yl = pysal.lag_spatial(self.w, y) np.testing.assert_array_almost_equal(yl, [2., 1., 1.]) w = pysal.lat2W(3, 3) y = np.arange(9) yl = pysal.lag_spatial(w, y) ylc = np.array([4., 6., 6., 10., 16., 14., 10., 18., 12.]) np.testing.assert_array_almost_equal(yl, ylc) w.transform = 'r' yl = pysal.lag_spatial(w, y) ylc = np.array([2., 2., 3., 3.33333333, 4., 4.66666667, 5., 6., 6.]) np.testing.assert_array_almost_equal(yl, ylc)
def test_lag_spatial(self): yl = pysal.lag_spatial(self.w, self.y) np.testing.assert_array_almost_equal(yl, [1., 2., 1.]) self.w.id_order = ['b', 'c', 'a'] y = np.array([1, 2, 0]) yl = pysal.lag_spatial(self.w, y) np.testing.assert_array_almost_equal(yl, [2., 1., 1.]) w = pysal.lat2W(3, 3) y = np.arange(9) yl = pysal.lag_spatial(w, y) ylc = np.array([4., 6., 6., 10., 16., 14., 10., 18., 12.]) np.testing.assert_array_almost_equal(yl, ylc) w.transform = 'r' yl = pysal.lag_spatial(w, y) ylc = np.array( [2., 2., 3., 3.33333333, 4., 4.66666667, 5., 6., 6.]) np.testing.assert_array_almost_equal(yl, ylc)
def rose(self, Y, w, k=8): sw = 2*np.pi/k cuts = np.arange(0.0,2*np.pi+sw,sw) wY = ps.lag_spatial(w,Y) dx = Y[:,-1]-Y[:,0] dy = wY[:,-1]-wY[:,0] theta = np.arctan2(dy,dx) neg = theta < 0.0 utheta = theta*(1-neg) + neg * (2*np.pi+theta) return cuts, utheta, dx, dy
def drawW(rel,w,k): r=np.random.permutation(rel) wy=pysal.lag_spatial(w,r) y=wy[:,-1]-wy[:,0] x=r[:,-1]-r[:,0] theta=np.arctan2(y,x) neg=theta < 0.0 utheta=theta*(1-neg) + neg * (2*np.pi+theta) k=8 width=2*np.pi/k cuts=np.arange(0.0,2*np.pi+width,width) counts,bin=np.histogram(utheta,cuts) return counts
def run_sim(df, start, end, sim, models=[], tsvars=[], spatvars=[], transformvars=[], transformvars_post=[]): nunits = len(df.loc[start].index) tsstreams = [streamers.init_order(nunits, tsvar) for tsvar in tsvars] # Seed the streamers for stream in tsstreams: for value, streamer in zip(df.loc[start - 1, stream['name']].values, stream['streamers']): streamer.seed(value) # load the weight matrices for sdict in spatvars: with open(sdict['path_weight'], 'rb') as p: w = pickle.load(p) #print(sdict['name'], "loaded", sdict['path_weight']) sdict.update({'w': w}) for t in range(start, end + 1): for stream in tsstreams: update = streamers.tick(stream['streamers'], df.loc[t - 1, stream['var']].values) df.loc[t, stream['name']] = update for sdict in spatvars: update = pysal.lag_spatial(sdict['w'], df.loc[t, sdict['var']].values) df.loc[t, sdict['name']] = update for transform in transformvars: df = apply_transform(df, transform) for model in models: outputs, varnames = model.predict(sim=sim, data=df.ix[t]) for output, varname in zip(outputs, varnames): df.loc[t, varname] = output for transform in transformvars_post: df = apply_transform(df, transform) return df
def log_lik_lag(ldet, w, b, X, y): n = w.n r = b[0] # ml estimate of rho b = b[1:] # ml for betas yl = ps.lag_spatial(w,y) ys = y - r * yl XX = np.dot(X.T, X) iXX = np.linalg.inv(XX) b = np.dot(iXX, np.dot(X.T,ys)) yhat = r * yl + np.dot(X,b) e = y - yhat e2 = (e**2).sum() sig2 = e2 / n ln2pi = np.log(2*np.pi) return ldet - n/2. * ln2pi - n/2. * np.log(sig2) - e2/(2 * sig2)
def _moran_scatterplot_calc(moran_loc, p): lag = ps.lag_spatial(moran_loc.w, moran_loc.z) fit = ps.spreg.OLS(moran_loc.z[:, None], lag[:, None]) if p is not None: if not isinstance(moran_loc, Moran_Local): raise ValueError("`moran_loc` is not a esda.moran.Moran_Local instance") _, _, colors, _ = mask_local_auto(moran_loc, p=p) else: colors = 'black' data = {'moran_z': moran_loc.z, 'lag': lag, 'colors': colors, 'fit_y': fit.predy.flatten(), 'moranloc_psim': moran_loc.p_sim, 'moranloc_q': moran_loc.q} return data
def AddLagVars(dataframe, shp, idfield, SumVars, AvgVars, Adj=None): df=dataframe.copy() if idfield not in df.columns: print 'Dataframe missing id field' if idfield not in pysal.open(shp[:-3]+'dbf').header: print 'Shp missing id field' if Adj==None: Adj=pysal.queen_from_shapefile(shp, idVariable=idfield) else: pass df.set_index(idfield, inplace=True) df=df.reindex(Adj.id_order) Adj.transform='o' for Var in SumVars: df[Var+'_LAG_SUM']=pysal.lag_spatial(Adj, np.array(df[Var])) Adj.transform=('r') for Var in AvgVars: df[Var+'_LAG_AVG']=pysal.lag_spatial(Adj, np.array(df[Var])) df.reset_index(inplace=True) return df
def run(self, path): W = self.loadWeights() if W == None: return False if self.verify() == False: return False else: print "running" # print self.data['wtFiles'][self.data['wtFile']] newVars = [var.run() for var in self.newVars] names = [v[0] for v in newVars] vars = [v[1] for v in newVars] db = self.db() xid = [db.header.index(i) for i in vars] X = [db[:, i] for i in xid] lag = [pysal.lag_spatial(W, y) for y in X] lag = zip(*lag) # transpose lag = map(list, lag) new_header = db.header + names if path.endswith('.dbf'): new_spec = db.field_spec + [('N', 20, 10) for n in names] data = db.read() db.close() newdb = pysal.open(path, 'w') newdb.header = new_header newdb.field_spec = new_spec for i, row in enumerate(data): newdb.write(row + lag[i]) newdb.close() elif path.endswith('.csv'): data = db.read() db.close() newdb = pysal.open(path, 'wb') writer = csv.writer(newdb) writer.writerow(new_header) for i, row in enumerate(data): writer.writerow(row + lag[i]) newdb.close()
def mplot(m, xlabel='', ylabel='', title='', custom=(7,7)): ''' Produce basic Moran Plot ... Parameters --------- m : array values of Moran's I xlabel : str label for x axis ylabel : str label for y axis title : str title of plot custom : tuple dimensions of figure size Returns --------- plot : png image file showing plot ''' lag = ps.lag_spatial(m.w, m.z) fit = ps.spreg.OLS(m.z[:, None], lag[:,None]) ## Customize plot fig = plt.figure(figsize=custom) plt.xlabel(xlabel) plt.ylabel(ylabel) plt.suptitle(title) plt.scatter(m.z, lag, s=60, color='k', alpha=.6) plt.plot(lag, fit.predy, color='r') plt.axvline(0, alpha=0.5) plt.axhline(0, alpha=0.5) plt.show() return None
def _calc(self, y, w, classes, k): # lag markov ly = pysal.lag_spatial(w, y) npm = np.matrix npa = np.array if self.fixed: l_classes = pysal.Quantiles(ly.flatten(), k=k).yb l_classes.shape = ly.shape else: l_classes = npa([ pysal.Quantiles(ly[:, i], k=k).yb for i in np.arange(self.cols) ]) l_classes = l_classes.transpose() l_classic = Markov(l_classes) T = np.zeros((k, k, k)) n, t = y.shape for t1 in range(t - 1): t2 = t1 + 1 for i in range(n): T[l_classes[i, t1], classes[i, t1], classes[i, t2]] += 1 P = np.zeros_like(T) F = np.zeros_like(T) # fmpt ss = np.zeros_like(T[0]) for i, mat in enumerate(T): row_sum = mat.sum(axis=1) row_sum = row_sum + (row_sum == 0) p_i = np.matrix(np.diag(1. / row_sum) * np.matrix(mat)) #print i #print mat #print p_i ss[i] = steady_state(p_i).transpose() try: F[i] = fmpt(p_i) except: #pylint; "No exception type(s) specified" print "Singlular fmpt matrix for class ", i P[i] = p_i return T, P, ss, F
def _calc(self, y, w, classes, k): # lag markov ly = pysal.lag_spatial(w, y) npm = np.matrix npa = np.array if self.fixed: l_classes = pysal.Quantiles(ly.flatten(), k=k).yb l_classes.shape = ly.shape else: l_classes = npa([pysal.Quantiles( ly[:, i], k=k).yb for i in np.arange(self.cols)]) l_classes = l_classes.transpose() l_classic = Markov(l_classes) T = np.zeros((k, k, k)) n, t = y.shape for t1 in range(t - 1): t2 = t1 + 1 for i in range(n): T[l_classes[i, t1], classes[i, t1], classes[i, t2]] += 1 P = np.zeros_like(T) F = np.zeros_like(T) # fmpt ss = np.zeros_like(T[0]) for i, mat in enumerate(T): row_sum = mat.sum(axis=1) row_sum = row_sum + (row_sum == 0) p_i = np.matrix(np.diag(1. / row_sum) * np.matrix(mat)) #print i #print mat #print p_i ss[i] = steady_state(p_i).transpose() try: F[i] = fmpt(p_i) except: #pylint; "No exception type(s) specified" print "Singlular fmpt matrix for class ", i P[i] = p_i return T, P, ss, F
def hac_multi(reg, gwk, constant=False): """ HAC robust estimation of the variance-covariance matrix for multi-regression object Parameters ---------- reg : Regression object (OLS or TSLS) output instance from a regression model gwk : PySAL weights object Spatial weights based on kernel functions Returns -------- psi : kxk array Robust estimation of the variance-covariance """ if not constant: reg.hac_var = check_constant(reg.hac_var) xu = spbroadcast(reg.hac_var, reg.u) gwkxu = lag_spatial(gwk, xu) psi0 = spdot(xu.T, gwkxu) counter = 0 for m in reg.multi: reg.multi[m].robust = 'hac' reg.multi[m].name_gwk = reg.name_gwk try: psi1 = spdot(reg.multi[m].varb, reg.multi[m].zthhthi) reg.multi[m].vm = spdot(psi1, np.dot(psi0, psi1.T)) except: reg.multi[m].vm = spdot( reg.multi[m].xtxi, np.dot(psi0, reg.multi[m].xtxi)) reg.vm[(counter * reg.kr):((counter + 1) * reg.kr), (counter * reg.kr):((counter + 1) * reg.kr)] = reg.multi[m].vm counter += 1
def hac_multi(reg, gwk, constant=False): """ HAC robust estimation of the variance-covariance matrix for multi-regression object Parameters ---------- reg : Regression object (OLS or TSLS) output instance from a regression model gwk : PySAL weights object Spatial weights based on kernel functions Returns -------- psi : kxk array Robust estimation of the variance-covariance """ if not constant: reg.hac_var = check_constant(reg.hac_var) xu = spbroadcast(reg.hac_var, reg.u) gwkxu = lag_spatial(gwk, xu) psi0 = spdot(xu.T, gwkxu) counter = 0 for m in reg.multi: reg.multi[m].robust = 'hac' reg.multi[m].name_gwk = reg.name_gwk try: psi1 = spdot(reg.multi[m].varb, reg.multi[m].zthhthi) reg.multi[m].vm = spdot(psi1, np.dot(psi0, psi1.T)) except: reg.multi[m].vm = spdot(reg.multi[m].xtxi, np.dot(psi0, reg.multi[m].xtxi)) reg.vm[(counter * reg.kr):((counter + 1) * reg.kr), (counter * reg.kr):((counter + 1) * reg.kr)] = reg.multi[m].vm counter += 1
def moran_scatter_plot(shp, dbf, var, w): y = np.array(dbf.by_col[var]) y_lag = pysal.lag_spatial(w, y) y_z = (y - y.mean()) / y.std() y_lag_z = (y_lag - y_lag.mean()) / y_lag.std() global SHP_DICT uuid = SHP_DICT[shp] global WS_SERVER ws = create_connection(WS_SERVER) msg = { "command": "moran_scatter_plot", "uuid": uuid, "title": "Moran Scatter plot for variable [%s]" % var, "data": { "x": y_z.tolist(), "y" : y_lag_z.tolist() }, "fields": [var, "lagged %s" % var] } str_msg = json.dumps(msg) ws.send(str_msg) #print "send:", str_msg ws.close()
def moran_dispersao(IM, title='', xlabel='', ylabel=''): y_norm = normalizar(IM.y) y_lag = ps.lag_spatial(IM.w, IM.y) y_lag_norm = normalizar(y_lag) dados = pd.DataFrame({'y':IM.y, 'y_norm':y_norm, 'y_lag':y_lag, 'y_lag_norm':y_lag_norm}) f, ax = plt.subplots(1, figsize=(7, 5)) sns.regplot('y_norm', 'y_lag_norm', data=dados, ci=None, color='black', line_kws={'color':'red'}) plt.axvline(0, c='gray', alpha=0.7) plt.axhline(0, c='gray', alpha=0.7) limits = np.array([y_norm.min(), y_norm.max(), y_lag_norm.min(), y_lag_norm.max()]) limits = np.abs(limits).max() border = 0.02 ax.set_xlim(- limits - border, limits + border) ax.set_ylim(- limits - border, limits + border) plt.title(title) plt.xlabel(xlabel) plt.ylabel(ylabel) plt.show();
def grafico(self): """Grafico de dispersion""" w=self.w y=self.y ystd=(y-np.mean(y))/np.std(y) w.transform = 'r' yl = pysal.lag_spatial(w,ystd) colors = np.random.rand(100) """area = np.pi * (15 * np.random.rand(100))**2 # 0 to 15 point radiuses""" fig, ax = plt.subplots() m, fit = np.polyfit(ystd, yl, deg=1) """ax.plot(self.y, fit[0] * self.y + fit[1], color='blue', alpha=0.4, linewidth=0.3, linestyle='dotted')""" ax.scatter(ystd, yl, c=colors, alpha=0.5) ax.plot(ystd, m*ystd + fit, color='blue', alpha=0.4, linewidth=0.3) fig.suptitle('Moran`s I: '+str(round(self.mi.I,5))) plt.xlabel(self.fieldName) plt.ylabel('Spatial Lag '+self.fieldName) ax.set_yticks([np.mean(yl)],minor=False) ax.yaxis.set_major_locator(FixedLocator([round(np.mean(yl),5)])) ax.yaxis.grid(True) ax.set_xticks([np.mean(ystd), np.amax(ystd)],minor=True) ax.xaxis.set_major_locator(FixedLocator([round(np.mean(ystd),5)])) ax.xaxis.grid(True) """ax.spines['left'].set_position((y,np.mean(y)))""" """ax.spines['bottom'].set_position((yl,np.mean(yl)))""" """ax.set_yticklabels(['Bill', 'Jim'])""" """plt.grid(True)""" """fig.savefig('test.jpg')""" fig.show()
def run(self, path): if self.verify(): print "running" # print self.data['wtFiles'][self.data['wtFile']] newVars = [var.run() for var in self.newVars] names = [v[0] for v in newVars] vars = [v[1] for v in newVars] db = self.db() xid = [db.header.index(i) for i in vars] X = [db[:, i] for i in xid] W = self.loadWeights() lag = [pysal.lag_spatial(W, y) for y in X] lag = zip(*lag) # transpose lag = map(list, lag) new_header = db.header + names if path.endswith('.dbf'): new_spec = db.field_spec + [('N', 20, 10) for n in names] data = db.read() db.close() newdb = pysal.open(path, 'w') newdb.header = new_header newdb.field_spec = new_spec for i, row in enumerate(data): newdb.write(row + lag[i]) newdb.close() elif path.endswith('.csv'): data = db.read() db.close() newdb = pysal.open(path, 'wb') writer = csv.writer(newdb) writer.writerow(new_header) for i, row in enumerate(data): writer.writerow(row + lag[i]) newdb.close()
def calculate_lag_value(x): return ps.lag_spatial(W, x)
def spatial_trend(self, subquery, time_cols, num_classes=7, w_type='knn', num_ngbrs=5, permutations=0, geom_col='the_geom', id_col='cartodb_id'): """ Predict the trends of a unit based on: 1. history of its transitions to different classes (e.g., 1st quantile -> 2nd quantile) 2. average class of its neighbors Inputs: @param subquery string: e.g., SELECT the_geom, cartodb_id, interesting_time_column FROM table_name @param time_cols list of strings: list of strings of column names @param num_classes (optional): number of classes to break distribution of values into. Currently uses quantile bins. @param w_type string (optional): weight type ('knn' or 'queen') @param num_ngbrs int (optional): number of neighbors (if knn type) @param permutations int (optional): number of permutations for test stats @param geom_col string (optional): name of column which contains the geometries @param id_col string (optional): name of column which has the ids of the table Outputs: @param trend_up float: probablity that a geom will move to a higher class @param trend_down float: probablity that a geom will move to a lower class @param trend float: (trend_up - trend_down) / trend_static @param volatility float: a measure of the volatility based on probability stddev(prob array) """ if len(time_cols) < 2: plpy.error('More than one time column needs to be passed') params = { "id_col": id_col, "time_cols": time_cols, "geom_col": geom_col, "subquery": subquery, "num_ngbrs": num_ngbrs } result = self.data_provider.get_markov(w_type, params) # build weight weights = pu.get_weight(result, w_type) weights.transform = 'r' # prep time data t_data = get_time_data(result, time_cols) sp_markov_result = ps.Spatial_Markov(t_data, weights, k=num_classes, fixed=False, permutations=permutations) # get lag classes lag_classes = ps.Quantiles(ps.lag_spatial(weights, t_data[:, -1]), k=num_classes).yb # look up probablity distribution for each unit according to class and # lag class prob_dist = get_prob_dist(sp_markov_result.P, lag_classes, sp_markov_result.classes[:, -1]) # find the ups and down and overall distribution of each cell trend_up, trend_down, trend, volatility = get_prob_stats( prob_dist, sp_markov_result.classes[:, -1]) # output the results return zip(trend, trend_up, trend_down, volatility, weights.id_order)
def __init__(self, w, target_est_count=None, target_moe_count=None, target_th_count=None,\ target_est_prop=None, target_moe_prop=None, target_th_prop=None,\ target_est_ratio=None, target_moe_ratio=None, target_th_ratio=None,\ target_th_all=None, count_est=None, count_th_min=None, count_th_max=None,\ exclude=None, auto_exclude=0, base_solutions=100,\ zscore=True, pca=True, local_improvement=True, local_params=None,\ compactness=None, points=None, anchor=None, cardinality=False,\ cv_exclude_count=0, cv_exclude_prop=0, cv_exclude_ratio=0): time1 = time.time() time_output = { 'prep': 0, 'base': 0, 'base_wrapup': 0, 'local': 0, 'local_wrapup': 0, 'wrapup': 0, 'total': 0 } # convert arbitrary IDs in W object to integers id2i = w.id2i neighbors = { id2i[key]: [id2i[neigh] for neigh in w.neighbors[key]] for key in w.id_order } w = ps.W(neighbors) # build KDTree for use in finding base solution if issubclass(type(points), scipy.spatial.KDTree): kd = points points = kd.data elif type(points).__name__ == 'ndarray': kd = ps.common.KDTree(points) elif issubclass(type(points), ps.core.IOHandlers.pyShpIO.PurePyShpWrapper): #loop to find centroids, need to be sure order matches W and data centroids = [] for i in points: centroids.append(i.centroid) kd = ps.common.KDTree(centroids) points = kd.data elif points is None: kd = None else: raise Exception, 'Unsupported type passed to points' # dictionary allowing multivariate and univariate flexibility target_parts = {'target_est_count':target_est_count,\ 'target_est_prop':target_est_prop,\ 'target_est_ratio':target_est_ratio,\ 'target_sde_count':target_moe_count ,\ 'target_sde_prop':target_moe_prop,\ 'target_sde_ratio':target_moe_ratio} # setup the holder for the variables to minimize; later we will put all # the count, ratio and proportion variables into this array. # Also, convert MOEs to standard errors when appropriate total_vars = 0 rows = 0 if target_est_count is not None: rows, cols = target_est_count.shape total_vars += cols target_parts['target_est_count'] = target_est_count * 1.0 target_parts['target_sde_count'] = target_moe_count / 1.645 if target_est_prop is not None: rows, cols = target_est_prop.shape total_vars += cols / 2 target_parts['target_est_prop'] = target_est_prop * 1.0 target_parts['target_sde_prop'] = target_moe_prop / 1.645 if target_est_ratio is not None: rows, cols = target_est_ratio.shape total_vars += cols / 2 target_parts['target_est_ratio'] = target_est_ratio * 1.0 target_parts['target_sde_ratio'] = target_moe_ratio / 1.645 if total_vars == 0: target_est = None print 'warning: optimization steps will not be run since no target_est variables provided' else: target_est = np.ones((rows, total_vars)) * -999 # organize and check the input data; prep data for actual computations position = 0 target_th = [] # IMPORTANT: maintain the order of count then proportion then ratio if target_est_count is not None: target_est, target_th, position = mv_data_prep(target_est_count,\ target_th_count, target_th_all,\ target_est, target_th, position,\ scale=1, ratio=False) if target_est_prop is not None: target_est, target_th, position = mv_data_prep(target_est_prop,\ target_th_prop, target_th_all,\ target_est, target_th, position,\ scale=2, ratio=False) if target_est_ratio is not None: target_est, target_th, position = mv_data_prep(target_est_ratio,\ target_th_ratio, target_th_all,\ target_est, target_th, position,\ scale=2, ratio=True) target_th = np.array(target_th) # compute zscores # NOTE: zscores computed using all data, i.e. we do not screen out # observations in the exclude list. if zscore and target_est is not None: if pca: # Python does not currently have a widely used tool for # computing PCA with missing values. In principle, # NIPALS (Nonlinear Iterative Partial Least Squares) # can accommodate missing values, but the implementation in MDP # 3.4 will return a matrix of NAN values if there is an NAN # value in the input data. # http://sourceforge.net/p/mdp-toolkit/mailman/mdp-toolkit-users/?viewmonth=201111 # http://stats.stackexchange.com/questions/35561/imputation-of-missing-values-for-pca # Therefore, we impute the missing values when the user # requests PCA; compute the z-scores on the imputed data; and # then pass this on to the PCA step. # The imputation replaces a missing value with the average of # its neighbors (i.e., its spatial lag). If missing values # remain (due to missing values in a missing value's neighbor # set), then that value is replaced by the column average. w_standardized = copy.deepcopy(w) w_standardized.transform = 'r' target_est_lag = ps.lag_spatial(w_standardized, target_est) # replace troublemakers with their spatial lag trouble = np.isfinite(target_est) trouble = np.bitwise_not(trouble) target_est[trouble] = target_est_lag[trouble] del target_est_lag del trouble # Pandas ignores missing values by default, so we can # compute the z-score and retain the missing values target_est = pd.DataFrame(target_est) target_est = (target_est - target_est.mean(axis=0)) / target_est.std(axis=0) target_est = target_est.values if pca: # For the PCA case we need to replace any remaining missing # values with their column average. Since we now have z-scores, # we know that the average of every column is zero. # If it's not the PCA case, then we can leave the missing # values in as they will be ignored down the line. if np.isfinite(target_est.sum()) == False: trouble = np.isfinite(target_est) trouble = np.bitwise_not(trouble) target_est[trouble] = 0. del trouble # run principle components on target data (skip PCA if pca=False) # NOTE: matplotlib has deprecated PCA function, also it only uses SVD # which can get tripped up by bad data # NOTE: the logic here is to first identify the principle components and # then weight each component in preparation for future SSD # computations; we weight the data here so that we don't need to # weight the data each time the SSD is computed; in effect we want # to compute the SSD on each raw component and then weight that # component's contribution to the total SSD by the component's share # of total variance explained, since the SSD computation has a # squared term we can take the square root of the data now and then # not have to weight it later # NOTE: PCA computed using all data, i.e. we do not screen out # observations in the exclude list. if pca and target_est is not None: try: # eigenvector approach pca_node = MDP.nodes.PCANode() target_est = pca_node.execute( target_est) # get principle components except: try: # singular value decomposition approach pca_node = MDP.nodes.PCANode(svd=True) target_est = pca_node.execute( target_est) # get principle components except: # NIPALS would be a better approach than imputing # missing values entirely, but MDP 3.4 does not handle # missing values. Leaving this code as a place holder in # case MDP is updated later. ###pca_node = MDP.nodes.NIPALSNode() ###target_est = pca_node.execute(target_est) # get principle components raise Exception, "PCA not possible given input data and settings. Set zscore=True to automatically impute missing values or address missing values in advance." pca_variance = np.sqrt(pca_node.d / pca_node.total_variance) target_est = target_est * pca_variance # weighting for SSD # NOTE: the target_est variable is passed to the SSD function, and the # target_parts variable is passed to the feasibility test function # set the appropriate objective function plan build_region, enclave_test, local_test = function_picker(count_est,\ count_th_min, count_th_max, target_th_count,\ target_th_prop, target_th_ratio, target_th_all) # setup the CV computation get_cv = UTILS.get_mv_cv cv_exclude = [cv_exclude_count, cv_exclude_prop, cv_exclude_ratio] # setup areas to be excluded from computations if exclude: exclude = [id2i[j] for j in exclude] original_exclude = exclude[:] # in integer ID form else: original_exclude = [] # might consider an automated process to drop observations where # count_est=0; at this time the user would be expected to add these # observations to the exclude list time2 = time.time() time_output['prep'] = time2 - time1 # find the feasible solution with the most number of regions regions, id2region, exclude, enclaves = BASE.base_region_iterator(\ w, count_th_min, count_th_max, count_est, target_th, target_est,\ exclude, auto_exclude, get_cv, base_solutions,\ target_parts, build_region, enclave_test, kd, points, anchor, cardinality, cv_exclude) time3 = time.time() time_output['base'] = time3 - time2 problem_ids = list(set(exclude).difference(original_exclude)) if id2region == False: # Infeasible base run exit = "no feasible solution" time3a = time4 = time4a = time.time() else: if target_est is not None: # only compute SSDs if there are target_est variables start_ssds = np.array([ UTILS.sum_squares(region, target_est) for region in regions ]) else: start_ssds = np.ones(len(regions)) * -999.0 if compactness: # capture compactness from base solution start_compactness = UTILS.compactness_global( regions, compactness) if local_improvement and len(regions) > 1: # only run the local improvement if the appropriate flag is set # (local_improvement=True) and if there is more then one region to # swap areas between # swap areas along region borders that improve SSD time3a = time.time() regions, id2region, exit = \ LOCAL.local_search(regions, id2region, w, count_th_min, count_th_max,\ count_est, target_th, target_parts,\ target_est, exclude, get_cv,\ local_test, local_params, cv_exclude) time4 = time.time() # collect stats on SSD for each region end_ssds = np.array([ UTILS.sum_squares(region, target_est) for region in regions ]) ssd_improvement = (end_ssds - start_ssds) / start_ssds ssd_improvement[np.isnan( ssd_improvement )] = 0.0 # makes singleton regions have 0 improvement ssds = np.vstack((start_ssds, end_ssds, ssd_improvement)).T if compactness: # capture compactness from final solution end_compactness = UTILS.compactness_global( regions, compactness) compact_change = \ (end_compactness - start_compactness) / start_compactness compacts = np.vstack( (start_compactness, end_compactness, compact_change)).T else: compacts = np.ones((len(regions), 3)) * -999.0 time4a = time.time() else: time3a = time4 = time.time() # capture start SSDs and compactness, insert -999 for "improvements" ssds = np.vstack((start_ssds, np.ones(start_ssds.shape)*-999,\ np.ones(start_ssds.shape)*-999)).T if compactness: compacts = np.vstack((start_compactness, np.ones(start_compactness.shape)*-999,\ np.ones(start_compactness.shape)*-999)).T else: compacts = np.ones((len(regions), 3)) * -999.0 exit = 'no local improvement' print "Did not run local improvement" time4a = time.time() time_output['base_wrapup'] = time3a - time3 time_output['local'] = time4 - time3a time_output['local_wrapup'] = time4a - time4 #################### # process regionalization results for user output #################### # setup header for the pandas dataframes (estimates, MOEs, CVs) header = [] if target_est_count is not None: if 'pandas' in str(type(target_est_count)): header.extend(target_est_count.columns.tolist()) else: header.extend([ 'count_var' + str(i) for i in range(target_est_count.shape[1]) ]) if target_est_prop is not None: if 'pandas' in str(type(target_est_prop)): header.extend(target_est_count.prop.tolist()) else: header.extend([ 'prop_var' + str(i) for i in range(target_est_prop.shape[1] / 2) ]) if target_est_ratio is not None: if 'pandas' in str(type(target_est_ratio)): header.extend(target_est_ratio.columns.tolist()) else: header.extend([ 'ratio_var' + str(i) for i in range(target_est_ratio.shape[1] / 2) ]) # initialize pandas dataframes (estimates, MOEs, CVs; regions and areas) regionID = pd.Index(range(len(regions)), name='regionID') ests_region = pd.DataFrame(index=regionID, columns=header) moes_region = pd.DataFrame(index=regionID, columns=header) cvs_region = pd.DataFrame(index=regionID, columns=header) areaID = pd.Index(range(w.n), name='areaID') ests_area = pd.DataFrame(index=areaID, columns=header) moes_area = pd.DataFrame(index=areaID, columns=header) cvs_area = pd.DataFrame(index=areaID, columns=header) # setup header and pandas dataframe (count variable, if applicable) header = ['count'] if count_est is not None: if 'pandas' in str(type(count_est)): header = [count_est.columns[0]] counts_region = pd.DataFrame(index=range(len(regions)), columns=header) counts_area = pd.DataFrame(index=range(w.n), columns=header) # create SSD and compactness dataframes if id2region == False: # Infeasible base run ssds = None compacts = None else: ssds = pd.DataFrame( ssds, index=regionID, columns=['start_ssd', 'end_ssd', 'ssd_improvement']) compacts = pd.DataFrame(compacts, index=regionID, columns=[ 'start_compactness', 'end_compactness', 'compactness_improvement' ]) # this one-dimensional list will contain the region IDs (ordered by area) ordered_region_ids = np.ones(w.n) * -9999 for i, region in enumerate(regions): if count_est is not None: # get region totals for count variable counts_region.ix[i] = count_est[region].sum() for j in region: counts_area.ix[j] = count_est[j] ests = [] sdes = [] if target_est_count is not None: # est, MOE and CV for count data est, sde = UTILS.get_est_sde_count(region, target_parts) est[np.isnan(est)] = 0.0 # clean up 0/0 case sde[np.isnan(sde)] = 0.0 # clean up 0/0 case ests.extend(est) sdes.extend(sde) if target_est_prop is not None: # est, MOE and CV for proportion data est, sde = UTILS.get_est_sde_prop(region, target_parts) est[np.isnan(est)] = 0.0 # clean up 0/0 case sde[np.isnan(sde)] = 0.0 # clean up 0/0 case ests.extend(est) sdes.extend(sde) if target_est_ratio is not None: # est, MOE and CV for ratio data est, sde = UTILS.get_est_sde_ratio(region, target_parts) est[np.isnan(est)] = 0.0 # clean up 0/0 case sde[np.isnan(sde)] = 0.0 # clean up 0/0 case ests.extend(est) sdes.extend(sde) ests_region, moes_region, cvs_region = wrapup_region(\ i, ests, sdes, target_parts, ests_region, moes_region, cvs_region) ests_area, moes_area, cvs_area = wrapup_areas(\ region, target_parts, ests_area, moes_area, cvs_area) ordered_region_ids[region] = i # set excluded areas to region ID -999 ordered_region_ids[exclude] = -999 time5 = time.time() time_output['wrapup'] = time5 - time4 time_output['total'] = time5 - time1 self.exit = exit self.time = time_output self.enclaves = enclaves self.p = len(regions) self.regions = regions self.region_ids = ordered_region_ids.tolist() self.ssds = ssds self.compactness = compacts self.ests_region = ests_region self.moes_region = moes_region self.cvs_region = cvs_region self.ests_area = ests_area self.moes_area = moes_area self.cvs_area = cvs_area self.counts_region = counts_region self.counts_area = counts_area self.problem_ids = problem_ids
def ml_error(y, X, w, precrit=0.0000001, verbose=False, method='full'): """ Maximum likelihood of spatial error model Parameters ---------- y: dependent variable (nx1 array) w: spatial weights object X: explanatory variables (nxk array) precrit: convergence criterion verbose: boolen to print iterations in estimation method: method to use for evaluating jacobian term in concentrated likelihood function (FULL|ORD) where FULL=Brute Force, ORD = eigenvalue based jacobian Returns ------- Results: dictionary with estimates, standard errors, vcv, and z-values """ n = w.n n,k = X.shape yy = (y**2).sum() yl = ps.lag_spatial(w, y) ylyl = (yl**2).sum() Xy = np.dot(X.T,y) Xl = ps.lag_spatial(w, X) Xly = np.dot(Xl.T,y) + np.dot(X.T, yl) Xlyl = np.dot(Xl.T, yl) XX = np.dot(X.T, X) XlX = np.dot(Xl.T,X) + np.dot(X.T, Xl) XlXl = np.dot(Xl.T, Xl) yly = np.dot(yl.T, y) yyl = np.dot(y.T, yl) ylyl = np.dot(yl.T, yl) lam = 0 dlik, b, sig2, tr, dd = defer(w, lam, yy, yyl, ylyl, Xy, Xly, Xlyl, XX, XlX, XlXl) roots = np.linalg.eigvals(w.full()[0]) maxroot = 1./roots.max() minroot = 1./roots.min() delta = 0.0001 if dlik > 0: ll = lam ul = maxroot - delta else: ul = lam ll = minroot + delta # bisection t = 10 lam0 = (ll + ul) /2. i = 0 if verbose: line ="\nMaximum Likelihood Estimation of Spatial error Model" print line line ="%-5s\t%12s\t%12s\t%12s\t%12s"%("Iter.","LL","LAMBDA","UL","dlik") print line while abs(t - lam0) > precrit: if verbose: print "%d\t%12.8f\t%12.8f\t%12.8f\t%12.8f" % (i,ll, lam0, ul, dlik) i += 1 dlik, b, sig2, tr, dd = defer(w, lam0, yy, yyl, ylyl, Xy, Xly, Xlyl, XX, XlX, XlXl) if dlik > 0: ll = lam0 else: ul = lam0 t = lam0 lam0 = (ul + ll)/ 2. ldet = _logJacobian(w, lam0, method) llik = log_lik_error(ldet, w, b, lam0, X, y, sig2) # Info Matrix # l = lambda # B = betas # s = sigma2 # Vl ClB Cls # CBl VB CBs # Csl CsB Vs # Vll n,k = X.shape W = w.full()[0] B_inv = np.linalg.inv(np.eye(n) - lam0 * W) WB = np.dot(W, B_inv) trWB = np.trace(WB) Vl = trWB**2 + np.trace(np.dot(WB.T, WB)) # Cls Cls = trWB / sig2 # Vs Vs = n / (2.0 * sig2 * sig2) # VB XL = X - lam0 * ps.lag_spatial(w, X) VB = sig2 * np.linalg.inv(np.dot(XL.T, XL)) #Variance for l and s is inverse of 2x2 information matrix Infols = np.zeros((2,2)) Infols[0,0] = Vl Infols[1,0] = Cls Infols[0,1] = Cls Infols[1,1] = Vs Varls = np.linalg.inv(Infols) results = {} results['betas'] = b results['lambda'] = lam0 results['llik'] = llik results['sig2'] = sig2 results['std.error_B'] = np.sqrt(np.diag(VB)) results['std.error_l'] = np.sqrt(Varls[0,0]) results['method'] = method return results
def __init__(self, y, x, w, method='full', epsilon=0.0000001): # set up main regression variables and spatial filters self.y = y self.x = x self.n, self.k = self.x.shape self.method = method self.epsilon = epsilon #W = w.full()[0] #Wsp = w.sparse ylag = ps.lag_spatial(w, y) # b0, b1, e0 and e1 xtx = spdot(self.x.T, self.x) xtxi = la.inv(xtx) xty = spdot(self.x.T, self.y) xtyl = spdot(self.x.T, ylag) b0 = np.dot(xtxi, xty) b1 = np.dot(xtxi, xtyl) e0 = self.y - spdot(x, b0) e1 = ylag - spdot(x, b1) methodML = method.upper() # call minimizer using concentrated log-likelihood to get rho if methodML in ['FULL', 'LU', 'ORD']: if methodML == 'FULL': W = w.full()[0] # moved here res = minimize_scalar(lag_c_loglik, 0.0, bounds=(-1.0, 1.0), args=( self.n, e0, e1, W), method='bounded', tol=epsilon) elif methodML == 'LU': I = sp.identity(w.n) Wsp = w.sparse # moved here res = minimize_scalar(lag_c_loglik_sp, 0.0, bounds=(-1.0,1.0), args=(self.n, e0, e1, I, Wsp), method='bounded', tol=epsilon) elif methodML == 'ORD': # check on symmetry structure if w.asymmetry(intrinsic=False) == []: ww = symmetrize(w) WW = ww.todense() evals = la.eigvalsh(WW) else: W = w.full()[0] # moved here evals = la.eigvals(W) res = minimize_scalar(lag_c_loglik_ord, 0.0, bounds=(-1.0, 1.0), args=( self.n, e0, e1, evals), method='bounded', tol=epsilon) else: # program will crash, need to catch print("{0} is an unsupported method".format(methodML)) self = None return self.rho = res.x[0][0] # compute full log-likelihood, including constants ln2pi = np.log(2.0 * np.pi) llik = -res.fun - self.n / 2.0 * ln2pi - self.n / 2.0 self.logll = llik[0][0] # b, residuals and predicted values b = b0 - self.rho * b1 self.betas = np.vstack((b, self.rho)) # rho added as last coefficient self.u = e0 - self.rho * e1 self.predy = self.y - self.u xb = spdot(x, b) self.predy_e = inverse_prod( w.sparse, xb, self.rho, inv_method="power_exp", threshold=epsilon) self.e_pred = self.y - self.predy_e # residual variance self.sig2 = self.sig2n # no allowance for division by n-k # information matrix a = -self.rho * W np.fill_diagonal(a, 1.0) ai = la.inv(a) wai = np.dot(W, ai) tr1 = np.trace(wai) wai2 = np.dot(wai, wai) tr2 = np.trace(wai2) waiTwai = np.dot(wai.T, wai) tr3 = np.trace(waiTwai) wpredy = ps.lag_spatial(w, self.predy_e) wpyTwpy = np.dot(wpredy.T, wpredy) xTwpy = spdot(x.T, wpredy) # order of variables is beta, rho, sigma2 v1 = np.vstack( (xtx / self.sig2, xTwpy.T / self.sig2, np.zeros((1, self.k)))) v2 = np.vstack( (xTwpy / self.sig2, tr2 + tr3 + wpyTwpy / self.sig2, tr1 / self.sig2)) v3 = np.vstack( (np.zeros((self.k, 1)), tr1 / self.sig2, self.n / (2.0 * self.sig2 ** 2))) v = np.hstack((v1, v2, v3)) self.vm1 = la.inv(v) # vm1 includes variance for sigma2 self.vm = self.vm1[:-1, :-1] # vm is for coefficients only
'"Far West 3/"'] snames=[name for name in names if name not in out] sids=[names.index(name) for name in snames] states=data[sids,:] us=data[0] from pylab import * years=np.arange(1969,2009) rel=states/(us*1.) gal=pysal.open('states48.gal') w=gal.read() rt=rel.transpose() w.transform='r' wrel=pysal.lag_spatial(w,rel) y1=rel[:,0] wy1=wrel[:,0] y2=rel[:,-1] wy2=wrel[:,-1] minx,miny=rel.min(),rel.min() maxx,maxy=rel.max(),rel.max() import matplotlib.pyplot as plt import matplotlib.patches as mpp fig=plt.figure() dx=y2-y1 dy=wy2-wy1
## build weight weights = pu.get_weight(query_result, w_type) weights.transform = "r" ## prep time data t_data = get_time_data(query_result, time_cols) plpy.debug("shape of t_data %d, %d" % t_data.shape) plpy.debug("number of weight objects: %d, %d" % (weights.sparse).shape) plpy.debug("first num elements: %f" % t_data[0, 0]) sp_markov_result = ps.Spatial_Markov(t_data, weights, k=num_classes, fixed=False, permutations=permutations) ## get lag classes lag_classes = ps.Quantiles(ps.lag_spatial(weights, t_data[:, -1]), k=num_classes).yb ## look up probablity distribution for each unit according to class and lag class prob_dist = get_prob_dist(sp_markov_result.P, lag_classes, sp_markov_result.classes[:, -1]) ## find the ups and down and overall distribution of each cell trend_up, trend_down, trend, volatility = get_prob_stats(prob_dist, sp_markov_result.classes[:, -1]) ## output the results return zip(trend, trend_up, trend_down, volatility, weights.id_order) def get_time_data(markov_data, time_cols): """ Extract the time columns and bin appropriately """
# I didn't understand QURBRURX db1['POPDENS'] = db.ACS12_5yr_B01003001 / (db.SE_T02A_002 * 1.) # if no home value, assign the spatial lag of the estimate and SE homeval = db1['MHSEVAL_ALT'].copy() homeval_se = db.ACS12_5yr_B25077001s.copy() dbf = ps.open(os.path.join(spath, 'USA_Counties_500k.dbf')) # Rename dbf GEOIDs to match homeval geoid = dbf.by_col('geoFIPS') shp_fips = pd.DataFrame(dbf.by_col('geoFIPS'), index=geoid) shp_fips = shp_fips.join(homeval) shp_fips = shp_fips.join(homeval_se) shp_fips['MHSEVAL_ALT_LAG'] = ps.lag_spatial(w, shp_fips.MHSEVAL_ALT) shp_fips['MHSEVAL_ALT_LAG_SE'] = ps.lag_spatial(w, shp_fips.ACS12_5yr_B25077001s) mh = shp_fips.ix[shp_fips.MHSEVAL_ALT_LAG == 0].MHSEVAL_ALT.tolist() # Reassign values to MHSEVAL_ALT_LAG shp_fips.ix[shp_fips.MHSEVAL_ALT_LAG == 0, 'MHSEVAL_ALT_LAG'] = mh # Reassign missing standard error values mhs = shp_fips.ix[shp_fips.MHSEVAL_ALT_LAG_SE == 0].ACS12_5yr_B25077001s.tolist() shp_fips.ix[shp_fips.MHSEVAL_ALT_LAG_SE == 0, 'MHSEVAL_ALT_LAG_SE'] = mhs # Get rid of nan values - reassign MHSEVAL_ALT(_SE) shp_fips.MHSEVAL_ALT_LAG[np.isnan(shp_fips.MHSEVAL_ALT_LAG)] = \ shp_fips.MHSEVAL_ALT[np.isnan(shp_fips.MHSEVAL_ALT_LAG)] # replace NA with lag shp_fips.MHSEVAL_ALT_LAG_SE[np.isnan(shp_fips.MHSEVAL_ALT_LAG_SE)] = \
for s in finalSet: XVarsdummy.append(s) lst[s] = lst[s].astype(float) XVars = ['median_income', 'LivingArea', 'Age', 'num_trees'] yxs = lst.loc[:, XVars + [YVar]].dropna() yxs_dummy = lst.loc[:, XVarsdummy + [YVar]].dropna() y = lst[YVar] w = pysal.knnW_from_array(lst.loc[\ yxs.index, \ ['centroid_long', 'centroid_lat']\ ].values, k=30) w.transform = 'R' yxs = yxs.assign(w_res=pysal.lag_spatial(w, yxs_dummy['residential'].values)) """ yxs = yxs.assign(w_mixed=pysal.lag_spatial(w, yxs_dummy['mixed'].values)) yxs = yxs.assign(w_retail=pysal.lag_spatial(w, yxs_dummy['retail'].values)) yxs = yxs.assign(w_apt=pysal.lag_spatial(w, yxs_dummy['apt'].values)) yxs = yxs.assign(w_industrial=pysal.lag_spatial(w, yxs_dummy['industrial'].values)) yxs = yxs.assign(w_office=pysal.lag_spatial(w, yxs_dummy['office'].values)) yxs = yxs.assign(w_school=pysal.lag_spatial(w, yxs_dummy['school'].values)) yxs = yxs.assign(w_auto_shop=pysal.lag_spatial(w, yxs_dummy['auto_shop'].values)) yxs = yxs.assign(w_religious=pysal.lag_spatial(w, yxs_dummy['religious'].values)) yxs = yxs.assign(w_food=pysal.lag_spatial(w, yxs_dummy['food'].values)) yxs = yxs.assign(w_charitable=pysal.lag_spatial(w, yxs_dummy['charitable'].values)) yxs = yxs.assign(w_gov=pysal.lag_spatial(w, yxs_dummy['gov'].values)) yxs = yxs.assign(w_medical=pysal.lag_spatial(w, yxs_dummy['medical'].values)) yxs = yxs.assign(w_gas_mart=pysal.lag_spatial(w, yxs_dummy['gas_mart'].values)) """
def ml_lag(y, X, w, precrit=0.0000001, verbose=False, method='full'): """ Maximum likelihood estimation of spatial lag model Parameters ---------- y: dependent variable (nx1 array) w: spatial weights object X: explanatory variables (nxk array) precrit: convergence criterion verbose: boolen to print iterations in estimation method: method to use for evaluating jacobian term in concentrated likelihood function (FULL|ORD) where FULL=Brute Force, ORD = eigenvalue based jacobian Returns ------- Results: dictionary with estimates, standard errors, vcv, and z-values """ # step 1 OLS of X on y yields b1 d = np.linalg.inv(np.dot(X.T, X)) b1 = np.dot(d, np.dot(X.T, y)) # step 2 OLS of X on Wy: yields b2 wy = ps.lag_spatial(w,y) b2 = np.dot(d, np.dot(X.T, wy)) # step 3 compute residuals e1, e2 e1 = y - np.dot(X,b1) e2 = wy - np.dot(X,b2) # step 4 given e1, e2 find rho that maximizes Lc # ols estimate of rho XA = np.hstack((wy,X)) bols = np.dot(np.linalg.inv(np.dot(XA.T, XA)), np.dot(XA.T,y)) rols = bols[0][0] while np.abs(rols) > 1.0: rols = rols/2.0 if rols > 0.0: r1 = rols r2 = r1 / 5.0 else: r2 = rols r1 = r2 / 5.0 df1 = 0 df2 = 0 tr = 0 df1, tr = defl_lag(r1, w, e1, e2) df2, tr = defl_lag(r2, w, e1, e2) if df1*df2 <= 0: ll = r2 ul = r1 elif df1 >= 0.0 and df1 >= df2: ll = -0.999 ul = r2 df1 = df2 df2 = -(10.0**10) elif df1 >= 0.0 and df1 < df2: ll = r1 ul = 0.999 df2 = df1 df1 = -(10.0**10) elif df1 < 0.0 and df1 >= df2: ul = 0.999 ll = r1 df2 = df1 df1 = 10.0**10 else: ul = r2 ll = -0.999 df1 = df2 df2 = 10.0**10 # main bisection iteration err = 10 t = rols ro = (ll+ul) / 2. if verbose: line ="\nMaximum Likelihood Estimation of Spatial lag Model" print line line ="%-5s\t%12s\t%12s\t%12s\t%12s"%("Iter.","LL","RHO","UL","DFR") print line i = 0 while err > precrit: if verbose: print "%d\t%12.8f\t%12.8f\t%12.8f\t%12.8f" % (i,ll, ro, ul, df1) dfr, tr = defl_lag(ro, w, e1, e2) if dfr*df1 < 0.0: ll = ro else: ul = ro df1 = dfr err = np.abs(t-ro) t = ro ro =(ul+ll)/2. i += 1 ro = t tr1 = tr bml = b1 - (ro * b2) b = [ro,bml] xb = np.dot(X, bml) eml = y - ro * wy - xb sig2 = (eml**2).sum() / w.n # Likelihood evaluation ldet = _logJacobian(w, ro, method) llik = log_lik_lag(ldet, w, b, X, y) # Information matrix # Ipp IpB Ips # IBp IBB IBs # Isp IsB Iss # Ipp n,k = X.shape W = w.full()[0] A_inv = np.linalg.inv(np.eye(w.n) - ro * W) WA = np.dot(W, A_inv) tr1 = np.trace(WA) tr1 = tr1**2 tr2 = np.trace(np.dot(WA.T, WA)) WAXB = np.dot(WA,xb) Ipp = tr1 + tr2 + np.dot(WAXB.T, WAXB)/sig2 I = np.eye(w.n) IpB = np.dot(X.T, WAXB).T / sig2 Ips = np.trace(WA) / sig2 IBp = IpB.T IBB = np.dot(X.T,X) / sig2 Isp = Ips Iss = n / (2 * sig2 * sig2) results = {} results['betas'] = bml results['rho'] = ro results['llik'] = llik results['sig2'] = sig2 dim = k + 2 Info = np.zeros((dim,dim)) Info[0,0] = Ipp Info[0,1:k+1] = IpB Info[0,k+1] = Ips Info[1:k+1,0 ] = IpB Info[1:k+1, 1:k+1] = IBB Info[k+1,0 ] = Ips Info[k+1, k+1] = Iss VCV = np.linalg.inv(Info) se_b = np.sqrt(np.diag(VCV)[1:k+1]) se_b.shape = (k,1) z_b = bml/se_b se_rho = np.sqrt(VCV[0,0]) z_rho = ro / se_rho se_sig2 = np.sqrt(VCV[k+1,k+1]) results['se_b'] = se_b results['z_b'] = z_b results['se_rho'] = se_rho results['z_rho'] = z_rho results['se_sig2'] = se_sig2 results['VCV'] = VCV results['method'] = method return results
def robust_vm(reg, gwk=None, sig2n_k=False): """ Robust estimation of the variance-covariance matrix. Estimated by White (default) or HAC (if wk is provided). Parameters ---------- reg : Regression object (OLS or TSLS) output instance from a regression model gwk : PySAL weights object Optional. Spatial weights based on kernel functions If provided, returns the HAC variance estimation sig2n_k : boolean If True, then use n-k to rescale the vc matrix. If False, use n. (White only) Returns -------- psi : kxk array Robust estimation of the variance-covariance Examples -------- >>> import numpy as np >>> import pysal >>> from ols import OLS >>> from twosls import TSLS >>> db=pysal.open(pysal.examples.get_path("NAT.dbf"),"r") >>> y = np.array(db.by_col("HR90")) >>> y = np.reshape(y, (y.shape[0],1)) >>> X = [] >>> X.append(db.by_col("RD90")) >>> X.append(db.by_col("DV90")) >>> X = np.array(X).T Example with OLS with unadjusted standard errors >>> ols = OLS(y,X) >>> ols.vm array([[ 0.17004545, 0.00226532, -0.02243898], [ 0.00226532, 0.00941319, -0.00031638], [-0.02243898, -0.00031638, 0.00313386]]) Example with OLS and White >>> ols = OLS(y,X, robust='white') >>> ols.vm array([[ 0.24515481, 0.01093322, -0.03441966], [ 0.01093322, 0.01798616, -0.00071414], [-0.03441966, -0.00071414, 0.0050153 ]]) Example with OLS and HAC >>> wk = pysal.kernelW_from_shapefile(pysal.examples.get_path('NAT.shp'),k=15,function='triangular', fixed=False) >>> wk.transform = 'o' >>> ols = OLS(y,X, robust='hac', gwk=wk) >>> ols.vm array([[ 0.29213532, 0.01670361, -0.03948199], [ 0.01655557, 0.02295829, -0.00116874], [-0.03941483, -0.00119077, 0.00568314]]) Example with 2SLS and White >>> yd = [] >>> yd.append(db.by_col("UE90")) >>> yd = np.array(yd).T >>> q = [] >>> q.append(db.by_col("UE80")) >>> q = np.array(q).T >>> tsls = TSLS(y, X, yd, q=q, robust='white') >>> tsls.vm array([[ 0.29569954, 0.04119843, -0.02496858, -0.01640185], [ 0.04119843, 0.03647762, 0.004702 , -0.00987345], [-0.02496858, 0.004702 , 0.00648262, -0.00292891], [-0.01640185, -0.00987345, -0.00292891, 0.0053322 ]]) Example with 2SLS and HAC >>> tsls = TSLS(y, X, yd, q=q, robust='hac', gwk=wk) >>> tsls.vm array([[ 0.41985329, 0.06823119, -0.02883889, -0.02788116], [ 0.06867042, 0.04887508, 0.00497443, -0.01367746], [-0.02856454, 0.00501402, 0.0072195 , -0.00321604], [-0.02810131, -0.01364908, -0.00318197, 0.00713251]]) """ if hasattr(reg, 'h'): # If reg has H, do 2SLS estimator. OLS otherwise. tsls = True xu = spbroadcast(reg.h, reg.u) else: tsls = False xu = spbroadcast(reg.x, reg.u) if gwk: # If gwk do HAC. White otherwise. gwkxu = lag_spatial(gwk, xu) psi0 = spdot(xu.T, gwkxu) else: psi0 = spdot(xu.T, xu) if sig2n_k: psi0 = psi0 * (1. * reg.n / (reg.n - reg.k)) if tsls: psi1 = spdot(reg.varb, reg.zthhthi) psi = spdot(psi1, np.dot(psi0, psi1.T)) else: psi = spdot(reg.xtxi, np.dot(psi0, reg.xtxi)) return psi
... Arguments --------- var : array values of variable w : array values of spatial weight ''' self.var = var self.w = w w.transform = 'r' slag = ps.lag_spatial(w, var) zx = (var - var.mean())/var.std() zy = (slag - slag.mean())/slag.std() fit = ps.spreg.OLS(zx[:, None], zy[:,None]) ## Customize plot fig1 = plt.figure(figsize=custom) plt.xlabel(xlabel, fontsize=20) plt.ylabel(ylabel, fontsize=20) plt.suptitle(title, fontsize=30) plt.scatter(zx, zy, s=60, color='k', alpha=.6) plot(zy, fit.predy, color='r')
# In[ ]: # Now we would like to standardize all the weights. This can be # done by specifying 'R' as the matrix transformation. w_knn3.transform = 'R' w_knn5.transform = 'R' w_knn9.transform = 'R' # In[ ]: # and then compute the spatial lag for all neighborhoods based # on the spatial weight matrix. We also store this as a column # named 'w_percent_knn3' in the original table. sl = ps.lag_spatial(w_knn3, Y) data['w_percent_knn3'] = sl # In[ ]: data.head() # In[ ]: #calculate moran's i moran = ps.Moran(Y, w_knn3) # In[ ]:
def rose(Y, w, k=8, permutations=0): """ Calculation of rose diagram for local indicators of spatial association Parameters ---------- Y: array (n,2) variable observed on n spatial units over 2 time periods w: spatial weights object k: int number of circular sectors in rose diagram permutations: int number of random spatial permutations for calculation of pseudo p-values Returns ------- results: dictionary (keys defined below) counts: array (k,1) number of vectors with angular movement falling in each sector cuts: array (k,1) intervals defining circular sectors (in radians) random_counts: array (permutations,k) counts from random permutations pvalues: array (kx1) one sided (upper tail) pvalues for observed counts Notes ----- Based on Rey, Murray, and Anselin (2011) [1]_ Examples -------- Constructing data for illustration of directional LISA analytics. Data is for the 48 lower US states over the period 1969-2009 and includes per capita income normalized to the national average. Load comma delimited data file in and convert to a numpy array >>> f=open(pysal.examples.get_path("spi_download.csv"),'r') >>> lines=f.readlines() >>> f.close() >>> lines=[line.strip().split(",") for line in lines] >>> names=[line[2] for line in lines[1:-5]] >>> data=np.array([map(int,line[3:]) for line in lines[1:-5]]) Bottom of the file has regional data which we don't need for this example so we will subset only those records that match a state name >>> sids=range(60) >>> out=['"United States 3/"', ... '"Alaska 3/"', ... '"District of Columbia"', ... '"Hawaii 3/"', ... '"New England"', ... '"Mideast"', ... '"Great Lakes"', ... '"Plains"', ... '"Southeast"', ... '"Southwest"', ... '"Rocky Mountain"', ... '"Far West 3/"'] >>> snames=[name for name in names if name not in out] >>> sids=[names.index(name) for name in snames] >>> states=data[sids,:] >>> us=data[0] >>> years=np.arange(1969,2009) Now we convert state incomes to express them relative to the national average >>> rel=states/(us*1.) Create our contiguity matrix from an external GAL file and row standardize the resulting weights >>> gal=pysal.open(pysal.examples.get_path('states48.gal')) >>> w=gal.read() >>> w.transform='r' Take the first and last year of our income data as the interval to do the directional directional analysis >>> Y=rel[:,[0,-1]] Set the random seed generator which is used in the permutation based inference for the rose diagram so that we can replicate our example results >>> np.random.seed(100) Call the rose function to construct the directional histogram for the dynamic LISA statistics. We will use four circular sectors for our histogram >>> r4=rose(Y,w,k=4,permutations=999) What are the cut-offs for our histogram - in radians >>> r4['cuts'] array([ 0. , 1.57079633, 3.14159265, 4.71238898, 6.28318531]) How many vectors fell in each sector >>> r4['counts'] array([32, 5, 9, 2]) What are the pseudo-pvalues for these counts based on 999 random spatial permutations of the state income data >>> r4['pvalues'] array([ 0.02 , 0.001, 0.001, 0.001]) Repeat the exercise but now for 8 rather than 4 sectors >>> r8=rose(Y,w,permutations=999) >>> r8['counts'] array([19, 13, 3, 2, 7, 2, 1, 1]) >>> r8['pvalues'] array([ 0.445, 0.042, 0.079, 0.003, 0.005, 0.1 , 0.269, 0.002]) References ---------- .. [1] Rey, S.J., A.T. Murray and L. Anselin. 2011. "Visualizing regional income distribution dynamics." Letters in Spatial and Resource Sciences, 4: 81-90. """ results = {} sw = 2 * np.pi / k cuts = np.arange(0.0, 2 * np.pi + sw, sw) wY = pysal.lag_spatial(w, Y) dx = Y[:, -1] - Y[:, 0] dy = wY[:, -1] - wY[:, 0] theta = np.arctan2(dy, dx) neg = theta < 0.0 utheta = theta * (1 - neg) + neg * (2 * np.pi + theta) counts, bins = np.histogram(utheta, cuts) results['counts'] = counts results['cuts'] = cuts if permutations: n, k1 = Y.shape ids = np.arange(n) all_counts = np.zeros((permutations, k)) for i in range(permutations): rid = np.random.permutation(ids) YR = Y[rid, :] wYR = pysal.lag_spatial(w, YR) dx = YR[:, -1] - YR[:, 0] dy = wYR[:, -1] - wYR[:, 0] theta = np.arctan2(dy, dx) neg = theta < 0.0 utheta = theta * (1 - neg) + neg * (2 * np.pi + theta) c, b = np.histogram(utheta, cuts) c.shape = (1, k) all_counts[i, :] = c larger = sum(all_counts >= counts) p_l = permutations - larger extreme = (p_l) < larger extreme = np.where(extreme, p_l, larger) p = (extreme + 1.) / (permutations + 1.) results['pvalues'] = p results['random_counts'] = all_counts return results
plt.savefig(figName, bbox_inches='tight') plt.show() plt.figure(10) #ax8 = plt.subplot(212) sns.kdeplot(mi.sim, shade=True) plt.vlines(mi.sim, 0, 1) plt.vlines(mi.EI + .01, 0, 40, 'r') plt.suptitle(filename) figName = "lasso\\" + filename + "_LassoMoranStatistics2.png" plt.savefig(figName, bbox_inches='tight') plt.show() #Moran scatterplot with statistically significant LISA values highlighted. #spatial lags Lag_response = pysal.lag_spatial(w, response) #plot the statistically-significant LISA values in a different color than the others #find all of the statistically significant LISAs. Since the p-values are in the same #order as the I_i statistics, we can do this in the following way plt.figure(11) sigs = response[lm.p_sim <= .001] W_sigs = Lag_response[lm.p_sim <= .001] insigs = response[lm.p_sim > .001] W_insigs = Lag_response[lm.p_sim > .001] b, a = np.polyfit(response, Lag_response, 1) #plot the statistically significant points in a dark red color. plt.plot(sigs, W_sigs, '.', color='firebrick') plt.plot(insigs, W_insigs, '.k', alpha=.2)
def __init__(self, y, x, w, method='full', epsilon=0.0000001, regimes_att=None): # set up main regression variables and spatial filters self.y = y if regimes_att: self.x = x.toarray() else: self.x = x self.n, self.k = self.x.shape self.method = method self.epsilon = epsilon #W = w.full()[0] #wait to build pending what is needed #Wsp = w.sparse ylag = ps.lag_spatial(w, self.y) xlag = self.get_x_lag(w, regimes_att) # call minimizer using concentrated log-likelihood to get lambda methodML = method.upper() if methodML in ['FULL', 'LU', 'ORD']: if methodML == 'FULL': W = w.full()[0] # need dense here res = minimize_scalar(err_c_loglik, 0.0, bounds=(-1.0, 1.0), args=(self.n, self.y, ylag, self.x, xlag, W), method='bounded', tol=epsilon) elif methodML == 'LU': I = sp.identity(w.n) Wsp = w.sparse # need sparse here res = minimize_scalar(err_c_loglik_sp, 0.0, bounds=(-1.0,1.0), args=(self.n, self.y, ylag, self.x, xlag, I, Wsp), method='bounded', tol=epsilon) elif methodML == 'ORD': # check on symmetry structure if w.asymmetry(intrinsic=False) == []: ww = symmetrize(w) WW = ww.todense() evals = la.eigvalsh(WW) else: W = w.full()[0] # need dense here evals = la.eigvals(W) res = minimize_scalar( err_c_loglik_ord, 0.0, bounds=(-1.0, 1.0), args=(self.n, self.y, ylag, self.x, xlag, evals), method='bounded', tol=epsilon) else: raise Exception, "{0} is an unsupported method".format(method) self.lam = res.x # compute full log-likelihood, including constants ln2pi = np.log(2.0 * np.pi) llik = -res.fun - self.n / 2.0 * ln2pi - self.n / 2.0 self.logll = llik # b, residuals and predicted values ys = self.y - self.lam * ylag xs = self.x - self.lam * xlag xsxs = np.dot(xs.T, xs) xsxsi = np.linalg.inv(xsxs) xsys = np.dot(xs.T, ys) b = np.dot(xsxsi, xsys) self.betas = np.vstack((b, self.lam)) self.u = y - np.dot(self.x, b) self.predy = self.y - self.u # residual variance self.e_filtered = self.u - self.lam * ps.lag_spatial(w, self.u) self.sig2 = np.dot(self.e_filtered.T, self.e_filtered) / self.n # variance-covariance matrix betas varb = self.sig2 * xsxsi # variance-covariance matrix lambda, sigma a = -self.lam * W np.fill_diagonal(a, 1.0) ai = la.inv(a) wai = np.dot(W, ai) tr1 = np.trace(wai) wai2 = np.dot(wai, wai) tr2 = np.trace(wai2) waiTwai = np.dot(wai.T, wai) tr3 = np.trace(waiTwai) v1 = np.vstack((tr2 + tr3, tr1 / self.sig2)) v2 = np.vstack((tr1 / self.sig2, self.n / (2.0 * self.sig2 ** 2))) v = np.hstack((v1, v2)) self.vm1 = np.linalg.inv(v) # create variance matrix for beta, lambda vv = np.hstack((varb, np.zeros((self.k, 1)))) vv1 = np.hstack( (np.zeros((1, self.k)), self.vm1[0, 0] * np.ones((1, 1)))) self.vm = np.vstack((vv, vv1))
# In[2]: QueenWeight = ps.queen_from_shapefile(shape) QueenWeightMatrix, ids = QueenWeight.full() QueenWeightMatrix # In[3]: #Spatial Lag #Is a variable that averages the neighboring values of a locaation #Accounts for the acutocorrelation in the model with the weight matrix # data = ps.pdio.read_files(shape) Queen = ps.queen_from_shapefile(shape) Queen.transform = 'r' percent16Lag = ps.lag_spatial(Queen, data.percent16) # In[4]: #This is a spatial lag graph of the percentages of suicide for the year 2016. #Spatial lag is a form of regression that accounts for the weight matrix of the shape file #and the dependent veriable that you have chosen. import matplotlib.pyplot as plt us = file percent16LagQ16 = ps.Quantiles(percent16Lag, k=10) f, ax = plt.subplots(1, figsize=(150, 150)) us.assign(cl=percent16LagQ16.yb).plot(column='cl', categorical=True, k=10, cmap='OrRd',
## prep time data t_data = get_time_data(query_result, time_cols) plpy.debug('shape of t_data %d, %d' % t_data.shape) plpy.debug('number of weight objects: %d, %d' % (weights.sparse).shape) plpy.debug('first num elements: %f' % t_data[0, 0]) sp_markov_result = ps.Spatial_Markov(t_data, weights, k=num_classes, fixed=False, permutations=permutations) ## get lag classes lag_classes = ps.Quantiles( ps.lag_spatial(weights, t_data[:, -1]), k=num_classes).yb ## look up probablity distribution for each unit according to class and lag class prob_dist = get_prob_dist(sp_markov_result.P, lag_classes, sp_markov_result.classes[:, -1]) ## find the ups and down and overall distribution of each cell trend_up, trend_down, trend, volatility = get_prob_stats(prob_dist, sp_markov_result.classes[:, -1]) ## output the results return zip(trend, trend_up, trend_down, volatility, weights.id_order) def get_time_data(markov_data, time_cols):
def __init__(self, y, x, w, method='full', epsilon=0.0000001, regimes_att=None): # set up main regression variables and spatial filters self.y = y if regimes_att: self.x = x.toarray() else: self.x = x self.n, self.k = self.x.shape self.method = method self.epsilon = epsilon #W = w.full()[0] #wait to build pending what is needed #Wsp = w.sparse ylag = ps.lag_spatial(w, self.y) xlag = self.get_x_lag(w, regimes_att) # call minimizer using concentrated log-likelihood to get lambda methodML = method.upper() if methodML in ['FULL', 'LU', 'ORD']: if methodML == 'FULL': W = w.full()[0] # need dense here res = minimize_scalar(err_c_loglik, 0.0, bounds=(-1.0, 1.0), args=(self.n, self.y, ylag, self.x, xlag, W), method='bounded', tol=epsilon) elif methodML == 'LU': I = sp.identity(w.n) Wsp = w.sparse # need sparse here res = minimize_scalar(err_c_loglik_sp, 0.0, bounds=(-1.0, 1.0), args=(self.n, self.y, ylag, self.x, xlag, I, Wsp), method='bounded', tol=epsilon) elif methodML == 'ORD': # check on symmetry structure if w.asymmetry(intrinsic=False) == []: ww = symmetrize(w) WW = ww.todense() evals = la.eigvalsh(WW) else: W = w.full()[0] # need dense here evals = la.eigvals(W) res = minimize_scalar(err_c_loglik_ord, 0.0, bounds=(-1.0, 1.0), args=(self.n, self.y, ylag, self.x, xlag, evals), method='bounded', tol=epsilon) else: raise Exception, "{0} is an unsupported method".format(method) self.lam = res.x # compute full log-likelihood, including constants ln2pi = np.log(2.0 * np.pi) llik = -res.fun - self.n / 2.0 * ln2pi - self.n / 2.0 self.logll = llik # b, residuals and predicted values ys = self.y - self.lam * ylag xs = self.x - self.lam * xlag xsxs = np.dot(xs.T, xs) xsxsi = np.linalg.inv(xsxs) xsys = np.dot(xs.T, ys) b = np.dot(xsxsi, xsys) self.betas = np.vstack((b, self.lam)) self.u = y - np.dot(self.x, b) self.predy = self.y - self.u # residual variance self.e_filtered = self.u - self.lam * ps.lag_spatial(w, self.u) self.sig2 = np.dot(self.e_filtered.T, self.e_filtered) / self.n # variance-covariance matrix betas varb = self.sig2 * xsxsi # variance-covariance matrix lambda, sigma a = -self.lam * W np.fill_diagonal(a, 1.0) ai = la.inv(a) wai = np.dot(W, ai) tr1 = np.trace(wai) wai2 = np.dot(wai, wai) tr2 = np.trace(wai2) waiTwai = np.dot(wai.T, wai) tr3 = np.trace(waiTwai) v1 = np.vstack((tr2 + tr3, tr1 / self.sig2)) v2 = np.vstack((tr1 / self.sig2, self.n / (2.0 * self.sig2**2))) v = np.hstack((v1, v2)) self.vm1 = np.linalg.inv(v) # create variance matrix for beta, lambda vv = np.hstack((varb, np.zeros((self.k, 1)))) vv1 = np.hstack((np.zeros((1, self.k)), self.vm1[0, 0] * np.ones( (1, 1)))) self.vm = np.vstack((vv, vv1))
D.head() mi = ps.Moran(D.n_assaults.values[:, None], qW, two_tailed=False) mi.I mi.EI y = D.n_assaults.values[:, None] xs = D.n_subwayex.values[:, None] m1 = ps.spreg.OLS(y, xs, w=qW, spat_diag=True) print(m1.summary) sl = ps.lag_spatial(qW, D.n_subwayex.values[:, None]) D_sl = D.assign(w_subway=sl) m2 = ps.spreg.OLS(D.n_assaults.values[:, None], D_sl[['n_subwayex', 'w_subway']].values, w=qW, spat_diag=True, name_x=D_sl[['n_subwayex', 'w_subway']].columns.tolist(), name_y='assaults') m3 = ps.spreg.GM_Lag(D.n_assaults.values[:, None], D_sl[['n_subwayex', 'w_subway']].values, w=qW, spat_diag=True, name_x=D_sl[['n_subwayex', 'w_subway']].columns.tolist(),
## prep time data t_data = get_time_data(query_result, time_cols) plpy.debug('shape of t_data %d, %d' % t_data.shape) plpy.debug('number of weight objects: %d, %d' % (weights.sparse).shape) plpy.debug('first num elements: %f' % t_data[0, 0]) sp_markov_result = ps.Spatial_Markov(t_data, weights, k=num_classes, fixed=False, permutations=permutations) ## get lag classes lag_classes = ps.Quantiles(ps.lag_spatial(weights, t_data[:, -1]), k=num_classes).yb ## look up probablity distribution for each unit according to class and lag class prob_dist = get_prob_dist(sp_markov_result.P, lag_classes, sp_markov_result.classes[:, -1]) ## find the ups and down and overall distribution of each cell trend_up, trend_down, trend, volatility = get_prob_stats( prob_dist, sp_markov_result.classes[:, -1]) ## output the results return zip(trend, trend_up, trend_down, volatility, weights.id_order) def get_time_data(markov_data, time_cols):