def get_centroid_ca(): cas = Tract.createAllCAObjects() centers = [] for i in range(1, 78): ctd = cas[i].polygon.centroid centers.append([ctd.x, ctd.y]) return centers
def generate_geographical_SpatialLag_ca(knearest=True, leaveOut=-1): """ Generate the distance matrix for CA pairs. If knearest is true, then select the 6-nearest neighboring CAs. Else, return the distance to all other CAs. leaveOut will select the CA and remove it. take value from 1 to 77 """ cas = Tract.createAllCAObjects() centers = [] iset = range(1, 78) if leaveOut > 0: iset.remove(leaveOut) for i in iset: centers.append(cas[i].polygon.centroid) W = np.zeros( (len(iset),len(iset)) ) for i, src in enumerate(centers): for j, dst in enumerate(centers): if src != dst: W[i][j] = 1 / src.distance(dst) # find n-largest (n=6) if knearest == True: threshold = heapq.nlargest(6, W[i,:])[-1] for j in range(len(W[i,:])): W[i][j] = 0 if W[i][j] < threshold else W[i][j] return W
def generate_geographical_SpatialLag_ca(knearest=True, leaveOut=-1): """ Generate the distance matrix for CA pairs. If knearest is true, then select the 6-nearest neighboring CAs. Else, return the distance to all other CAs. leaveOut will select the CA and remove it. take value from 1 to 77 """ cas = Tract.createAllCAObjects() centers = [] iset = range(1, 78) if leaveOut > 0: iset.remove(leaveOut) for i in iset: centers.append(cas[i].polygon.centroid) W = np.zeros((len(iset), len(iset))) for i, src in enumerate(centers): for j, dst in enumerate(centers): if src != dst: W[i][j] = 1 / src.distance(dst) # find n-largest (n=6) if knearest == True: threshold = heapq.nlargest(6, W[i, :])[-1] for j in range(len(W[i, :])): W[i][j] = 0 if W[i][j] < threshold else W[i][j] return W
def generateTaxiFlow(gridLevel='ca'): """ Generate taxi flow and write it to a file This is slow to run """ if gridLevel == 'ca': cas = Tract.createAllCAObjects() elif gridLevel == 'tract': cas = Tract.createAllTractObjects() n = len(cas) TF = np.zeros((n, n)) # taxi flow matrix ordKey = sorted(cas.keys()) cnt = 0 # import os # fnames = os.listdir("../data/ChicagoTaxi/") fnames = ['201401-03.txt'] for fname in fnames: print "Count taxi flow in {0}".format(fname) with open('../data/ChicagoTaxi/{0}'.format(fname), 'rU') as fin: reader = csv.reader(fin, delimiter='\t' ) header = reader.next() for row in reader: # initialize points start = Point(float(row[3]), float(row[4])) end = Point(float(row[5]), float(row[6])) sid = -1 eid = -1 for key, grid in cas.items(): """ grid key starts from 1 map the start/end point of trip into grids to get flow """ if grid.polygon.contains(start): sid = ordKey.index(key) if grid.polygon.contains(end): eid = ordKey.index(key) if sid != -1 and eid != -1: break TF[sid, eid] += 1 cnt += 1 if (cnt % 100000 == 0): print "{0} trips have been added".format(cnt) if gridLevel == 'ca': np.savetxt(here + "/TF.csv", TF, delimiter="," ) elif gridLevel == 'tract': np.savetxt(here + "/TF_tract.csv", TF, delimiter="," )
def generate_GWR_weight(h=1): """ Generate the GWR weighting matrix with exponential function. """ cas = Tract.createAllCAObjects() centers = [] for i in range(1, 78): centers.append(cas[i].polygon.centroid) gamma = np.ones((len(centers), len(centers))) for i, src in enumerate(centers): for j, dst in enumerate(centers): if i != j: gamma[i][j] = np.exp(-0.5 * src.distance(dst)**2 / h**2) return gamma
def generate_GWR_weight(h = 1): """ Generate the GWR weighting matrix with exponential function. """ cas = Tract.createAllCAObjects() centers = [] for i in range(1, 78): centers.append(cas[i].polygon.centroid) gamma = np.ones((len(centers), len(centers))) for i, src in enumerate(centers): for j, dst in enumerate(centers): if i != j: gamma[i][j] = np.exp(-0.5 * src.distance(dst)**2 / h**2) return gamma
def visualize_prediction_error(self, er, Y, title): cas = Tract.createAllCAObjects() import matplotlib.pyplot as plt import descartes fig = plt.figure() ax = fig.add_subplot(111) for k in cas: re = er[k - 1] / Y[k - 1] if re > 0.4: c = 'r' elif re < -0.4: c = 'b' else: c = 'w' cak = cas[k].polygon ax.add_patch(descartes.PolygonPatch(cak, fc=c)) ax.annotate(str(k), [cak.centroid.x, cak.centroid.y]) ax.axis('equal') ax.set_title(title) fig.show()
def visualize_prediction_error(self, er, Y, title): cas = Tract.createAllCAObjects() import matplotlib.pyplot as plt import descartes fig = plt.figure() ax = fig.add_subplot(111) for k in cas: re = er[k-1] / Y[k-1] if re > 0.4: c = 'r' elif re < -0.4: c = 'b' else: c = 'w' cak = cas[k].polygon ax.add_patch(descartes.PolygonPatch(cak, fc=c)) ax.annotate(str(k), [cak.centroid.x, cak.centroid.y]) ax.axis('equal') ax.set_title(title) fig.show()
def CA_clustering_with_embedding(): ge = get_graph_embedding_features("geo_all.txt") from sklearn.cluster import KMeans kmeans = KMeans(n_clusters=6, max_iter=100).fit(ge) for idx, lab in enumerate(kmeans.labels_): print idx+1, lab colorMaps = ['blue', 'red', 'g', 'c', 'y', 'm', 'k', 'w'] cas = Tract.createAllCAObjects() import matplotlib.pyplot as plt import descartes fig = plt.figure() ax = fig.add_subplot(111) for k in cas: cak = cas[k].polygon ax.add_patch(descartes.PolygonPatch(cak, fc=colorMaps[kmeans.labels_[k-1]])) ax.annotate(str(k), [cak.centroid.x, cak.centroid.y]) ax.axis('equal') fig.show() return kmeans, cas
def CA_clustering_with_embedding(): ge = get_graph_embedding_features("geo_all.txt") from sklearn.cluster import KMeans kmeans = KMeans(n_clusters=4, max_iter=100).fit(ge) for idx, lab in enumerate(kmeans.labels_): print idx + 1, lab colorMaps = ['blue', 'red', 'g', 'c', 'y', 'm', 'k', 'w'] cas = Tract.createAllCAObjects() import matplotlib.pyplot as plt import descartes fig = plt.figure() ax = fig.add_subplot(111) for k in cas: cak = cas[k].polygon ax.add_patch( descartes.PolygonPatch(cak, fc=colorMaps[kmeans.labels_[k - 1]])) ax.annotate(str(k), [cak.centroid.x, cak.centroid.y]) ax.axis('equal') fig.show() return kmeans, cas
def generate_transition_SocialLag(year=2010, lehd_type=0, region='ca', leaveOut=-1, normalization='source'): """ Generate the spatial lag matrix from the transition flow connected CAs. 0 - #total jobs 1 - #jobs age under 29, 2 - #jobs age from 30 to 54, 3 - #jobs above 55, 4 - #jobs earning under $1250/month, 5 - #jobs earnings from $1251 to $3333/month, 6 - #jobs above $3333/month, 7 - #jobs in goods producing, 8 - #jobs in trade transportation, 9 - #jobs in other services """ if region == 'ca': ts = Tract.createAllCAObjects() fn = here + '/../data/chicago_ca_od_{0}.csv'.format(year) elif region == 'tract': ts = Tract.createAllTractObjects() fn = here + '/../data/chicago_od_tract_{0}.csv'.format(year) ordkey = sorted(ts.keys()) listIdx = {} fin = open(fn) for line in fin: ls = line.split(",") srcid = int(ls[0][5:]) dstid = int(ls[1][5:]) val = int(ls[2 + lehd_type]) if srcid in listIdx: listIdx[srcid][dstid] = val else: listIdx[srcid] = {} listIdx[srcid][dstid] = val fin.close() if leaveOut > 0: ordkey.remove(leaveOut) W = np.zeros((len(ts), len(ts))) for srcid in ordkey: if srcid in listIdx: sdict = listIdx[srcid] if leaveOut in sdict: del sdict[leaveOut] for dstid, val in sdict.items(): W[ordkey.index(srcid)][ordkey.index(dstid)] = val else: W[ordkey.index(srcid)] = np.zeros((1, len(ts))) # update diagonal as 0 # if normalization != 'none': # for i in range(len(W)): # W[i,i] = 0 # first make all self-factor 0 assert W.dtype == "float64" # normalization section if normalization == 'source': # source mean the residence W = np.transpose(W) sW = np.sum(W, axis=1, keepdims=True) W = W / sW assert abs(np.sum(W[1, ]) - 1) < 0.0000000001 and W.dtype == "float64" elif normalization == 'destination': # # destination mean workplace sW = np.sum(W, axis=1) sW = sW.reshape((len(sW), 1)) W = W / sW elif normalization == 'pair': sW = W + np.transpose(W) sW = np.sum(sW) W = W / sW # by default, the output is the workplace-to-residence count matrix return W
def generatePOIfeature(gridLevel='ca'): """ generate POI features and write out to a file regionLevel could be "ca" or "tract" ['Food', 'Residence', 'Travel', 'Arts & Entertainment', 'Outdoors & Recreation', 'College & Education', 'Nightlife', 'Professional', 'Shops', 'Event'] """ if gridLevel == 'ca': cas = Tract.createAllCAObjects() elif gridLevel == 'tract': cas = Tract.createAllTractObjects() ordKey = sorted(cas.keys()) gcn = np.zeros((len(cas), 3)) # check-in count, user count, and POI count gcat = {} with open('../data/all_POIs_chicago', 'r') as fin: POIs = pickle.load(fin) with open('category_hierarchy.pickle', 'r') as f2: poi_cat = pickle.load(f2) cnt = 0 for poi in POIs.values(): loc = Point(poi.location.lon, poi.location.lat) if poi.cat in poi_cat: cat = poi_cat[poi.cat] else: continue for key, grid in cas.items(): if grid.polygon.contains(loc): gcn[ordKey.index(key), 0] += poi.checkin_count gcn[ordKey.index(key), 1] += poi.user_count gcn[ordKey.index(key), 2] += 1 """ Build a two-level dictionary, first index by region id, then index by category id, finally, the value is number of POI under the category. """ if key in gcat: if cat in gcat[key]: gcat[key][cat] += 1 else: gcat[key][cat] = 1 else: gcat[key] = {} gcat[key][cat] = 1 # break the polygon loop cnt += 1 break s = 0 hi_catgy = [] for catdict in gcat.values(): hi_catgy += catdict.keys() for c in catdict.values(): s += c hi_catgy = list(set(hi_catgy)) print hi_catgy gdist = np.zeros((len(cas), len(hi_catgy))) for key, distDict in gcat.items(): for idx, cate in enumerate(hi_catgy): if cate in distDict: gdist[ordKey.index(key), idx] = distDict[cate] else: gdist[ordKey.index(key), idx] = 0 if gridLevel == 'ca': np.savetxt(here + "/POI_dist.csv", gdist, delimiter=",") np.savetxt(here + "/POI_cnt.csv", gcn, delimiter=",") elif gridLevel == 'tract': np.savetxt(here + "/POI_dist_tract.csv", gdist, delimiter=",") np.savetxt(here + "/POI_cnt_tract.csv", gcn, delimiter=",") with open(here + "/POI_tract.pickle", 'w') as fout: pickle.dump(ordKey, fout) pickle.dump(gcat, fout)
def generatePOIfeature(gridLevel = 'ca'): """ generate POI features and write out to a file regionLevel could be "ca" or "tract" ['Food', 'Residence', 'Travel', 'Arts & Entertainment', 'Outdoors & Recreation', 'College & Education', 'Nightlife', 'Professional', 'Shops', 'Event'] """ if gridLevel == 'ca': cas = Tract.createAllCAObjects() elif gridLevel == 'tract': cas = Tract.createAllTractObjects() ordKey = sorted(cas.keys()) gcn = np.zeros((len(cas), 3)) # check-in count, user count, and POI count gcat = {} with open('../data/all_POIs_chicago', 'r') as fin: POIs = pickle.load(fin) with open('category_hierarchy.pickle', 'r') as f2: poi_cat = pickle.load(f2) cnt = 0 for poi in POIs.values(): loc = Point(poi.location.lon, poi.location.lat) if poi.cat in poi_cat: cat = poi_cat[poi.cat] else: continue for key, grid in cas.items(): if grid.polygon.contains(loc): gcn[ordKey.index(key),0] += poi.checkin_count gcn[ordKey.index(key),1] += poi.user_count gcn[ordKey.index(key),2] += 1 """ Build a two-level dictionary, first index by region id, then index by category id, finally, the value is number of POI under the category. """ if key in gcat: if cat in gcat[key]: gcat[key][cat] += 1 else: gcat[key][cat] = 1 else: gcat[key] = {} gcat[key][cat] = 1 # break the polygon loop cnt += 1 break s = 0 hi_catgy = [] for catdict in gcat.values(): hi_catgy += catdict.keys() for c in catdict.values(): s += c hi_catgy = list(set(hi_catgy)) print hi_catgy gdist = np.zeros( (len(cas), len(hi_catgy)) ) for key, distDict in gcat.items(): for idx, cate in enumerate(hi_catgy): if cate in distDict: gdist[ordKey.index(key), idx] = distDict[cate] else: gdist[ordKey.index(key), idx] = 0 if gridLevel == 'ca': np.savetxt(here + "/POI_dist.csv", gdist, delimiter="," ) np.savetxt(here + "/POI_cnt.csv", gcn, delimiter="," ) elif gridLevel == 'tract': np.savetxt(here + "/POI_dist_tract.csv", gdist, delimiter="," ) np.savetxt(here + "/POI_cnt_tract.csv", gcn, delimiter="," )
def generate_transition_SocialLag(year = 2010, lehd_type=0, region='ca', leaveOut=-1, normalization='source'): """ Generate the spatial lag matrix from the transition flow connected CAs. 0 - #total jobs 1 - #jobs age under 29, 2 - #jobs age from 30 to 54, 3 - #jobs above 55, 4 - #jobs earning under $1250/month, 5 - #jobs earnings from $1251 to $3333/month, 6 - #jobs above $3333/month, 7 - #jobs in goods producing, 8 - #jobs in trade transportation, 9 - #jobs in other services """ if region == 'ca': ts = Tract.createAllCAObjects() fn = here + '/../data/chicago_ca_od_{0}.csv'.format(year) elif region == 'tract': ts = Tract.createAllTractObjects() fn = here + '/../data/chicago_od_tract_{0}.csv'.format(year) ordkey = sorted(ts.keys()) listIdx = {} fin = open(fn) for line in fin: ls = line.split(",") srcid = int(ls[0]) dstid = int(ls[1]) val = int(ls[2 + lehd_type]) if srcid in listIdx: listIdx[srcid][dstid] = val else: listIdx[srcid] = {} listIdx[srcid][dstid] = val fin.close() if leaveOut > 0: ordkey.remove(leaveOut) W = np.zeros( (len(ts),len(ts)) ) for srcid in ordkey: if srcid in listIdx: sdict = listIdx[srcid] if leaveOut in sdict: del sdict[leaveOut] for dstid, val in sdict.items(): W[ordkey.index(srcid)][ordkey.index(dstid)] = val else: W[ordkey.index(srcid)] = np.zeros( (1,len(ts)) ) # update diagonal as 0 # if normalization != 'none': # for i in range(len(W)): # W[i,i] = 0 # first make all self-factor 0 assert W.dtype == "float64" # normalization section if normalization == 'source': # source mean the residence W = np.transpose(W) sW = np.sum(W, axis=1, keepdims=True) W = W / sW assert abs( np.sum(W[1,]) - 1 ) < 0.0000000001 and W.dtype == "float64" elif normalization == 'destination': # # destination mean workplace sW = np.sum(W, axis=1) sW = sW.reshape((len(sW),1)) W = W / sW elif normalization == 'pair': sW = W + np.transpose(W) sW = np.sum(sW) W = W / sW # by default, the output is the workplace-to-residence count matrix return W