Ejemplo n.º 1
0
def get_centroid_ca():
    cas = Tract.createAllCAObjects()
    centers = []
    for i in range(1, 78):
        ctd = cas[i].polygon.centroid
        centers.append([ctd.x, ctd.y])
    return centers
Ejemplo n.º 2
0
def generate_geographical_SpatialLag_ca(knearest=True, leaveOut=-1):
    """
    Generate the distance matrix for CA pairs.
    
    If knearest is true, then select the 6-nearest neighboring CAs.
    Else, return the distance to all other CAs.

    leaveOut will select the CA and remove it. take value from 1 to 77
    """
    
    
    cas = Tract.createAllCAObjects()
    centers = []
    iset = range(1, 78)
    if leaveOut > 0:
        iset.remove(leaveOut)
    for i in iset:
        centers.append(cas[i].polygon.centroid)
    
    W = np.zeros( (len(iset),len(iset)) )
    for i, src in enumerate(centers):
        for j, dst in enumerate(centers):
            if src != dst:
                W[i][j] = 1 / src.distance(dst)
                
        # find n-largest (n=6)
        if knearest == True:
            threshold = heapq.nlargest(6, W[i,:])[-1]
            for j in range(len(W[i,:])):
                W[i][j] = 0 if W[i][j] < threshold else W[i][j]
    return W
Ejemplo n.º 3
0
def generate_geographical_SpatialLag_ca(knearest=True, leaveOut=-1):
    """
    Generate the distance matrix for CA pairs.
    
    If knearest is true, then select the 6-nearest neighboring CAs.
    Else, return the distance to all other CAs.

    leaveOut will select the CA and remove it. take value from 1 to 77
    """
    cas = Tract.createAllCAObjects()
    centers = []
    iset = range(1, 78)
    if leaveOut > 0:
        iset.remove(leaveOut)
    for i in iset:
        centers.append(cas[i].polygon.centroid)

    W = np.zeros((len(iset), len(iset)))
    for i, src in enumerate(centers):
        for j, dst in enumerate(centers):
            if src != dst:
                W[i][j] = 1 / src.distance(dst)

        # find n-largest (n=6)
        if knearest == True:
            threshold = heapq.nlargest(6, W[i, :])[-1]
            for j in range(len(W[i, :])):
                W[i][j] = 0 if W[i][j] < threshold else W[i][j]
    return W
Ejemplo n.º 4
0
def generateTaxiFlow(gridLevel='ca'):
    """
    Generate taxi flow and write it to a file
    
    This is slow to run
    """
    if gridLevel == 'ca':
        cas = Tract.createAllCAObjects()
    elif gridLevel == 'tract':
        cas = Tract.createAllTractObjects()
    n = len(cas)
    TF = np.zeros((n, n))   # taxi flow matrix
    
    ordKey = sorted(cas.keys())
    
    cnt = 0
#    import os
#    fnames = os.listdir("../data/ChicagoTaxi/")
    fnames = ['201401-03.txt']
    
    for fname in fnames:
        print "Count taxi flow in {0}".format(fname)
        with open('../data/ChicagoTaxi/{0}'.format(fname), 'rU') as fin:
            reader = csv.reader(fin, delimiter='\t' )
            header = reader.next()
            for row in reader:
                # initialize points            
                start = Point(float(row[3]), float(row[4]))
                end = Point(float(row[5]), float(row[6]))
                
                sid = -1
                eid = -1
                for key, grid in cas.items():
                    """
                    grid key starts from 1
                    map the start/end point of trip into grids to get flow
                    """
                    if grid.polygon.contains(start):
                        sid = ordKey.index(key)
                    if grid.polygon.contains(end):
                        eid = ordKey.index(key)
                    if sid != -1 and eid != -1:
                        break
                
                TF[sid, eid] += 1
                cnt += 1
                if (cnt % 100000 == 0):
                    print "{0} trips have been added".format(cnt)
    if gridLevel == 'ca':
        np.savetxt(here + "/TF.csv", TF, delimiter="," )
    elif gridLevel == 'tract':
        np.savetxt(here + "/TF_tract.csv", TF, delimiter="," )
Ejemplo n.º 5
0
def generate_GWR_weight(h=1):
    """
    Generate the GWR weighting matrix with exponential function.
    """
    cas = Tract.createAllCAObjects()
    centers = []
    for i in range(1, 78):
        centers.append(cas[i].polygon.centroid)

    gamma = np.ones((len(centers), len(centers)))
    for i, src in enumerate(centers):
        for j, dst in enumerate(centers):
            if i != j:
                gamma[i][j] = np.exp(-0.5 * src.distance(dst)**2 / h**2)
    return gamma
Ejemplo n.º 6
0
def generate_GWR_weight(h = 1):
    """
    Generate the GWR weighting matrix with exponential function.
    """
    cas = Tract.createAllCAObjects()
    centers = []
    for i in range(1, 78):
        centers.append(cas[i].polygon.centroid)
    
    gamma = np.ones((len(centers), len(centers)))
    for i, src in enumerate(centers):
        for j, dst in enumerate(centers):
            if i != j:
                gamma[i][j] = np.exp(-0.5 * src.distance(dst)**2 / h**2)
    return gamma
 def visualize_prediction_error(self, er, Y, title):
     cas = Tract.createAllCAObjects()
     import matplotlib.pyplot as plt
     import descartes
     fig = plt.figure()
     ax = fig.add_subplot(111)
     for k in cas:
         re = er[k - 1] / Y[k - 1]
         if re > 0.4:
             c = 'r'
         elif re < -0.4:
             c = 'b'
         else:
             c = 'w'
         cak = cas[k].polygon
         ax.add_patch(descartes.PolygonPatch(cak, fc=c))
         ax.annotate(str(k), [cak.centroid.x, cak.centroid.y])
     ax.axis('equal')
     ax.set_title(title)
     fig.show()
 def visualize_prediction_error(self, er, Y, title):
     cas = Tract.createAllCAObjects()
     import matplotlib.pyplot as plt
     import descartes
     fig = plt.figure()
     ax = fig.add_subplot(111)
     for k in cas:
         re = er[k-1] / Y[k-1]
         if re > 0.4:
             c = 'r'
         elif re < -0.4:
             c = 'b'
         else:
             c = 'w'
         cak = cas[k].polygon
         ax.add_patch(descartes.PolygonPatch(cak, fc=c))
         ax.annotate(str(k), [cak.centroid.x, cak.centroid.y])
     ax.axis('equal')
     ax.set_title(title)
     fig.show()
def CA_clustering_with_embedding():
    ge = get_graph_embedding_features("geo_all.txt")
    from sklearn.cluster import KMeans
    kmeans = KMeans(n_clusters=6, max_iter=100).fit(ge)
    for idx, lab in enumerate(kmeans.labels_):
        print idx+1, lab
    
    colorMaps = ['blue', 'red', 'g', 'c', 'y', 'm', 'k', 'w']
    cas = Tract.createAllCAObjects()
    import matplotlib.pyplot as plt
    import descartes
    fig = plt.figure()
    ax = fig.add_subplot(111)
    for k in cas:
        cak = cas[k].polygon
        ax.add_patch(descartes.PolygonPatch(cak, fc=colorMaps[kmeans.labels_[k-1]]))
        ax.annotate(str(k), [cak.centroid.x, cak.centroid.y])
    ax.axis('equal')
    fig.show()
    
    return kmeans, cas
Ejemplo n.º 10
0
def CA_clustering_with_embedding():
    ge = get_graph_embedding_features("geo_all.txt")
    from sklearn.cluster import KMeans
    kmeans = KMeans(n_clusters=4, max_iter=100).fit(ge)
    for idx, lab in enumerate(kmeans.labels_):
        print idx + 1, lab

    colorMaps = ['blue', 'red', 'g', 'c', 'y', 'm', 'k', 'w']
    cas = Tract.createAllCAObjects()
    import matplotlib.pyplot as plt
    import descartes
    fig = plt.figure()
    ax = fig.add_subplot(111)
    for k in cas:
        cak = cas[k].polygon
        ax.add_patch(
            descartes.PolygonPatch(cak, fc=colorMaps[kmeans.labels_[k - 1]]))
        ax.annotate(str(k), [cak.centroid.x, cak.centroid.y])
    ax.axis('equal')
    fig.show()

    return kmeans, cas
Ejemplo n.º 11
0
def generate_transition_SocialLag(year=2010,
                                  lehd_type=0,
                                  region='ca',
                                  leaveOut=-1,
                                  normalization='source'):
    """
    Generate the spatial lag matrix from the transition flow connected CAs.
    
    0 - #total jobs
    1 - #jobs age under 29,
    2 - #jobs age from 30 to 54, 
    3 - #jobs above 55, 
    4 - #jobs earning under $1250/month, 
    5 - #jobs earnings from $1251 to $3333/month, 
    6 - #jobs above $3333/month,
    7 - #jobs in goods producing, 
    8 - #jobs in trade transportation, 
    9 - #jobs in other services
    """

    if region == 'ca':
        ts = Tract.createAllCAObjects()
        fn = here + '/../data/chicago_ca_od_{0}.csv'.format(year)
    elif region == 'tract':
        ts = Tract.createAllTractObjects()
        fn = here + '/../data/chicago_od_tract_{0}.csv'.format(year)
    ordkey = sorted(ts.keys())

    listIdx = {}
    fin = open(fn)
    for line in fin:
        ls = line.split(",")
        srcid = int(ls[0][5:])
        dstid = int(ls[1][5:])
        val = int(ls[2 + lehd_type])
        if srcid in listIdx:
            listIdx[srcid][dstid] = val
        else:
            listIdx[srcid] = {}
            listIdx[srcid][dstid] = val
    fin.close()

    if leaveOut > 0:
        ordkey.remove(leaveOut)

    W = np.zeros((len(ts), len(ts)))
    for srcid in ordkey:
        if srcid in listIdx:
            sdict = listIdx[srcid]
            if leaveOut in sdict:
                del sdict[leaveOut]
            for dstid, val in sdict.items():
                W[ordkey.index(srcid)][ordkey.index(dstid)] = val
        else:
            W[ordkey.index(srcid)] = np.zeros((1, len(ts)))

    # update diagonal as 0


#    if normalization != 'none':
#        for i in range(len(W)):
#            W[i,i] = 0
# first make all self-factor 0
    assert W.dtype == "float64"

    # normalization section
    if normalization == 'source':
        # source mean the residence
        W = np.transpose(W)
        sW = np.sum(W, axis=1, keepdims=True)
        W = W / sW
        assert abs(np.sum(W[1, ]) - 1) < 0.0000000001 and W.dtype == "float64"
    elif normalization == 'destination':  #
        # destination mean workplace
        sW = np.sum(W, axis=1)
        sW = sW.reshape((len(sW), 1))
        W = W / sW
    elif normalization == 'pair':
        sW = W + np.transpose(W)
        sW = np.sum(sW)
        W = W / sW

    # by default, the output is the workplace-to-residence count matrix
    return W
Ejemplo n.º 12
0
def generatePOIfeature(gridLevel='ca'):
    """
    generate POI features and write out to a file
    
    regionLevel could be "ca" or "tract"
    
    ['Food', 'Residence', 'Travel', 'Arts & Entertainment', 
    'Outdoors & Recreation', 'College & Education', 'Nightlife', 
    'Professional', 'Shops', 'Event']
    """
    if gridLevel == 'ca':
        cas = Tract.createAllCAObjects()
    elif gridLevel == 'tract':
        cas = Tract.createAllTractObjects()

    ordKey = sorted(cas.keys())

    gcn = np.zeros((len(cas), 3))  # check-in count, user count, and POI count
    gcat = {}

    with open('../data/all_POIs_chicago', 'r') as fin:
        POIs = pickle.load(fin)

    with open('category_hierarchy.pickle', 'r') as f2:
        poi_cat = pickle.load(f2)

    cnt = 0
    for poi in POIs.values():
        loc = Point(poi.location.lon, poi.location.lat)
        if poi.cat in poi_cat:
            cat = poi_cat[poi.cat]
        else:
            continue

        for key, grid in cas.items():
            if grid.polygon.contains(loc):
                gcn[ordKey.index(key), 0] += poi.checkin_count
                gcn[ordKey.index(key), 1] += poi.user_count
                gcn[ordKey.index(key), 2] += 1
                """
                Build a two-level dictionary,
                first index by region id,
                then index by category id,
                finally, the value is number of POI under the category.
                """
                if key in gcat:
                    if cat in gcat[key]:
                        gcat[key][cat] += 1
                    else:
                        gcat[key][cat] = 1
                else:
                    gcat[key] = {}
                    gcat[key][cat] = 1

                # break the polygon loop
                cnt += 1
                break

    s = 0
    hi_catgy = []
    for catdict in gcat.values():
        hi_catgy += catdict.keys()
        for c in catdict.values():
            s += c

    hi_catgy = list(set(hi_catgy))
    print hi_catgy

    gdist = np.zeros((len(cas), len(hi_catgy)))
    for key, distDict in gcat.items():
        for idx, cate in enumerate(hi_catgy):
            if cate in distDict:
                gdist[ordKey.index(key), idx] = distDict[cate]
            else:
                gdist[ordKey.index(key), idx] = 0

    if gridLevel == 'ca':
        np.savetxt(here + "/POI_dist.csv", gdist, delimiter=",")
        np.savetxt(here + "/POI_cnt.csv", gcn, delimiter=",")
    elif gridLevel == 'tract':
        np.savetxt(here + "/POI_dist_tract.csv", gdist, delimiter=",")
        np.savetxt(here + "/POI_cnt_tract.csv", gcn, delimiter=",")
        with open(here + "/POI_tract.pickle", 'w') as fout:
            pickle.dump(ordKey, fout)
            pickle.dump(gcat, fout)
Ejemplo n.º 13
0
def generatePOIfeature(gridLevel = 'ca'):
    """
    generate POI features and write out to a file
    
    regionLevel could be "ca" or "tract"
    
    ['Food', 'Residence', 'Travel', 'Arts & Entertainment', 
    'Outdoors & Recreation', 'College & Education', 'Nightlife', 
    'Professional', 'Shops', 'Event']
    """
    if gridLevel == 'ca':
        cas = Tract.createAllCAObjects()
    elif gridLevel == 'tract':
        cas = Tract.createAllTractObjects()

    ordKey = sorted(cas.keys())
    
    gcn = np.zeros((len(cas), 3))   # check-in count, user count, and POI count
    gcat = {}
    
    with open('../data/all_POIs_chicago', 'r') as fin:
        POIs = pickle.load(fin)
        
    with open('category_hierarchy.pickle', 'r') as f2:
        poi_cat = pickle.load(f2)
    
    cnt = 0
    for poi in POIs.values():
        loc = Point(poi.location.lon, poi.location.lat)
        if poi.cat in poi_cat:
            cat = poi_cat[poi.cat]
        else:
            continue
        
        for key, grid in cas.items():
            if grid.polygon.contains(loc):
                gcn[ordKey.index(key),0] += poi.checkin_count
                gcn[ordKey.index(key),1] += poi.user_count
                gcn[ordKey.index(key),2] += 1
                """
                Build a two-level dictionary,
                first index by region id,
                then index by category id,
                finally, the value is number of POI under the category.
                """
                if key in gcat:
                    if cat in gcat[key]:
                        gcat[key][cat] += 1
                    else:
                        gcat[key][cat] = 1
                else:
                    gcat[key] = {}
                    gcat[key][cat] = 1
                    
                # break the polygon loop
                cnt += 1
                break
    
    s = 0
    hi_catgy = []
    for catdict in gcat.values():
        hi_catgy += catdict.keys()
        for c in catdict.values():
            s += c
            
    hi_catgy = list(set(hi_catgy))
    print hi_catgy
    
    
    gdist = np.zeros( (len(cas), len(hi_catgy)) )
    for key, distDict in gcat.items():
        for idx, cate in enumerate(hi_catgy):
            if cate in distDict:            
                gdist[ordKey.index(key), idx] = distDict[cate]
            else:
                gdist[ordKey.index(key), idx] = 0
                
    if gridLevel == 'ca':
        np.savetxt(here + "/POI_dist.csv", gdist, delimiter="," )
        np.savetxt(here + "/POI_cnt.csv", gcn, delimiter="," )
    elif gridLevel == 'tract':
        np.savetxt(here + "/POI_dist_tract.csv", gdist, delimiter="," )
        np.savetxt(here + "/POI_cnt_tract.csv", gcn, delimiter="," )
Ejemplo n.º 14
0
def generate_transition_SocialLag(year = 2010, lehd_type=0, region='ca', leaveOut=-1, normalization='source'):
    """
    Generate the spatial lag matrix from the transition flow connected CAs.
    
    0 - #total jobs
    1 - #jobs age under 29,
    2 - #jobs age from 30 to 54, 
    3 - #jobs above 55, 
    4 - #jobs earning under $1250/month, 
    5 - #jobs earnings from $1251 to $3333/month, 
    6 - #jobs above $3333/month,
    7 - #jobs in goods producing, 
    8 - #jobs in trade transportation, 
    9 - #jobs in other services
    """
    
    if region == 'ca':
        ts = Tract.createAllCAObjects()
        fn = here + '/../data/chicago_ca_od_{0}.csv'.format(year)
    elif region == 'tract':
        ts = Tract.createAllTractObjects()
        fn = here + '/../data/chicago_od_tract_{0}.csv'.format(year)
    ordkey = sorted(ts.keys())
    
    
    listIdx = {}
    fin = open(fn)
    for line in fin:
        ls = line.split(",")
        srcid = int(ls[0])
        dstid = int(ls[1])
        val = int(ls[2 + lehd_type])
        if srcid in listIdx:
            listIdx[srcid][dstid] = val
        else:
            listIdx[srcid] = {}
            listIdx[srcid][dstid] = val                            
    fin.close()

    if leaveOut > 0:
        ordkey.remove(leaveOut)
    
    W = np.zeros( (len(ts),len(ts)) )
    for srcid in ordkey:
        if srcid in listIdx:
            sdict = listIdx[srcid]
            if leaveOut in sdict:
                del sdict[leaveOut]
            for dstid, val in sdict.items():
                W[ordkey.index(srcid)][ordkey.index(dstid)] = val
        else:
            W[ordkey.index(srcid)] = np.zeros( (1,len(ts)) )
            
    
    
    # update diagonal as 0
#    if normalization != 'none':
#        for i in range(len(W)):
#            W[i,i] = 0
    # first make all self-factor 0
    assert W.dtype == "float64"
        
    # normalization section
    if normalization == 'source':
        # source mean the residence
        W = np.transpose(W)    
        sW = np.sum(W, axis=1, keepdims=True)
        W = W / sW
        assert abs( np.sum(W[1,]) - 1 ) < 0.0000000001 and W.dtype == "float64"
    elif normalization == 'destination': # 
        # destination mean workplace
        sW = np.sum(W, axis=1)
        sW = sW.reshape((len(sW),1))
        W = W / sW
    elif normalization == 'pair':
        sW = W + np.transpose(W)
        sW = np.sum(sW)
        W = W / sW
    
    # by default, the output is the workplace-to-residence count matrix
    return W