Ejemplo n.º 1
0
def NB_coefficients(year=2010):
    poi_dist = getFourSquarePOIDistribution(useRatio=False)
    F_taxi = getTaxiFlow(normalization="bydestination")
    W2 = generate_geographical_SpatialLag_ca()
    Y = retrieve_crime_count(year=year)
    C = generate_corina_features()
    D = C[1]

    popul = C[1][:,0].reshape(C[1].shape[0],1)
    Y = np.divide(Y, popul) * 10000
    
    f2 = np.dot(W2, Y)
    ftaxi = np.dot(F_taxi, Y)
    
    f = np.concatenate( (D, f2, ftaxi, poi_dist), axis=1 )
    mms = MinMaxScaler(copy=False)
    mms.fit(f)
    mms.transform(f)
    header = C[0] + [ 'spatiallag', 'taxiflow'] + \
        ['POI food', 'POI residence', 'POI travel', 'POI arts entertainment', 
                       'POI outdoors recreation', 'POI education', 'POI nightlife', 
                       'POI professional', 'POI shops', 'POI event']
    df = pd.DataFrame(f, columns=header)
    
    np.savetxt("Y.csv", Y, delimiter=",")
    df.to_csv("f.csv", sep=",", index=False)
    
    # NB permute
    nbres = subprocess.check_output( ['Rscript', 'nbr_eval.R', 'ca', 'coefficient'] )
    print nbres
    
    ls = nbres.strip().split(" ")
    coef = [float(e) for e in ls]
    print coef
    return coef, header
Ejemplo n.º 2
0
def leaveOneOut_Input_v4(leaveOut):
    """
    Generate observation matrix and vectors
    Y, F

    Those observations are trimed for the leave-one-out evaluation. Therefore, the leaveOut 
    indicates the CA id to be left out, ranging from 1-77
    """
    des, X = generate_corina_features('ca')
    X = np.delete(X, leaveOut - 1, 0)
    popul = X[:, 0].reshape(X.shape[0], 1)
    pvt = X[:, 2]  # poverty index of each CA

    #    poi_cnt = getFourSquareCount(leaveOut)
    #    poi_cnt = np.divide(poi_cnt, popul) * 10000

    poi_dist = getFourSquarePOIDistribution(leaveOut)
    poi_dist = np.divide(poi_dist, popul) * 10000

    F_dist = generate_geographical_SpatialLag_ca(leaveOut=leaveOut)
    F_flow = generate_transition_SocialLag(year=2010,
                                           lehd_type=0,
                                           region='ca',
                                           leaveOut=leaveOut)
    F_taxi = getTaxiFlow(leaveOut=leaveOut)

    Y = retrieve_crime_count(year=2010, col=['total'], region='ca')
    Y = np.delete(Y, leaveOut - 1, 0)
    Y = np.divide(Y, popul) * 10000

    F = []
    n = Y.size
    Yd = []
    for i in range(n):
        for j in range(n):
            if i != j:
                wij = np.array([
                    F_dist[i, j],
                    actualFlowInteraction(pvt[i], pvt[j]) * F_flow[i, j],
                    F_taxi[i, j]
                ])
                #                fij = np.concatenate( (X[i], poi_dist[i],  wij * Y[j][0]), 0)
                fij = np.concatenate((X[i], wij * Y[j][0]), 0)
                F.append(fij)
                Yd.append(Y[i])
    F = np.array(F)
    np.append(F, np.ones((F.shape[0], 1)), axis=1)
    Yd = np.array(Yd)
    Yd.resize((Yd.size, 1))

    return Yd, F
Ejemplo n.º 3
0
def explore_POI_dist():
    des, X = generate_corina_features('ca')
    popul = X[:,0].reshape(X.shape[0],1)
    poi_dist = getFourSquarePOIDistribution()
#    poi_dist = np.divide(poi_dist, popul) * 10000
    
    avgd = np.sum(poi_dist, axis=0) / poi_dist.shape[0]
#    plot(avgd)
    cnt = 0
    for row in poi_dist:
        if cnt % 5 == 0:
            figure()
            title('{0} - {1}'.format(cnt, cnt + 4))
        plot(row)
        cnt += 1
Ejemplo n.º 4
0
def explore_POI_dist():
    des, X = generate_corina_features('ca')
    popul = X[:, 0].reshape(X.shape[0], 1)
    poi_dist = getFourSquarePOIDistribution()
    #    poi_dist = np.divide(poi_dist, popul) * 10000

    avgd = np.sum(poi_dist, axis=0) / poi_dist.shape[0]
    #    plot(avgd)
    cnt = 0
    for row in poi_dist:
        if cnt % 5 == 0:
            figure()
            title('{0} - {1}'.format(cnt, cnt + 4))
        plot(row)
        cnt += 1
Ejemplo n.º 5
0
def leaveOneOut_Input_v4( leaveOut ):
    """
    Generate observation matrix and vectors
    Y, F

    Those observations are trimed for the leave-one-out evaluation. Therefore, the leaveOut 
    indicates the CA id to be left out, ranging from 1-77
    """
    des, X = generate_corina_features('ca')
    X = np.delete(X, leaveOut-1, 0)
    popul = X[:,0].reshape(X.shape[0],1)
    pvt = X[:,2]    # poverty index of each CA
    
#    poi_cnt = getFourSquareCount(leaveOut)
#    poi_cnt = np.divide(poi_cnt, popul) * 10000
    
    poi_dist = getFourSquarePOIDistribution(leaveOut)
    poi_dist = np.divide(poi_dist, popul) * 10000
    
    F_dist = generate_geographical_SpatialLag_ca( leaveOut=leaveOut )
    F_flow = generate_transition_SocialLag(year=2010, lehd_type=0, region='ca', leaveOut=leaveOut)
    F_taxi = getTaxiFlow(leaveOut = leaveOut)
    
    
    Y = retrieve_crime_count(year=2010, col=['total'], region='ca')
    Y = np.delete(Y, leaveOut-1, 0)
    Y = np.divide(Y, popul) * 10000
    
    F = []
    n = Y.size
    Yd = []
    for i in range(n):
        for j in range(n):
            if i != j:
                wij = np.array( [F_dist[i,j], 
                                actualFlowInteraction(pvt[i], pvt[j]) * F_flow[i,j],
                                F_taxi[i,j] ])
#                fij = np.concatenate( (X[i], poi_dist[i],  wij * Y[j][0]), 0)
                fij = np.concatenate( (X[i],   wij * Y[j][0]), 0)
                F.append(fij)
                Yd.append(Y[i])
    F = np.array(F)
    np.append(F, np.ones( (F.shape[0], 1) ), axis=1)
    Yd = np.array(Yd)
    Yd.resize( (Yd.size, 1) )
    
    
    return Yd, F
Ejemplo n.º 6
0
def correlation_POIdist_crime():
    """
    we calculate the correlation between POI distribution and crime for each
    community area(CA).
    Within each CA, the crime count is number of crime in each tract.
    The POI count is number of POIs in each tract.
    """
    tracts = Tract.createAllTractObjects()
    ordkey = sorted(tracts.keys())
    CAs = {}
    for key, val in tracts.items():
        if val.CA not in CAs:
            CAs[val.CA] = [key]
        else:
            CAs[val.CA].append(key)
    
    Y = retrieve_crime_count(2010, col=['total'], region='tract')
    poi_dist = getFourSquarePOIDistribution(gridLevel='tract')
    
    
    Pearson = {}
    for cakey, calist in CAs.items():
        crime = []
        pois = []
        for tractkey in calist:
            crime.append(Y[tractkey])
            pois.append(poi_dist[ordkey.index(tractkey)])
        # calculate correlation
        pois = np.array(pois)
        crime = np.array(crime)
        pearson = []
        for i in range(pois.shape[1]):
            r = np.vstack( (pois[:,i], crime) )
            pearson.append( np.corrcoef(r)[0,1] )
            
        Pearson[cakey] = np.nan_to_num( pearson )

    P = []
    for key in range(1, 78):
        P.append(Pearson[key])
    
    np.savetxt("../R/poi_correlation_ca.csv", P, delimiter=",")
    return np.array(P)
Ejemplo n.º 7
0
def extract_raw_samples(year=2010, crime_t=['total'], crime_rate=True):
    """
    Extract all samples with raw labels and features. Return None if the 
    corresponding feature is not selected.
    
    This function is called once only to avoid unnecessary disk I/O.
    
    Input:
    year        - which year to study
    crime_t     - crime types of interest, e.g. 'total'
    crime_rate  - predict crime_rate or not (count)
    
    Output:
    Y - crime rate / count
    D - demo feature
    P - POI feature
    Tf - taxi flow matrix (count)
    Gd - geo weight matrix
    """
    # Crime count
    y_cnt = retrieve_crime_count(year, col = crime_t)
    
    # Crime rate / count
    demo = generate_corina_features()
    population = demo[1][:,0].reshape(demo[1].shape[0], 1)
    Y = y_cnt / population * 10000 if crime_rate else y_cnt
    assert(Y.shape == (N,1))
    
    # Demo features
    D = demo[1]
    
    # POI features
    P = getFourSquarePOIDistribution(useRatio=False)
    
    # Taxi flow matrix
    Tf = getTaxiFlow(normalization="none")
    
    # Geo weight matrix
    Gd = generate_geographical_SpatialLag_ca()
    
    return Y, D, P, Tf, Gd
def extract_raw_samples(year=2010, crime_t=['total'], crime_rate=True):
    """
    Extract all samples with raw labels and features. Return None if the 
    corresponding feature is not selected.
    
    This function is called once only to avoid unnecessary disk I/O.
    
    Input:
    year        - which year to study
    crime_t     - crime types of interest, e.g. 'total'
    crime_rate  - predict crime_rate or not (count)
    
    Output:
    Y - crime rate / count
    D - demo feature
    P - POI feature
    Tf - taxi flow matrix (count)
    Gd - geo weight matrix
    """
    # Crime count
    y_cnt = retrieve_crime_count(year, col = crime_t)
    
    # Crime rate / count
    demo = generate_corina_features()
    population = demo[1][:,0].reshape(demo[1].shape[0], 1)
    Y = y_cnt / population * 10000 if crime_rate else y_cnt
    assert(Y.shape == (77,1))
    
    # Demo features
    D = demo[1]
    
    # POI features
    P = getFourSquarePOIDistribution(useRatio=False)
    
    # Taxi flow matrix
    Tf = getTaxiFlow(normalization="none")
    
    # Geo weight matrix
    Gd = generate_geographical_SpatialLag_ca()
    
    return Y, D, P, Tf, Gd
Ejemplo n.º 9
0
def generateInput_v4(fout=False):
    """
    Generate complete observation matrix
    """
    des, X = generate_corina_features('ca')
    pvt = X[:, 2]  # poverty index of each CA
    popul = X[:, 0].reshape(X.shape[0], 1)

    #    poi_cnt = getFourSquareCount()
    #    poi_cnt = np.divide(poi_cnt, popul) * 10000

    poi_dist = getFourSquarePOIDistribution()
    poi_dist = np.divide(poi_dist, popul) * 10000

    F_dist = generate_geographical_SpatialLag_ca()
    F_flow = generate_transition_SocialLag(year=2010, lehd_type=0, region='ca')
    F_taxi = getTaxiFlow()

    Y = retrieve_crime_count(year=2010, col=['total'], region='ca')
    Y = np.divide(Y, popul) * 10000

    F = []
    n = Y.size
    for i in range(n):
        for j in range(n):
            if i != j:
                wij = np.array([
                    F_dist[i, j],
                    actualFlowInteraction(pvt[i], pvt[j]) * F_flow[i, j],
                    F_taxi[i, j]
                ])
                #                fij = np.concatenate( (X[i], poi_dist[i], wij * Y[j][0]) , 0)
                fij = np.concatenate((X[i], wij * Y[j, 0]), 0)
                F.append(fij)
    F = np.array(F)
    np.append(F, np.ones((F.shape[0], 1)), axis=1)

    if fout:
        np.savetxt('../matlab/F.csv', F, delimiter=',')

    return Y, F
Ejemplo n.º 10
0
def correlation_POI_crime(gridLevel='tract', poiRatio=False):
    """
    calculate correlation for different POI category
    """
    Y = retrieve_crime_count(2010, col=['total'], region=gridLevel)
    h, D = generate_corina_features(region='ca')
    popul = D[:,0].reshape(D.shape[0],1)
    poi_dist = getFourSquarePOIDistribution(gridLevel=gridLevel, useRatio=poiRatio)
    cate_label = ['Food', 'Residence', 'Travel', 'Arts & Entertainment', 
                'Outdoors & Recreation', 'College & Education', 'Nightlife', 
                'Professional', 'Shops', 'Event']
    
    if gridLevel == 'tract':
        tracts = Tract.createAllTractObjects()
        ordkey = sorted(tracts.keys())

        crime = []
        pois = []
        for tractkey in ordkey:
            crime.append(Y[tractkey])
            pois.append(poi_dist[ordkey.index(tractkey)])
        
        pois = np.array(pois)
        crime = np.array(crime)
    
        for i in range(pois.shape[1]):
            r = np.vstack( (pois[:,i], crime) )
            pcc = np.corrcoef(r)[0,1]
            print pcc
            
    elif gridLevel == 'ca':
        Y = np.divide(Y, popul) * 10000
        Y = Y.reshape( (len(Y),) )
        poi_dist = np.transpose(poi_dist)
        
        for i in range(poi_dist.shape[0]):
            poi = np.reshape(poi_dist[i,:], Y.shape )
            r, p = pearsonr(poi, Y)
            print cate_label[i], r, p
Ejemplo n.º 11
0
def generateInput_v4(fout=False):
    """
    Generate complete observation matrix
    """
    des, X = generate_corina_features('ca')
    pvt = X[:,2]    # poverty index of each CA
    popul = X[:,0].reshape(X.shape[0],1)
    
#    poi_cnt = getFourSquareCount()
#    poi_cnt = np.divide(poi_cnt, popul) * 10000
    
    poi_dist = getFourSquarePOIDistribution()
    poi_dist = np.divide(poi_dist, popul) * 10000
    
    F_dist = generate_geographical_SpatialLag_ca()
    F_flow = generate_transition_SocialLag(year=2010, lehd_type=0, region='ca')
    F_taxi = getTaxiFlow()

    Y = retrieve_crime_count(year=2010, col=['total'], region='ca')
    Y = np.divide(Y, popul) * 10000

    F = []
    n = Y.size
    for i in range(n):
        for j in range(n):
            if i != j:
                wij = np.array( [F_dist[i,j], 
                                actualFlowInteraction(pvt[i], pvt[j]) * F_flow[i,j],
                                F_taxi[i,j] ] )
#                fij = np.concatenate( (X[i], poi_dist[i], wij * Y[j][0]) , 0)
                fij = np.concatenate( (X[i], wij * Y[j,0]) , 0)
                F.append(fij)
    F = np.array(F)
    np.append(F, np.ones( (F.shape[0], 1) ), axis=1)

    if fout:
        np.savetxt('../matlab/F.csv', F, delimiter=',')

    return Y, F
Ejemplo n.º 12
0
def line_POI_crime():
    d = getFourSquarePOIDistribution(gridLevel='ca')
    y = retrieve_crime_count(2010, col=['total'], region='ca')
    h, D = generate_corina_features(region='ca')
    popul = D[:,0].reshape(D.shape[0],1)
    
    hd = getFourSquarePOIDistributionHeader()
    yhat = np.divide(y, popul) * 10000
    
    for i in range(6,8):
        plt.figure()
        plt.scatter(d[:,i], y)
        plt.xlim(0, 1000)
        plt.xlabel('POI count -- {0} category'.format(hd[i]))
        plt.ylabel('Crime count')
        
    
        
        plt.figure()
        plt.scatter(d[:,i], yhat)
        plt.xlim(0, 1000)
        plt.xlabel('POI count -- {0} category'.format(hd[i]))
        plt.ylabel('Crime rate (per 10,000)')
Ejemplo n.º 13
0
def permutationTest_accuracy(iters, permute='taxiflow'):
    """
    Evaluate crime rate
    
    use full feature set:
        Corina, spaitallag, taxiflow, POIdist
    evaluate on 2013
    
    at CA level
    
    leave one out
    
    permutation
        permute one feature 1000 times takes roughly 30-40 minutes.
        The results are dumped as "permute-{feature}.pickle"
    """
    poi_dist = getFourSquarePOIDistribution(useRatio=False)
    F_taxi = getTaxiFlow(normalization="bydestination")
    W2 = generate_geographical_SpatialLag_ca()
    Y = retrieve_crime_count(year=2013)

    C = generate_corina_features()
    D = C[1]

    popul = C[1][:, 0].reshape(C[1].shape[0], 1)
    Y = np.divide(Y, popul) * 10000

    f2 = np.dot(W2, Y)
    ftaxi = np.dot(F_taxi, Y)

    nb_mae = []
    nb_mre = []
    lr_mae = []
    lr_mre = []
    for i in range(iters):
        if permute == 'corina':
            D = np.random.permutation(D)
        elif permute == 'spatiallag':
            yhat = np.random.permutation(Y)
            f2 = np.dot(W2, yhat)
        elif permute == 'taxiflow':
            yhat = np.random.permutation(Y)
            ftaxi = np.dot(F_taxi, Y)
        elif permute == 'POIdist':
            poi_dist = np.random.permutation(poi_dist)
        f = np.ones(f2.shape)
        f = np.concatenate((f, D, f2, ftaxi, poi_dist), axis=1)
        header = ['intercept'] + C[0] + [ 'spatiallag', 'taxiflow'] + \
            ['POI food', 'POI residence', 'POI travel', 'POI arts entertainment',
                           'POI outdoors recreation', 'POI education', 'POI nightlife',
                           'POI professional', 'POI shops', 'POI event']
        df = pd.DataFrame(f, columns=header)

        np.savetxt("Y.csv", Y, delimiter=",")
        df.to_csv("f.csv", sep=",", index=False)

        # NB permute
        nbres = subprocess.check_output(['Rscript', 'nbr_eval.R', 'ca'])
        ls = nbres.split(' ')
        nb_mae.append(float(ls[0]))
        nb_mre.append(float(ls[2]))

        mae2, mre2 = permutation_Test_LR(Y, f)
        lr_mae.append(mae2)
        lr_mre.append(mre2)

        if i % 10 == 0:
            print i

    print '{0} iterations finished.'.format(iters)
    print pvalue(412.305, lr_mae), pvalue(0.363, lr_mre), \
        pvalue(319.86, nb_mae), pvalue(0.281, nb_mre)
    return nb_mae, nb_mre, lr_mae, lr_mre
Ejemplo n.º 14
0
def leaveOneOut_evaluation_onChicagoCrimeData(year=2010,
                                              features=["all"],
                                              crime_t=['total'],
                                              flow_type=0,
                                              verboseoutput=False,
                                              region='ca',
                                              weightSocialFlow=True,
                                              useRate=True,
                                              logFeatures=[]):
    """
    Generate the social lag from previous year
    use income/race/education of current year
    """
    warnings.warn("The leave one out in nbr_eval.R is unfair")
    if 'sociallag' in features:
        W = generate_transition_SocialLag(year,
                                          lehd_type=flow_type,
                                          region=region,
                                          normalization='pair')

    # add POI distribution and taxi flow
    poi_dist = getFourSquarePOIDistribution(useRatio=False, gridLevel=region)
    F_taxi = getTaxiFlow(normalization="bydestination", gridLevel=region)

    if region == 'ca':
        W2 = generate_geographical_SpatialLag_ca()

        Yhat = retrieve_crime_count(year - 1, col=crime_t)
        #        h = retrieve_health_data()
        #        Y = h[0].reshape((77,1))
        Y = retrieve_crime_count(year, col=crime_t)
        C = generate_corina_features()
        popul = C[1][:, 0].reshape(C[1].shape[0], 1)

        if 'sociallag' in features:
            """ use poverty demographics to weight social lag """
            wC = 28  # 130.0 if useRate else 32.0     # constant parameter
            if weightSocialFlow:
                poverty = C[1][:, 2]
                for i in range(W.shape[0]):
                    for j in range(W.shape[1]):
                        W[i][j] *= np.exp(-np.abs(poverty[i] - poverty[j]) /
                                          wC)

        # crime count is normalized by the total population as crime rate
        # here we use the crime count per 10 thousand residents
        if useRate:
            Y = np.divide(Y, popul) * 10000
            Yhat = np.divide(Yhat, popul) * 10000
    elif region == 'tract':
        W2, tractkey = generate_geographical_SpatialLag()

        Yhat_map = retrieve_crime_count(year - 1, col=crime_t, region='tract')
        Yhat = np.array([Yhat_map[k]
                         for k in tractkey]).reshape(len(Yhat_map), 1)

        Y_map = retrieve_crime_count(year, col=crime_t, region='tract')
        Y = np.array([Y_map[k] for k in tractkey]).reshape(len(Y_map), 1)

        C = generate_corina_features(region='tract')
        C_mtx = []
        cnt = 0

        for k in tractkey:
            if k in C[1]:
                C_mtx.append(C[1][k])
            else:
                cnt += 1
                C_mtx.append([0 for i in range(7)])

        C = (C[0], np.array(C_mtx))

        # at tract level we don't normalize by population, since the tract is
        # defined as region with around 2000 population
        if useRate:
            pass

    i = retrieve_income_features()
    e = retrieve_education_features()
    r = retrieve_race_features()

    f2 = np.dot(W2, Y)
    ftaxi = np.dot(F_taxi, Y)

    # add intercept
    columnName = ['intercept']
    f = np.ones(f2.shape)
    lrf = np.copy(f)

    if "all" in features:
        f = np.concatenate((f, f1, i[1], e[1], r[1]), axis=1)
        f = pd.DataFrame(f, columns=['social lag'] + i[0] + e[0] + r[0])
    if "sociallag" in features:
        f1 = np.dot(W, Y)
        if 'sociallag' in logFeatures:
            f = np.concatenate((f, np.log(f1)), axis=1)
        else:
            f = np.concatenate((f, f1), axis=1)
        lrf = np.concatenate((f, f1), axis=1)
        columnName += ['social lag']
    if "income" in features:
        f = np.concatenate((f, i[1]), axis=1)
        lrf = np.concatenate((f, i[1]), axis=1)
        columnName += i[0]
    if "race" in features:
        f = np.concatenate((f, r[1]), axis=1)
        lrf = np.concatenate((f, r[1]), axis=1)
        columnName += r[0]
    if "education" in features:
        f = np.concatenate((f, e[1]), axis=1)
        lrf = np.concatenate((f, e[1]), axis=1)
        columnName += e[0]
    if 'corina' in features:
        f = np.concatenate((f, C[1]), axis=1)
        lrf = np.concatenate((f, C[1]), axis=1)
        columnName += C[0]
    if 'spatiallag' in features:
        if 'spatiallag' in logFeatures:
            f = np.concatenate((f, np.log(f2)), axis=1)
        else:
            f = np.concatenate((f, f2), axis=1)
        lrf = np.concatenate((f, f2), axis=1)
        columnName += ['spatial lag']
    if 'taxiflow' in features:
        if 'taxiflow' in logFeatures:
            f = np.concatenate((f, np.log(ftaxi)), axis=1)
        else:
            f = np.concatenate((f, ftaxi), axis=1)
        lrf = np.concatenate((f, ftaxi), axis=1)
        columnName += ['taxi flow']
    if 'POIdist' in features:
        f = np.concatenate((f, poi_dist), axis=1)
        lrf = np.concatenate((f, poi_dist), axis=1)
        columnName += [
            'POI food', 'POI residence', 'POI travel',
            'POI arts entertainment', 'POI outdoors recreation',
            'POI education', 'POI nightlife', 'POI professional', 'POI shops',
            'POI event'
        ]

    if 'temporallag' in features:
        f = np.concatenate((f, np.log(Yhat)), axis=1)
        lrf = np.concatenate((f, Yhat), axis=1)
        columnName += ['temporal lag']

    nbres = NB_training_R(f, columnName, Y, region, verboseoutput)
    print NB_training_python(f, Y)
    mae2, var2, mre2 = LR_training_python(lrf, Y, verboseoutput)

    if verboseoutput:
        print "Linear Regression MAE", mae2, "std", var2, "MRE", mre2
    else:
        print nbres
        print mae2, var2, mre2
        return np.array([[float(ele) for ele in nbres.split(" ")],
                         [mae2, var2, mre2]])
Ejemplo n.º 15
0
def permutationTest_accuracy(iters, permute='taxiflow'):
    """
    Evaluate crime rate
    
    use full feature set:
        Corina, spaitallag, taxiflow, POIdist
    evaluate on 2013
    
    at CA level
    
    leave one out
    
    permutation
        permute one feature 1000 times takes roughly 30-40 minutes.
        The results are dumped as "permute-{feature}.pickle"
    """
    poi_dist = getFourSquarePOIDistribution(useRatio=False)
    F_taxi = getTaxiFlow(normalization="bydestination")
    W2 = generate_geographical_SpatialLag_ca()
    Y = retrieve_crime_count(year=2013)
    
    
    C = generate_corina_features()
    D = C[1]
    
    popul = C[1][:,0].reshape(C[1].shape[0],1)
    Y = np.divide(Y, popul) * 10000
    
     
    f2 = np.dot(W2, Y)
    ftaxi = np.dot(F_taxi, Y)
    
    
    nb_mae = []
    nb_mre = []
    lr_mae = []
    lr_mre = []
    for i in range(iters):
        if permute == 'corina':
            D = np.random.permutation(D)
        elif permute == 'spatiallag':
            yhat = np.random.permutation(Y)
            f2 = np.dot(W2, yhat)
        elif permute == 'taxiflow':            
            yhat = np.random.permutation(Y)
            ftaxi = np.dot(F_taxi, Y)
        elif permute == 'POIdist':
            poi_dist = np.random.permutation(poi_dist)
        f = np.ones(f2.shape)
        f = np.concatenate( (f, D, f2, ftaxi, poi_dist), axis=1 )
        header = ['intercept'] + C[0] + [ 'spatiallag', 'taxiflow'] + \
            ['POI food', 'POI residence', 'POI travel', 'POI arts entertainment', 
                           'POI outdoors recreation', 'POI education', 'POI nightlife', 
                           'POI professional', 'POI shops', 'POI event']
        df = pd.DataFrame(f, columns=header)
        
        np.savetxt("Y.csv", Y, delimiter=",")
        df.to_csv("f.csv", sep=",", index=False)
        
        # NB permute
        nbres = subprocess.check_output( ['Rscript', 'nbr_eval.R', 'ca'] )
        ls = nbres.split(' ')
        nb_mae.append( float(ls[0]) )
        nb_mre.append( float(ls[2]) )

        mae2, mre2 = permutation_Test_LR(Y, f)
        lr_mae.append(mae2)
        lr_mre.append(mre2)
        
        if i % 10 == 0:
            print i
        
    print '{0} iterations finished.'.format(iters)
    print pvalue(412.305, lr_mae), pvalue(0.363, lr_mre), \
        pvalue(319.86, nb_mae), pvalue(0.281, nb_mre)
    return nb_mae, nb_mre, lr_mae, lr_mre
Ejemplo n.º 16
0
def leaveOneOut_evaluation_onChicagoCrimeData(year=2010, features= ["all"], 
                                              crime_t=['total'], flow_type=0, 
                                              verboseoutput=False, region='ca',
                                              weightSocialFlow=True, 
                                              useRate=True, logFeatures = []):
    """
    Generate the social lag from previous year
    use income/race/education of current year
    """
    warnings.warn("The leave one out in nbr_eval.R is unfair")
    if 'sociallag' in features:
        W = generate_transition_SocialLag(year, lehd_type=flow_type, region=region,
                                          normalization='pair')
    
    
    # add POI distribution and taxi flow
    poi_dist = getFourSquarePOIDistribution(useRatio=False, gridLevel=region)
    F_taxi = getTaxiFlow(normalization="bydestination", gridLevel=region)
        
        
    if region == 'ca':
        W2 = generate_geographical_SpatialLag_ca()
        
        Yhat = retrieve_crime_count(year-1, col = crime_t)
#        h = retrieve_health_data()
#        Y = h[0].reshape((77,1))
        Y = retrieve_crime_count(year, col = crime_t)
        C = generate_corina_features()
        popul = C[1][:,0].reshape(C[1].shape[0],1)
        
        
        if 'sociallag' in features:
            """ use poverty demographics to weight social lag """
            wC = 28 # 130.0 if useRate else 32.0     # constant parameter
            if weightSocialFlow:
                poverty = C[1][:,2]        
                for i in range(W.shape[0]):
                    for j in range (W.shape[1]):
                        W[i][j] *= np.exp( - np.abs(poverty[i] - poverty[j]) / wC )
        
        # crime count is normalized by the total population as crime rate
        # here we use the crime count per 10 thousand residents
        if useRate:
            Y = np.divide(Y, popul) * 10000
            Yhat = np.divide(Yhat, popul) * 10000
    elif region == 'tract':
        W2, tractkey = generate_geographical_SpatialLag()
    
        Yhat_map = retrieve_crime_count(year-1, col = crime_t, region='tract')
        Yhat = np.array( [Yhat_map[k] for k in tractkey] ).reshape( len(Yhat_map), 1)
        
        Y_map = retrieve_crime_count(year, col = crime_t, region='tract')
        Y = np.array( [Y_map[k] for k in tractkey] ).reshape( len(Y_map), 1 )
        
        C = generate_corina_features(region='tract')
        C_mtx = []
        cnt = 0
        
        for k in tractkey:
            if k in C[1]:
                C_mtx.append(C[1][k])
            else:
                cnt += 1
                C_mtx.append( [0 for i in range(7)] )
        
        C = ( C[0], np.array( C_mtx ) )
        
        
        # at tract level we don't normalize by population, since the tract is
        # defined as region with around 2000 population
        if useRate:
            pass
    
    
    
    i = retrieve_income_features()
    e = retrieve_education_features()
    r = retrieve_race_features()
    
    f2 = np.dot(W2, Y)
    ftaxi = np.dot(F_taxi, Y)
    
    
    # add intercept
    columnName = ['intercept']
    f = np.ones(f2.shape)
    lrf = np.copy(f)

    if "all" in features:
        f = np.concatenate( (f, f1, i[1], e[1], r[1]), axis=1)
        f = pd.DataFrame(f, columns=['social lag'] + i[0] + e[0] + r[0])
    if "sociallag" in features:        
        f1 = np.dot(W, Y)
        if 'sociallag' in logFeatures:
            f = np.concatenate( (f, np.log(f1)), axis=1 )
        else:
            f = np.concatenate( (f, f1), axis=1)
        lrf = np.concatenate( (f, f1), axis=1)
        columnName += ['social lag']
    if  "income" in features:
        f = np.concatenate( (f, i[1]), axis=1)
        lrf = np.concatenate( (f, i[1]), axis=1)
        columnName += i[0]
    if "race" in features:
        f = np.concatenate( (f, r[1]), axis=1)
        lrf = np.concatenate( (f, r[1]), axis=1)
        columnName += r[0]
    if "education" in features :
        f = np.concatenate( (f, e[1]), axis=1)
        lrf = np.concatenate( (f, e[1]), axis=1)
        columnName += e[0]
    if 'corina' in features :
        f = np.concatenate( (f, C[1]), axis=1)
        lrf = np.concatenate( (f, C[1]), axis=1)
        columnName += C[0]
    if 'spatiallag' in features:
        if 'spatiallag' in logFeatures:
            f = np.concatenate( (f, np.log(f2)), axis=1)
        else:
            f = np.concatenate( (f, f2), axis=1)
        lrf = np.concatenate( (f, f2), axis=1)
        columnName += ['spatial lag']
    if 'taxiflow' in features:
        if 'taxiflow' in logFeatures:
            f = np.concatenate( (f, np.log(ftaxi)), axis=1 )
        else:
            f = np.concatenate( (f, ftaxi), axis=1 )
        lrf = np.concatenate( (f, ftaxi), axis=1 )
        columnName += ['taxi flow']
    if 'POIdist' in features:
        f = np.concatenate( (f, poi_dist), axis=1 )
        lrf = np.concatenate( (f, poi_dist), axis=1 )
        columnName += ['POI food', 'POI residence', 'POI travel', 'POI arts entertainment', 
                       'POI outdoors recreation', 'POI education', 'POI nightlife', 
                       'POI professional', 'POI shops', 'POI event']

    
    if 'temporallag' in features:
        f = np.concatenate( (f, np.log(Yhat)), axis=1)
        lrf = np.concatenate( (f, Yhat), axis=1)
        columnName += ['temporal lag']
        
    nbres = NB_training_R(f, columnName, Y, region, verboseoutput)
    print NB_training_python(f, Y)
    mae2, var2, mre2 = LR_training_python(lrf, Y, verboseoutput)
    
    if verboseoutput:
        print "Linear Regression MAE", mae2, "std", var2, "MRE", mre2
    else:
        print nbres
        print mae2, var2, mre2
        return np.array([[float(ele) for ele in nbres.split(" ")], [mae2, var2, mre2]])
Ejemplo n.º 17
-1
def NB_coefficients(year=2010):
    poi_dist = getFourSquarePOIDistribution(useRatio=False)
    F_taxi = getTaxiFlow(normalization="bydestination")
    W2 = generate_geographical_SpatialLag_ca()
    Y = retrieve_crime_count(year=year)
    C = generate_corina_features()
    D = C[1]

    popul = C[1][:,0].reshape(C[1].shape[0],1)
    Y = np.divide(Y, popul) * 10000
    
    f2 = np.dot(W2, Y)
    ftaxi = np.dot(F_taxi, Y)
    
    f = np.concatenate( (D, f2, ftaxi, poi_dist), axis=1 )
    mms = MinMaxScaler(copy=False)
    mms.fit(f)
    mms.transform(f)
    header = C[0] + [ 'spatiallag', 'taxiflow'] + \
        ['POI food', 'POI residence', 'POI travel', 'POI arts entertainment', 
                       'POI outdoors recreation', 'POI education', 'POI nightlife', 
                       'POI professional', 'POI shops', 'POI event']
    df = pd.DataFrame(f, columns=header)
    
    np.savetxt("Y.csv", Y, delimiter=",")
    df.to_csv("f.csv", sep=",", index=False)
    
    # NB permute
    nbres = subprocess.check_output( ['Rscript', 'nbr_eval.R', 'ca', 'coefficient'] )
    print nbres
    
    ls = nbres.strip().split(" ")
    coef = [float(e) for e in ls]
    print coef
    return coef, header