def generate_raw_samples(year=2012):
    """
    Generate raw features for all samples.
    
    Returns
    -------
    Y : Numpy.Array
        Crime counts
    D : Numpy.Array
        Demo features
    P : Numpy.Array
        POI features
    T : Numpy.Array
        Taxi flow graph embedding
    G : Numpy.Array
        Geographic graph embedding
    """
    Y, D, P, Tf, Gd = extract_raw_samples(year)
    T = get_graph_embedding_features('taxi-CA-static.vec')
    G = get_graph_embedding_features('geo-CA.vec')
    return Y, D, P, T, G
def generate_raw_samples():
    """
    Generate raw features for all samples.
    
    Returns
    -------
    Y : Numpy.Array
        Crime counts
    D : Numpy.Array
        Demo features
    P : Numpy.Array
        POI features
    T : Numpy.Array
        Taxi flow graph embedding
    G : Numpy.Array
        Geographic graph embedding
    """   
    Y, D, P, Tf, Gd = extract_raw_samples(year=2014)
    T = get_graph_embedding_features('taxi_all.txt')
    G = get_graph_embedding_features('geo_all.txt')
    return Y, D, P, T, G
def plot_hourly_crime():
    plt.rc("axes", linewidth=2)
    plt.figure(figsize=(8, 6))
    for year in range(2013, 2016):
        Y, D, P, T, G = extract_raw_samples(year)
        population = D[:, 0]

        Yh = pickle.load(
            open("../chicago-hourly-crime-{0}.pickle".format(year)))
        Yh = Yh / population * 10000
        if year == 2015:
            Yh = Yh * 2
        plt.plot(Yh.mean(axis=1), lw=3)

    plt.legend(["2013", "2014", "2015"], fontsize=20, loc='best')
    plt.xlabel("Hour in day", fontsize=20)
    plt.ylabel("Average crime rate", fontsize=24)
    plt.axis([0, 23, 10, 70])
    plt.gca().set_xticks([0, 6, 12, 18, 23])
    plt.gca().set_xticklabels(("0:00", "6:00", "12:00", "18:00", "23:00"))
    plt.grid(b=True, axis="both", lw=1)
    plt.tick_params(labelsize=18)
    plt.savefig("crime-rate-hourly.pdf")
def evaluate_various_embedding_features_with_lag_model(year, spatial):
    Y, D, P, T, G = extract_raw_samples(int(year))

    # predict hourly crime
    #    population = D[:,0]
    #    Yh = pickle.load(open("../chicago-hourly-crime-{0}.pickle".format(year)))
    #    Yh = Yh / population * 10000

    # predict average income
    header, income = retrieve_income_features()
    Yh = np.repeat(income[:, 0, None], 24, axis=1)
    Yh = Yh.T

    # predict average house price
    #    Yh = retrieve_averge_house_price()
    #    Yh = np.repeat(Yh[:,None], 24, axis=1)
    #    Yh = Yh.T

    assert Yh.shape == (24, N)

    with open("CAflowFeatures.pickle") as fin:
        mf = pickle.load(fin)
        line = pickle.load(fin)
        dwt = pickle.load(fin)
        dws = pickle.load(fin)
        hdge = pickle.load(fin)

    mf_mre = []
    mf_mae = []
    line_mre = []
    line_mae = []
    dw_mre = []
    dw_mae = []
    for h in range(24):
        print h
        Yhat = Yh[h, :].reshape((N, 1))
        if spatial == "nospatial":
            features_ = ['demo', 'poi', 'taxi']
        elif spatial == "onlyspatial":
            features_ = ['demo', 'poi', 'geo']
        elif spatial == "usespatial":
            features_ = ['demo', 'poi', 'geo', 'taxi']
        else:
            features_ = ["demo", "poi"]

        # MF models
        Tmf = mf[h]  # sum([e for e in mf.values()])
        import nimfa
        nmf = nimfa.Nmf(
            G, rank=4, max_iter=100
        )  #, update="divergence", objective="conn", conn_change=50)
        nmf_fit = nmf()
        src = nmf_fit.basis()
        dst = nmf_fit.coef()
        Gmf = np.concatenate((src, dst.T), axis=1)

        mae, mre = leaveOneOut_error(Yhat,
                                     D,
                                     P,
                                     similarityMatrix(Tmf),
                                     Yhat,
                                     keep_topk(similarityMatrix(Gmf), 20),
                                     Yhat,
                                     features=features_,
                                     taxi_norm="bydestination")
        mf_mre.append(mre)
        mf_mae.append(mae)
        print "MF MRE: {0}".format(mre)

        # LINE model
        Tline = line[h]  # sum([e for e in line.values()])
        Gline = get_graph_embedding_features('geo_all.txt')
        mae, mre = leaveOneOut_error(Yhat,
                                     D,
                                     P,
                                     similarityMatrix(Tline),
                                     Yhat,
                                     keep_topk(similarityMatrix(Gline)),
                                     Yhat,
                                     features=features_,
                                     taxi_norm="bydestination")
        line_mre.append(mre)
        line_mae.append(mae)
        print "LINE_slotted MRE: {0}".format(mre)

        # deepwalk
        #        TGdw = dw[h] # sum([e for e in dw.values()])

        mae, mre = leaveOneOut_error(
            Yhat,
            D,
            P,
            similarityMatrix(dwt[h]),
            Yhat,
            similarityMatrix(dws[h]),
            Yhat,
            features=features_,  #['demo', 'poi', 'geo'],
            taxi_norm="none")

        dw_mre.append(mre)
        dw_mae.append(mae)
        print "HDGE MRE: {0}".format(mre)

    return mf_mre, line_mre, dw_mre, mf_mae, line_mae, dw_mae
def evaluate_various_flow_features_with_concatenation_model(year, spatial):
    Y, D, P, T, G = extract_raw_samples(int(year))
    population = D[:, 0]

    Yh = pickle.load(open("../chicago-hourly-crime-{0}.pickle".format(year)))
    Yh = Yh / population * 10000
    assert Yh.shape == (24, N)

    with open("CAflowFeatures.pickle") as fin:
        mf = pickle.load(fin)
        line = pickle.load(fin)
        dwt = pickle.load(fin)
        dws = pickle.load(fin)
        hdge = pickle.load(fin)

    mf_mre = []
    mf_mae = []
    line_mre = []
    line_mae = []
    dw_mre = []
    dw_mae = []
    for h in range(24):
        print h
        # MF models
        Tmf = mf[h]  # sum([e for e in mf.values()])
        import nimfa
        nmf = nimfa.Nmf(
            G, rank=4, max_iter=100
        )  #, update="divergence", objective="conn", conn_change=50)
        nmf_fit = nmf()
        src = nmf_fit.basis()
        dst = nmf_fit.coef()
        Gmf = np.concatenate((src, dst.T), axis=1)

        if spatial == "nospatial":
            X = np.concatenate((D, P, Tmf), axis=1)
        elif spatial == "onlyspatial":
            X = np.concatenate((D, P, Gmf), axis=1)
        elif spatial == "usespatial":
            X = np.concatenate((D, P, Tmf, Gmf), axis=1)
        mre, mae = leaveOneOut_eval(X, Yh[h, :].reshape((N, 1)))
        mf_mre.append(mre)
        mf_mae.append(mae)
        print "MF MRE: {0}".format(mre)

        # LINE model
        Tline = line[h]  # sum([e for e in line.values()])
        Gline = get_graph_embedding_features('geo_all.txt')
        if spatial == "nospatial":
            X = np.concatenate((D, P, Tline), axis=1)
        elif spatial == "onlyspatial":
            X = np.concatenate((D, P, Gline), axis=1)
        elif spatial == "usespatial":
            X = np.concatenate((D, P, Tline, Gline), axis=1)
        mre, mae = leaveOneOut_eval(X, Yh[h, :].reshape((N, 1)))
        line_mre.append(mre)
        line_mae.append(mae)
        print "LINE_slotted MRE: {0}".format(mre)

        # deepwalk
        if spatial == 'nospatial':
            TGdw = dwt[h]  # sum([e for e in dw.values()])
        elif spatial == 'onlyspatial':
            TGdw = dws[h]
        elif spatial == 'usespatial':
            TGdw = hdge[h]


#        TGdw = dw[h] # sum([e for e in dw.values()])
        X = np.concatenate((D, P, TGdw), axis=1)
        mre, mae = leaveOneOut_eval(X, Yh[h, :].reshape((N, 1)))
        dw_mre.append(mre)
        dw_mae.append(mae)
        print "HDGE MRE: {0}".format(mre)

    return mf_mre, line_mre, dw_mre, mf_mae, line_mae, dw_mae