def mode_purpose_comparison(context, df_syn, df_act, suffix = None):
    # first in the synthetic data
    types = df_syn.groupby(["mode","following_purpose"]).count()["person_id"]
    syn = types / types.sum()

    # then in the actual data
    df_act.loc[df_act["mode"]=='car_passanger', "mode"] = 'car_passenger'
    which = ["car","car_passenger","pt", "taxi","walk"]
    atypes = df_act.groupby(["mode","destination_purpose"]).sum().loc[which,"weight_person"].reindex(index=which, level=0)
    act = atypes / atypes.sum()
    
    lista = [item for item in list(types.index.levels[0]) for i in range(len(types.index.levels[1]))]
    listb = list(types.index.levels[1]) * len(types.index.levels[0])
    labels = [a + " " + b for a, b in zip(lista,listb)]

    # already ready to plot!
    title_plot = "Synthetic and HTS Mode-Purpose Distribution"
    title_figure = "modepurpose"
    
    if suffix:
        title_plot += " - " + suffix
        title_figure += "_" + suffix
        
    title_figure += ".png"
    
    myplottools.plot_comparison_bar(context, imtitle = title_figure, plottitle = title_plot,
                                    ylabel = "Percentage", xlabel = "", lab = labels, 
                                    actual = act.values.tolist(), synthetic = syn.values.tolist(), 
                                    t = 10, xticksrot = True )
def activity_counts_comparison(context, all_CC, suffix = None):
    all_CC_dic = all_CC.to_dict('records')
    counts_dic = {}
    for actchain in all_CC_dic:
        chain = actchain["Chain"]
        s = actchain["synthetic Count"]
        a = actchain["actual Count"]
        if np.isnan(s):
            s = 0
        if np.isnan(a):
            a = 0
        if chain == "-" or chain == "h":
            x = 0
        else:
            act = chain.split("-")
            x = len(act) - 2
        x = min(x, 7)
        if x not in counts_dic.keys():
            counts_dic[x] = [s, a]
        else:
            counts_dic[x][0] += s
            counts_dic[x][1] += a
    
    counts = pd.DataFrame(columns = ["number", "synthetic Count", "actual Count"])
    for k in range(8):
        v = counts_dic[k]
        if k == 7:
            l = "7+"
        else:
            l = str(int(k))
        counts.loc[k] = pd.Series({"number": l, 
                                      "synthetic Count": v[0],
                                      "actual Count": v[1]
                                          })
    
    # Get percentages, prepare for plotting
    counts["synthetic Count"] = counts["synthetic Count"] / counts["synthetic Count"].sum() *100
    counts["actual Count"] = counts["actual Count"] / counts["actual Count"].sum() *100
    #counts = counts.sort_values(by=['actual Count'], ascending=False)

    # First step done: plot activity chain counts
    title_plot = "Synthetic and HTS activity counts comparison"
    title_figure = "activitycounts"
    if suffix:
        title_plot += " - " + suffix
        title_figure += "_" + suffix
        
    title_figure += ".png"
    
    myplottools.plot_comparison_bar(context, imtitle = title_figure, plottitle = title_plot, 
                                    ylabel = "Percentage", xlabel = "Number of activities in the activity chain",
                                    lab = counts["number"], actual = counts["actual Count"], 
                                    synthetic = counts["synthetic Count"])
def activity_chains_comparison(context, all_CC, suffix = None):
    # Get percentages, prepare for plotting
    all_CC["synthetic Count"] = all_CC ["synthetic Count"] / all_CC["synthetic Count"].sum() *100
    all_CC["actual Count"] = all_CC["actual Count"] / all_CC["actual Count"].sum() *100
    all_CC = all_CC.sort_values(by=['actual Count'], ascending=False)

    # First step done: plot activity chain counts
    title_plot = "Synthetic and HTS activity chain comparison"
    title_figure = "activitychains"
    if suffix:
        title_plot += " - " + suffix
        title_figure += "_" + suffix
        
    title_figure += ".png"
    
    myplottools.plot_comparison_bar(context, imtitle = title_figure, plottitle = title_plot, ylabel = "Percentage", xlabel = "Activity chain", lab = all_CC["Chain"], actual = all_CC["actual Count"], synthetic = all_CC["synthetic Count"])
Esempio n. 4
0
def pipeline_menwomen(df_syn, df_act_trips, df_act_persons, gender, context):
    # Comparing men and women activity chains
    CC = process_synthetic_activity_chain_counts(df_syn)

    act_CC, amdf = process_actual_activity_chain_counts(
        df_act_trips, df_act_persons)

    # Merging together
    all_CC = CC.merge(act_CC, on="Chain", how="left")

    # Get percentages, prepare for plotting
    all_CC["synthetic Count"] = all_CC["synthetic Count"] / all_CC[
        "synthetic Count"].sum() * 100
    all_CC["actual Count"] = all_CC["actual Count"] / all_CC[
        "actual Count"].sum() * 100
    all_CC = all_CC.sort_values(by=['actual Count'], ascending=False)

    # First step done: plot activity chain counts
    myplottools.plot_comparison_bar(
        context,
        imtitle="activitychains_" + gender + ".png",
        plottitle="Synthetic and HTS activity chain comparison - " + gender,
        ylabel="Percentage",
        xlabel="Activity chain",
        lab=all_CC["Chain"],
        actual=all_CC["actual Count"],
        synthetic=all_CC["synthetic Count"])

    # first in the synthetic data
    types = df_syn.groupby(["mode",
                            "destination_purpose"]).count()["person_id"]
    syn = types / types.sum()

    # then in the actual data
    which = ["car", "car_passenger", "pt", "bike", "walk"]
    atypes = amdf.groupby(["mode", "destination_purpose"
                           ]).sum().loc[which,
                                        "weight_person"].reindex(index=which,
                                                                 level=0)
    act = atypes / atypes.sum()

    lista = [
        item for item in list(types.index.levels[0])
        for i in range(len(types.index.levels[1]))
    ]
    listb = list(types.index.levels[1]) * len(types.index.levels[0])
    labels = [a + " " + b for a, b in zip(lista, listb)]

    # already ready to plot!
    myplottools.plot_comparison_bar(
        context,
        imtitle="modepurpose_" + gender + ".png",
        plottitle="Synthetic and HTS Mode-Purpose Distribution - " + gender,
        ylabel="Percentage",
        xlabel="",
        lab=labels,
        actual=act.values.tolist(),
        synthetic=syn.values.tolist(),
        t=10,
        xticksrot=True)

    # Third step: look into the crowfly distances

    # Compute the distances
    amdf["crowfly_distance"] = 0.001 * np.sqrt(
        (amdf["origin_x"] - amdf["destination_x"])**2 +
        (amdf["origin_y"] - amdf["destination_y"])**2)
    df_syn["crowfly_distance"] = df_syn.geometry.length
    df_syn["crowfly_distance"] = df_syn["crowfly_distance"] * 0.001

    # Only consider crowfly distances shorter than 25 km
    df2 = df_syn[df_syn["crowfly_distance"] < 25]
    amdf2 = amdf[amdf["crowfly_distance"] < 25]

    # Finish to prepare for plotting
    amdf2["x"] = amdf2["weight_person"] * amdf2["crowfly_distance"]

    act = amdf2.groupby(["destination_purpose"]).sum()["x"] / amdf2.groupby(
        ["destination_purpose"]).sum()["weight_person"]
    syn = df2.groupby(["destination_purpose"]).mean()["crowfly_distance"]

    # Ready to plot!
    myplottools.plot_comparison_bar(context,
                                    imtitle="distancepurpose_" + gender +
                                    ".png",
                                    plottitle="Crowfly distance - " + gender,
                                    ylabel="Mean crowfly distance [km]",
                                    xlabel="",
                                    lab=syn.index,
                                    actual=act,
                                    synthetic=syn,
                                    t=None,
                                    xticksrot=True)

    myplottools.plot_comparison_hist_purpose(context,
                                             "distance_purpose_hist_" +
                                             gender + ".png",
                                             amdf2,
                                             df2,
                                             bins=np.linspace(0, 25, 120),
                                             dpi=300,
                                             cols=3,
                                             rows=2)
    myplottools.plot_comparison_hist_mode(context,
                                          "distance_mode_hist_" + gender +
                                          ".png",
                                          amdf2,
                                          df2,
                                          bins=np.linspace(0, 25, 120),
                                          dpi=300,
                                          cols=3,
                                          rows=2)

    myplottools.plot_comparison_cdf_purpose(context,
                                            "distance_purpose_cdf_" + gender +
                                            ".png",
                                            amdf2,
                                            df2,
                                            bins=np.linspace(0, 25, 120),
                                            dpi=300,
                                            cols=3,
                                            rows=2)
    myplottools.plot_comparison_cdf_mode(context,
                                         "distance_mode_cdf_" + gender +
                                         ".png",
                                         amdf2,
                                         df2,
                                         bins=np.linspace(0, 25, 120),
                                         dpi=300,
                                         cols=3,
                                         rows=2)
def execute(context):
    # Import data, merging
    df_syn, df_syn_no_trip = import_data_synthetic(context)
    df_act, df_act_no_trip = import_data_actual(context)
    df_aux = aux_data_frame(df_act)

    # 1. ACTIVITY CHAINS

    
    # Creating the new dataframes with activity chain counts
    syn_CC = myutils.process_synthetic_activity_chain_counts(df_syn)
    syn_CC.loc[len(syn_CC) + 1] = pd.Series({"Chain": "h", "synthetic Count": df_syn_no_trip.shape[0] })
    
    act_CC = myutils.process_actual_activity_chain_counts(df_act, df_aux)
    act_CC.loc[len(act_CC) + 1] = pd.Series({"Chain": "h", "actual Count": np.sum(df_act_no_trip["weight_person"].values.tolist())})

    # Merging together, comparing
    all_CC = pd.merge(syn_CC, act_CC, on = "Chain", how = "left")
    activity_chains_comparison(context, all_CC)
    
    # Number of activities    
    activity_counts_comparison(context, all_CC)
    
    # Number of activities per purposes
    activity_counts_per_purpose(context, all_CC)

    # 2. MODE AND DESTINATION PURPOSE
    mode_purpose_comparison(context, df_syn, df_act)


    # 3. CROWFLY DISTANCES
    
    # 3.1. Compute the distances
    df_syn_dist = compute_distances_synthetic(df_syn)
    df_act_dist = compute_distances_actual(df_act) 

    # 3.2 Prepare for plotting
    df_act_dist["x"] = df_act_dist["weight_person"] * df_act_dist["crowfly_distance"]

    act = df_act_dist.groupby(["destination_purpose"]).sum()["x"] / df_act_dist.groupby(["destination_purpose"]).sum()["weight_person"]
    syn = df_syn_dist.groupby(["following_purpose"]).mean()["crowfly_distance"] 

    # 3.3 Ready to plot!
    myplottools.plot_comparison_bar(context, imtitle = "distancepurpose.png", plottitle = "Crowfly distance", ylabel = "Mean crowfly distance [km]", xlabel = "", lab = syn.index, actual = act, synthetic = syn, t = None, xticksrot = True )
    all_the_plot_distances(context, df_act_dist, df_syn_dist)

    # 3.4 Distance from home to education
    syn_0, act_0, act_w0 = compare_dist_educ(context, df_syn, df_act)
    
    
    # 4. Do the same for men and women separated, aged 18 to 40
    
    # 4.1 Create the dataframes
    df_syn_men = df_syn[df_syn["sex"] == "male"]
    df_syn_men = df_syn_men[np.logical_and(df_syn_men["age"] >= 18,
                                           df_syn_men["age"] <= 40)]
    df_syn_no_trip_men = df_syn_no_trip[df_syn_no_trip["sex"] == "male"]
    df_syn_no_trip_men = df_syn_no_trip_men[np.logical_and(df_syn_no_trip_men["age"] >= 18,
                                           df_syn_no_trip_men["age"] <= 40)]
        
    df_syn_women = df_syn[df_syn["sex"] == "female"]
    df_syn_women = df_syn_women[np.logical_and(df_syn_women["age"] >= 18,
                                           df_syn_women["age"] <= 40)]
    df_syn_no_trip_women = df_syn_no_trip[df_syn_no_trip["sex"] == "female"]
    df_syn_no_trip_women = df_syn_no_trip_women[np.logical_and(df_syn_no_trip_women["age"] >= 18,
                                           df_syn_no_trip_women["age"] <= 40)]
        
    df_act_men = df_act[df_act["sex"] == "male"]
    df_act_men = df_act_men[np.logical_and(df_act_men["age"] >= 18,
                                           df_act_men["age"] <= 40)]
    df_aux_men = aux_data_frame(df_act_men)
    df_act_no_trip_men = df_act_no_trip[df_act_no_trip["sex"] == "male"]
    df_act_no_trip_men = df_act_no_trip_men[np.logical_and(df_act_no_trip_men["age"] >= 18,
                                           df_act_no_trip_men["age"] <= 40)]
    
        
    df_act_women = df_act[df_act["sex"] == "female"]
    df_act_women = df_act_women[np.logical_and(df_act_women["age"] >= 18,
                                           df_act_women["age"] <= 40)]
    df_aux_women = aux_data_frame(df_act_women)
    df_act_no_trip_women = df_act_no_trip[df_act_no_trip["sex"] == "female"]
    df_act_no_trip_women = df_act_no_trip_women[np.logical_and(df_act_no_trip_women["age"] >= 18,
                                           df_act_no_trip_women["age"] <= 40)]
        
    # 4.2 Activity chains
    # Creating the new dataframes with activity chain counts
    M_syn_CC = myutils.process_synthetic_activity_chain_counts(df_syn_men)
    M_syn_CC.loc[len(M_syn_CC) + 1] = pd.Series({"Chain": "h", 
                                          "synthetic Count": df_syn_no_trip_men.shape[0]
                                          })
    M_act_CC = myutils.process_actual_activity_chain_counts(df_act_men, df_aux_men)
    M_act_CC.loc[len(M_act_CC) + 1] = pd.Series({"Chain": "h", 
                                          "actual Count": np.sum(df_act_no_trip_men["weight_person"].values.tolist())
                                          })
    
    W_syn_CC = myutils.process_synthetic_activity_chain_counts(df_syn_women)
    W_syn_CC.loc[len(W_syn_CC) + 1] = pd.Series({"Chain": "h", 
                                          "synthetic Count": df_syn_no_trip_women.shape[0]
                                          })
    W_act_CC = myutils.process_actual_activity_chain_counts(df_act_women, df_aux_women)
    W_act_CC.loc[len(W_act_CC) + 1] = pd.Series({"Chain": "h", 
                                          "actual Count": np.sum(df_act_no_trip_women["weight_person"].values.tolist())
                                          })
    
    # Merging together, comparing
    M_all_CC = pd.merge(M_syn_CC, M_act_CC, on = "Chain", how = "left")
    activity_chains_comparison(context, M_all_CC, "men")
    
    W_all_CC = pd.merge(W_syn_CC, W_act_CC, on = "Chain", how = "left")
    activity_chains_comparison(context, W_all_CC, "women")
    
    activity_counts_comparison(context, M_all_CC, "men")
    activity_counts_comparison(context, W_all_CC, "women")
    
    activity_counts_per_purpose(context, M_all_CC, "men")
    activity_counts_per_purpose(context, W_all_CC, "women")

    # 4.3 Mode-purpose comparison
    mode_purpose_comparison(context, df_syn_men, df_act_men, "men")
    mode_purpose_comparison(context, df_syn_women, df_act_women, "women")
    
    # 4.4 Distance-purpose comparison
    df_syn_distM = compute_distances_synthetic(df_syn_men)
    df_act_distM = compute_distances_actual(df_act_men) 
    df_act_distM["x"] = df_act_distM["weight_person"] * df_act_distM["crowfly_distance"]
    actM = df_act_distM.groupby(["destination_purpose"]).sum()["x"] / df_act_distM.groupby(["destination_purpose"]).sum()["weight_person"]
    synM = df_syn_distM.groupby(["following_purpose"]).mean()["crowfly_distance"] 
    myplottools.plot_comparison_bar(context, imtitle = "distancepurpose_men.png", 
                                    plottitle = "Crowfly distances - men", 
                                    ylabel = "Mean crowfly distance [km]", xlabel = "", 
                                    lab = synM.index, actual = actM, synthetic = synM, t = None, xticksrot = True )
    all_the_plot_distances(context, df_act_distM, df_syn_distM, suffix = "men")
    
    df_syn_distW = compute_distances_synthetic(df_syn_women)
    df_act_distW = compute_distances_actual(df_act_women) 
    df_act_distW["x"] = df_act_distW["weight_person"] * df_act_distW["crowfly_distance"]
    actW = df_act_distW.groupby(["destination_purpose"]).sum()["x"] / df_act_distW.groupby(["destination_purpose"]).sum()["weight_person"]
    synW = df_syn_distW.groupby(["following_purpose"]).mean()["crowfly_distance"] 
    myplottools.plot_comparison_bar(context, imtitle = "distancepurpose_women.png", 
                                    plottitle = "Crowfly distances - women", 
                                    ylabel = "Mean crowfly distance [km]", xlabel = "", 
                                    lab = synM.index, actual = actW, synthetic = synW, 
                                    t = None, xticksrot = True )
    all_the_plot_distances(context, df_act_distW, df_syn_distW, suffix = "women")


    # 5 Distance from home to education according to age
    ages = [[0, 14], [15, 18], [19, 24], [25, 1000]]

    syn_means = [np.mean(syn_0)]
    act_means = [np.average(act_0, weights = act_w0)]
    labels = ["All"]
    for age in ages:
        df_syn_age = df_syn[np.logical_and(df_syn["age"] >= age[0],
                                           df_syn["age"] <= age[1] )]
        df_act_age = df_act[np.logical_and(df_act["age"] >= age[0],
                                           df_act["age"] <= age[1] )]
        suf = "aged " + str(age[0]) + " to " + str(age[1])
        lab = str(age[0]) + " to " + str(age[1]) + " y. o."
        if age[1] == 1000:
            lab = "25 +"
        syn, act, act_w = compare_dist_educ(context, df_syn_age, df_act_age, suffix = suf)

        syn_means.append(np.average(syn))
        act_means.append(np.average(act, weights = act_w))
        labels.append(lab)

    myplottools.plot_comparison_bar(context,"avdisthomeeduc - age.png", "Average distances from home to education", "Average distance [km]", "Population group", labels, act_means, syn_means)

    # 6. Distance from home to education according to residence area

    areas = [1,2,3]

    syn_means = [np.mean(syn_0)]
    act_means = [np.average(act_0, weights = act_w0)]
    labels = ["All"]
    for area in areas:
        df_syn_area = df_syn[df_syn["residence_area_index"] == area]
        df_act_area = df_act[df_act["residence_area_index"] == area]
        suf = "agents living in "
        if area == 1:
            suf += " rest of the state"
            lab = "rest of the state"
        if area == 2 :
            suf += " city"
            lab = "city of Sao Paulo"
        if area == 3 :
            suf += " downtown"
            lab = "downtown"

        syn, act, act_w = compare_dist_educ(context, df_syn_area, df_act_area, suffix = suf)
        syn_means.append(np.average(syn))
        act_means.append(np.average(act, weights = act_w))
        labels.append(lab)

    myplottools.plot_comparison_bar(context,"avdisthomeeduc - area.png", "Average distances from home to education", "Average distance [km]", "Population group", labels, act_means, syn_means)


    # 7. Distance from home to education according to gender

    genders = ["male","female"]

    syn_means = [np.mean(syn_0)]
    act_means = [np.average(act_0, weights = act_w0)]
    labels = ["All"]
    for gender in genders:
        df_syn_gender = df_syn[df_syn["sex"] == gender]
        df_act_gender = df_act[df_act["sex"] == gender]
        suf = gender
        lab = gender

        syn, act, act_w = compare_dist_educ(context, df_syn_gender, df_act_gender, suffix = suf)
        syn_means.append(np.average(syn))
        act_means.append(np.average(act, weights = act_w))
        labels.append(lab)

    myplottools.plot_comparison_bar(context,"avdisthomeeduc - gender.png", "Average distances from home to education", "Average distance [km]", "Population group", labels, act_means, syn_means)
def activity_counts_per_purpose(context, all_CC, suffix = None):
    all_CC_dic = all_CC.to_dict('records')
    purposes = ['h', 'w', 'e', 's', 'l', 'o']
    counts_dic = {}
    cpt = 0
    for actchain in all_CC_dic:
        chain = actchain["Chain"]
        s = actchain["synthetic Count"]
        a = actchain["actual Count"]
        if np.isnan(s):
            s = 0
        if np.isnan(a):
            a = 0
        if chain == "-" or chain == "h":
            pass
        else:
            act = chain.split("-")
            act = act[1:-1]
            for p in purposes:
                cpt_purpose = act.count(p)
                if cpt_purpose > 0 :
                    identifier = p + " - " + str(cpt_purpose) 
                    if cpt_purpose > 1:
                        identifier += " times"
                    else:
                        identifier += " time"
                    if cpt_purpose >= 3 or (cpt_purpose == 2 and p not in ['h', 'w', 'e']):
                        identifier = "Other"
                    if identifier not in counts_dic.keys():
                        counts_dic[identifier] = [s, a]
                    else:
                        counts_dic[identifier][0] += s
                        counts_dic[identifier][1] += a
    
    counts = pd.DataFrame(columns = ["number", "synthetic Count", "actual Count"])

    for k, v in counts_dic.items():
        counts.loc[k] = pd.Series({"number": k, 
                                      "synthetic Count": v[0],
                                      "actual Count": v[1]
                                          })
            

    # Get percentages, prepare for plotting
    counts["synthetic Count"] = counts["synthetic Count"] / counts["synthetic Count"].sum() *100
    counts["actual Count"] = counts["actual Count"] / counts["actual Count"].sum() *100
    counts = counts.sort_values(by=['actual Count'], ascending=False)
    val = "Other"
    idx = counts.index.drop(val).tolist() + [val]
    counts = counts.reindex(idx)

    # First step done: plot activity chain counts
    title_plot = "Activity counts per purpose comparison"
    title_figure = "activitycountspurpose"
    if suffix:
        title_plot += " - " + suffix
        title_figure += "_" + suffix
        
    title_figure += ".png"
    
    myplottools.plot_comparison_bar(context, imtitle = title_figure, plottitle = title_plot, 
                                    ylabel = "Percentage", xlabel = "Activities with the same purpose in the activity chain",
                                    lab = counts["number"], actual = counts["actual Count"], 
                                    synthetic = counts["synthetic Count"], t = 20)