Exemple #1
0
def execute(context):
    data = context.stage("data")
    variables = max(data.keys()) + 1

    means = [np.mean(data[v] / data[0]) for v in range(variables)]
    #mins = [np.percentile(data[v] / data[0], 10) for v in range(variables)]
    #maxs = [np.percentile(data[v] / data[0], 90) for v in range(variables)]

    mins = [np.min(data[v] / data[0]) for v in range(variables)]
    maxs = [np.max(data[v] / data[0]) for v in range(variables)]

    # Prepare plot
    plotting.setup()

    plt.figure()
    plt.bar(range(variables), means, color=plotting.COLORS["synthetic"])

    for v, min, max in zip(range(variables), mins, maxs):
        plt.plot([
            v,
            v,
        ], [min, max],
                 linewidth=1,
                 label="90% Conf.",
                 color="k")

    plt.xlabel("Variables")
    plt.ylabel("Matching rate")

    plt.gca().yaxis.set_major_locator(tck.FixedLocator(np.arange(100) * 0.2))
    plt.gca().yaxis.set_major_formatter(
        tck.FuncFormatter(lambda x, p: "%d%%" % (100 * x, )))

    plt.tight_layout()
    plt.savefig("%s/matching_rate.pdf" % context.path())
def execute(context):
    data = context.stage("data")
    variables = max(data.keys()) + 1

    indices = np.random.randint(0, len(data[0]), size=ESTIMATION_SAMPLES)

    means = [
        np.mean(data[v][indices] / data[0][indices]) for v in range(variables)
    ]
    q10s = [
        np.percentile(data[v][indices] / data[0][indices], 10)
        for v in range(variables)
    ]
    q90s = [
        np.percentile(data[v][indices] / data[0][indices], 90)
        for v in range(variables)
    ]

    # Prepare plot
    plotting.setup()

    plt.figure()
    plt.bar(range(variables), means, color=plotting.COLORS["synthetic"])

    for v, q10, q90 in zip(range(variables), q10s, q90s):
        plt.plot([
            v,
            v,
        ], [q10, q90],
                 linewidth=1,
                 label="90% Conf.",
                 color="k")

    plt.xlabel("Variables")
    plt.ylabel("Matching rate")

    plt.gca().yaxis.set_major_locator(tck.FixedLocator(np.arange(100) * 0.2))
    plt.gca().yaxis.set_major_formatter(
        tck.FuncFormatter(lambda x, p: "%d%%" % (100 * x, )))

    plt.tight_layout()
    plt.savefig("%s/matching_rate.pdf" % context.path())
def execute(context):
    plotting.setup()

    q = 0.01

    plt.figure(figsize = plotting.WIDE_FIGSIZE)

    for s, color in zip([0.01, 0.1, 0.25], ["#000000", "#777777", "#cccccc"]):
        ws = np.linspace(0, 2000, 10000)

        probs = get_error_probability(ws, s, q)
        plt.plot(ws, probs, ".", label = "s = %.2f" % s, color = color, markersize = 2)

    plt.legend(loc = "best")
    plt.grid()
    plt.xlabel("Reference weight")
    plt.ylabel("Probability")
    plt.tight_layout()

    plt.savefig("%s/sampling_error.pdf" % context.path())
Exemple #4
0
def execute(context):
    plotting.setup()

    hts_comparison = context.stage("data.hts.comparison")

    # Distance distribution plot
    df_distance = hts_comparison["distance_distribution"]

    f_entd = df_distance["hts"] == "entd"
    f_egt = df_distance["hts"] == "egt"

    plt.figure()

    plt.bar(df_distance[f_entd]["distance_class"].values, df_distance[f_entd]["trip_weight"].values / 1e6, width = 0.4, label = "ENTD (Routed)", align = "edge", color = plotting.COLORS["entd"], linewidth = 0.5, edgecolor = "white")
    plt.bar(df_distance[f_egt]["distance_class"].values + 0.4, df_distance[f_egt]["trip_weight"].values / 1e6, width = 0.4, label = "EGT (Euclidean)", align = "edge", color = plotting.COLORS["egt"], linewidth = 0.5, edgecolor = "white")

    plt.gca().xaxis.set_major_locator(tck.FixedLocator(np.arange(0, 10, 2) + 0.4))
    plt.gca().xaxis.set_major_formatter(tck.FixedFormatter(["<%dkm" % d for d in np.arange(1, 10, 2)]))

    plt.gca().annotate(
        r"≥10 km",
        xy = (10.0, 8.0), xycoords = 'data', ha = "right"
    )

    plt.grid()
    plt.gca().set_axisbelow(True)
    plt.gca().xaxis.grid(alpha = 0.0)

    plt.xlabel("Trip distance")
    plt.ylabel("Number of trips [$10^6$]")

    plt.legend()

    plt.tight_layout()
    plt.savefig("%s/distance_distribution.pdf" % context.path())
    plt.close()

    # HTS Age distribution plot
    df_age = hts_comparison["age_distribution"]

    f_entd = df_age["hts"] == "entd"
    f_egt = df_age["hts"] == "egt"
    f_census = df_age["hts"] == "census"

    plt.figure()

    plt.bar(df_age[f_census]["age_class"].values, df_age[f_census]["person_weight"].values / 1e6, width = 0.25, label = "Census", align = "edge", color = plotting.COLORS["census"], linewidth = 0.5, edgecolor = "white")
    plt.bar(df_age[f_entd]["age_class"].values + 0.25, df_age[f_entd]["person_weight"].values / 1e6, width = 0.25, label = "ENTD", align = "edge", color = plotting.COLORS["entd"], linewidth = 0.5, edgecolor = "white")
    plt.bar(df_age[f_egt]["age_class"].values + 0.5, df_age[f_egt]["person_weight"].values / 1e6, width = 0.25, label = "EGT", align = "edge", color = plotting.COLORS["egt"], linewidth = 0.5, edgecolor = "white")

    plt.gca().xaxis.set_major_locator(tck.FixedLocator(np.arange(1000) + 0.75 / 2))
    plt.gca().xaxis.set_major_formatter(tck.FixedFormatter(["%d0s" % d for d in np.arange(1, 10, 2)]))

    AGE_BOUNDS = ["<15", "15-29", "30-44", "45-59", "60-74", ">75"]
    plt.gca().xaxis.set_major_formatter(tck.FixedFormatter(AGE_BOUNDS))

    plt.gca().annotate(
        "A",
        xy = (1.5 + 0.5 * 0.25, 2.0), xycoords='data',
        xytext = (1.5 + 0.5 * 0.25, 2.35), textcoords='data',
        arrowprops = { "arrowstyle": "-|>", "facecolor": "black", "linewidth": 0.5 },
        bbox = { "pad": 0.0, "linewidth": 0.0, "facecolor": (1.0, 0.0, 0.0, 0.0) },
        ha = 'center'
    )

    plt.gca().annotate(
        "B",
        xy = (4.25 + 0.5 * 0.25, 1.3), xycoords='data',
        xytext = (4.25 + 0.5 * 0.25, 1.65), textcoords='data',
        arrowprops = { "arrowstyle": "-|>", "facecolor": "black", "linewidth": 0.5 },
        bbox = { "pad": 0.0, "linewidth": 0.0, "facecolor": (1.0, 0.0, 0.0, 0.0) },
        ha = 'center'
    )

    plt.grid()
    plt.gca().set_axisbelow(True)
    plt.gca().xaxis.grid(alpha = 0.0)

    plt.xlabel("Age")
    plt.ylabel("Number of persons [x$10^6$]")

    plt.legend()

    plt.tight_layout()
    plt.savefig("%s/age_distribution.pdf" % context.path())
    plt.close()
Exemple #5
0
def execute(context):
    plotting.setup()
    hts_name = context.config("hts")

    df_census = context.stage("census")
    df_hts, df_correction = context.stage("hts")

    # PLOT: Work / education flows
    plt.figure(figsize=plotting.WIDE_FIGSIZE)

    figures = [{
        "slot": "work",
        "title": "Work",
        "top": 12
    }, {
        "slot": "education",
        "title": "Education",
        "top": 12,
        "factor": 0.7
    }]

    for index, figure in enumerate(figures):
        plt.subplot(1, 2, index + 1)
        slot = figure["slot"]

        df = context.stage("data")[slot]
        df = pd.merge(df,
                      df_census[slot].rename(columns={"weight": "reference"}),
                      on=["home", slot])
        df = pd.merge(df, df_correction[slot], on="home")
        df["scaled_reference"] = df["reference"] * (
            figure["factor"] if "factor" in figure else df["factor"])

        count = figure["top"]
        df = df.sort_values(by="scaled_reference", ascending=False).head(count)

        plt.bar(np.arange(count),
                df["reference"],
                width=0.4,
                align="edge",
                linewidth=0.5,
                edgecolor="white",
                color=plotting.COLORS["census"],
                alpha=0.25)
        plt.bar(np.arange(count),
                df["scaled_reference"],
                width=0.4,
                label="Census",
                align="edge",
                linewidth=0.5,
                edgecolor="white",
                color=plotting.COLORS["census"])
        plt.bar(np.arange(count) + 0.4,
                df["mean"] / SAMPLING_RATE,
                width=0.4,
                label="Synthetic",
                align="edge",
                linewidth=0.5,
                edgecolor="white",
                color=plotting.COLORS["synthetic"])

        for index, (q5,
                    q95) in enumerate(zip(df["q5"].values, df["q95"].values)):
            index += 0.4 + 0.2
            plt.plot([index, index], [q5 / SAMPLING_RATE, q95 / SAMPLING_RATE],
                     color='k',
                     linewidth=1.0)

        plt.grid()
        plt.gca().set_axisbelow(True)
        plt.gca().xaxis.grid(alpha=0.0)

        plt.gca().yaxis.set_major_locator(
            tck.FixedLocator(np.arange(100) * 1e5))
        plt.gca().yaxis.set_major_formatter(
            tck.FuncFormatter(lambda x, p: "%d" % (x * 1e-3, )))

        origins, destinations = df["home"].values, df[figure["slot"]].values

        plt.gca().xaxis.set_major_locator(
            tck.FixedLocator(np.arange(count) + 0.4))
        plt.gca().xaxis.set_major_formatter(
            tck.FixedFormatter(
                ["%s\n%s" % item for item in zip(origins, destinations)]))

        plt.ylabel("Commuters [x1000]")
        plt.legend(loc="best")
        plt.title(figure["title"])

    plt.tight_layout()
    plt.savefig("%s/commute_flows.pdf" % context.path())
    plt.close()

    # PLOT: Scatter
    plt.figure(figsize=plotting.SHORT_FIGSIZE)

    parts = [{
        "slot": "work",
        "title": "Work",
        "marker": ".",
        "color": "k"
    }, {
        "slot": "education",
        "title": "Education",
        "factor": 0.7,
        "marker": ".",
        "color": plotting.COLORS["egt"]
    }]

    minimum = np.inf
    maximum = -np.inf

    for part in parts:
        slot = part["slot"]

        df = context.stage("data")[slot]
        df = pd.merge(df,
                      df_census[slot].rename(columns={"weight": "reference"}),
                      on=["home", slot])
        df = pd.merge(df, df_correction[slot], on="home")
        df["scaled_reference"] = df["reference"] * (part["factor"] if "factor"
                                                    in part else df["factor"])

        plt.loglog(df["scaled_reference"],
                   df["mean"] / SAMPLING_RATE,
                   markersize=2,
                   marker=part["marker"],
                   color=part["color"],
                   linestyle="none",
                   label=part["title"])

        minimum = min(minimum, df["scaled_reference"].min() * 0.9)
        maximum = max(maximum, df["scaled_reference"].max() * 1.1)

    x = np.linspace(minimum, maximum, 100)
    plt.fill_between(x,
                     x * 0.8,
                     x * 1.2,
                     color="k",
                     alpha=0.2,
                     linewidth=0.0,
                     label=r"20% Error")

    plt.xlim([minimum, maximum])
    plt.ylim([minimum, maximum])

    plt.grid()
    plt.gca().set_axisbelow(True)
    plt.legend()

    plt.xlabel("Reference flow")
    plt.ylabel("Synthetic flow")

    plt.tight_layout()
    plt.savefig("%s/commute_scatter.pdf" % context.path())
    plt.close()

    # PLOT: Histogram
    plt.figure(figsize=plotting.SHORT_FIGSIZE)

    parts = [{
        "slot": "work",
        "title": "Work"
    }, {
        "slot": "education",
        "title": "Education",
        "factor": 0.7
    }]

    for index, part in enumerate(parts):
        slot = part["slot"]

        df = context.stage("data")[slot]
        df = pd.merge(df,
                      df_census[slot].rename(columns={"weight": "reference"}),
                      on=["home", slot])
        df = pd.merge(df, df_correction[slot], on="home")
        df["scaled_reference"] = df["reference"] * (part["factor"] if "factor"
                                                    in part else df["factor"])

        df["difference"] = 100 * (
            df["mean"] / SAMPLING_RATE -
            df["scaled_reference"]) / df["scaled_reference"]

        q5 = df["difference"].quantile(0.05)
        q95 = df["difference"].quantile(0.95)
        mean = df["difference"].mean()

        values = df["difference"].values
        outliers = values  # values[(values < q5) | (values > q95)]

        plt.plot([index - 0.2, index + 0.2], [q5, q5],
                 color="k",
                 linewidth=1.0)
        plt.plot([index - 0.2, index + 0.2], [q95, q95],
                 color="k",
                 linewidth=1.0)
        plt.plot([index - 0.2, index + 0.2], [mean, mean],
                 color="k",
                 linewidth=1.0,
                 linestyle=":")
        plt.plot([index - 0.2, index - 0.2], [q5, q95],
                 color="k",
                 linewidth=1.0)
        plt.plot([index + 0.2, index + 0.2], [q5, q95],
                 color="k",
                 linewidth=1.0)

        plt.plot([index] * len(outliers),
                 outliers,
                 color="k",
                 marker=".",
                 markersize=2,
                 linestyle="none")

    plt.gca().xaxis.set_major_locator(tck.FixedLocator([0, 1]))
    plt.gca().xaxis.set_major_formatter(
        tck.FixedFormatter(["Work", "Education"]))

    plt.ylabel("Error [%]")

    plt.xlim([-0.5, 1.5])
    plt.grid()
    plt.gca().set_axisbelow(True)
    plt.gca().xaxis.grid(alpha=0.0)

    plt.bar([np.nan], [np.nan],
            color="none",
            edgecolor="k",
            linewidth=1.0,
            label="5% - 95%")
    plt.plot([np.nan], color="k", linestyle=":", label="Mean")

    plt.legend(loc="best")

    plt.tight_layout()
    plt.savefig("%s/commute_flow_boxplot.pdf" % context.path())
    plt.close()
Exemple #6
0
def execute(context):
    plotting.setup()

    reference = context.stage("analysis.reference.hts.chains")
    data = context.stage("data")

    # PLOT: Activity chains by sex

    marginal = ("age_range", "sex", "chain")
    df = pd.merge(data[marginal],
                  reference[marginal].rename(columns={"weight": "reference"}))
    df = df[df["age_range"]]

    df_female = df[df["sex"] == "female"].sort_values(by="reference",
                                                      ascending=False).head(10)
    df_male = df[df["sex"] == "male"].sort_values(by="reference",
                                                  ascending=False).head(10)

    plt.figure(figsize=plotting.WIDE_FIGSIZE)
    hts_name = context.config("hts")

    for index, (df, title) in enumerate(
            zip([df_male, df_female], ["Male (18-40)", "Female (18-40)"])):
        plt.subplot(1, 2, index + 1)

        plt.bar(np.arange(10),
                df["reference"],
                width=0.4,
                label="HTS",
                align="edge",
                linewidth=0.5,
                edgecolor="white",
                color=plotting.COLORS[hts_name])
        plt.bar(np.arange(10) + 0.4,
                df["mean"] / SAMPLING_RATE,
                width=0.4,
                label="Synthetic",
                align="edge",
                linewidth=0.5,
                edgecolor="white",
                color=plotting.COLORS["synthetic"])

        for location, (min, max) in enumerate(
                zip(df["min"].values, df["max"].values)):
            location += 0.4 + 0.2
            plt.plot([location, location],
                     [min / SAMPLING_RATE, max / SAMPLING_RATE],
                     "k",
                     linewidth=1)

        plt.grid()
        plt.gca().set_axisbelow(True)
        plt.gca().xaxis.grid(alpha=0.0)

        if hts_name == "egt":
            plt.ylim([0, 3.5e5])
        else:
            plt.ylim([0, 5e5])

        plt.plot([np.nan], color="k", linewidth=1, label="Range")

        plt.gca().yaxis.set_major_locator(
            tck.FixedLocator(np.arange(100) * 1e5))
        plt.gca().yaxis.set_major_formatter(
            tck.FuncFormatter(lambda x, p: "%d" % (x * 1e-3, )))

        plt.gca().xaxis.set_major_locator(
            tck.FixedLocator(np.arange(10) + 0.4))
        plt.gca().xaxis.set_major_formatter(
            tck.FuncFormatter(
                lambda x, p: "\n".join(df["chain"].values[p]).upper()))

        if index == 1:
            plt.gca().yaxis.set_major_formatter(tck.FixedFormatter([""] *
                                                                   1000))
            plt.gca().yaxis.get_label().set_visible(False)

        handles, labels = plt.gca().get_legend_handles_labels()
        handles = [handles[-2], handles[-1], handles[-3]]
        labels = [labels[-2], labels[-1], labels[-3]]
        plt.legend(handles=handles, labels=labels, loc="best", title=title)

        if index == 0:
            plt.ylabel("Number of persons [x1000]")

    plt.tight_layout()
    plt.savefig("%s/activity_chains.pdf" % context.path())
    plt.close()
def execute(context):
    plotting.setup()
    hts_name = context.config("hts")

    # PLOT: Input distributions
    distributions = context.stage(
        "synthesis.population.spatial.secondary.distance_distributions")

    plt.figure()

    modes = list(context.stage("analysis.reference.hts.mode_distances").keys())
    #modes = ["car", "car_passenger", "pt", "bike", "walk"]

    for index, mode in enumerate(modes):
        mode_distribution = distributions[mode]
        bounds = mode_distribution["bounds"]
        bounds[~np.isfinite(bounds)] = 6 * 3600

        means = [0.0]
        q10 = [0.0]
        q90 = [0.0]

        for distribution in mode_distribution["distributions"]:
            weights = distribution["weights"] / np.sum(distribution["weights"])
            means.append(np.sum(weights * distribution["values"]))

            q10.append(distribution["values"][np.count_nonzero(
                distribution["cdf"] < 0.1)])
            q90.append(distribution["values"][np.count_nonzero(
                distribution["cdf"] < 0.9)])

        if mode in ("car", "pt"):
            plt.fill_between([0.0] + list(bounds),
                             q10,
                             q90,
                             color=plotting.COLORSET5[index],
                             alpha=0.25,
                             linewidth=0.0)

        plt.plot([0.0] + list(bounds),
                 means,
                 label="%s (%d)" % (plotting.MODE_LABELS[mode], len(bounds)),
                 linewidth=1.0,
                 marker=".",
                 markersize=3,
                 color=plotting.COLORSET5[index])

    plt.gca().xaxis.set_major_locator(
        tck.FixedLocator(np.arange(100) * 60 * 20))
    plt.gca().xaxis.set_major_formatter(
        tck.FuncFormatter(lambda x, p: str(x // 60)))

    plt.gca().yaxis.set_major_locator(
        tck.FixedLocator(np.arange(100) * 5 * 1000))
    plt.gca().yaxis.set_major_formatter(
        tck.FuncFormatter(lambda x, p: str(x // 1000)))

    plt.legend(loc="upper left")
    plt.xlim([0, 90 * 60 if hts_name == "egt" else 50 * 60])
    plt.ylim([0, 45 * 1000 if hts_name == "egt" else 25 * 1000])

    plt.grid()

    plt.xlabel("Travel time [min]")
    plt.ylabel("Euclidean distance [km]")

    plt.tight_layout()
    plt.savefig("%s/input_distributions.pdf" % context.path())
    plt.close()

    # PLOT: Distance distributions
    df_synthetic = context.stage("analysis.synthesis.mode_distances")
    reference_data = context.stage("analysis.reference.hts.mode_distances")

    plt.figure(figsize=(6.0, 2.5), dpi=100)  # 2.5 * 2.5

    limits = dict(car=20 * 1e3,
                  car_passenger=20 * 1e3,
                  pt=20 * 1e3,
                  bike=6 * 1e3,
                  walk=1 * 1e3)

    modes = ["car", "bike" if "bike" in modes else "walk"]

    for index, mode in enumerate(modes):
        plt.subplot(1, 2, index + 1)

        mode_reference = reference_data[mode]
        plt.plot(mode_reference["values"] * 1e-3,
                 mode_reference["cdf"],
                 linestyle='--',
                 color="k",
                 linewidth=1.0,
                 label="HTS")

        df_mode = df_synthetic[df_synthetic["mode"] == mode]
        plt.fill_betweenx(df_mode["cdf"],
                          df_mode["q5"] * 1e-3,
                          df_mode["q95"] * 1e-3,
                          linewidth=0.0,
                          color=plotting.COLORS[hts_name],
                          alpha=0.25,
                          label="90% Conf.")
        plt.plot(df_mode["mean"] * 1e-3,
                 df_mode["cdf"],
                 color=plotting.COLORS[hts_name],
                 linewidth=1.0,
                 label="Synthetic")

        plt.xlim([0, limits[mode] * 1e-3])
        plt.ylim([0, 1])

        plt.title(plotting.MODE_LABELS[mode], fontsize=plotting.FONT_SIZE)
        plt.xlabel("Euclidean distance [km]")
        plt.grid()

        if index % 2 == 0:
            plt.ylabel("Cumulative density")

        if index % 2 == 1:
            plt.legend(loc="best")

    plt.tight_layout()
    plt.savefig("%s/distance_distributions.pdf" % context.path())
    plt.close()
Exemple #8
0
def execute(context):
    plotting.setup()

    hts = context.stage("analysis.reference.hts.sociodemographics")
    census = context.stage("analysis.reference.census.sociodemographics")
    data = context.stage("data")

    figures = [
        dict(level="person",
             label="Number of persons",
             size=(6.0, 5.0),
             marginals=[
                 "age_class", "sex", "employed", "studies", "has_license",
                 "has_pt_subscription", "socioprofessional_class"
             ]),
        dict(level="household",
             label="Number of households",
             size=plotting.WIDE_FIGSIZE,
             marginals=[
                 "household_size_class", "number_of_vehicles_class",
                 "number_of_bikes_class"
             ])
    ]

    for figure in figures:
        plt.figure(figsize=figure["size"])

        df_figure = prepare_data(data, hts, census, figure["level"],
                                 figure["marginals"], SAMPLING_RATE)

        reweight_hts(df_figure, hts, census, figure["level"])
        add_labels(df_figure)

        locations = np.arange(len(df_figure))

        f = (df_figure["reference_source"] == "census").values
        plt.barh(locations[f],
                 df_figure["reference"].values[f],
                 height=0.4,
                 label="Census",
                 align="edge",
                 linewidth=0.5,
                 edgecolor="white",
                 color=plotting.COLORS["census"])
        plt.barh(locations[f] + 0.4,
                 df_figure["mean"].values[f],
                 height=0.4,
                 label="Synthetic",
                 align="edge",
                 linewidth=0.5,
                 edgecolor="white",
                 color=plotting.COLORS["synthetic"])

        f = (df_figure["reference_source"] == "hts").values
        hts_name = context.config("hts")
        plt.barh(locations[f],
                 df_figure["reference"].values[f],
                 height=0.4,
                 label="HTS",
                 align="edge",
                 linewidth=0.5,
                 edgecolor="white",
                 color=plotting.COLORS[hts_name])
        plt.barh(locations[f] + 0.4,
                 df_figure["mean"].values[f],
                 height=0.4,
                 label=None,
                 align="edge",
                 linewidth=0.5,
                 edgecolor="white",
                 color=plotting.COLORS["synthetic"])

        for index, (min, max) in enumerate(
                zip(df_figure["min"].values, df_figure["max"].values)):
            location = index + 0.4 + 0.2
            plt.plot([min, max], [location, location],
                     "k",
                     linewidth=1,
                     label="Range")

        plt.gca().yaxis.set_major_locator(tck.FixedLocator(locations + 0.4))
        plt.gca().yaxis.set_major_formatter(
            tck.FixedFormatter(df_figure["label"].values))

        if figure["level"] == "person":
            plt.gca().xaxis.set_major_locator(
                tck.FixedLocator(np.arange(1, 100) * 1e6 * 2))
            plt.gca().xaxis.set_major_formatter(
                tck.FuncFormatter(lambda x, p: "%dM" % (x / 1e6, )))

        if figure["level"] == "household":
            plt.gca().xaxis.set_major_locator(
                tck.FixedLocator(np.arange(1, 100) * 1e6 * 0.5))
            plt.gca().xaxis.set_major_formatter(
                tck.FuncFormatter(lambda x, p: "%.1fM" % (x / 1e6, )))

        plt.grid()
        plt.gca().set_axisbelow(True)
        plt.gca().yaxis.grid(alpha=0.0)
        plt.gca().invert_yaxis()

        plt.xlabel(figure["label"])

        handles, labels = plt.gca().get_legend_handles_labels()
        handles = [handles[-2], handles[-1], handles[-3], handles[-4]]
        labels = [labels[-2], labels[-1], labels[-3], labels[-4]]
        plt.legend(handles=handles, labels=labels, loc="best")

        plt.tight_layout()
        plt.savefig("%s/%s.pdf" % (context.path(), figure["level"]))
        plt.close()
Exemple #9
0
def execute(context):
    plotting.setup()

    hts_data = context.stage("hts")
    data = context.stage("data")
    census_data = context.stage("census")

    plt.figure(figsize=plotting.SHORT_FIGSIZE)

    parts = [{
        "slot": "work",
        "linestyle": "-",
        "title": "Work"
    }, {
        "slot": "education",
        "linestyle": "--",
        "title": "Educ."
    }]

    for part in parts:
        slot = part["slot"]

        #plt.plot(census_data[slot]["centroid_distance"] * 1e-3, census_data[slot]["cdf"], color = plotting.COLORS["census"], linestyle = part["linestyle"], linewidth = 1.0)

        plt.plot(data[slot]["mean"],
                 data[slot]["cdf"],
                 color="k",
                 linestyle=part["linestyle"],
                 linewidth=1.0)
        plt.fill_betweenx(data[slot]["cdf"],
                          data[slot]["q5"],
                          data[slot]["q95"],
                          color="k",
                          linewidth=0.0,
                          alpha=0.25)

        plt.plot(hts_data[slot]["euclidean_distance"] * 1e-3,
                 hts_data[slot]["cdf"],
                 color=plotting.COLORS["egt"],
                 linestyle=part["linestyle"],
                 linewidth=1.0)

        plt.plot([np.nan],
                 color="k",
                 linewidth=1.0,
                 linestyle=part["linestyle"],
                 label=part["title"])

    plt.plot([np.nan], color="k", linewidth=1.0, label="Synthetic")
    plt.plot([np.nan],
             color=plotting.COLORS["egt"],
             linewidth=1.0,
             label="EGT")

    plt.xlim([0, 40])
    plt.ylim([0, 1])

    plt.legend(loc="best", ncol=2)

    plt.grid()
    plt.gca().set_axisbelow(True)

    plt.xlabel("Euclidean commute distance [km]")
    plt.ylabel("Cumulative density")

    plt.tight_layout()
    plt.savefig("%s/commute_distance_cdf.pdf" % context.path())
    plt.close()
Exemple #10
0
def execute(context):
    plotting.setup()

    census = context.stage("analysis.reference.census.sociodemographics")
    data = context.stage("data")

    cases = [
        dict(commune=75106, title="16th Arrondissement"),
        dict(commune=94002, title="Alfortville")
    ]

    plt.figure(figsize=plotting.WIDE_FIGSIZE)

    for case_index, case in enumerate(cases):
        case_census = filter_commune(census, case["commune"])
        case_data = filter_commune(data, case["commune"])

        df_case = pd.concat([
            prepare_data(case_data, case_census, case_census, "household",
                         ["household_size_class"], SAMPLING_RATE),
            prepare_data(case_data, case_census, case_census, "person",
                         ["age_class"], SAMPLING_RATE),
        ])

        add_labels(df_case)

        plt.subplot(1, 2, case_index + 1)
        locations = np.arange(len(df_case))

        reference_values = df_case["reference"].values
        mean_values = df_case["mean"].values

        plt.barh(locations,
                 df_case["reference"].values,
                 height=0.4,
                 label="Census",
                 align="edge",
                 linewidth=0.5,
                 edgecolor="white",
                 color=plotting.COLORS["census"])
        plt.barh(locations + 0.4,
                 df_case["mean"].values,
                 height=0.4,
                 label="Synthetic",
                 align="edge",
                 linewidth=0.5,
                 edgecolor="white",
                 color=plotting.COLORS["synthetic"])

        for index, (q5, q95) in enumerate(
                zip(df_case["q5"].values, df_case["q95"].values)):
            location = index + 0.4 + 0.2
            plt.plot([q5, q95], [location, location],
                     "k",
                     linewidth=1,
                     label="90% Conf.")

        plt.gca().yaxis.set_major_locator(tck.FixedLocator(locations + 0.4))

        if case_index == 0:
            plt.gca().yaxis.set_major_formatter(
                tck.FixedFormatter(df_case["label"].values))
        else:
            plt.gca().yaxis.set_major_formatter(tck.FixedFormatter([""] * 100))

        plt.grid()
        plt.gca().set_axisbelow(True)
        plt.gca().yaxis.grid(alpha=0.0)
        plt.gca().invert_yaxis()

        plt.xlabel("Number of persons / households")
        plt.title(case["title"])

        if case_index == 0:
            handles, labels = plt.gca().get_legend_handles_labels()
            handles = [handles[-2], handles[-1], handles[-3]]
            labels = [labels[-2], labels[-1], labels[-3]]
            plt.legend(handles=handles, labels=labels, loc="best")

    plt.tight_layout()
    plt.savefig("%s/comparison.pdf" % (context.path(), ))
    plt.close()
Exemple #11
0
def execute(context):
    plotting.setup()

    census = context.stage("analysis.reference.census.sociodemographics")
    data = context.stage("data")

    cases = [
        dict(commune=44109, title="Nantes Centre"),
        dict(commune=44158, title="Saint Etienne de Montluc"),
    ]

    plt.figure(figsize=plotting.WIDE_FIGSIZE)

    for case_index, case in enumerate(cases):
        case_census = filter_commune(census, case["commune"])
        case_data = filter_commune(data, case["commune"])

        df_case = pd.concat([
            prepare_data(case_data, case_census, case_census, "household",
                         ["household_size_class"], SAMPLING_RATE),
            prepare_data(case_data, case_census, case_census, "person",
                         ["age_class"], SAMPLING_RATE),
        ])

        add_labels(df_case)

        plt.subplot(1, 2, case_index + 1)
        locations = np.arange(len(df_case))

        reference_values = df_case["reference"].values
        mean_values = df_case["mean"].values

        plt.barh(locations,
                 df_case["reference"].values,
                 height=0.4,
                 label="Census",
                 align="edge",
                 linewidth=0.5,
                 edgecolor="white",
                 color=plotting.COLORS["census"])
        plt.barh(locations + 0.4,
                 df_case["mean"].values,
                 height=0.4,
                 label="Synthetic",
                 align="edge",
                 linewidth=0.5,
                 edgecolor="white",
                 color=plotting.COLORS["synthetic"])

        for index, (min, max) in enumerate(
                zip(df_case["min"].values, df_case["max"].values)):
            location = index + 0.4 + 0.2
            plt.plot([min, max], [location, location],
                     "k",
                     linewidth=1,
                     label="Range")

        plt.gca().yaxis.set_major_locator(tck.FixedLocator(locations + 0.4))

        if case_index == 0:
            plt.gca().yaxis.set_major_formatter(
                tck.FixedFormatter(df_case["label"].values))
        else:
            plt.gca().yaxis.set_major_formatter(tck.FixedFormatter([""] * 100))

        plt.gca().xaxis.set_major_formatter(
            tck.FuncFormatter(lambda x, p: "%dk" % (x // 1000, )))

        plt.grid()
        plt.gca().set_axisbelow(True)
        plt.gca().yaxis.grid(alpha=0.0)
        plt.gca().invert_yaxis()

        plt.xlabel("Number of persons / households")
        plt.title(case["title"])
        #plt.ylim([len(locations) + 2.5, -0.5])

        if case_index == 1:
            handles, labels = plt.gca().get_legend_handles_labels()
            handles = [handles[-2], handles[-1], handles[-3]]
            labels = [labels[-2], labels[-1], labels[-3]]
            plt.legend(handles=handles,
                       labels=labels,
                       loc=(0.05, 0.32),
                       framealpha=1.0)

    plt.tight_layout()
    plt.savefig("%s/comparison.pdf" % (context.path(), ))
    plt.close()
Exemple #12
0
def execute(context):
    data = context.stage("analysis.synthesis.statistics.monte_carlo")

    # Prepare data for error probability table
    df_table = []

    for marginal in TABLE_MARGINALS:
        df_marginal = data[(marginal, )]
        values = np.sort(df_marginal[(marginal, )].drop_duplicates().values)

        for value in values:
            row = {"marginal": marginal, "value": value}

            df_value = df_marginal[df_marginal[marginal] == value]
            df_value = df_value[df_value["samples"] == ACQUISITION_SAMPLE_SIZE]

            assert len(df_value) == len(SAMPLING_RATES)
            probabilities = df_value.sort_values(
                by=["sampling_rate", "samples"])["error_probability"].values[:,
                                                                             0]

            for sampling_rate, probability in zip(SAMPLING_RATES,
                                                  probabilities):
                row[sampling_rate] = probability

            df_table.append(row)

    df_table = pd.DataFrame.from_records(df_table)
    df_table = create_table(df_table)
    df_table.to_latex("%s/monte_carlo_table.tex" % context.path(),
                      escape=False)

    # Prepare data for plotting
    reference = context.stage(
        "analysis.reference.census.sociodemographics")["person"]

    # Perform plotting
    plotting.setup()

    plt.figure(figsize=plotting.WIDE_FIGSIZE)

    # ... subplot on nominal stratum values
    plt.subplot(1, 2, 1)
    plt.title("(a) Monte Carlo analysis", fontsize=plotting.FONT_SIZE)

    df_marginal, reference_value = select(reference, data, SELECTED_MARGINAL,
                                          SELECTED_VALUES)
    assert len(df_marginal) == ACQUISITION_SAMPLE_SIZE * len(SAMPLING_RATES)

    display_sampling_rates = [0.001, 0.01, 0.05]

    for index, sampling_rate in enumerate([0.001, 0.01, 0.05]):
        df_rate = df_marginal[df_marginal["sampling_rate"] == sampling_rate]
        df_rate = df_rate.sort_values(by="samples")
        plt.fill_between(df_rate["samples"],
                         df_rate[("weight", "q5")],
                         df_rate[("weight", "q95")],
                         alpha=0.25 + index * 0.2,
                         color=plotting.COLORSET[0],
                         linewidth=0.0)

    plt.plot([1, ACQUISITION_SAMPLE_SIZE], [reference_value] * 2,
             'k--',
             label="Ref. $y$",
             linewidth=1.0)
    plt.plot([1, ACQUISITION_SAMPLE_SIZE], [reference_value * 0.99] * 2,
             'k:',
             label="1% Err.",
             linewidth=1.0)
    plt.plot([1, ACQUISITION_SAMPLE_SIZE], [reference_value * 1.01] * 2,
             'k:',
             linewidth=1.0)

    plt.xlabel("Sample size $N$")
    plt.ylabel("Stratum weight")

    plt.gca().yaxis.set_major_formatter(
        tck.FuncFormatter(lambda x, p: "%.2fM" % (x * 1e-6, )))

    plt.grid()
    plt.gca().set_axisbelow(True)
    plt.xlim([1, ACQUISITION_SAMPLE_SIZE])

    plt.fill_between([np.nan], [np.nan], [np.nan],
                     color=plotting.COLORSET[0],
                     alpha=0.25,
                     label="90% Conf.")
    plt.legend(loc="lower center", ncol=2)

    # ... subplot on nominal stratum values
    plt.subplot(1, 2, 2)
    plt.title("(b) Error probability", fontsize=plotting.FONT_SIZE)

    for index, values in enumerate(ADDITIONAL_VALUES):
        df_marginal, reference_value = select(reference, data,
                                              SELECTED_MARGINAL, values)
        assert len(
            df_marginal) == ACQUISITION_SAMPLE_SIZE * len(SAMPLING_RATES)

        df_max = df_marginal[df_marginal["samples"] == ACQUISITION_SAMPLE_SIZE]
        df_max = df_max.sort_values(by="sampling_rate")

        plt.plot(100 * np.array(SAMPLING_RATES),
                 df_max[("error_probability", "mean")],
                 color=plotting.COLORSET[index],
                 label="Age %s" % ADDITIONAL_LABELS[index],
                 marker=".",
                 markersize=3.0,
                 linewidth=1.0)

    plt.plot([0, 100 * max(SAMPLING_RATES)], [0.9] * 2,
             'k:',
             label="90% Prob.",
             linewidth=1.0)
    plt.xlim([0, 100 * max(SAMPLING_RATES)])
    plt.ylim([0, 1.0])

    plt.xlabel("Sampling rate $s$ [%]")
    plt.ylabel("Error probability")

    plt.grid()
    plt.gca().set_axisbelow(True)

    plt.legend(loc="center", ncol=1)

    plt.tight_layout()
    plt.savefig("%s/monte_carlo.pdf" % context.path())
    plt.close()
Exemple #13
0
def execute(context):
    plotting.setup()

    # Income imputation
    df_income = context.stage("data.income.municipality")
    df_imputed = df_income[df_income["is_imputed"]]

    plt.figure()

    minimum = min(df_imputed["reference_median"].min(),
                  df_imputed["q5"].min()) * 1e-3
    maximum = max(df_imputed["reference_median"].max(),
                  df_imputed["q5"].max()) * 1e-3
    plt.plot([minimum, maximum], [minimum, maximum], "k--")

    f = ~df_imputed["is_missing"]
    plt.plot(df_imputed[f]["reference_median"] * 1e-3,
             df_imputed[f]["q5"] * 1e-3,
             '.',
             markersize=3,
             color=plotting.COLORSET[0],
             label="y")
    plt.plot(df_imputed[~f]["reference_median"] * 1e-3,
             df_imputed[~f]["q5"] * 1e-3,
             'x',
             markersize=3,
             color=plotting.COLORSET[1])

    plt.xlabel("Reference median income [1000 EUR]")
    plt.ylabel("Imputed median income [1000 EUR]")
    plt.grid()

    plt.tight_layout()
    plt.savefig("%s/income_imputation.pdf" % context.path())
    plt.close()

    # Income distributions
    plt.figure()

    df_data = context.stage("data")
    df_reference = context.stage("analysis.reference.income")

    f = df_reference["source"] == "entd"
    plt.plot(df_reference[f]["income"].values * 1e-3,
             df_reference[f]["cdf"].values,
             color=plotting.COLORS["entd"],
             label="ENTD",
             linewidth=1.0)

    f = df_reference["source"] == "egt"
    plt.plot(df_reference[f]["income"].values * 1e-3,
             df_reference[f]["cdf"].values,
             color=plotting.COLORS["egt"],
             label="EGT",
             linewidth=1.0)

    f = df_reference["source"] == "filo"
    plt.plot(df_reference[f]["income"].values * 1e-3,
             df_reference[f]["cdf"].values,
             color=plotting.COLORS["census"],
             label="Tax data",
             linewidth=1.0,
             marker=".",
             markersize=3)

    plt.plot(df_data["mean"].values * 1e-3,
             df_data["cdf"].values,
             color="k",
             label="Synthetic",
             linewidth=1.0,
             linestyle=":")
    plt.fill_betweenx(df_data["cdf"].values,
                      df_data["min"].values * 1e-3,
                      df_data["max"].values * 1e-3,
                      color="k",
                      linewidth=0.0,
                      alpha=0.25)

    plt.xlim([0, 60])

    plt.xlabel("Household income [1000 EUR]")
    plt.ylabel("Cumulative density")

    plt.legend(loc="lower right")
    plt.grid()

    plt.tight_layout()
    plt.savefig("%s/income_distributions.pdf" % context.path())
    plt.close()
def execute(context):
    # Obtain reference data
    reference = context.stage("analysis.reference.census.sociodemographics")
    reference = reference[MARGINAL_LEVEL][MARGINAL]

    reference = reference[np.logical_and.reduce([
        reference[name] == value for name, value in zip(MARGINAL, VALUES)
    ])]["weight"].values[0]

    # Gather marginal information
    df_data = []

    for sampling_rate in SAMPLING_RATES:
        df_marginals = []

        for df_stage in bt.get_stages(context,
                                      "sample_%f" % sampling_rate,
                                      sample_size=ACQUISITION_SAMPLE_SIZE):
            marginals.prepare_classes(df_stage)
            df_stage = stats.marginalize(df_stage, [MARGINAL],
                                         weight_column=None)[MARGINAL]
            df_stage["sampling_rate"] = sampling_rate
            df_marginals.append(df_stage)

        df_marginals = stats.collect_sample(df_marginals)
        df_marginals = df_marginals[np.logical_and.reduce([
            df_marginals[name] == value
            for name, value in zip(MARGINAL, VALUES)
        ])]

        df_data.append(df_marginals)

    df_data = pd.concat(df_data)

    sample_sizes = np.arange(1, MAXIMUM_SAMPLE_SIZE + 1)
    df_figure = []

    for sampling_rate in SAMPLING_RATES:
        for sample_size in context.progress(
                sample_sizes, label="Calculating sample sizes ..."):
            df_marginals = df_data[df_data["sampling_rate"] == sampling_rate]
            df_marginals = df_marginals.drop(columns=["sampling_rate"])

            df_bootstrap = stats.bootstrap(
                df_marginals,
                ESTIMATION_SAMPLES,
                sample_size,
                metrics={
                    "mean":
                    "mean",
                    "q5":
                    lambda x: x.quantile(0.05),
                    "q95":
                    lambda x: x.quantile(0.95),
                    "precision":
                    lambda x: np.mean(
                        np.abs(x / sampling_rate - reference) / reference <=
                        ERROR_THRESHOLD)
                })

            df_bootstrap["sample_size"] = sample_size
            df_bootstrap["sampling_rate"] = sampling_rate

            df_figure.append(df_bootstrap)

    df_figure = pd.concat(df_figure)

    # Plotting
    plotting.setup()
    plt.figure(figsize=plotting.SHORT_FIGSIZE)

    for index, sampling_rate in enumerate(SAMPLING_RATES):
        df_rate = df_figure[df_figure["sampling_rate"] == sampling_rate]
        plt.plot(df_rate["sample_size"],
                 df_rate["precision"],
                 label=SAMPLING_RATE_LABELS[sampling_rate],
                 color=SAMPLING_RATE_COLORS[sampling_rate])

    plt.plot([0, MAXIMUM_SAMPLE_SIZE + 1], [0.9, 0.9], 'k:')

    plt.xlim([1, MAXIMUM_SAMPLE_SIZE])
    plt.ylim([0, 1.05])
    plt.xlabel("Number of seeds $K$")
    plt.ylabel(r"Error probability")

    plt.gca().xaxis.set_major_locator(tck.FixedLocator([1, 10, 20, 30, 40]))
    plt.gca().yaxis.set_major_formatter(
        tck.FuncFormatter(lambda x, p: "%d%%" % (x * 100, )))

    plt.grid()
    plt.gca().set_axisbelow(True)

    plt.legend(loc="best", title="Sampling rate $s$")

    plt.tight_layout()
    plt.savefig("%s/error_probability.pdf" % context.path())
    plt.close()
Exemple #15
0
def execute(context):
    plotting.setup()

    marginal = ("age_range", "sex", "chain")
    df_egt = context.stage("egt")[marginal].rename(columns={"weight": "egt"})
    df_entd = context.stage("entd")[marginal].rename(
        columns={"weight": "entd"})

    df = pd.merge(df_egt, df_entd, on=["age_range", "sex", "chain"])
    df = df[df["age_range"]]

    df_female = df[df["sex"] == "female"].sort_values(by="egt",
                                                      ascending=False).head(10)
    df_male = df[df["sex"] == "male"].sort_values(by="egt",
                                                  ascending=False).head(10)

    plt.figure(figsize=plotting.WIDE_FIGSIZE)

    for index, (df, title) in enumerate(
            zip([df_male, df_female], ["Male (18-40)", "Female (18-40)"])):
        plt.subplot(1, 2, index + 1)

        plt.bar(np.arange(10),
                df["egt"],
                width=0.4,
                label="EGT",
                align="edge",
                linewidth=0.5,
                edgecolor="white",
                color=plotting.COLORS["egt"])
        plt.bar(np.arange(10) + 0.4,
                df["entd"],
                width=0.4,
                label="ENTD",
                align="edge",
                linewidth=0.5,
                edgecolor="white",
                color=plotting.COLORS["entd"])

        plt.grid()
        plt.gca().set_axisbelow(True)
        plt.gca().xaxis.grid(alpha=0.0)

        plt.gca().yaxis.set_major_locator(
            tck.FixedLocator(np.arange(100) * 1e5))
        plt.gca().yaxis.set_major_formatter(
            tck.FuncFormatter(lambda x, p: "%d" % (x * 1e-3, )))

        plt.gca().xaxis.set_major_locator(
            tck.FixedLocator(np.arange(10) + 0.4))
        plt.gca().xaxis.set_major_formatter(
            tck.FuncFormatter(
                lambda x, p: "\n".join(df["chain"].values[p]).upper()))

        if index == 1:
            plt.gca().yaxis.set_major_formatter(tck.FixedFormatter([""] *
                                                                   1000))
            plt.gca().yaxis.get_label().set_visible(False)

        plt.legend(loc="best", title=title)

        if index == 0:
            plt.ylabel("Number of persons [x1000]")

    plt.tight_layout()
    plt.show()
    plt.savefig("%s/activity_chains.pdf" % context.path())
    plt.close()
Exemple #16
0
def execute(context):
    # Obtain reference data
    reference = context.stage("analysis.reference.census.sociodemographics")
    reference = reference[MARGINAL_LEVEL][MARGINAL]

    reference = reference[np.logical_and.reduce([
        reference[name] == value for name, value in zip(MARGINAL, VALUES)
    ])]["weight"].values[0]

    # Gather information
    df_marginals = []

    for df_stage in bt.get_stages(context,
                                  "sample",
                                  sample_size=ACQUISITION_SAMPLE_SIZE):
        marginals.prepare_classes(df_stage)
        df_marginals.append(
            stats.marginalize(df_stage, [MARGINAL],
                              weight_column=None)[MARGINAL])

    df_marginals = stats.collect_sample(df_marginals)
    df_marginals = df_marginals[np.logical_and.reduce([
        df_marginals[name] == value for name, value in zip(MARGINAL, VALUES)
    ])]

    sample_sizes = np.arange(1, MAXIMUM_SAMPLE_SIZE + 1)
    df_figure = []

    for sample_size in context.progress(sample_sizes,
                                        label="Calculating sample sizes ..."):
        df_bootstrap = stats.bootstrap(df_marginals, ESTIMATION_SAMPLES,
                                       sample_size)
        df_bootstrap["sample_size"] = sample_size
        df_figure.append(df_bootstrap)

    df_figure = pd.concat(df_figure)

    df_figure["mean"] /= SAMPLING_RATE
    df_figure["q5"] /= SAMPLING_RATE
    df_figure["q95"] /= SAMPLING_RATE

    # Prepare plot
    plotting.setup()
    plt.figure(figsize=plotting.SHORT_FIGSIZE)

    plt.fill_between(df_figure["sample_size"],
                     df_figure["q5"],
                     df_figure["q95"],
                     alpha=0.25,
                     label="90% Conf.",
                     color=plotting.COLORSET[0],
                     linewidth=0.0)
    plt.plot([1, MAXIMUM_SAMPLE_SIZE], [reference] * 2,
             'k--',
             label="Ref. $w$")
    plt.plot([1, MAXIMUM_SAMPLE_SIZE], [reference * 0.99] * 2,
             'k:',
             label="1% Err.")
    plt.plot([1, MAXIMUM_SAMPLE_SIZE], [reference * 1.01] * 2, 'k:')
    plt.plot(df_figure["sample_size"],
             df_figure["mean"],
             label=r"$\mathrm{\mathbf{E}}[\tilde w_K]$",
             color=plotting.COLORSET[0])

    plt.xlim([1, MAXIMUM_SAMPLE_SIZE])
    plt.xlabel("Number of seeds $K$")
    plt.ylabel("Stratum weight")

    plt.gca().xaxis.set_major_locator(tck.FixedLocator([1, 5, 10, 15, 20, 25]))
    plt.gca().yaxis.set_major_formatter(
        tck.FuncFormatter(lambda x, p: "%.2fM" % (x * 1e-6, )))

    plt.grid()
    plt.gca().set_axisbelow(True)

    plt.legend(loc="best", ncol=2)

    plt.tight_layout()
    plt.savefig("%s/sample_count.pdf" % context.path())
    plt.close()