Esempio n. 1
0
def execute_parallel(context, data):
    acquisition_sample_size = context.config("acquisition_sample_size")
    df_population, df_chains = data

    df_chains = df_chains[["person_id", "activity_index", "purpose"
                           ]].sort_values(by=["person_id", "activity_index"])
    df_chains = aggregate_chains(df_chains)

    marginals.prepare_classes(df_population)
    df_chains = pd.merge(
        df_population[["person_id", "age_class", "sex", "age"]],
        df_chains,
        on="person_id")
    df_chains["chain_length_class"] = np.minimum(df_chains["chain_length"],
                                                 CHAIN_LENGTH_LIMIT)

    top_k_chains = df_chains.groupby("chain").size().reset_index(
        name="weight").sort_values(
            by="weight", ascending=False).head(CHAIN_TOP_K)["chain"].values
    df_chains = df_chains[df_chains["chain"].isin(top_k_chains)]

    df_chains["age_range"] = (df_chains["age"] >= 18) & (df_chains["age"] <=
                                                         40)

    context.progress.update()
    return stats.marginalize(df_chains, CHAIN_MARGINALS, weight_column=None)
def execute(context):
    df_chains = context.stage("analysis.reference.hts.activities")[[
        "person_id", "activity_id", "purpose"
    ]].sort_values(by=["person_id", "activity_id"])
    df_chains = aggregate_chains(df_chains)

    df_population = context.stage("hts")[1]
    marginals.prepare_classes(df_population)

    df_chains = pd.merge(df_population[[
        "person_id", "age_class", "sex", "person_weight", "age"
    ]],
                         df_chains,
                         on="person_id")
    df_chains["chain_length_class"] = np.minimum(df_chains["chain_length"],
                                                 CHAIN_LENGTH_LIMIT)

    top_k_chains = df_chains.groupby(
        "chain")["person_weight"].sum().reset_index().sort_values(
            by="person_weight",
            ascending=False).head(CHAIN_TOP_K)["chain"].values
    df_chains = df_chains[df_chains["chain"].isin(top_k_chains)]

    df_chains["age_range"] = (df_chains["age"] >= 18) & (df_chains["age"] <=
                                                         40)

    return stats.marginalize(df_chains,
                             CHAIN_MARGINALS,
                             weight_column="person_weight")
Esempio n. 3
0
def execute(context):
    acquisition_sample_size = context.config("acquisition_sample_size")

    person_marginals = []
    household_marginals = []

    for df in bs.get_stages(context, "synthesis.population.enriched",
                            acquisition_sample_size):
        marginals.prepare_classes(df)

        person_marginals.append(
            stats.marginalize(df,
                              marginals.ANALYSIS_PERSON_MARGINALS,
                              weight_column=None))
        household_marginals.append(
            stats.marginalize(df.drop_duplicates("household_id"),
                              marginals.ANALYSIS_HOUSEHOLD_MARGINALS,
                              weight_column=None))

    person_marginals = stats.combine_marginals(person_marginals)
    household_marginals = stats.combine_marginals(household_marginals)

    person_marginals = stats.apply_per_marginal(
        person_marginals, stats.analyze_sample_and_flatten)
    household_marginals = stats.apply_per_marginal(
        household_marginals, stats.analyze_sample_and_flatten)

    return dict(person=person_marginals, household=household_marginals)
Esempio n. 4
0
def execute(context):
    acquisition_sample_size = context.config("acquisition_sample_size")

    person_marginals = []
    household_marginals = []

    for df in bs.get_stages(context, "synthesis.population.enriched",
                            acquisition_sample_size):
        marginals.prepare_classes(df)

        person_marginals.append(
            stats.marginalize(df,
                              marginals.ANALYSIS_PERSON_MARGINALS,
                              weight_column=None))
        household_marginals.append(
            stats.marginalize(df.drop_duplicates("household_id"),
                              marginals.ANALYSIS_HOUSEHOLD_MARGINALS,
                              weight_column=None))

    person_marginals = stats.collect_marginalized_sample(person_marginals)
    household_marginals = stats.collect_marginalized_sample(
        household_marginals)

    person_marginals = stats.bootstrap_sampled_marginals(
        person_marginals, ESTIMATION_SAMPLE_SIZE)
    household_marginals = stats.bootstrap_sampled_marginals(
        household_marginals, ESTIMATION_SAMPLE_SIZE)

    return dict(person=person_marginals, household=household_marginals)
Esempio n. 5
0
def execute(context):
    person_marginals = marginals.combine(
        marginals.TOTAL_MARGINAL, marginals.CENSUS_PERSON_MARGINALS,
        marginals.CENSUS_HOUSEHOLD_MARGINALS,
        marginals.cross(marginals.CENSUS_PERSON_MARGINALS,
                        marginals.CENSUS_PERSON_MARGINALS),
        marginals.cross(marginals.CENSUS_HOUSEHOLD_MARGINALS,
                        marginals.CENSUS_HOUSEHOLD_MARGINALS),
        marginals.cross(marginals.CENSUS_PERSON_MARGINALS,
                        marginals.CENSUS_HOUSEHOLD_MARGINALS),
        marginals.SPATIAL_MARGINALS,
        marginals.cross(marginals.SPATIAL_MARGINALS,
                        marginals.CENSUS_PERSON_MARGINALS))

    household_marginals = marginals.combine(
        marginals.TOTAL_MARGINAL, marginals.CENSUS_HOUSEHOLD_MARGINALS,
        marginals.cross(marginals.CENSUS_HOUSEHOLD_MARGINALS,
                        marginals.CENSUS_HOUSEHOLD_MARGINALS),
        marginals.SPATIAL_MARGINALS,
        marginals.cross(marginals.SPATIAL_MARGINALS,
                        marginals.CENSUS_HOUSEHOLD_MARGINALS))

    df_persons = context.stage("data.census.filtered")
    marginals.prepare_classes(df_persons)

    df_households = df_persons.drop_duplicates("household_id").copy()

    return dict(person=stats.marginalize(df_persons, person_marginals),
                household=stats.marginalize(df_households,
                                            household_marginals))
Esempio n. 6
0
def execute(context):
    acquisition_sample_size = context.config("acquisition_sample_size")

    person_marginals = []
    household_marginals = []

    feeder = zip(
        bs.get_stages(context, "synthesis.population.enriched", acquisition_sample_size),
        bs.get_stages(context, "synthesis.population.spatial.home.zones", acquisition_sample_size)
    )

    for df, df_home in feeder:
        df = pd.merge(df, df_home[["household_id", "departement_id", "commune_id"]])
        marginals.prepare_classes(df)

        person_marginals.append(stats.marginalize(df, marginals.SPATIAL_PERSON_MARGINALS, weight_column = None))
        household_marginals.append(stats.marginalize(df.drop_duplicates("household_id"), marginals.SPATIAL_HOUSEHOLD_MARGINALS, weight_column = None))

    person_marginals = stats.combine_marginals(person_marginals)
    household_marginals = stats.combine_marginals(household_marginals)

    person_marginals = stats.apply_per_marginal(person_marginals, stats.analyze_sample_and_flatten)
    household_marginals = stats.apply_per_marginal(household_marginals, stats.analyze_sample_and_flatten)

    return dict(person = person_marginals, household = household_marginals)
Esempio n. 7
0
def execute(context):
    df_households, df_persons, _ = context.stage("hts")

    person_columns = set(df_persons.columns)
    household_columns = set(df_households.columns)
    household_columns -= person_columns
    household_columns.add("household_id")

    df = pd.merge(df_persons,
                  df_households[household_columns],
                  on="household_id")
    assert len(df_persons) == len(df)
    df_persons = df

    spatial_marginals = [("departement_id", )]

    person_marginals = marginals.combine(
        marginals.TOTAL_MARGINAL, marginals.HTS_PERSON_MARGINALS,
        marginals.HTS_HOUSEHOLD_MARGINALS,
        marginals.cross(marginals.HTS_PERSON_MARGINALS,
                        marginals.HTS_PERSON_MARGINALS),
        marginals.cross(marginals.HTS_HOUSEHOLD_MARGINALS,
                        marginals.HTS_HOUSEHOLD_MARGINALS),
        marginals.cross(marginals.HTS_PERSON_MARGINALS,
                        marginals.HTS_HOUSEHOLD_MARGINALS), spatial_marginals,
        marginals.cross(spatial_marginals, marginals.HTS_PERSON_MARGINALS))

    household_marginals = marginals.combine(
        marginals.TOTAL_MARGINAL, marginals.HTS_HOUSEHOLD_MARGINALS,
        marginals.cross(marginals.HTS_HOUSEHOLD_MARGINALS,
                        marginals.HTS_HOUSEHOLD_MARGINALS), spatial_marginals,
        marginals.cross(spatial_marginals, marginals.HTS_HOUSEHOLD_MARGINALS))

    marginals.prepare_classes(df_persons)
    df_households = df_persons.drop_duplicates("household_id").copy()

    df_persons = df_persons.rename(columns={"person_weight": "weight"})
    df_households = df_households.rename(
        columns={"household_weight": "weight"})

    return dict(person=stats.marginalize(df_persons, person_marginals),
                household=stats.marginalize(df_households,
                                            household_marginals))
Esempio n. 8
0
def execute(context):
    df = context.stage("sample")
    marginals.prepare_classes(df)
    return stats.marginalize(df, MARGINALS, weight_column=None)
def execute(context):
    # Obtain reference data
    reference = context.stage("analysis.reference.census.sociodemographics")
    reference = reference[MARGINAL_LEVEL][MARGINAL]

    reference = reference[np.logical_and.reduce([
        reference[name] == value for name, value in zip(MARGINAL, VALUES)
    ])]["weight"].values[0]

    # Gather marginal information
    df_data = []

    for sampling_rate in SAMPLING_RATES:
        df_marginals = []

        for df_stage in bt.get_stages(context,
                                      "sample_%f" % sampling_rate,
                                      sample_size=ACQUISITION_SAMPLE_SIZE):
            marginals.prepare_classes(df_stage)
            df_stage = stats.marginalize(df_stage, [MARGINAL],
                                         weight_column=None)[MARGINAL]
            df_stage["sampling_rate"] = sampling_rate
            df_marginals.append(df_stage)

        df_marginals = stats.collect_sample(df_marginals)
        df_marginals = df_marginals[np.logical_and.reduce([
            df_marginals[name] == value
            for name, value in zip(MARGINAL, VALUES)
        ])]

        df_data.append(df_marginals)

    df_data = pd.concat(df_data)

    sample_sizes = np.arange(1, MAXIMUM_SAMPLE_SIZE + 1)
    df_figure = []

    for sampling_rate in SAMPLING_RATES:
        for sample_size in context.progress(
                sample_sizes, label="Calculating sample sizes ..."):
            df_marginals = df_data[df_data["sampling_rate"] == sampling_rate]
            df_marginals = df_marginals.drop(columns=["sampling_rate"])

            df_bootstrap = stats.bootstrap(
                df_marginals,
                ESTIMATION_SAMPLES,
                sample_size,
                metrics={
                    "mean":
                    "mean",
                    "q5":
                    lambda x: x.quantile(0.05),
                    "q95":
                    lambda x: x.quantile(0.95),
                    "precision":
                    lambda x: np.mean(
                        np.abs(x / sampling_rate - reference) / reference <=
                        ERROR_THRESHOLD)
                })

            df_bootstrap["sample_size"] = sample_size
            df_bootstrap["sampling_rate"] = sampling_rate

            df_figure.append(df_bootstrap)

    df_figure = pd.concat(df_figure)

    # Plotting
    plotting.setup()
    plt.figure(figsize=plotting.SHORT_FIGSIZE)

    for index, sampling_rate in enumerate(SAMPLING_RATES):
        df_rate = df_figure[df_figure["sampling_rate"] == sampling_rate]
        plt.plot(df_rate["sample_size"],
                 df_rate["precision"],
                 label=SAMPLING_RATE_LABELS[sampling_rate],
                 color=SAMPLING_RATE_COLORS[sampling_rate])

    plt.plot([0, MAXIMUM_SAMPLE_SIZE + 1], [0.9, 0.9], 'k:')

    plt.xlim([1, MAXIMUM_SAMPLE_SIZE])
    plt.ylim([0, 1.05])
    plt.xlabel("Number of seeds $K$")
    plt.ylabel(r"Error probability")

    plt.gca().xaxis.set_major_locator(tck.FixedLocator([1, 10, 20, 30, 40]))
    plt.gca().yaxis.set_major_formatter(
        tck.FuncFormatter(lambda x, p: "%d%%" % (x * 100, )))

    plt.grid()
    plt.gca().set_axisbelow(True)

    plt.legend(loc="best", title="Sampling rate $s$")

    plt.tight_layout()
    plt.savefig("%s/error_probability.pdf" % context.path())
    plt.close()
Esempio n. 10
0
def execute(context):
    # Obtain reference data
    reference = context.stage("analysis.reference.census.sociodemographics")
    reference = reference[MARGINAL_LEVEL][MARGINAL]

    reference = reference[np.logical_and.reduce([
        reference[name] == value for name, value in zip(MARGINAL, VALUES)
    ])]["weight"].values[0]

    # Gather information
    df_marginals = []

    for df_stage in bt.get_stages(context,
                                  "sample",
                                  sample_size=ACQUISITION_SAMPLE_SIZE):
        marginals.prepare_classes(df_stage)
        df_marginals.append(
            stats.marginalize(df_stage, [MARGINAL],
                              weight_column=None)[MARGINAL])

    df_marginals = stats.collect_sample(df_marginals)
    df_marginals = df_marginals[np.logical_and.reduce([
        df_marginals[name] == value for name, value in zip(MARGINAL, VALUES)
    ])]

    sample_sizes = np.arange(1, MAXIMUM_SAMPLE_SIZE + 1)
    df_figure = []

    for sample_size in context.progress(sample_sizes,
                                        label="Calculating sample sizes ..."):
        df_bootstrap = stats.bootstrap(df_marginals, ESTIMATION_SAMPLES,
                                       sample_size)
        df_bootstrap["sample_size"] = sample_size
        df_figure.append(df_bootstrap)

    df_figure = pd.concat(df_figure)

    df_figure["mean"] /= SAMPLING_RATE
    df_figure["q5"] /= SAMPLING_RATE
    df_figure["q95"] /= SAMPLING_RATE

    # Prepare plot
    plotting.setup()
    plt.figure(figsize=plotting.SHORT_FIGSIZE)

    plt.fill_between(df_figure["sample_size"],
                     df_figure["q5"],
                     df_figure["q95"],
                     alpha=0.25,
                     label="90% Conf.",
                     color=plotting.COLORSET[0],
                     linewidth=0.0)
    plt.plot([1, MAXIMUM_SAMPLE_SIZE], [reference] * 2,
             'k--',
             label="Ref. $w$")
    plt.plot([1, MAXIMUM_SAMPLE_SIZE], [reference * 0.99] * 2,
             'k:',
             label="1% Err.")
    plt.plot([1, MAXIMUM_SAMPLE_SIZE], [reference * 1.01] * 2, 'k:')
    plt.plot(df_figure["sample_size"],
             df_figure["mean"],
             label=r"$\mathrm{\mathbf{E}}[\tilde w_K]$",
             color=plotting.COLORSET[0])

    plt.xlim([1, MAXIMUM_SAMPLE_SIZE])
    plt.xlabel("Number of seeds $K$")
    plt.ylabel("Stratum weight")

    plt.gca().xaxis.set_major_locator(tck.FixedLocator([1, 5, 10, 15, 20, 25]))
    plt.gca().yaxis.set_major_formatter(
        tck.FuncFormatter(lambda x, p: "%.2fM" % (x * 1e-6, )))

    plt.grid()
    plt.gca().set_axisbelow(True)

    plt.legend(loc="best", ncol=2)

    plt.tight_layout()
    plt.savefig("%s/sample_count.pdf" % context.path())
    plt.close()