def execute_parallel(context, data): acquisition_sample_size = context.config("acquisition_sample_size") df_population, df_chains = data df_chains = df_chains[["person_id", "activity_index", "purpose" ]].sort_values(by=["person_id", "activity_index"]) df_chains = aggregate_chains(df_chains) marginals.prepare_classes(df_population) df_chains = pd.merge( df_population[["person_id", "age_class", "sex", "age"]], df_chains, on="person_id") df_chains["chain_length_class"] = np.minimum(df_chains["chain_length"], CHAIN_LENGTH_LIMIT) top_k_chains = df_chains.groupby("chain").size().reset_index( name="weight").sort_values( by="weight", ascending=False).head(CHAIN_TOP_K)["chain"].values df_chains = df_chains[df_chains["chain"].isin(top_k_chains)] df_chains["age_range"] = (df_chains["age"] >= 18) & (df_chains["age"] <= 40) context.progress.update() return stats.marginalize(df_chains, CHAIN_MARGINALS, weight_column=None)
def execute(context): df_chains = context.stage("analysis.reference.hts.activities")[[ "person_id", "activity_id", "purpose" ]].sort_values(by=["person_id", "activity_id"]) df_chains = aggregate_chains(df_chains) df_population = context.stage("hts")[1] marginals.prepare_classes(df_population) df_chains = pd.merge(df_population[[ "person_id", "age_class", "sex", "person_weight", "age" ]], df_chains, on="person_id") df_chains["chain_length_class"] = np.minimum(df_chains["chain_length"], CHAIN_LENGTH_LIMIT) top_k_chains = df_chains.groupby( "chain")["person_weight"].sum().reset_index().sort_values( by="person_weight", ascending=False).head(CHAIN_TOP_K)["chain"].values df_chains = df_chains[df_chains["chain"].isin(top_k_chains)] df_chains["age_range"] = (df_chains["age"] >= 18) & (df_chains["age"] <= 40) return stats.marginalize(df_chains, CHAIN_MARGINALS, weight_column="person_weight")
def execute(context): acquisition_sample_size = context.config("acquisition_sample_size") person_marginals = [] household_marginals = [] for df in bs.get_stages(context, "synthesis.population.enriched", acquisition_sample_size): marginals.prepare_classes(df) person_marginals.append( stats.marginalize(df, marginals.ANALYSIS_PERSON_MARGINALS, weight_column=None)) household_marginals.append( stats.marginalize(df.drop_duplicates("household_id"), marginals.ANALYSIS_HOUSEHOLD_MARGINALS, weight_column=None)) person_marginals = stats.combine_marginals(person_marginals) household_marginals = stats.combine_marginals(household_marginals) person_marginals = stats.apply_per_marginal( person_marginals, stats.analyze_sample_and_flatten) household_marginals = stats.apply_per_marginal( household_marginals, stats.analyze_sample_and_flatten) return dict(person=person_marginals, household=household_marginals)
def execute(context): acquisition_sample_size = context.config("acquisition_sample_size") person_marginals = [] household_marginals = [] for df in bs.get_stages(context, "synthesis.population.enriched", acquisition_sample_size): marginals.prepare_classes(df) person_marginals.append( stats.marginalize(df, marginals.ANALYSIS_PERSON_MARGINALS, weight_column=None)) household_marginals.append( stats.marginalize(df.drop_duplicates("household_id"), marginals.ANALYSIS_HOUSEHOLD_MARGINALS, weight_column=None)) person_marginals = stats.collect_marginalized_sample(person_marginals) household_marginals = stats.collect_marginalized_sample( household_marginals) person_marginals = stats.bootstrap_sampled_marginals( person_marginals, ESTIMATION_SAMPLE_SIZE) household_marginals = stats.bootstrap_sampled_marginals( household_marginals, ESTIMATION_SAMPLE_SIZE) return dict(person=person_marginals, household=household_marginals)
def execute(context): person_marginals = marginals.combine( marginals.TOTAL_MARGINAL, marginals.CENSUS_PERSON_MARGINALS, marginals.CENSUS_HOUSEHOLD_MARGINALS, marginals.cross(marginals.CENSUS_PERSON_MARGINALS, marginals.CENSUS_PERSON_MARGINALS), marginals.cross(marginals.CENSUS_HOUSEHOLD_MARGINALS, marginals.CENSUS_HOUSEHOLD_MARGINALS), marginals.cross(marginals.CENSUS_PERSON_MARGINALS, marginals.CENSUS_HOUSEHOLD_MARGINALS), marginals.SPATIAL_MARGINALS, marginals.cross(marginals.SPATIAL_MARGINALS, marginals.CENSUS_PERSON_MARGINALS)) household_marginals = marginals.combine( marginals.TOTAL_MARGINAL, marginals.CENSUS_HOUSEHOLD_MARGINALS, marginals.cross(marginals.CENSUS_HOUSEHOLD_MARGINALS, marginals.CENSUS_HOUSEHOLD_MARGINALS), marginals.SPATIAL_MARGINALS, marginals.cross(marginals.SPATIAL_MARGINALS, marginals.CENSUS_HOUSEHOLD_MARGINALS)) df_persons = context.stage("data.census.filtered") marginals.prepare_classes(df_persons) df_households = df_persons.drop_duplicates("household_id").copy() return dict(person=stats.marginalize(df_persons, person_marginals), household=stats.marginalize(df_households, household_marginals))
def execute(context): acquisition_sample_size = context.config("acquisition_sample_size") person_marginals = [] household_marginals = [] feeder = zip( bs.get_stages(context, "synthesis.population.enriched", acquisition_sample_size), bs.get_stages(context, "synthesis.population.spatial.home.zones", acquisition_sample_size) ) for df, df_home in feeder: df = pd.merge(df, df_home[["household_id", "departement_id", "commune_id"]]) marginals.prepare_classes(df) person_marginals.append(stats.marginalize(df, marginals.SPATIAL_PERSON_MARGINALS, weight_column = None)) household_marginals.append(stats.marginalize(df.drop_duplicates("household_id"), marginals.SPATIAL_HOUSEHOLD_MARGINALS, weight_column = None)) person_marginals = stats.combine_marginals(person_marginals) household_marginals = stats.combine_marginals(household_marginals) person_marginals = stats.apply_per_marginal(person_marginals, stats.analyze_sample_and_flatten) household_marginals = stats.apply_per_marginal(household_marginals, stats.analyze_sample_and_flatten) return dict(person = person_marginals, household = household_marginals)
def execute(context): df_households, df_persons, _ = context.stage("hts") person_columns = set(df_persons.columns) household_columns = set(df_households.columns) household_columns -= person_columns household_columns.add("household_id") df = pd.merge(df_persons, df_households[household_columns], on="household_id") assert len(df_persons) == len(df) df_persons = df spatial_marginals = [("departement_id", )] person_marginals = marginals.combine( marginals.TOTAL_MARGINAL, marginals.HTS_PERSON_MARGINALS, marginals.HTS_HOUSEHOLD_MARGINALS, marginals.cross(marginals.HTS_PERSON_MARGINALS, marginals.HTS_PERSON_MARGINALS), marginals.cross(marginals.HTS_HOUSEHOLD_MARGINALS, marginals.HTS_HOUSEHOLD_MARGINALS), marginals.cross(marginals.HTS_PERSON_MARGINALS, marginals.HTS_HOUSEHOLD_MARGINALS), spatial_marginals, marginals.cross(spatial_marginals, marginals.HTS_PERSON_MARGINALS)) household_marginals = marginals.combine( marginals.TOTAL_MARGINAL, marginals.HTS_HOUSEHOLD_MARGINALS, marginals.cross(marginals.HTS_HOUSEHOLD_MARGINALS, marginals.HTS_HOUSEHOLD_MARGINALS), spatial_marginals, marginals.cross(spatial_marginals, marginals.HTS_HOUSEHOLD_MARGINALS)) marginals.prepare_classes(df_persons) df_households = df_persons.drop_duplicates("household_id").copy() df_persons = df_persons.rename(columns={"person_weight": "weight"}) df_households = df_households.rename( columns={"household_weight": "weight"}) return dict(person=stats.marginalize(df_persons, person_marginals), household=stats.marginalize(df_households, household_marginals))
def execute(context): df = context.stage("sample") marginals.prepare_classes(df) return stats.marginalize(df, MARGINALS, weight_column=None)
def execute(context): # Obtain reference data reference = context.stage("analysis.reference.census.sociodemographics") reference = reference[MARGINAL_LEVEL][MARGINAL] reference = reference[np.logical_and.reduce([ reference[name] == value for name, value in zip(MARGINAL, VALUES) ])]["weight"].values[0] # Gather marginal information df_data = [] for sampling_rate in SAMPLING_RATES: df_marginals = [] for df_stage in bt.get_stages(context, "sample_%f" % sampling_rate, sample_size=ACQUISITION_SAMPLE_SIZE): marginals.prepare_classes(df_stage) df_stage = stats.marginalize(df_stage, [MARGINAL], weight_column=None)[MARGINAL] df_stage["sampling_rate"] = sampling_rate df_marginals.append(df_stage) df_marginals = stats.collect_sample(df_marginals) df_marginals = df_marginals[np.logical_and.reduce([ df_marginals[name] == value for name, value in zip(MARGINAL, VALUES) ])] df_data.append(df_marginals) df_data = pd.concat(df_data) sample_sizes = np.arange(1, MAXIMUM_SAMPLE_SIZE + 1) df_figure = [] for sampling_rate in SAMPLING_RATES: for sample_size in context.progress( sample_sizes, label="Calculating sample sizes ..."): df_marginals = df_data[df_data["sampling_rate"] == sampling_rate] df_marginals = df_marginals.drop(columns=["sampling_rate"]) df_bootstrap = stats.bootstrap( df_marginals, ESTIMATION_SAMPLES, sample_size, metrics={ "mean": "mean", "q5": lambda x: x.quantile(0.05), "q95": lambda x: x.quantile(0.95), "precision": lambda x: np.mean( np.abs(x / sampling_rate - reference) / reference <= ERROR_THRESHOLD) }) df_bootstrap["sample_size"] = sample_size df_bootstrap["sampling_rate"] = sampling_rate df_figure.append(df_bootstrap) df_figure = pd.concat(df_figure) # Plotting plotting.setup() plt.figure(figsize=plotting.SHORT_FIGSIZE) for index, sampling_rate in enumerate(SAMPLING_RATES): df_rate = df_figure[df_figure["sampling_rate"] == sampling_rate] plt.plot(df_rate["sample_size"], df_rate["precision"], label=SAMPLING_RATE_LABELS[sampling_rate], color=SAMPLING_RATE_COLORS[sampling_rate]) plt.plot([0, MAXIMUM_SAMPLE_SIZE + 1], [0.9, 0.9], 'k:') plt.xlim([1, MAXIMUM_SAMPLE_SIZE]) plt.ylim([0, 1.05]) plt.xlabel("Number of seeds $K$") plt.ylabel(r"Error probability") plt.gca().xaxis.set_major_locator(tck.FixedLocator([1, 10, 20, 30, 40])) plt.gca().yaxis.set_major_formatter( tck.FuncFormatter(lambda x, p: "%d%%" % (x * 100, ))) plt.grid() plt.gca().set_axisbelow(True) plt.legend(loc="best", title="Sampling rate $s$") plt.tight_layout() plt.savefig("%s/error_probability.pdf" % context.path()) plt.close()
def execute(context): # Obtain reference data reference = context.stage("analysis.reference.census.sociodemographics") reference = reference[MARGINAL_LEVEL][MARGINAL] reference = reference[np.logical_and.reduce([ reference[name] == value for name, value in zip(MARGINAL, VALUES) ])]["weight"].values[0] # Gather information df_marginals = [] for df_stage in bt.get_stages(context, "sample", sample_size=ACQUISITION_SAMPLE_SIZE): marginals.prepare_classes(df_stage) df_marginals.append( stats.marginalize(df_stage, [MARGINAL], weight_column=None)[MARGINAL]) df_marginals = stats.collect_sample(df_marginals) df_marginals = df_marginals[np.logical_and.reduce([ df_marginals[name] == value for name, value in zip(MARGINAL, VALUES) ])] sample_sizes = np.arange(1, MAXIMUM_SAMPLE_SIZE + 1) df_figure = [] for sample_size in context.progress(sample_sizes, label="Calculating sample sizes ..."): df_bootstrap = stats.bootstrap(df_marginals, ESTIMATION_SAMPLES, sample_size) df_bootstrap["sample_size"] = sample_size df_figure.append(df_bootstrap) df_figure = pd.concat(df_figure) df_figure["mean"] /= SAMPLING_RATE df_figure["q5"] /= SAMPLING_RATE df_figure["q95"] /= SAMPLING_RATE # Prepare plot plotting.setup() plt.figure(figsize=plotting.SHORT_FIGSIZE) plt.fill_between(df_figure["sample_size"], df_figure["q5"], df_figure["q95"], alpha=0.25, label="90% Conf.", color=plotting.COLORSET[0], linewidth=0.0) plt.plot([1, MAXIMUM_SAMPLE_SIZE], [reference] * 2, 'k--', label="Ref. $w$") plt.plot([1, MAXIMUM_SAMPLE_SIZE], [reference * 0.99] * 2, 'k:', label="1% Err.") plt.plot([1, MAXIMUM_SAMPLE_SIZE], [reference * 1.01] * 2, 'k:') plt.plot(df_figure["sample_size"], df_figure["mean"], label=r"$\mathrm{\mathbf{E}}[\tilde w_K]$", color=plotting.COLORSET[0]) plt.xlim([1, MAXIMUM_SAMPLE_SIZE]) plt.xlabel("Number of seeds $K$") plt.ylabel("Stratum weight") plt.gca().xaxis.set_major_locator(tck.FixedLocator([1, 5, 10, 15, 20, 25])) plt.gca().yaxis.set_major_formatter( tck.FuncFormatter(lambda x, p: "%.2fM" % (x * 1e-6, ))) plt.grid() plt.gca().set_axisbelow(True) plt.legend(loc="best", ncol=2) plt.tight_layout() plt.savefig("%s/sample_count.pdf" % context.path()) plt.close()