def check_employment_age_distribution(pop, n, datadir, figdir, location=None, state_location=None, country_location=None, file_path=None, use_default=False, test_prefix="", skip_stat_check=False, do_close=True): """ Check the population employment by age distribution against the reference data Args: pop : population dictionary n : population size datadir : root data directory which has resides the reference data figdir : directory where to result files are saved location : name of the location state_location : name of the state the location is in country_location : name of the country the location is in file_path : file path to user specified gender by age bracket distribution data use_default : if True, try to first use the other parameters to find data specific to the location under study, otherwise returns default data drawing from Seattle, Washington. test_prefix : used for prefix of the plot title skip_stat_check : skip the statistics check for distribution do_close : close the image immediately if set to True Returns: None. Plots will be save to figdir if provided """ figdir = os.path.join(figdir, "employment") er = sp.get_employment_rates(datadir=datadir, location=location, state_location=state_location, country_location=country_location, file_path=file_path, use_default=use_default) brackets = sp.get_census_age_brackets(datadir=datadir, state_location=state_location, country_location=country_location) ageindex = sp.get_age_by_brackets_dic(brackets) age_dist = sp.read_age_bracket_distr(datadir=datadir, location=location, state_location=state_location, country_location=country_location, file_path=file_path, use_default=use_default) # counting the actual population by age with employment including teachers and staffs actual_employed_age_dist, actual_unemployed_age_dist = \ utilities.get_ids_count_by_param(pop, condition_name=['wpid', 'sc_teacher', 'sc_staff'], param='age') utilities.plot_array([ actual_employed_age_dist[k] for k in sorted(actual_employed_age_dist) ], datadir=figdir, names=[k for k in sorted(actual_employed_age_dist)], expect_label='employed by age count', xlabel_rotation=90, testprefix="employeed count by age " + test_prefix) utilities.plot_array([ actual_unemployed_age_dist[k] for k in sorted(actual_unemployed_age_dist) ], datadir=figdir, names=[k for k in sorted(actual_unemployed_age_dist)], expect_label='unemployed by age count', xlabel_rotation=90, testprefix="unemployed count by age " + test_prefix) sorted_actual_employed_rate = {} actual_employed_rate = utilities.calc_rate(actual_employed_age_dist, actual_unemployed_age_dist) for i in er.keys(): if i in actual_employed_rate: sorted_actual_employed_rate[i] = actual_employed_rate[i] else: sorted_actual_employed_rate[i] = 0 actual_values = np.array(list(sorted_actual_employed_rate.values())) expected_values = np.array(list(er.values())) if not skip_stat_check: utilities.statistic_test(expected_values, actual_values, test="x", comments="employment rate distribution check") # plotting fill 0 to under age 16 for better display filled_count = min(er.keys()) expected_values = np.insert(expected_values, 0, np.zeros(filled_count)) actual_values = np.insert(actual_values, 0, np.zeros(filled_count)) names = [i for i in range(0, max(er.keys()) + 1)] # somehow double stacks for age 100 utilities.plot_array( expected_values, actual_values, names=None, datadir=figdir, testprefix="employment rate distribution " + test_prefix, do_close=do_close, ) # check if total employment match expected_employed_brackets = {k: 0 for k in brackets} actual_employed_brackets = {k: 0 for k in brackets} for i in names: expected_employed_brackets[ageindex[i]] += expected_values[i] if i in actual_employed_age_dist: actual_employed_brackets[ ageindex[i]] += actual_employed_age_dist[i] for i in expected_employed_brackets: expected_employed_brackets[i] = expected_employed_brackets[i] / len( brackets[i]) * age_dist[i] * n expected_total = np.array(list(expected_employed_brackets.values())) actual_total = np.array(list(actual_employed_brackets.values())) utilities.plot_array(expected_total, actual_total, names=brackets.keys(), datadir=figdir, testprefix="employment total " + test_prefix, do_close=do_close) expected_etotal = np.round(np.sum(expected_total)) actual_etotal = np.round(np.sum(actual_total)) utilities.check_error_percentage(n, expected_etotal, actual_etotal, name="employee")
def check_school_size_distribution(pop, n, datadir, figdir, location=None, state_location=None, country_location=None, file_path=None, use_default=False, test_prefix="", skip_stat_check=True, do_close=True, school_type=None): """ Check the school size distribution against the reference data Args: pop : population dictionary n : population size datadir : root data directory which has resides the reference data figdir : directory where to result files are saved location : name of the location state_location : name of the state country_location : name of the country the state_location is in file_path : file path to user specified gender by age bracket distribution data use_default : if True, try to first use the other parameters to find data specific to the location under study, otherwise returns default data drawing from Seattle, Washington. test_prefix : used for prefix of the plot title skip_stat_check : skip the statistics check for distribution do_close : close the image immediately if set to True school_type : list of school types e.g. ['pk', 'es', 'ms', 'hs', 'uv'] Returns: None. Plots will be save to figdir if provided """ figdir = os.path.join(figdir, "school_size") sb = sp.get_school_size_brackets(datadir=datadir, location=location, state_location=state_location, country_location=country_location, file_path=file_path, use_default=use_default) sb_index = sp.get_index_by_brackets_dic(sb) expected_school_size_by_brackets = sp.get_school_size_distr_by_brackets( datadir=datadir, location=location, state_location=state_location, country_location=country_location) actual_school, actual_school_none = utilities.get_ids_count_by_param( pop, "scid") actual_school_student_only, actual_school_none_student_only = utilities.get_ids_count_by_param( pop, "sc_student", "scid") actual_per_school_type_dict = {} actual_per_school_type_dict_student_only = {} actual_per_school_type_dict["all"] = actual_school actual_per_school_type_dict_student_only[ "all"] = actual_school_student_only if school_type is not None: for sc in school_type: actual_per_school_type_dict[sc] = \ utilities.get_ids_count_by_param(pop, "sc_type", param="scid", condition_value=sc)[0] actual_per_school_type_dict_student_only[sc] = \ utilities.get_ids_count_by_param(pop, "sc_type", param="scid", condition_value=sc, filter_expression={'sc_student':'1'})[0] # get individual school type size distribution for k in actual_per_school_type_dict: actual_scount = dict(Counter(actual_per_school_type_dict[k].values())) actual_scount_student_only = dict( Counter(actual_per_school_type_dict_student_only[k].values())) actual_school_size_by_brackets = sp.norm_dic( utilities.get_bucket_count(sb_index, sb, actual_scount)) expected_values = np.array( list(expected_school_size_by_brackets.values())) actual_values = np.array(list(actual_school_size_by_brackets.values())) utilities.plot_array(expected_values, actual_values, names=sb.keys(), datadir=figdir, testprefix="school size " + test_prefix + " " + k, do_close=do_close) utilities.plot_array( actual_per_school_type_dict[k].values(), datadir=figdir, expect_label= f"school count: total {len(actual_per_school_type_dict[k])}", testprefix="school size total\n" + test_prefix + " " + k, binned=False, do_close=do_close) utilities.plot_array( actual_per_school_type_dict_student_only[k].values(), datadir=figdir, expect_label= f"school count: total {len(actual_per_school_type_dict[k])}", testprefix="school size total (student only)\n" + test_prefix + " " + k, binned=False, do_close=do_close) # statistic_test is not working yet because school sizes are now available by school type. Also depends strongly on population size. if not skip_stat_check: utilities.statistic_test(expected_values, actual_values, test="x", comments="school size check") # check average school size school_size_brackets = sp.get_school_size_brackets( datadir=datadir, location=location, country_location=country_location, state_location=state_location) # calculate the average school size per bracket average_school_size_in_bracket = [ sum(i) / len(i) for i in school_size_brackets.values() ] # calculate expected school size based on expected value sum(distribution * size) expected_average_school_size = sum([ v[1] * average_school_size_in_bracket[v[0]] for v in expected_school_size_by_brackets.items() ]) actual_average_school_size = sum( [i * actual_scount[i] for i in actual_scount]) / sum(actual_scount.values()) utilities.check_error_percentage(n, expected_average_school_size, actual_average_school_size, name=f"average school size:'{k}'") # check school count distribution utilities.plot_array([ len(actual_per_school_type_dict[i]) for i in actual_per_school_type_dict ], names=list(actual_per_school_type_dict.keys()), datadir=figdir, expect_label="school count", testprefix="school count " + test_prefix, value_text=True)
def check_work_size_distribution(pop, n, datadir, figdir, location=None, state_location=None, country_location=None, file_path=None, use_default=False, test_prefix="", skip_stat_check=False, do_close=True): """ Check the population workplace size distribution against the reference data Args: pop : population dictionary n : population size datadir : root data directory which has resides the reference data figdir : directory where to result files are saved location : name of the location state_location : name of the state the location is in country_location : name of the country the location is in file_path : file path to user specified gender by age bracket distribution data use_default : if True, try to first use the other parameters to find data specific to the location under study, otherwise returns default data drawing from Seattle, Washington. test_prefix : used for prefix of the plot title skip_stat_check : skip the statistics check for distribution do_close : close the image immediately if set to True Returns: None. Plots will be save to figdir if provided """ figdir = os.path.join(figdir, "work_size") wb = sp.get_workplace_size_brackets(datadir=datadir, location=location, state_location=state_location, country_location=country_location, file_path=file_path, use_default=use_default) ws = sp.norm_dic( sp.get_workplace_size_distr_by_brackets( datadir=datadir, location=location, state_location=state_location, country_location=country_location, file_path=file_path, use_default=use_default)) ws_index = sp.get_index_by_brackets_dic(wb) upper_bound = max(ws_index.keys()) actual_work_dist, actual_work_dist_none = utilities.get_ids_count_by_param( pop, "wpid") actual_worksizes = {} for v in actual_work_dist.values(): if v > upper_bound: v = upper_bound actual_worksizes.setdefault(ws_index[v], 0) actual_worksizes[ws_index[v]] += 1 actual_values = np.zeros(len(ws.keys())) for i in range(0, len(ws.keys())): if i in actual_worksizes: actual_values[i] = actual_worksizes[i] actual_values = actual_values / np.nansum(actual_values) expected_values = np.array(list(ws.values())) xlabels = [str(wb[b][0]) + '-' + str(wb[b][-1]) for b in sorted(wb.keys())] utilities.plot_array(expected_values, actual_values, names=xlabels, datadir=figdir, testprefix="work size distribution " + test_prefix, do_close=do_close, xlabel_rotation=50) if not skip_stat_check: utilities.statistic_test(expected_values, actual_values, test="x", comments="work size distribution check")
def check_household_distribution(pop, n, datadir, figdir, location=None, state_location=None, country_location=None, file_path=None, use_default=False, test_prefix="", skip_stat_check=False, do_close=True): """ Check the household size distribution against the reference data Args: pop : population dictionary n : population size datadir : root data directory which has resides the reference data figdir : directory where to result files are saved location : name of the location state_location : name of the state the location is in country_location : name of the country the location is in file_path : file path to user specified gender by age bracket distribution data use_default : if True, try to first use the other parameters to find data specific to the location under study, otherwise returns default data drawing from Seattle, Washington. test_prefix : used for prefix of the plot title skip_stat_check : skip the statistics check for distribution do_close : close the image immediately if set to True Returns: None. Plots will be save to figdir if provided """ figdir = os.path.join(figdir, "household") hs = sp.get_household_size_distr(datadir=datadir, location=location, state_location=state_location, country_location=country_location, file_path=file_path, use_default=use_default) actual_households, actual_households_none = utilities.get_ids_count_by_param( pop, "hhid") assert actual_households_none == {}, "all entries must have household ids" actual_household_count = dict(Counter(actual_households.values())) sorted_actual_household_count = {} for i in sorted(actual_household_count): sorted_actual_household_count[i] = actual_household_count[i] actual_values = np.array( list(sp.norm_dic(sorted_actual_household_count).values())) expected_values = np.array(list(hs.values())) utilities.plot_array(expected_values, actual_values, names=[x for x in list(hs.keys())], datadir=figdir, testprefix="household count percentage " + test_prefix, do_close=do_close, value_text=True) if not skip_stat_check: utilities.statistic_test(expected_values, actual_values, test="x", comments="household count percentage check") # check average household size expected_average_household_size = round( sum([(i + 1) * expected_values[np.where(i)] for i in expected_values])[0], 3) actual_average_household_size = round( sum([(i + 1) * actual_values[np.where(i)] for i in actual_values])[0], 3) print( f"expected average household size: {expected_average_household_size}\n" f"actual average household size: {actual_average_household_size}") utilities.check_error_percentage(n, expected_average_household_size, actual_average_household_size, name="average household size")