def test_employment_age_distribution(do_show, do_save, create_sample_pop_e2e, get_fig_dir_by_module): sp.logger.info( "Test employment age distribution vs the employment_rates_by_age.dat") plotting_kwargs = sc.objdict(do_show=do_show, do_save=do_save, figdir=get_fig_dir_by_module) actual_employment_age_count = create_sample_pop_e2e.count_employment_by_age( ) total_employee = sum(actual_employment_age_count.values()) expected_employment_age_dist = sp.norm_dic( sp.get_employment_rates(**create_sample_pop_e2e.loc_pars)) expected_employment_age_count = { i: round(expected_employment_age_dist[i] * total_employee) for i in expected_employment_age_dist } # generate list of ages based on the actual count generated_actual = sum([[i] * actual_employment_age_count[i] for i in actual_employment_age_count], []) generated_expected = sum([[i] * expected_employment_age_count[i] for i in expected_employment_age_count], []) # run statistical tests for employment by age distribution # TODO: Need to refine the data for fair comparison sp.statistic_test(expected=generated_expected, actual=generated_actual, test=st.kstest, verbose=True) # plot enrollment by age create_sample_pop_e2e.plot_employment_rates_by_age(**plotting_kwargs)
def test_work_size_distribution(do_show, do_save, create_sample_pop_e2e, get_fig_dir_by_module): sp.logger.info( "Test workplace size distribution vs the work_size_count.dat") plotting_kwargs = sc.objdict(do_show=do_show, do_save=do_save, figdir=get_fig_dir_by_module) workplace_brackets_index = sp.get_index_by_brackets( sp.get_workplace_size_brackets(**create_sample_pop_e2e.loc_pars)) actual_workplace_sizes = create_sample_pop_e2e.count_workplace_sizes() # count the workplaces by size bracket actual_count = {k: 0 for k in set(workplace_brackets_index.values())} for i in workplace_brackets_index: actual_count[ workplace_brackets_index[i]] += actual_workplace_sizes.get(i, 0) expected_distr = sp.norm_dic( sp.get_workplace_size_distr_by_brackets( **create_sample_pop_e2e.loc_pars)) # calculate expected count by using actual number of workplaces expected_count = { k: expected_distr[k] * sum(actual_count.values()) for k in expected_distr } # perform statistical check sp.statistic_test([expected_count[i] for i in sorted(expected_count)], [actual_count[i] for i in sorted(actual_count)]) create_sample_pop_e2e.plot_workplace_sizes(**plotting_kwargs)
def test_household_distribution( create_sample_pop_e2e): actual_households_count = Counter([len(i['member_uids']) for i in create_sample_pop_e2e.households]) actual_households_size = [actual_households_count[i] for i in sorted(actual_households_count)] expected_households_dist = sp.get_household_size_distr(**create_sample_pop_e2e.loc_pars) expected_households_size = [expected_households_dist[i] * create_sample_pop_e2e.n_households for i in sorted(expected_households_dist)] # use t-test to compare instead test = scipy.stats.ttest_rel sp.statistic_test(expected_households_size, actual_households_size, test)
def test_statistic_test(): sp.logger.info( "Test sp.statistic_test method. This performs specified scipy statistical tests on expected and actual data to see if they are likely to be from the same distribution. By default the test is the chi squared test." ) low, high, size = 0, 10, 500 mu, sigma = 5, 3 bins = range(low, high, 1) # generate data from the truncated normal distribution expected = scipy.stats.truncnorm.rvs((low - mu) / sigma, (high - mu) / sigma, loc=mu, scale=sigma, size=size) actual_good = scipy.stats.truncnorm.rvs((low - mu) / sigma, (high - mu) / sigma, loc=mu, scale=sigma, size=size) # generate data uniformly from low+2 to high-2 --- this should not match actual_bad = np.random.randint(low=low + 2, high=high - 2, size=size) # default test is chisquare sp.statistic_test( np.histogram(expected, bins)[0], np.histogram(actual_good, bins)[0]) # should pass with pytest.warns(UserWarning): sp.statistic_test( np.histogram(expected, bins)[0], np.histogram(actual_bad, bins)[0]) # should fail # use t-test to compare instead test = scipy.stats.ttest_rel sp.statistic_test(expected, actual_good, test) # should pass with pytest.warns(UserWarning): sp.statistic_test(expected, actual_bad, test) # should fail
def test_workplace_contact_distribution_2(create_sample_pop_e2e): sp.logger.info( "Not a test - exploratory --- workplaces that don't match are quite close to expected results" ) pop = create_sample_pop_e2e max_contacts = pop.max_contacts max_w_size = int(max_contacts['W'] // 2) wsize_brackets = sp.get_workplace_size_brackets(**pop.loc_pars) wsize_index = sp.get_index_by_brackets(wsize_brackets) contacts, contacts_by_id = cn.get_contact_counts_by_layer( pop.popdict, layer="w", with_layer_ids=True) wpids = sorted(contacts_by_id.keys()) max_size_full_connected = 0 runs = 0 passed = 0 failedsize = [] allsize = [] for nw, wpid in enumerate(wpids): wnc = set(contacts_by_id[wpid]) wsize = len(contacts_by_id[wpid]) allsize.append(wsize_index[wsize]) if len(wnc) == 1: assert list(wnc)[0] + 1 == wsize, 'Check Failed' if max_size_full_connected < wsize: max_size_full_connected = wsize else: print( f"workplace id is {wpid}, no.contacts, {wnc}, size {wsize}, mu {max_w_size}" ) N = wsize p = (max_contacts['W'] - 1) / N # degree distribution for an ER random graph follows a binomial distribution that is truncated # to the max size N. When N is large this approximates the poisson distribution. Perhaps our # test could look at the zero-N truncated binomial distribution # G = nx.erdos_renyi_graph(N, p, seed=0) G = nx.fast_gnp_random_graph(N, p, seed=0) degree = [G.degree(i) for i in G.nodes()] # sp.statistic_test(degree, contacts_by_id[wpid], verbose=True) # sp.check_truncated_poisson(contacts_by_id[wpid], mu=max_contacts['W'] - 2, lowerbound=max_contacts['W'] // 2, upperbound=wsize - 1) runs += 1 result = sp.check_truncated_poisson(contacts_by_id[wpid], mu=max_contacts['W'] - 2, lowerbound=max_contacts['W'] // 2, upperbound=wsize - 1, skipcheck=0, do_show=0) passed += int(result) if not result: failedsize.append(wsize_index[wsize]) sp.statistic_test(degree, contacts_by_id[wpid], verbose=True) print('workplace id', wpid) print('\n\n') print( f'total workplaces: {runs}, passing checks: {passed}, passed rate:{round(passed/runs,2) *100} %' ) print("size brackets:\tcount") failed_counts = { i: dict(Counter(failedsize))[i] for i in sorted(dict(Counter(failedsize)).keys()) } all_counts = { i: dict(Counter(allsize))[i] for i in sorted(dict(Counter(allsize)).keys()) } for k, v in failed_counts.items(): print( f"{min(wsize_brackets[k])}-{max(wsize_brackets[k])}:\t{v}, {v/all_counts[k] * 100:.2f}" ) print('max_size_full_connected', max_size_full_connected)