Ejemplo n.º 1
0
def test_employment_age_distribution(do_show, do_save, create_sample_pop_e2e,
                                     get_fig_dir_by_module):
    sp.logger.info(
        "Test employment age distribution vs the employment_rates_by_age.dat")

    plotting_kwargs = sc.objdict(do_show=do_show,
                                 do_save=do_save,
                                 figdir=get_fig_dir_by_module)
    actual_employment_age_count = create_sample_pop_e2e.count_employment_by_age(
    )
    total_employee = sum(actual_employment_age_count.values())
    expected_employment_age_dist = sp.norm_dic(
        sp.get_employment_rates(**create_sample_pop_e2e.loc_pars))

    expected_employment_age_count = {
        i: round(expected_employment_age_dist[i] * total_employee)
        for i in expected_employment_age_dist
    }

    # generate list of ages based on the actual count
    generated_actual = sum([[i] * actual_employment_age_count[i]
                            for i in actual_employment_age_count], [])
    generated_expected = sum([[i] * expected_employment_age_count[i]
                              for i in expected_employment_age_count], [])
    # run statistical tests for employment by age distribution
    # TODO: Need to refine the data for fair comparison
    sp.statistic_test(expected=generated_expected,
                      actual=generated_actual,
                      test=st.kstest,
                      verbose=True)
    # plot enrollment by age
    create_sample_pop_e2e.plot_employment_rates_by_age(**plotting_kwargs)
Ejemplo n.º 2
0
def test_work_size_distribution(do_show, do_save, create_sample_pop_e2e,
                                get_fig_dir_by_module):
    sp.logger.info(
        "Test workplace size distribution vs the work_size_count.dat")

    plotting_kwargs = sc.objdict(do_show=do_show,
                                 do_save=do_save,
                                 figdir=get_fig_dir_by_module)

    workplace_brackets_index = sp.get_index_by_brackets(
        sp.get_workplace_size_brackets(**create_sample_pop_e2e.loc_pars))

    actual_workplace_sizes = create_sample_pop_e2e.count_workplace_sizes()
    # count the workplaces by size bracket

    actual_count = {k: 0 for k in set(workplace_brackets_index.values())}
    for i in workplace_brackets_index:
        actual_count[
            workplace_brackets_index[i]] += actual_workplace_sizes.get(i, 0)

    expected_distr = sp.norm_dic(
        sp.get_workplace_size_distr_by_brackets(
            **create_sample_pop_e2e.loc_pars))

    # calculate expected count by using actual number of workplaces
    expected_count = {
        k: expected_distr[k] * sum(actual_count.values())
        for k in expected_distr
    }
    # perform statistical check
    sp.statistic_test([expected_count[i] for i in sorted(expected_count)],
                      [actual_count[i] for i in sorted(actual_count)])

    create_sample_pop_e2e.plot_workplace_sizes(**plotting_kwargs)
Ejemplo n.º 3
0
def test_household_distribution( create_sample_pop_e2e):
    actual_households_count = Counter([len(i['member_uids']) for i in create_sample_pop_e2e.households])
    actual_households_size = [actual_households_count[i] for i in sorted(actual_households_count)]
    expected_households_dist = sp.get_household_size_distr(**create_sample_pop_e2e.loc_pars)
    expected_households_size = [expected_households_dist[i] * create_sample_pop_e2e.n_households for i in sorted(expected_households_dist)]

    # use t-test to compare instead
    test = scipy.stats.ttest_rel
    sp.statistic_test(expected_households_size, actual_households_size, test)
Ejemplo n.º 4
0
def test_statistic_test():
    sp.logger.info(
        "Test sp.statistic_test method. This performs specified scipy statistical tests on expected and actual data to see if they are likely to be from the same distribution. By default the test is the chi squared test."
    )
    low, high, size = 0, 10, 500
    mu, sigma = 5, 3
    bins = range(low, high, 1)

    # generate data from the truncated normal distribution
    expected = scipy.stats.truncnorm.rvs((low - mu) / sigma,
                                         (high - mu) / sigma,
                                         loc=mu,
                                         scale=sigma,
                                         size=size)
    actual_good = scipy.stats.truncnorm.rvs((low - mu) / sigma,
                                            (high - mu) / sigma,
                                            loc=mu,
                                            scale=sigma,
                                            size=size)

    # generate data uniformly from low+2 to high-2 --- this should not match
    actual_bad = np.random.randint(low=low + 2, high=high - 2, size=size)

    # default test is chisquare
    sp.statistic_test(
        np.histogram(expected, bins)[0],
        np.histogram(actual_good, bins)[0])  # should pass
    with pytest.warns(UserWarning):
        sp.statistic_test(
            np.histogram(expected, bins)[0],
            np.histogram(actual_bad, bins)[0])  # should fail

    # use t-test to compare instead
    test = scipy.stats.ttest_rel
    sp.statistic_test(expected, actual_good, test)  # should pass

    with pytest.warns(UserWarning):
        sp.statistic_test(expected, actual_bad, test)  # should fail
Ejemplo n.º 5
0
def test_workplace_contact_distribution_2(create_sample_pop_e2e):
    sp.logger.info(
        "Not a test - exploratory --- workplaces that don't match are quite close to expected results"
    )
    pop = create_sample_pop_e2e
    max_contacts = pop.max_contacts
    max_w_size = int(max_contacts['W'] // 2)
    wsize_brackets = sp.get_workplace_size_brackets(**pop.loc_pars)
    wsize_index = sp.get_index_by_brackets(wsize_brackets)
    contacts, contacts_by_id = cn.get_contact_counts_by_layer(
        pop.popdict, layer="w", with_layer_ids=True)

    wpids = sorted(contacts_by_id.keys())

    max_size_full_connected = 0

    runs = 0
    passed = 0
    failedsize = []
    allsize = []
    for nw, wpid in enumerate(wpids):
        wnc = set(contacts_by_id[wpid])
        wsize = len(contacts_by_id[wpid])
        allsize.append(wsize_index[wsize])

        if len(wnc) == 1:

            assert list(wnc)[0] + 1 == wsize, 'Check Failed'
            if max_size_full_connected < wsize:
                max_size_full_connected = wsize

        else:
            print(
                f"workplace id is {wpid}, no.contacts, {wnc}, size {wsize}, mu {max_w_size}"
            )
            N = wsize

            p = (max_contacts['W'] - 1) / N
            # degree distribution for an ER random graph follows a binomial distribution that is truncated
            # to the max size N. When N is large this approximates the poisson distribution. Perhaps our
            # test could look at the zero-N truncated binomial distribution
            # G = nx.erdos_renyi_graph(N, p, seed=0)
            G = nx.fast_gnp_random_graph(N, p, seed=0)
            degree = [G.degree(i) for i in G.nodes()]

            # sp.statistic_test(degree, contacts_by_id[wpid], verbose=True)
            # sp.check_truncated_poisson(contacts_by_id[wpid], mu=max_contacts['W'] - 2, lowerbound=max_contacts['W'] // 2, upperbound=wsize - 1)
            runs += 1
            result = sp.check_truncated_poisson(contacts_by_id[wpid],
                                                mu=max_contacts['W'] - 2,
                                                lowerbound=max_contacts['W'] //
                                                2,
                                                upperbound=wsize - 1,
                                                skipcheck=0,
                                                do_show=0)
            passed += int(result)
            if not result:
                failedsize.append(wsize_index[wsize])
                sp.statistic_test(degree, contacts_by_id[wpid], verbose=True)
            print('workplace id', wpid)
            print('\n\n')
    print(
        f'total workplaces: {runs}, passing checks: {passed}, passed rate:{round(passed/runs,2) *100} %'
    )
    print("size brackets:\tcount")
    failed_counts = {
        i: dict(Counter(failedsize))[i]
        for i in sorted(dict(Counter(failedsize)).keys())
    }
    all_counts = {
        i: dict(Counter(allsize))[i]
        for i in sorted(dict(Counter(allsize)).keys())
    }
    for k, v in failed_counts.items():
        print(
            f"{min(wsize_brackets[k])}-{max(wsize_brackets[k])}:\t{v}, {v/all_counts[k] * 100:.2f}"
        )
    print('max_size_full_connected', max_size_full_connected)