Exemple #1
0
def test_output(df):
    log = "test_output\n"
    assert (unique(df.columns))

    assert near(len(df),
                num_purchases_surviving / com.subsample,
                tol_frac=(1 / 20 if com.subsample != 10 else 0.6))
    # TODO | BUG? Why is the previous conditional necessary? That is, why,
    # in the special case of subsample = 1/10, is the size of the
    # purchase data so different from what you'd expect.
    # This isn't necessarily wrong, since the data is subsampled by households,
    # and households can make different numbers of purchases.
    # That's why `tol_frac` needs to be substantial in both cases.
    # But it's surprising, because for subsample = 10,
    # the reality is much less than the expectation.

    spec = {
        "where-got": cla.InRange(1, 26),
        "weight": cla.InRange(0, 1e4),
        "value": cla.InRange(0, 1e9),
        "quantity": cla.InRange(0, 1e8),
        "is-purchase": cla.InRange(0, 1),
        "household": cla.InRange(1, 1e7),
        "per month": cla.InRange(1, 11),
        "coicop": cla.InRange(1, 1e8),
        "25-broad-categs": cla.InRange(1, 25)
    }
    for k in spec:
        log += ("  " + k + "\n")
        assert spec[k].test(df[k])

    log += "Specs cover all column names."
    assert set(df.columns) == set(spec.keys())

    log += "Very few missing quantity values."
    assert ((1e-5) > (len(df[pd.isnull(df["quantity"])]) / len(df)))

    log += "Very few negative quantity values."
    assert ((1e-5) > (len(df[df["quantity"] <= 0]) / len(df)))

    log += "Negative quantity purchases are for very little money."
    assert (df[df["quantity"] < 0]["value"] < 1e4).all()

    log += "Very few purchases with a frequency of \"never\"."
    assert ((1e-5) > (len(df[df["per month"] > 10]) / len(df)))

    log += "Those few frequency=\"never\" purchases are for very little money."
    assert (df[df["per month"] > 10]["value"] < 1e4).all()

    return log
Exemple #2
0
def test_mk_salud_employer():
    if True:  # for contractors, always 0
        assert near(sf.mk_salud_employer(contractor, 0), 0)
        assert near(sf.mk_salud_employer(contractor, 1000 * min_wage), 0)
    if True:  # for employees
        assert near(sf.mk_salud_employer(employee, 0.5 * min_wage), 0)
        assert near(sf.mk_salud_employer(employee, 9 * min_wage), 0)
        assert near(sf.mk_salud_employer(employee, 20 * min_wage),
                    0.085 * 0.7 * 20 * min_wage)
        assert near(sf.mk_salud_employer(employee, 100 * min_wage),
                    0.085 * 25 * min_wage)
Exemple #3
0
def test_mk_pension_employer():
    if True:  # for contractors, always 0
        assert near(sf.mk_pension_employer(contractor, 0), 0)
        assert near(sf.mk_pension_employer(contractor, 1000 * min_wage), 0)
    if True:  # for employees
        assert near(sf.mk_pension_employer(employee, 0.5 * min_wage), 0)
        assert near(sf.mk_pension_employer(employee, 5 * min_wage),
                    0.12 * 5 * min_wage)
        assert near(sf.mk_pension_employer(employee, 20 * min_wage),
                    0.12 * 0.7 * 20 * min_wage)
        assert near(sf.mk_pension_employer(employee, 100 * min_wage),
                    0.12 * 25 * min_wage)
def test_output(df):
    log = "test_output()\n"

    assert unique(df.columns)

    assert near(len(df),
                num_purchases_surviving / com.subsample,
                tol_frac=(1 / 10 if not com.subsample == 10 else 0.6))
    # TODO | BUG? Why is the previous conditional necessary? That is, why,
    # in the special case of subsample = 1/10, is the size of the
    # purchase data so different from what you'd expect.
    # This isn't necessarily wrong, since the data is subsampled by households,
    # and households can make different numbers of purchases.
    # That's why `tol_frac` needs to be substantial in both cases.
    # But it's surprising, because for subsample = 10,
    # the reality is much less than the expectation.

    assert (set(df.columns) == set(Purchase_2_Columns_missing.all_columns()))

    # coicop and 25-broad-categs are each individually missing substantially,
    # but exactly one of them is always present
    assert len(df[(~pd.isnull(df["coicop"]))
                  & (~pd.isnull(df["25-broad-categs"]))]) == 0
    assert len(df[(pd.isnull(df["coicop"])) |
                  (pd.isnull(df["25-broad-categs"]))]) == len(df)

    for c in Purchase_2_Columns_missing.never:
        assert (len(df[pd.isnull(df[c])]) == 0)

    for c in Purchase_2_Columns_missing.slightly:
        assert ((len(df[pd.isnull(df[c])]) / len(df)) < 0.03)

    for c in Purchase_2_Columns_missing.very:
        assert ((len(df[pd.isnull(df[c])]) / len(df)) < 0.25)

    return log
Exemple #5
0
cols2 = set(out.columns)
new_cols = {
    "estrato", 'recently bought this house', "region-1", "region-2",
    "age-decile", "income-decile", "IT", "IC", "ICM", "ICMD", "GT", "GC",
    "GCM", "female head"
}

assert util.unique(out.columns)
assert util.unique(new_cols)

assert set.intersection(cols1, new_cols) == set()
assert set.union(cols1, new_cols) == cols2
assert set.difference(cols2, cols1) == new_cols

assert len(in_rows) == len(out)
assert util.near(len(out), misc.num_people / com.subsample, tol_frac=1 / 5)

per_cell_spec = {
    "age-decile": cl.InRange(0, 9),
    "income-decile": cl.InRange(0, 9),
    "female head": cl.InRange(0, 1)
}

per_column_spec = {
    "age-decile": cl.CoversRange(0, 9),
    "income-decile": cl.CoversRange(0, 9),
    "female head": cl.CoversRange(0, 1)
}

for k, v in per_cell_spec.items():
    assert v.test(out[k])
Exemple #6
0
def test_mk_solidaridad():
    if True:  # contractor
        assert near(sf.mk_solidaridad(contractor, 0), 0)
        assert near(sf.mk_solidaridad(contractor, 3 * min_wage), 0)
        assert near(sf.mk_solidaridad(contractor, 5 * min_wage),
                    0.01 * 0.4 * 5 * min_wage)
        assert near(sf.mk_solidaridad(contractor, 15 * min_wage),
                    0.01 * 0.4 * 15 * min_wage)
        assert near(sf.mk_solidaridad(contractor, 16.5 * min_wage),
                    0.012 * 0.4 * 16.5 * min_wage)
        assert near(sf.mk_solidaridad(contractor, 17.5 * min_wage),
                    0.014 * 0.4 * 17.5 * min_wage)
        assert near(sf.mk_solidaridad(contractor, 18.5 * min_wage),
                    0.016 * 0.4 * 18.5 * min_wage)
        assert near(sf.mk_solidaridad(contractor, 19.5 * min_wage),
                    0.018 * 0.4 * 19.5 * min_wage)
        assert near(sf.mk_solidaridad(contractor, 21 * min_wage),
                    0.02 * 0.4 * 21 * min_wage)
        assert near(sf.mk_solidaridad(contractor, 100 * min_wage),
                    0.02 * 25 * min_wage)

    if True:  # employee
        assert near(sf.mk_solidaridad(employee, 0), 0)
        assert near(sf.mk_solidaridad(employee, 3 * min_wage), 0)
        assert near(sf.mk_solidaridad(employee, 5 * min_wage),
                    0.01 * 5 * min_wage)
        assert near(sf.mk_solidaridad(employee, 13.1 * min_wage),
                    0.01 * 0.7 * 13.1 * min_wage)
        assert near(sf.mk_solidaridad(employee, 16.5 * min_wage),
                    0.012 * 0.7 * 16.5 * min_wage)
        assert near(sf.mk_solidaridad(employee, 17.5 * min_wage),
                    0.014 * 0.7 * 17.5 * min_wage)
        assert near(sf.mk_solidaridad(employee, 18.5 * min_wage),
                    0.016 * 0.7 * 18.5 * min_wage)
        assert near(sf.mk_solidaridad(employee, 19.5 * min_wage),
                    0.018 * 0.7 * 19.5 * min_wage)
        assert near(sf.mk_solidaridad(employee, 21 * min_wage),
                    0.02 * 0.7 * 21 * min_wage)
        assert near(sf.mk_solidaridad(employee, 100 * min_wage),
                    0.02 * 25 * min_wage)
Exemple #7
0
     ( "value, non-purchase",
       [ cla.MeanBounds    (1e6,1e7),
         cla.CoversRange   (0 ,1e6),
         cla.InRange       (0 ,3.3e9),
         cla.MissingAtMost (0) ] ),

    ( "value, purchase",
       [ cla.MeanBounds    (1e6 ,5e6),
         cla.CoversRange   (1e2 ,4e7), # TODO ? This minimum is nuts.
         cla.InRange       (0   ,2e8),
         cla.MissingAtMost (0) ] ),

    ( "value, spending",
       [ cla.MeanBounds    (1e6 ,5e6),
         cla.CoversRange   (1e2 ,4e7), # TODO ? This minimum is nuts.
         cla.InRange       (0   ,2e8),
         cla.MissingAtMost(0) ] ) ]:
      for t in ts:
          assert t.test( sums[c] )

assert sums["household"].is_unique

assert util.near( len(sums),
                  num_households / com.subsample,
                  tol_frac = 1/5 )

oio.test_write( com.subsample,
                "build_purchase_sums",
                "It worked." )
Exemple #8
0
    import python.build.purchases.articulos as articulos
    import python.build.purchases.capitulo_c as capitulo_c

purchases = com.collect_files(
    (
        articulos.files
        # + medios.files
        # The tax only applies if the purchase is more than 880 million pesos,
        # and the data only records purchases of a second home.
        + capitulo_c.files + nice_purchases.files),
    subsample=com.subsample)

assert util.near(
    # PITFALL: This differs from the usual idiom which separates testing
    # from production. That's because the only thing tested here is
    # the number of rows; reading the entire data set into memory again
    # for such a simple test seems unworth the added execution time.
    len(purchases),
    misc.num_purchases / com.subsample,
    tol_frac=(1 / 20 if not com.subsample == 10 else 1 / 2))
# TODO | BUG? Why is the previous conditional necessary? That is, why,
# in the special case of subsample = 1/10, is the size of the
# purchase data so different from what you'd expect.
# This isn't necessarily wrong, since the data is subsampled by households,
# and households can make different numbers of purchases.
# That's why `tol_frac` needs to be substantial in both cases.
# But it's surprising, because for subsample = 10,
# the reality is much less than the expectation.

oio.saveStage(com.subsample, purchases, 'purchases_0')
Exemple #9
0

# TODO : extend to all the old variables
def test_means(ppl: pd.DataFrame) -> None:
    for (col, theMin, theMax) in [
        ("used savings", 0.005, 0.05),
        ("empleado", 0.20, 0.6),
        ("desempleado", 0.03, 0.12),
        ("in labor force", 0.25, 0.6),
    ]:
        x = ppl[col].mean()
        assert (x >= theMin) & (x <= theMax)


if True:  # run tests
    log = "starting\n"

    # unit tests
    test_count_num_matches_in_space_separated_list()

    # integration tests
    ppl = oio.readStage(com.subsample, 'people_1')
    ppl["edu"] = util.interpretCategorical(ppl["edu"], files.edu_key.values())
    test_ranges(ppl)
    test_upper_bound_on_fraction_missing(ppl)
    test_means(ppl)

    assert util.near(len(ppl), num_people / com.subsample, tol_frac=1 / 5)

    oio.test_write(com.subsample, "people_main", log)