def test_output(df): log = "test_output\n" assert (unique(df.columns)) assert near(len(df), num_purchases_surviving / com.subsample, tol_frac=(1 / 20 if com.subsample != 10 else 0.6)) # TODO | BUG? Why is the previous conditional necessary? That is, why, # in the special case of subsample = 1/10, is the size of the # purchase data so different from what you'd expect. # This isn't necessarily wrong, since the data is subsampled by households, # and households can make different numbers of purchases. # That's why `tol_frac` needs to be substantial in both cases. # But it's surprising, because for subsample = 10, # the reality is much less than the expectation. spec = { "where-got": cla.InRange(1, 26), "weight": cla.InRange(0, 1e4), "value": cla.InRange(0, 1e9), "quantity": cla.InRange(0, 1e8), "is-purchase": cla.InRange(0, 1), "household": cla.InRange(1, 1e7), "per month": cla.InRange(1, 11), "coicop": cla.InRange(1, 1e8), "25-broad-categs": cla.InRange(1, 25) } for k in spec: log += (" " + k + "\n") assert spec[k].test(df[k]) log += "Specs cover all column names." assert set(df.columns) == set(spec.keys()) log += "Very few missing quantity values." assert ((1e-5) > (len(df[pd.isnull(df["quantity"])]) / len(df))) log += "Very few negative quantity values." assert ((1e-5) > (len(df[df["quantity"] <= 0]) / len(df))) log += "Negative quantity purchases are for very little money." assert (df[df["quantity"] < 0]["value"] < 1e4).all() log += "Very few purchases with a frequency of \"never\"." assert ((1e-5) > (len(df[df["per month"] > 10]) / len(df))) log += "Those few frequency=\"never\" purchases are for very little money." assert (df[df["per month"] > 10]["value"] < 1e4).all() return log
def test_mk_salud_employer(): if True: # for contractors, always 0 assert near(sf.mk_salud_employer(contractor, 0), 0) assert near(sf.mk_salud_employer(contractor, 1000 * min_wage), 0) if True: # for employees assert near(sf.mk_salud_employer(employee, 0.5 * min_wage), 0) assert near(sf.mk_salud_employer(employee, 9 * min_wage), 0) assert near(sf.mk_salud_employer(employee, 20 * min_wage), 0.085 * 0.7 * 20 * min_wage) assert near(sf.mk_salud_employer(employee, 100 * min_wage), 0.085 * 25 * min_wage)
def test_mk_pension_employer(): if True: # for contractors, always 0 assert near(sf.mk_pension_employer(contractor, 0), 0) assert near(sf.mk_pension_employer(contractor, 1000 * min_wage), 0) if True: # for employees assert near(sf.mk_pension_employer(employee, 0.5 * min_wage), 0) assert near(sf.mk_pension_employer(employee, 5 * min_wage), 0.12 * 5 * min_wage) assert near(sf.mk_pension_employer(employee, 20 * min_wage), 0.12 * 0.7 * 20 * min_wage) assert near(sf.mk_pension_employer(employee, 100 * min_wage), 0.12 * 25 * min_wage)
def test_output(df): log = "test_output()\n" assert unique(df.columns) assert near(len(df), num_purchases_surviving / com.subsample, tol_frac=(1 / 10 if not com.subsample == 10 else 0.6)) # TODO | BUG? Why is the previous conditional necessary? That is, why, # in the special case of subsample = 1/10, is the size of the # purchase data so different from what you'd expect. # This isn't necessarily wrong, since the data is subsampled by households, # and households can make different numbers of purchases. # That's why `tol_frac` needs to be substantial in both cases. # But it's surprising, because for subsample = 10, # the reality is much less than the expectation. assert (set(df.columns) == set(Purchase_2_Columns_missing.all_columns())) # coicop and 25-broad-categs are each individually missing substantially, # but exactly one of them is always present assert len(df[(~pd.isnull(df["coicop"])) & (~pd.isnull(df["25-broad-categs"]))]) == 0 assert len(df[(pd.isnull(df["coicop"])) | (pd.isnull(df["25-broad-categs"]))]) == len(df) for c in Purchase_2_Columns_missing.never: assert (len(df[pd.isnull(df[c])]) == 0) for c in Purchase_2_Columns_missing.slightly: assert ((len(df[pd.isnull(df[c])]) / len(df)) < 0.03) for c in Purchase_2_Columns_missing.very: assert ((len(df[pd.isnull(df[c])]) / len(df)) < 0.25) return log
cols2 = set(out.columns) new_cols = { "estrato", 'recently bought this house', "region-1", "region-2", "age-decile", "income-decile", "IT", "IC", "ICM", "ICMD", "GT", "GC", "GCM", "female head" } assert util.unique(out.columns) assert util.unique(new_cols) assert set.intersection(cols1, new_cols) == set() assert set.union(cols1, new_cols) == cols2 assert set.difference(cols2, cols1) == new_cols assert len(in_rows) == len(out) assert util.near(len(out), misc.num_people / com.subsample, tol_frac=1 / 5) per_cell_spec = { "age-decile": cl.InRange(0, 9), "income-decile": cl.InRange(0, 9), "female head": cl.InRange(0, 1) } per_column_spec = { "age-decile": cl.CoversRange(0, 9), "income-decile": cl.CoversRange(0, 9), "female head": cl.CoversRange(0, 1) } for k, v in per_cell_spec.items(): assert v.test(out[k])
def test_mk_solidaridad(): if True: # contractor assert near(sf.mk_solidaridad(contractor, 0), 0) assert near(sf.mk_solidaridad(contractor, 3 * min_wage), 0) assert near(sf.mk_solidaridad(contractor, 5 * min_wage), 0.01 * 0.4 * 5 * min_wage) assert near(sf.mk_solidaridad(contractor, 15 * min_wage), 0.01 * 0.4 * 15 * min_wage) assert near(sf.mk_solidaridad(contractor, 16.5 * min_wage), 0.012 * 0.4 * 16.5 * min_wage) assert near(sf.mk_solidaridad(contractor, 17.5 * min_wage), 0.014 * 0.4 * 17.5 * min_wage) assert near(sf.mk_solidaridad(contractor, 18.5 * min_wage), 0.016 * 0.4 * 18.5 * min_wage) assert near(sf.mk_solidaridad(contractor, 19.5 * min_wage), 0.018 * 0.4 * 19.5 * min_wage) assert near(sf.mk_solidaridad(contractor, 21 * min_wage), 0.02 * 0.4 * 21 * min_wage) assert near(sf.mk_solidaridad(contractor, 100 * min_wage), 0.02 * 25 * min_wage) if True: # employee assert near(sf.mk_solidaridad(employee, 0), 0) assert near(sf.mk_solidaridad(employee, 3 * min_wage), 0) assert near(sf.mk_solidaridad(employee, 5 * min_wage), 0.01 * 5 * min_wage) assert near(sf.mk_solidaridad(employee, 13.1 * min_wage), 0.01 * 0.7 * 13.1 * min_wage) assert near(sf.mk_solidaridad(employee, 16.5 * min_wage), 0.012 * 0.7 * 16.5 * min_wage) assert near(sf.mk_solidaridad(employee, 17.5 * min_wage), 0.014 * 0.7 * 17.5 * min_wage) assert near(sf.mk_solidaridad(employee, 18.5 * min_wage), 0.016 * 0.7 * 18.5 * min_wage) assert near(sf.mk_solidaridad(employee, 19.5 * min_wage), 0.018 * 0.7 * 19.5 * min_wage) assert near(sf.mk_solidaridad(employee, 21 * min_wage), 0.02 * 0.7 * 21 * min_wage) assert near(sf.mk_solidaridad(employee, 100 * min_wage), 0.02 * 25 * min_wage)
( "value, non-purchase", [ cla.MeanBounds (1e6,1e7), cla.CoversRange (0 ,1e6), cla.InRange (0 ,3.3e9), cla.MissingAtMost (0) ] ), ( "value, purchase", [ cla.MeanBounds (1e6 ,5e6), cla.CoversRange (1e2 ,4e7), # TODO ? This minimum is nuts. cla.InRange (0 ,2e8), cla.MissingAtMost (0) ] ), ( "value, spending", [ cla.MeanBounds (1e6 ,5e6), cla.CoversRange (1e2 ,4e7), # TODO ? This minimum is nuts. cla.InRange (0 ,2e8), cla.MissingAtMost(0) ] ) ]: for t in ts: assert t.test( sums[c] ) assert sums["household"].is_unique assert util.near( len(sums), num_households / com.subsample, tol_frac = 1/5 ) oio.test_write( com.subsample, "build_purchase_sums", "It worked." )
import python.build.purchases.articulos as articulos import python.build.purchases.capitulo_c as capitulo_c purchases = com.collect_files( ( articulos.files # + medios.files # The tax only applies if the purchase is more than 880 million pesos, # and the data only records purchases of a second home. + capitulo_c.files + nice_purchases.files), subsample=com.subsample) assert util.near( # PITFALL: This differs from the usual idiom which separates testing # from production. That's because the only thing tested here is # the number of rows; reading the entire data set into memory again # for such a simple test seems unworth the added execution time. len(purchases), misc.num_purchases / com.subsample, tol_frac=(1 / 20 if not com.subsample == 10 else 1 / 2)) # TODO | BUG? Why is the previous conditional necessary? That is, why, # in the special case of subsample = 1/10, is the size of the # purchase data so different from what you'd expect. # This isn't necessarily wrong, since the data is subsampled by households, # and households can make different numbers of purchases. # That's why `tol_frac` needs to be substantial in both cases. # But it's surprising, because for subsample = 10, # the reality is much less than the expectation. oio.saveStage(com.subsample, purchases, 'purchases_0')
# TODO : extend to all the old variables def test_means(ppl: pd.DataFrame) -> None: for (col, theMin, theMax) in [ ("used savings", 0.005, 0.05), ("empleado", 0.20, 0.6), ("desempleado", 0.03, 0.12), ("in labor force", 0.25, 0.6), ]: x = ppl[col].mean() assert (x >= theMin) & (x <= theMax) if True: # run tests log = "starting\n" # unit tests test_count_num_matches_in_space_separated_list() # integration tests ppl = oio.readStage(com.subsample, 'people_1') ppl["edu"] = util.interpretCategorical(ppl["edu"], files.edu_key.values()) test_ranges(ppl) test_upper_bound_on_fraction_missing(ppl) test_means(ppl) assert util.near(len(ppl), num_people / com.subsample, tol_frac=1 / 5) oio.test_write(com.subsample, "people_main", log)