Ejemplo n.º 1
0
def test_output(df):
    log = "test_output\n"
    assert (unique(df.columns))

    assert near(len(df),
                num_purchases_surviving / com.subsample,
                tol_frac=(1 / 20 if com.subsample != 10 else 0.6))
    # TODO | BUG? Why is the previous conditional necessary? That is, why,
    # in the special case of subsample = 1/10, is the size of the
    # purchase data so different from what you'd expect.
    # This isn't necessarily wrong, since the data is subsampled by households,
    # and households can make different numbers of purchases.
    # That's why `tol_frac` needs to be substantial in both cases.
    # But it's surprising, because for subsample = 10,
    # the reality is much less than the expectation.

    spec = {
        "where-got": cla.InRange(1, 26),
        "weight": cla.InRange(0, 1e4),
        "value": cla.InRange(0, 1e9),
        "quantity": cla.InRange(0, 1e8),
        "is-purchase": cla.InRange(0, 1),
        "household": cla.InRange(1, 1e7),
        "per month": cla.InRange(1, 11),
        "coicop": cla.InRange(1, 1e8),
        "25-broad-categs": cla.InRange(1, 25)
    }
    for k in spec:
        log += ("  " + k + "\n")
        assert spec[k].test(df[k])

    log += "Specs cover all column names."
    assert set(df.columns) == set(spec.keys())

    log += "Very few missing quantity values."
    assert ((1e-5) > (len(df[pd.isnull(df["quantity"])]) / len(df)))

    log += "Very few negative quantity values."
    assert ((1e-5) > (len(df[df["quantity"] <= 0]) / len(df)))

    log += "Negative quantity purchases are for very little money."
    assert (df[df["quantity"] < 0]["value"] < 1e4).all()

    log += "Very few purchases with a frequency of \"never\"."
    assert ((1e-5) > (len(df[df["per month"] > 10]) / len(df)))

    log += "Those few frequency=\"never\" purchases are for very little money."
    assert (df[df["per month"] > 10]["value"] < 1e4).all()

    return log
Ejemplo n.º 2
0
def test_ranges(bs: pd.DataFrame) -> None:
    for (c, t) in [("recently bought this house", cla.InSet({True, False})),
                   ("recently bought this house", cla.CoversRange(0, 1)),
                   ("recently bought this house", cla.MeanBounds(0, 0.01)),
                   ("recently bought this house", cla.MissingAtMost(0)),
                   ("estrato", cla.InRange(0, 6)),
                   ("estrato", cla.CoversRange(0, 3)),
                   ("estrato", cla.MeanBounds(1.5, 2.5)),
                   ("estrato", cla.MissingAtMost(0.02))]:
        assert t.test(bs[c])
Ejemplo n.º 3
0
def test_Property_subclasses():
    assert (cla.MissingAtMost(0.5).test(pd.Series([1, np.nan, 3])))
    assert not (cla.MissingAtMost(0.5).test(pd.Series([1, np.nan, np.nan])))

    for (val, result) in [(-1, False), (0, True), (0.5, True), (1, True),
                          (2, False), (np.nan, True)]:
        assert cla.InRange(0, 1).test(pd.Series([val])) == result
        assert cla.InSet({0, 0.5, 1}).test(pd.Series([val])) == result

    assert ((cla.CoversRange(0, 10).test(pd.Series([0, 10]))) &
            (cla.CoversRange(0, 10).test(pd.Series([0, 10]))) &
            (not cla.CoversRange(0, 10).test(pd.Series([1, 9]))))
Ejemplo n.º 4
0
    "age-decile", "income-decile", "IT", "IC", "ICM", "ICMD", "GT", "GC",
    "GCM", "female head"
}

assert util.unique(out.columns)
assert util.unique(new_cols)

assert set.intersection(cols1, new_cols) == set()
assert set.union(cols1, new_cols) == cols2
assert set.difference(cols2, cols1) == new_cols

assert len(in_rows) == len(out)
assert util.near(len(out), misc.num_people / com.subsample, tol_frac=1 / 5)

per_cell_spec = {
    "age-decile": cl.InRange(0, 9),
    "income-decile": cl.InRange(0, 9),
    "female head": cl.InRange(0, 1)
}

per_column_spec = {
    "age-decile": cl.CoversRange(0, 9),
    "income-decile": cl.CoversRange(0, 9),
    "female head": cl.CoversRange(0, 1)
}

for k, v in per_cell_spec.items():
    assert v.test(out[k])

for k, v in per_column_spec.items():
    assert v.test(out[k])
Ejemplo n.º 5
0
def test_ranges(df):
    log = "test_ranges()\n"

    inRange_spec = {
        "25-broad-categs": cl.InRange(1, 25),
        "big-hog": cl.InRange(0, 1),
        "coicop": cl.InRange(1e6, 2e7),

        # PITFALL: "freq-code"=11 <=> the purchase is never made.
        # This corresponds to a "per month" value of np.nan.
        "freq-code": cl.InRange(0, 10),
        "household": cl.InRange(0, 1e6),
        "is-purchase": cl.InRange(0, 1),
        "per month": cl.InRange(1 / 36 - 0.001, 31),
        "quantity": cl.InRange(0, 1e8),
        "value": cl.InRange(0, 3e9),

        # The special motorcycle tax, abusivelyed lump into the VAT table,
        # means the max "vat" is 0.27 rather than 0.19.
        "vat": cl.InRange(0, 0.3),
        "vat frac": cl.InRange(0, 0.3 / 1.3),
    }

    for k, v in inRange_spec.items():
        assert v.test(df[k])

    coversRange_spec = {
        "household": cl.CoversRange(2e5, 6e5),
        "per month": cl.CoversRange(0.05, 30),
        "quantity": cl.CoversRange(1, 100),
        "value": cl.CoversRange(3, 1e6),
        "weight": cl.CoversRange(10, 1000),
        "where-got": cl.CoversRange(1, 25),

        # The special motorcycle tax, abusively lump into the VAT table,
        # means the max "vat" is 0.27 rather than 0.19.
        # *However*, in the smaller samples,
        # we can't be sure that whole range is covered:
        # there  might be no motorcycle purchases.
        # That at least some purchase incurs a VAT of 0.19, though, is a safe bet.
        "vat frac": cl.CoversRange(0, 0.19 / 1.19),
        "vat paid": cl.CoversRange(0, 1e5),
        "vat": cl.CoversRange(0, 0.19),
    }

    for k, v in coversRange_spec.items():
        assert v.test(df[k])

    return log
Ejemplo n.º 6
0
           "value, tax, predial",
           "value, tax, purchaselike non-predial non-VAT",
           "transactions",
           "value, non-purchase",
           "value, purchase",
           "value, spending",
           "value, consumption",
           "vat paid" } )

if com.subsample < 11: # The data is too sparse to test
                       # the smaller samples this way
  for (c,ts) in [
    ( "transactions",
      [ cla.MeanBounds    ( 50 , 120 ),
        cla.CoversRange   ( 2  , 200 ),
        cla.InRange       ( 1  , 400 ),
        cla.MissingAtMost ( 0 ) ] ),

    ( "value, tax, purchaselike non-VAT",
      [ cla.MeanBounds    (1e4 , 1e5),
        cla.CoversRange   (0   , 2e6),
        cla.InRange       (0   , 1.1e8), # someone pays a huge predial
        cla.MissingAtMost (0) ] ),

    ( "value, tax, predial",
      [ cla.MeanBounds    (1e4 ,1e5),
        cla.CoversRange   (0   ,1e3),
        cla.InRange       (0   ,1.1e8),
        cla.MissingAtMost (0) ] ),

    ( "value, tax, purchaselike non-predial non-VAT",
Ejemplo n.º 7
0
               "vat / income",
               "purchase value / income" ]
  assert ( len( merge.columns ) ==
           len( hh_cols.columns ) +
           len( pur.columns ) - 1 + # omit the key that was merged on
           len( new_cols ) )
  assert len( merge ) == len( hh_rows )

if True:
    assert         (merge["region-1"] == "SAN ANDRÉS") . any()
    assert ( merge[ merge["region-1"] == "SAN ANDRÉS" ]
             ["vat paid"].max() == 0 )

if True:
  for k,v in {
      "vat / purchase value"      : cl.InRange( 0, 0.3 ),
        # The special motorcycle tax, abusivelyed lump into the VAT table,
        # means the max "vat" is 0.27 rather than 0.19.
      "vat / income"              : cl.InRange( 0, np.inf ),
      "purchase value / income"   : cl.InRange( 0, np.inf )
      }.items():
    assert v.test( merge[k] )
  for k,v in {
      # These bounds could be tighter,
      # but the 1/1000 subsample has a small range.
      "vat / purchase value"       : cl.CoversRange( 0,      0.1    ),
      "vat / income"               : cl.CoversRange( 0,      np.inf ),
      "purchase value / income"    : cl.CoversRange( 0.2,    np.inf )
      }.items():
    assert v.test( merge[k] )
  for k,v in {
Ejemplo n.º 8
0
def test_capitulo_c_data(df: pd.DataFrame):
    for t in [cla.InRange(1, 25), cla.CoversRange(1, 25)]:
        assert t.test(df["CODE"])
    for col in ["vat", "vat, min", "vat, max"]:
        assert (cla.InRange(0, 1).test(df[col]))
Ejemplo n.º 9
0
def test_coicop_data(df: pd.DataFrame):
    for t in [cla.InRange(1e6, 2e7), cla.CoversRange(2e6, 1e7)]:
        assert t.test(df["coicop"])
    for col in ["vat", "vat, min", "vat, max"]:
        assert (cla.InRange(0, 1).test(df[col]))
Ejemplo n.º 10
0
def test_ranges(ppl: pd.DataFrame):
    assert (util.unique(ppl.columns))
    specs = {
        "household": cla.InRange(0, 1e7),
        "age": cla.InRange(0, 120),
        "edu": cla.InSet(set(files.edu_key.values())),
        "female": cla.InRange(0, 1),
        "household-member": cla.InRange(1, 50),
        "income, pension": cla.InRange(0, 3e8),
        "income, cesantia": cla.InRange(0, 1e8),
        "income, dividend": cla.InRange(0, 1e8),
        "independiente": cla.InRange(0, 1),
        "literate": cla.InRange(0, 1),
        "student": cla.InRange(0, 1),
        "weight": cla.InRange(0.001, 1e4),
        "pension, contributing (if not pensioned)": cla.InRange(0, 1),
        "pension, receiving": cla.InRange(0, 1),
        "pension, contributor(s) (if not pensioned) = split":
        cla.InRange(0, 1),
        "pension, contributor(s) (if not pensioned) = self": cla.InRange(0, 1),
        "pension, contributor(s) (if not pensioned) = employer":
        cla.InRange(0, 1),
        "seguro de riesgos laborales": cla.InRange(0, 1),
        "income, govt, cash": cla.InRange(0, 2e7),
        "income, govt, in-kind": cla.InRange(0, 1e7),
        "income, non-labor (tax def)": cla.InRange(0, 1e8),
        "income, rental + interest": cla.InRange(0, 1e9),
        "income, donacion": cla.InRange(0, 2e7),
        "income, infrequent": cla.InRange(0, 1e8),
        "income, ganancia ocasional, 10%-taxable": cla.InRange(0, 1e8),
        "income, ganancia ocasional, 20%-taxable": cla.InRange(0, 3e7),
        "income, labor, cash": cla.InRange(0, 3e9),
        "income, labor, in-kind": cla.InRange(0, 3e7),
        "income, cash": cla.InRange(0, 3e9),
        "income, in-kind": cla.InRange(0, 3e7),
        "income": cla.InRange(0, 3e9),
        "income, govt": cla.InRange(0, 3e7),
        "income, private": cla.InRange(0, 2e8),
        "income, labor": cla.InRange(0, 3e9),
        "income, borrowing": cla.InRange(0, 1e8),
        "rank, labor income": cla.InRange(1, 50),
        "empleado": cla.InRange(0, 1),
        "desempleado": cla.InRange(0, 1),
        "in labor force": cla.InRange(0, 1),
        "used savings": cla.InSet({True, False}),
        "disabled": cla.InSet({True, False}),
        "dependent": cla.InSet({True, False}),
        "race, indig": cla.InSet({True, False}),
        "race, git|rom": cla.InSet({True, False}),
        "race, raizal": cla.InSet({True, False}),
        "race, palenq": cla.InSet({True, False}),
        "race, neg|mul": cla.InSet({True, False}),
        "race, whi|mest": cla.InSet({True, False})
    }
    for k in specs.keys():
        assert specs[k].test(ppl[k])