Beispiel #1
0
    purchases_cap_c = purchases.merge(vat_cap_c,
                                      how="left",
                                      on="25-broad-categs")
    purchases = purchases_coicop.combine_first(purchases_cap_c)

if True:  # Big motorcycles incur a special 8% tax.
    # We proxy for "big" with "expensive".
    purchases["big-hog"] = (1 * (purchases["coicop"] == "07120101") *
                            (purchases["value"] > (9e6)))
    purchases.loc[purchases["big-hog"] > 0,
                  "vat"] = (purchases.loc[purchases["big-hog"] > 0, "vat"] +
                            0.08)
    purchases.loc[purchases["big-hog"] > 0, "vat frac"] = (
        purchases.loc[purchases["big-hog"] > 0, "vat"] /
        (purchases.loc[purchases["big-hog"] > 0, "vat"] + 1))

if False:  # drop anything missing vat
    purchases = purchases[~purchases["vat"].isnull()]

if True:  # handle freq, value, vat paid
    purchases["freq-code"] = purchases["per month"]
    # Kept for the sake of drawing a table of purchase frequency,
    # with frequencies spread evenly across the x-axis.
    (purchases["per month"]  # PITFALL: not functional; the "inplace" option
     # causes replace() to have no return value.
     .replace(legends.freq, inplace=True))
    purchases["value"] = purchases["per month"] * purchases["value"]
    purchases["vat paid"] = purchases["value"] * purchases["vat frac"]

oio.saveStage(c.subsample, purchases, "purchases_2_vat." + c.strategy_suffix)
                                         "race, palenq": "has-palenq",
                                         "race, whi|mest": "has-whi|mest"
                                     })
        h_max = pd.concat([edu_max, other_max], axis=1)
        del (edu_max, other_max)

if True:  # Assemble the aggregates, then compute a few variables.
    households = pd.concat([h_first, h_sum, h_min, h_max], axis=1)
    households["household"] = households.index
    # when there are multiple indices, reset_index is the way to do that
    households["has-child"] = households["age-min"] < 18
    households["all-elderly"] = households["age-min"] > 65
    households["has-elderly"] = households["age-max"] > 65
    #
    # PITFALL: Income decile and percentile for persons exist too. They are different.
    households["income-decile"] = (util.noisyQuantile(10, 0, 1,
                                                      households["income"]))
    households["income-percentile"] = (util.noisyQuantile(
        100, 0, 1, households["income"]))
    households[
        "one"] = 1  # used in report/households.py to create the trivial partition.
    # TODO ? move to report/households.py
    households_decile_summary = desc.summarizeQuantiles(
        "income-decile", households)

if True:  # save
    oio.saveStage(com.subsample, households,
                  "households_1_agg_plus." + com.strategy_year_suffix)
    oio.saveStage(com.subsample, households_decile_summary,
                  "households_decile_summary." + com.strategy_year_suffix)
Beispiel #3
0
    import python.build.output_io as oio
    import python.common.util as util
    import python.common.common as common

if True:  # merge people, buildings
    buildings = oio.readStage(
        1  # PITFALL: For buildings, we always use the full sample.
        ,
        'buildings',
        dtype={"estrato": 'float64'})
    people = oio.readStage(common.subsample, 'people_1')
    people = pd.merge(people, buildings, how="left", on="household")

if True:  # make some new variables
    people["age-decile"] = pd.qcut(people["age"],
                                   10,
                                   labels=False,
                                   duplicates='drop')
    people["income-decile"] = (
        # PITFALL: there's a different such variable at the household level
        util.noisyQuantile(10, 0, 1, people["income"]))
    people["female head"] = people["female"] * (people["household-member"]
                                                == 1)

# PITFALL: As noted earlier, the buildings data is always drawn from the full
# sample. However, the person data is drawn from a subsample.
# Hence the output is written only to the folder for that subsample --
# (This contrasts with the test programs that use the full sample,
# which write evidence that the test passed to every subsample folder.)
oio.saveStage(common.subsample, people, 'people_2_buildings')
Beispiel #4
0
purchases["purchases"] = 1

# When I check purchases[ purchases["coicop"] == x ] for these x,
# the results are consistent with the coicop-vat bridge.
# 11110103, 11110104, 11110105
# 1119807, 1119808, 1119809
# 1180103, 1180201, 1180301


## vat per coicop
p_sum = purchases.groupby( 'coicop' )[ "value" ] . agg( 'sum' )
p_first = purchases.groupby( 'coicop' )[ "vat, min" ] . agg( 'mean' )
p = pd.concat( [p_sum, p_first]
              , axis = 1 )

oio.saveStage( subsample, p, "vat-and-spending-per-coicop"
               , index = True
)


## vat per rate
q_sum = purchases.groupby( 'vat, min' )[ "value" ] . agg( 'sum' )
q_first = purchases.groupby( 'vat, min' )[ "vat, min" ] . agg( 'mean' )
q = pd.concat( [q_sum, q_first]
              , axis = 1 )

oio.saveStage( subsample, q, "vat-and-spending-per-vat-rate"
               , index = True
)
Beispiel #5
0
# From the raw ENPH person-level data,
# creates a data set that's a little friendlier.

if True:
    import python.build.output_io as oio
    import python.build.people.files as files
    import python.common.common as cl
    import python.common.misc as c

ppl = c.all_columns_to_numbers(
    cl.collect_files(files.files, subsample=cl.subsample),
    skip_columns=["non-beca sources"
                  ]  # PITFALL : a space-separated list of ints
)

oio.saveStage(cl.subsample, ppl, 'people_0')
Beispiel #6
0
    import python.build.purchases.articulos as articulos
    import python.build.purchases.capitulo_c as capitulo_c

purchases = com.collect_files(
    (
        articulos.files
        # + medios.files
        # The tax only applies if the purchase is more than 880 million pesos,
        # and the data only records purchases of a second home.
        + capitulo_c.files + nice_purchases.files),
    subsample=com.subsample)

assert util.near(
    # PITFALL: This differs from the usual idiom which separates testing
    # from production. That's because the only thing tested here is
    # the number of rows; reading the entire data set into memory again
    # for such a simple test seems unworth the added execution time.
    len(purchases),
    misc.num_purchases / com.subsample,
    tol_frac=(1 / 20 if not com.subsample == 10 else 1 / 2))
# TODO | BUG? Why is the previous conditional necessary? That is, why,
# in the special case of subsample = 1/10, is the size of the
# purchase data so different from what you'd expect.
# This isn't necessarily wrong, since the data is subsampled by households,
# and households can make different numbers of purchases.
# That's why `tol_frac` needs to be substantial in both cases.
# But it's surprising, because for subsample = 10,
# the reality is much less than the expectation.

oio.saveStage(com.subsample, purchases, 'purchases_0')
Beispiel #7
0
            ("ICMUG", 0, "ICM", 0),
            ("ICMDUG", 0, "ICMD", 0
             )  # "ingreso corriente monetario disponisble"
            ,
            ("GTUG", 0, "GT", 0),
            ("GCUG", 0, "GC", 0),
            ("GCMUG", 0, "GCM", 0)  # "gasto corriente monetario"
        ])
]

buildings = cl.collect_files(files, subsample=1)  # see PITFALL above
for c in ["IT", "IC", "ICM", "ICMD", "GT", "GC", "GCM"]:
    buildings = (cla.Correction.Replace_Substring_In_Column(
        c, ",", ".").correct(buildings))

if True:  # estrato is strange
    # It includes undocumented values 0 and 9.
    # 0 might mean "renter":
    # https://www.eltiempo.com/archivo/documento/MAM-1757051
    # I'm assuming 9 is some kind of error code.
    buildings["estrato"] = buildings["estrato"].replace(9, np.nan)

buildings["recently bought this house"] = (
    buildings["recently bought this house"] == 1)

oio.saveStage(
    1  # see PITFALL above
    ,
    buildings,
    'buildings')
Beispiel #8
0
if True:  # Make new variables, esp. create person-level purchase-like
    earners["share"] = np.where(  # The fraction of purchaselike variables
        # attributed to this household adult.
        earners["income, household"] <= 0,  # the condition
        1 / earners["members in labor force"],  # used if true
        earners["income"] / earners["income, household"])  # used if false
    earners[
        "one"] = 1  # To define the trivial group in the person-level report.
    for i in defs.household_variables_to_allocate_by_income_share:
        earners[i] = earners[i] * earners["share"]

if True:  # more variables
    earners["income-decile"] = (util.noisyQuantile(10, 0, 1,
                                                   earners["income"]))
    earners["income-percentile"] = (util.noisyQuantile(100, 0, 1,
                                                       earners["income"]))
    earners["vat / purchase value"] = (earners["vat paid"] /
                                       earners["value, purchase"])
    earners["vat / income"] = (
        # PITFALL: While the maximum value of this looks absurd (103),
        # it's not. The 95th percentile is 0.3. The outliers are so high because
        # people can spend borrowed money.
        earners["vat paid"] / earners["income"])
    earners["purchase value / income"] = (earners["value, purchase"] /
                                          earners["income"])

if True:  # save
    oio.saveStage(com.subsample, earners,
                  "people_4_post_households." + com.strategy_year_suffix)
Beispiel #9
0
  import sys
  import pandas                    as pd
  #
  import python.build.ss_functions as ss
  import python.build.output_io    as oio
  import python.common.util        as util
  import python.common.common      as com
  #
  import python.build.people_3_income_taxish_functions as f4
  if   com.regime_year == 2016:
      import python.regime.r2016 as regime
  elif com.regime_year == 2018:
      import python.regime.r2018 as regime
  else:
      import python.regime.r2019 as regime


ppl = oio.readStage( com.subsample
                   , "people_2_buildings" )

ppl = ss.mk_ss_contribs(ppl)

ppl = f4.insert_has_dependent_column(ppl)

ppl = regime.income_taxes( ppl )

oio.saveStage( com.subsample
             , ppl
             , 'people_3_income_taxish.' + com.strategy_year_suffix
)
Beispiel #10
0
    vat_columns = [col for col in purchases.columns if vat_regex.match(col)]
    # This could change -- that's why I use a regexp -- but currently,
    # vat_columns is equal to this:
    #   [ "vat", "vat frac", "vat paid"]
    #
    for col in vat_columns:
        purchases[col] = ((purchases["is-purchase"] > 0) * purchases[col])

purchases["transactions"] = 1  # This is soon summed within persons.
purchase_sums = purchases.groupby(["household"])[[
    "value, purchase", "value, non-purchase", "transactions", "vat paid",
    "value, tax, predial", "value, tax, purchaselike non-predial non-VAT"
]].agg("sum")
purchase_sums = purchase_sums.reset_index(level=["household"])

if True:  # It's faster to compute these columns post-aggregation.
    purchase_sums["value, tax, purchaselike non-VAT"] = (
        purchase_sums["value, tax, predial"] +
        purchase_sums["value, tax, purchaselike non-predial non-VAT"])
    purchase_sums["value, spending"] = (  # Taxes and purchases, but no gifts.
        # PITFALL: Includes VAT (it's part of "value, purchase").
        purchase_sums["value, tax, purchaselike non-VAT"] +
        purchase_sums["value, purchase"])
    purchase_sums[
        "value, consumption"] = (  # purchases and gifts, but no taxes (except VAT)
            # PITFALL: Includes VAT (it's part of "value, purchase").
            purchase_sums["value, non-purchase"] +
            purchase_sums["value, purchase"])

oio.saveStage(c.subsample, purchase_sums, "purchase_sums." + c.strategy_suffix)
Beispiel #11
0
    # results in the fraction of that payment attributable to the vat.
    # This is because reported expenditures are post-tax.
    # For instance, if the VAT were 20%, then (0.2 / 1.2) is that fraction.
    # TODO: Test (automatically).
    data["vat"] = (data[vat_components].sum(axis="columns"))
    data["vat frac"] = (data["vat"] / (data["vat"] + 1))
    return data


def go(data: pd.DataFrame) -> pd.DataFrame:
    return (compute_total_vat(
        incorporate_user_vat_prefs(data)
    ).drop(  # Once they have been summed, the components are of no interest.
        columns=vat_components + ["group"]))


vat_coicop = go(vat_coicop)
vat_cap_c = go(vat_cap_c)

if True:  # save
    oio.saveStage(c.subsample, vat_coicop, 'vat_coicop.' + c.strategy_suffix)
    oio.saveStage(c.subsample, vat_cap_c, 'vat_cap_c.' + c.strategy_suffix)
    #
    vat_coicop = vat_coicop.drop(columns=["description", "Notes"])
    vat_cap_c = vat_cap_c.drop(columns=["description"])
    #
    oio.saveStage(c.subsample, vat_coicop,
                  'vat_coicop_brief.' + c.strategy_suffix)
    oio.saveStage(c.subsample, vat_cap_c,
                  'vat_cap_c_brief.' + c.strategy_suffix)
Beispiel #12
0
purchases = com.all_columns_to_numbers(purchases)
purchases = defs.drop_if_coicop_or_value_invalid(purchases)
purchases = defs.drop_absurdly_big_expenditures(purchases)
purchases = (Correction.Drop_Row_If_Column_Satisfies_Predicate(
    "value", lambda v: v <= 0).correct(purchases))
purchases = (
    Correction.  # no "never" frequencies
    Drop_Row_If_Column_Satisfies_Predicate(
        "per month", lambda x: x == 11).correct(purchases))
purchases = (
    Correction.  # no non-positive quantities
    Drop_Row_If_Column_Satisfies_Predicate(
        "quantity", lambda x: x <= 0).correct(purchases))

# These only make sense once the relevant columns are numbers.
for c in (  # how-got=1 -> is-purchase=1, nan -> nan, otherwise -> 0
    [
        Correction.Apply_Function_To_Column(
            "how-got",
            lambda x: 1 if x == 1 else
            # PITFALL: x >= 0 yields False for NaN
            (0 if x >= 0 else np.nan)),
        Correction.Rename_Column("how-got", "is-purchase"),
        Correction.Drop_Row_If_Column_Satisfies_Predicate(
            "quantity", lambda x: x <= 0)
    ]):
    purchases = c.correct(purchases)

oio.saveStage(cl.subsample, purchases, 'purchases_1')
                                     lambda x: str(gv) + ": " + defs.maybeFill(
                                         gv, str(x)), t.index))))
            varSummaries.append(t)
        groupSummaries.append(pd.concat(varSummaries, axis=1))
    summaryDict[unit] = pd.concat(groupSummaries, axis=0)

    ret_tmi = pd.concat(list(summaryDict.values()), axis=0).transpose()
    ret = ret_tmi.loc[restrictedVars]
    ret_tmi.reset_index(inplace=True)
    ret_tmi = ret_tmi.rename(columns={"index": "measure"})
    ret.reset_index(inplace=True)
    ret = ret.rename(columns={"index": "measure"})

    return (ret, ret_tmi)


for (unit, df, variables, restrictedVars) in [
    ("earners", earners, defs.earnerVars, defs.earnerRestrictedVars),
    ("households", households, defs.householdVars,
     defs.householdRestrictedVars)
]:
    (ret, ret_tmi) = make_summary_frame(unit, df, variables, restrictedVars)
    oio.saveStage(com.subsample, ret_tmi,
                  "report_" + unit + "_tmi." + com.strategy_year_suffix)
    oio.saveStage_excel(com.subsample, ret_tmi,
                        "report_" + unit + "_tmi." + com.strategy_year_suffix)
    oio.saveStage(com.subsample, ret,
                  "report_" + unit + "." + com.strategy_year_suffix)
    oio.saveStage_excel(com.subsample, ret,
                        "report_" + unit + "." + com.strategy_year_suffix)
Beispiel #14
0

if True: # merge purchase data into person data
  # PITFALL: The unit of observation in all these data sets is a household.
  hh = oio.readStage(
    com.subsample,
    "households_1_agg_plus." + com.strategy_year_suffix )
  pur = oio.readStage(
    com.subsample,
    "purchase_sums." + com.strategy_suffix )
  merge = pd.merge( hh, pur,
                    how = "left",
                    on=["household"] )

if True: # In San Andrés there is no VAT.
  merge.loc[ merge["region-1"] == "SAN ANDRÉS", "vat paid" ] = 0

if True: # create a few more variables
  merge["vat / purchase value" ] = (
    merge["vat paid"]        / merge["value, purchase" ] )
  merge["vat / income"] = (
    merge["vat paid"]        / merge["income"] )
  merge["purchase value / income" ] = (
    merge["value, purchase"] / merge["income"] )

if True: # save
  oio.saveStage(
      com.subsample,
      merge,
      "households_2_purchases." + com.strategy_year_suffix )