purchases_cap_c = purchases.merge(vat_cap_c, how="left", on="25-broad-categs") purchases = purchases_coicop.combine_first(purchases_cap_c) if True: # Big motorcycles incur a special 8% tax. # We proxy for "big" with "expensive". purchases["big-hog"] = (1 * (purchases["coicop"] == "07120101") * (purchases["value"] > (9e6))) purchases.loc[purchases["big-hog"] > 0, "vat"] = (purchases.loc[purchases["big-hog"] > 0, "vat"] + 0.08) purchases.loc[purchases["big-hog"] > 0, "vat frac"] = ( purchases.loc[purchases["big-hog"] > 0, "vat"] / (purchases.loc[purchases["big-hog"] > 0, "vat"] + 1)) if False: # drop anything missing vat purchases = purchases[~purchases["vat"].isnull()] if True: # handle freq, value, vat paid purchases["freq-code"] = purchases["per month"] # Kept for the sake of drawing a table of purchase frequency, # with frequencies spread evenly across the x-axis. (purchases["per month"] # PITFALL: not functional; the "inplace" option # causes replace() to have no return value. .replace(legends.freq, inplace=True)) purchases["value"] = purchases["per month"] * purchases["value"] purchases["vat paid"] = purchases["value"] * purchases["vat frac"] oio.saveStage(c.subsample, purchases, "purchases_2_vat." + c.strategy_suffix)
"race, palenq": "has-palenq", "race, whi|mest": "has-whi|mest" }) h_max = pd.concat([edu_max, other_max], axis=1) del (edu_max, other_max) if True: # Assemble the aggregates, then compute a few variables. households = pd.concat([h_first, h_sum, h_min, h_max], axis=1) households["household"] = households.index # when there are multiple indices, reset_index is the way to do that households["has-child"] = households["age-min"] < 18 households["all-elderly"] = households["age-min"] > 65 households["has-elderly"] = households["age-max"] > 65 # # PITFALL: Income decile and percentile for persons exist too. They are different. households["income-decile"] = (util.noisyQuantile(10, 0, 1, households["income"])) households["income-percentile"] = (util.noisyQuantile( 100, 0, 1, households["income"])) households[ "one"] = 1 # used in report/households.py to create the trivial partition. # TODO ? move to report/households.py households_decile_summary = desc.summarizeQuantiles( "income-decile", households) if True: # save oio.saveStage(com.subsample, households, "households_1_agg_plus." + com.strategy_year_suffix) oio.saveStage(com.subsample, households_decile_summary, "households_decile_summary." + com.strategy_year_suffix)
import python.build.output_io as oio import python.common.util as util import python.common.common as common if True: # merge people, buildings buildings = oio.readStage( 1 # PITFALL: For buildings, we always use the full sample. , 'buildings', dtype={"estrato": 'float64'}) people = oio.readStage(common.subsample, 'people_1') people = pd.merge(people, buildings, how="left", on="household") if True: # make some new variables people["age-decile"] = pd.qcut(people["age"], 10, labels=False, duplicates='drop') people["income-decile"] = ( # PITFALL: there's a different such variable at the household level util.noisyQuantile(10, 0, 1, people["income"])) people["female head"] = people["female"] * (people["household-member"] == 1) # PITFALL: As noted earlier, the buildings data is always drawn from the full # sample. However, the person data is drawn from a subsample. # Hence the output is written only to the folder for that subsample -- # (This contrasts with the test programs that use the full sample, # which write evidence that the test passed to every subsample folder.) oio.saveStage(common.subsample, people, 'people_2_buildings')
purchases["purchases"] = 1 # When I check purchases[ purchases["coicop"] == x ] for these x, # the results are consistent with the coicop-vat bridge. # 11110103, 11110104, 11110105 # 1119807, 1119808, 1119809 # 1180103, 1180201, 1180301 ## vat per coicop p_sum = purchases.groupby( 'coicop' )[ "value" ] . agg( 'sum' ) p_first = purchases.groupby( 'coicop' )[ "vat, min" ] . agg( 'mean' ) p = pd.concat( [p_sum, p_first] , axis = 1 ) oio.saveStage( subsample, p, "vat-and-spending-per-coicop" , index = True ) ## vat per rate q_sum = purchases.groupby( 'vat, min' )[ "value" ] . agg( 'sum' ) q_first = purchases.groupby( 'vat, min' )[ "vat, min" ] . agg( 'mean' ) q = pd.concat( [q_sum, q_first] , axis = 1 ) oio.saveStage( subsample, q, "vat-and-spending-per-vat-rate" , index = True )
# From the raw ENPH person-level data, # creates a data set that's a little friendlier. if True: import python.build.output_io as oio import python.build.people.files as files import python.common.common as cl import python.common.misc as c ppl = c.all_columns_to_numbers( cl.collect_files(files.files, subsample=cl.subsample), skip_columns=["non-beca sources" ] # PITFALL : a space-separated list of ints ) oio.saveStage(cl.subsample, ppl, 'people_0')
import python.build.purchases.articulos as articulos import python.build.purchases.capitulo_c as capitulo_c purchases = com.collect_files( ( articulos.files # + medios.files # The tax only applies if the purchase is more than 880 million pesos, # and the data only records purchases of a second home. + capitulo_c.files + nice_purchases.files), subsample=com.subsample) assert util.near( # PITFALL: This differs from the usual idiom which separates testing # from production. That's because the only thing tested here is # the number of rows; reading the entire data set into memory again # for such a simple test seems unworth the added execution time. len(purchases), misc.num_purchases / com.subsample, tol_frac=(1 / 20 if not com.subsample == 10 else 1 / 2)) # TODO | BUG? Why is the previous conditional necessary? That is, why, # in the special case of subsample = 1/10, is the size of the # purchase data so different from what you'd expect. # This isn't necessarily wrong, since the data is subsampled by households, # and households can make different numbers of purchases. # That's why `tol_frac` needs to be substantial in both cases. # But it's surprising, because for subsample = 10, # the reality is much less than the expectation. oio.saveStage(com.subsample, purchases, 'purchases_0')
("ICMUG", 0, "ICM", 0), ("ICMDUG", 0, "ICMD", 0 ) # "ingreso corriente monetario disponisble" , ("GTUG", 0, "GT", 0), ("GCUG", 0, "GC", 0), ("GCMUG", 0, "GCM", 0) # "gasto corriente monetario" ]) ] buildings = cl.collect_files(files, subsample=1) # see PITFALL above for c in ["IT", "IC", "ICM", "ICMD", "GT", "GC", "GCM"]: buildings = (cla.Correction.Replace_Substring_In_Column( c, ",", ".").correct(buildings)) if True: # estrato is strange # It includes undocumented values 0 and 9. # 0 might mean "renter": # https://www.eltiempo.com/archivo/documento/MAM-1757051 # I'm assuming 9 is some kind of error code. buildings["estrato"] = buildings["estrato"].replace(9, np.nan) buildings["recently bought this house"] = ( buildings["recently bought this house"] == 1) oio.saveStage( 1 # see PITFALL above , buildings, 'buildings')
if True: # Make new variables, esp. create person-level purchase-like earners["share"] = np.where( # The fraction of purchaselike variables # attributed to this household adult. earners["income, household"] <= 0, # the condition 1 / earners["members in labor force"], # used if true earners["income"] / earners["income, household"]) # used if false earners[ "one"] = 1 # To define the trivial group in the person-level report. for i in defs.household_variables_to_allocate_by_income_share: earners[i] = earners[i] * earners["share"] if True: # more variables earners["income-decile"] = (util.noisyQuantile(10, 0, 1, earners["income"])) earners["income-percentile"] = (util.noisyQuantile(100, 0, 1, earners["income"])) earners["vat / purchase value"] = (earners["vat paid"] / earners["value, purchase"]) earners["vat / income"] = ( # PITFALL: While the maximum value of this looks absurd (103), # it's not. The 95th percentile is 0.3. The outliers are so high because # people can spend borrowed money. earners["vat paid"] / earners["income"]) earners["purchase value / income"] = (earners["value, purchase"] / earners["income"]) if True: # save oio.saveStage(com.subsample, earners, "people_4_post_households." + com.strategy_year_suffix)
import sys import pandas as pd # import python.build.ss_functions as ss import python.build.output_io as oio import python.common.util as util import python.common.common as com # import python.build.people_3_income_taxish_functions as f4 if com.regime_year == 2016: import python.regime.r2016 as regime elif com.regime_year == 2018: import python.regime.r2018 as regime else: import python.regime.r2019 as regime ppl = oio.readStage( com.subsample , "people_2_buildings" ) ppl = ss.mk_ss_contribs(ppl) ppl = f4.insert_has_dependent_column(ppl) ppl = regime.income_taxes( ppl ) oio.saveStage( com.subsample , ppl , 'people_3_income_taxish.' + com.strategy_year_suffix )
vat_columns = [col for col in purchases.columns if vat_regex.match(col)] # This could change -- that's why I use a regexp -- but currently, # vat_columns is equal to this: # [ "vat", "vat frac", "vat paid"] # for col in vat_columns: purchases[col] = ((purchases["is-purchase"] > 0) * purchases[col]) purchases["transactions"] = 1 # This is soon summed within persons. purchase_sums = purchases.groupby(["household"])[[ "value, purchase", "value, non-purchase", "transactions", "vat paid", "value, tax, predial", "value, tax, purchaselike non-predial non-VAT" ]].agg("sum") purchase_sums = purchase_sums.reset_index(level=["household"]) if True: # It's faster to compute these columns post-aggregation. purchase_sums["value, tax, purchaselike non-VAT"] = ( purchase_sums["value, tax, predial"] + purchase_sums["value, tax, purchaselike non-predial non-VAT"]) purchase_sums["value, spending"] = ( # Taxes and purchases, but no gifts. # PITFALL: Includes VAT (it's part of "value, purchase"). purchase_sums["value, tax, purchaselike non-VAT"] + purchase_sums["value, purchase"]) purchase_sums[ "value, consumption"] = ( # purchases and gifts, but no taxes (except VAT) # PITFALL: Includes VAT (it's part of "value, purchase"). purchase_sums["value, non-purchase"] + purchase_sums["value, purchase"]) oio.saveStage(c.subsample, purchase_sums, "purchase_sums." + c.strategy_suffix)
# results in the fraction of that payment attributable to the vat. # This is because reported expenditures are post-tax. # For instance, if the VAT were 20%, then (0.2 / 1.2) is that fraction. # TODO: Test (automatically). data["vat"] = (data[vat_components].sum(axis="columns")) data["vat frac"] = (data["vat"] / (data["vat"] + 1)) return data def go(data: pd.DataFrame) -> pd.DataFrame: return (compute_total_vat( incorporate_user_vat_prefs(data) ).drop( # Once they have been summed, the components are of no interest. columns=vat_components + ["group"])) vat_coicop = go(vat_coicop) vat_cap_c = go(vat_cap_c) if True: # save oio.saveStage(c.subsample, vat_coicop, 'vat_coicop.' + c.strategy_suffix) oio.saveStage(c.subsample, vat_cap_c, 'vat_cap_c.' + c.strategy_suffix) # vat_coicop = vat_coicop.drop(columns=["description", "Notes"]) vat_cap_c = vat_cap_c.drop(columns=["description"]) # oio.saveStage(c.subsample, vat_coicop, 'vat_coicop_brief.' + c.strategy_suffix) oio.saveStage(c.subsample, vat_cap_c, 'vat_cap_c_brief.' + c.strategy_suffix)
purchases = com.all_columns_to_numbers(purchases) purchases = defs.drop_if_coicop_or_value_invalid(purchases) purchases = defs.drop_absurdly_big_expenditures(purchases) purchases = (Correction.Drop_Row_If_Column_Satisfies_Predicate( "value", lambda v: v <= 0).correct(purchases)) purchases = ( Correction. # no "never" frequencies Drop_Row_If_Column_Satisfies_Predicate( "per month", lambda x: x == 11).correct(purchases)) purchases = ( Correction. # no non-positive quantities Drop_Row_If_Column_Satisfies_Predicate( "quantity", lambda x: x <= 0).correct(purchases)) # These only make sense once the relevant columns are numbers. for c in ( # how-got=1 -> is-purchase=1, nan -> nan, otherwise -> 0 [ Correction.Apply_Function_To_Column( "how-got", lambda x: 1 if x == 1 else # PITFALL: x >= 0 yields False for NaN (0 if x >= 0 else np.nan)), Correction.Rename_Column("how-got", "is-purchase"), Correction.Drop_Row_If_Column_Satisfies_Predicate( "quantity", lambda x: x <= 0) ]): purchases = c.correct(purchases) oio.saveStage(cl.subsample, purchases, 'purchases_1')
lambda x: str(gv) + ": " + defs.maybeFill( gv, str(x)), t.index)))) varSummaries.append(t) groupSummaries.append(pd.concat(varSummaries, axis=1)) summaryDict[unit] = pd.concat(groupSummaries, axis=0) ret_tmi = pd.concat(list(summaryDict.values()), axis=0).transpose() ret = ret_tmi.loc[restrictedVars] ret_tmi.reset_index(inplace=True) ret_tmi = ret_tmi.rename(columns={"index": "measure"}) ret.reset_index(inplace=True) ret = ret.rename(columns={"index": "measure"}) return (ret, ret_tmi) for (unit, df, variables, restrictedVars) in [ ("earners", earners, defs.earnerVars, defs.earnerRestrictedVars), ("households", households, defs.householdVars, defs.householdRestrictedVars) ]: (ret, ret_tmi) = make_summary_frame(unit, df, variables, restrictedVars) oio.saveStage(com.subsample, ret_tmi, "report_" + unit + "_tmi." + com.strategy_year_suffix) oio.saveStage_excel(com.subsample, ret_tmi, "report_" + unit + "_tmi." + com.strategy_year_suffix) oio.saveStage(com.subsample, ret, "report_" + unit + "." + com.strategy_year_suffix) oio.saveStage_excel(com.subsample, ret, "report_" + unit + "." + com.strategy_year_suffix)
if True: # merge purchase data into person data # PITFALL: The unit of observation in all these data sets is a household. hh = oio.readStage( com.subsample, "households_1_agg_plus." + com.strategy_year_suffix ) pur = oio.readStage( com.subsample, "purchase_sums." + com.strategy_suffix ) merge = pd.merge( hh, pur, how = "left", on=["household"] ) if True: # In San Andrés there is no VAT. merge.loc[ merge["region-1"] == "SAN ANDRÉS", "vat paid" ] = 0 if True: # create a few more variables merge["vat / purchase value" ] = ( merge["vat paid"] / merge["value, purchase" ] ) merge["vat / income"] = ( merge["vat paid"] / merge["income"] ) merge["purchase value / income" ] = ( merge["value, purchase"] / merge["income"] ) if True: # save oio.saveStage( com.subsample, merge, "households_2_purchases." + com.strategy_year_suffix )