Ejemplo n.º 1
0
def test_vat_file( filename
                 , code_column_name
                 , fraction_that_should_be_non_null ):

  def non_null_part( column ):
    return column[ ~ pd.isnull( column ) ]

  df = oio.readStage( cl.subsample
                    , filename + "." + cl.strategy_suffix )

  assert unique( df.columns )

  # The special motorcycle tax, abusivelyed lump into the VAT table,
  # means the max "vat" is 0.27 rather than 0.19.
  assert df["vat"].min() >= 0
  assert df["vat"].max() < 0.3

  assert set( df.columns ) == set(
    [code_column_name, 'vat', 'vat frac'] )

  for c in df.columns:
    if c == code_column_name:
          assert df[c].dtype == "int64"
    else: assert df[c].dtype == "float64"

  # The "vat" and "vat frac" columns might have a few missing values.
  # The others should have none.
  assert ( ( len( non_null_part( df["vat"] ) )
           / len( df ) )
         > ( fraction_that_should_be_non_null - tolerance ) )
  assert (  len( non_null_part( df["vat"] ) )
         == len( non_null_part( df["vat frac"] ) ) )
  df = df.drop( columns = ["vat", "vat frac"] )
  for c in df.columns:
    assert len( df[ ~ pd.isnull( df[c] ) ] ) == len( df )
Ejemplo n.º 2
0
if True:
    import pandas as pd
    import python.build.output_io as oio
    import python.common.common as com

ppl = oio.readStage(com.subsample, 'people_1')
dp = ppl[["household", "used savings"]].copy()
dp["one"] = 1
dh = (dp.groupby("household").agg("sum"))
Ejemplo n.º 3
0
if True:
    import os
    import pandas as pd
    import sys
    from typing import List, Tuple
    #
    import python.build.output_io as oio
    import python.common.common as com
    import python.common.describe as desc
    import python.common.misc as c
    import python.draw.util as draw
    import python.report.defs as defs

if True:  # load data
    households = oio.readStage(
        com.subsample, "households_2_purchases." + com.strategy_year_suffix)

    earners = oio.readStage(
        com.subsample, "people_4_post_households." + com.strategy_year_suffix)

if True:  # Create a few columns missing in the input data.
    # TODO ? Move upstream.
    for df in [households, earners]:
        df["income, labor + cesantia"] = (df["income, labor"] +
                                          df["income, cesantia"])

        df["income-percentile-in[90,97]"] = ((df["income-percentile"] >= 90)
                                             & (df["income-percentile"] <= 97))

        df["income < min wage"] = (df["income"] < c.min_wage)
Ejemplo n.º 4
0
# Merge the building data innto the person-level data.

if True:
    import sys
    import pandas as pd
    #
    import python.build.output_io as oio
    import python.common.util as util
    import python.common.common as common

if True:  # merge people, buildings
    buildings = oio.readStage(
        1  # PITFALL: For buildings, we always use the full sample.
        ,
        'buildings',
        dtype={"estrato": 'float64'})
    people = oio.readStage(common.subsample, 'people_1')
    people = pd.merge(people, buildings, how="left", on="household")

if True:  # make some new variables
    people["age-decile"] = pd.qcut(people["age"],
                                   10,
                                   labels=False,
                                   duplicates='drop')
    people["income-decile"] = (
        # PITFALL: there's a different such variable at the household level
        util.noisyQuantile(10, 0, 1, people["income"]))
    people["female head"] = people["female"] * (people["household-member"]
                                                == 1)

# PITFALL: As noted earlier, the buildings data is always drawn from the full
Ejemplo n.º 5
0
        from matplotlib.ticker import EngFormatter
    if True:  # more imports
        import sys
        import os
        import numpy as np
        from functools import reduce
        #
        import python.common.util as util
        import python.draw.util as draw
        import python.build.output_io as oio
        import python.build.common as c

vat_pics_dir = ("output/vat/pics/recip-" + str(c.subsample) + "/" +
                c.strategy_suffix + "/")
if not os.path.exists(vat_pics_dir): os.makedirs(vat_pics_dir)
households = oio.readStage(c.subsample, 'households.' + c.strategy_suffix)
households_decile_summary = oio.readStage(
    c.subsample, 'households_decile_summary.' + c.strategy_suffix)

if True:  # single series
    plt.close()
    draw.single_cdf(households["members"], "Household size", xmax=10)
    draw.savefig(vat_pics_dir + "households", "size")

    plt.close()
    draw.single_cdf(households["transactions"],
                    "Transactions per month",
                    xmax=150)
    draw.savefig(vat_pics_dir + "households", "transactions-per-month")

    plt.close()
Ejemplo n.º 6
0
# This creates a single key from a collection of keys,
# so that a dataset can be compared to a previous one using csv-diff, e.g.:
#   csv-diff old.csv new.csv --key="id"

if True:
    import sys
    import pandas as pd
    #
    import python.build.output_io as oio
    import python.common.common as cl

p4 = oio.readStage(cl.subsample,
                   "people_3_income_taxish." + cl.strategy_year_suffix)

p4["id"] = (p4["household"].astype(str) + ":" +
            p4["household-member"].astype(str))

p4.to_csv("old.csv")
Ejemplo n.º 7
0
if True:
  import pandas as pd
  import numpy as np
  #
  import python.build.classes   as cl
  import python.build.output_io as oio
  import python.common.common   as com
  import python.common.util     as util


if True:
  hh_cols = oio.readStage(
      com.subsample,
      "households_1_agg_plus." + com.strategy_year_suffix,
      nrows = 1 )
  hh_rows = oio.readStage(
    com.subsample,
    "households_1_agg_plus." + com.strategy_year_suffix,
    usecols = ["household"] )
  pur = oio.readStage(
    com.subsample,
    "purchase_sums." + com.strategy_suffix )
  merge = oio.readStage(
    com.subsample,
    "households_2_purchases." + com.strategy_year_suffix )

if True: # See people_2_buildings_test for how to use these definitions.
  assert util.unique( merge.columns )
  new_cols = [ "vat / purchase value",
               "vat / income",
               "purchase value / income" ]
Ejemplo n.º 8
0
      # enable the previous line if calling from the (non-gui) shell
    import matplotlib.pyplot as plt
    from matplotlib.ticker import EngFormatter
  if True: # more imports
    import sys
    import os
    import numpy as np
    #
    import python.build.output_io as oio
    import python.draw.util as draw
    import python.build.common as c


vat_pics_dir = "output/vat/pics/recip-" + str(c.subsample) + "/" + c.strategy_suffix + "/"
if not os.path.exists(vat_pics_dir): os.makedirs(vat_pics_dir)
purchases = oio.readStage( c.subsample, 'purchases_2_vat.' + c.strategy_suffix )


if True: # purchase quantity, logx and linear
  plt.close()
  draw.single_cdf( purchases["quantity"], "CDF of quantity per purchase",
                   xmin = 1, xmax = 1e3)
  plt.gca().xaxis.set_major_formatter(EngFormatter(places=2))
  draw.savefig( vat_pics_dir + "purchases" , "quantity" )

  plt.close()
  draw.single_cdf( purchases["quantity"], "CDF of quantity per purchase",
                  xmin = 1, logx = True)
  draw.savefig( vat_pics_dir + "purchases/logx" , "quantity" )

Ejemplo n.º 9
0
# people["dependent"]
# r2018 . income_taxes( ppl )
#
#
# ### how to test
# Restrict the file to dependents.
# Mark everyone as having a dependent.
# Compute everyone's taxes.
# Restrict to people with "tax, income" > 0.
# Is the set empty?

if True:
    import pandas as pd
    #
    import python.common.common as com
    import python.build.output_io as oio
    import python.regime.r2018 as reg
    import python.build.ss_functions as ss

ppl = oio.readStage(com.subsample, "people_3_purchases." + com.strategy_suffix)

if True:
    ppl = ppl[ppl["dependent"]]
    ppl = ss.mk_ss_contribs(ppl)
    ppl["claims dependent (labor income tax)"] = False
    ppl = reg.income_taxes(ppl)
    len(ppl[ppl["tax, income"] > 0])

rich_deps = ppl[ppl["tax, income"] > 0].copy()
rich_deps["tax, income"].describe()
Ejemplo n.º 10
0
if True:
    import pandas as pd
    import numpy as np
    #
    import python.common.util as util
    import python.build.output_io as oio
    import python.common.common as com
    import python.build.people_4_post_households_defs as defs

if True:  # input
    hs = oio.readStage(com.subsample,
                       "households_2_purchases." + com.strategy_year_suffix)
    ps = oio.readStage(com.subsample,
                       'people_3_income_taxish.' + com.strategy_year_suffix)

if True:  # Prepare to merge.
    hs = hs.rename(columns={"income": "income, household"})

if True:  # Merge people and households.
    m = pd.merge(left=ps,
                 right=hs[defs.columns_to_pull_from_hs],
                 on="household")
    earners = m[((m["in labor force"] == 1)
                 & (m["age"] >= 18))
                | (m["income"] > 0)]
    del (m)

if True:  # Make new variables, esp. create person-level purchase-like
    earners["share"] = np.where(  # The fraction of purchaselike variables
        # attributed to this household adult.
        earners["income, household"] <= 0,  # the condition
Ejemplo n.º 11
0
import sys
import pandas as pd

import python.common.util as util
import python.build.output_io as oio
from python.build.people.files import edu_key
import python.common.misc as c
import python.common.common as c

people = oio.readStage(c.subsample, "people_3_purchases")
households = oio.readStage(c.subsample, "households")
purchase_sums = oio.readStage(c.subsample, "purchase_sums")

if False:
    people["edu"] = pd.Categorical(people["edu"],
                                   categories=list(edu_key.values()),
                                   ordered=True)

people["edu"] = util.interpretCategorical(people["edu"], edu_key.values())
Ejemplo n.º 12
0
if True:
    import numpy as np
    from itertools import chain
    #
    from python.build.classes import Correction
    import python.build.output_io as oio
    import python.build.purchases.correct_defs as defs
    import python.common.common as cl
    import python.common.misc as com
    #
    # input files
    import python.build.purchases.nice_purchases as nice_purchases
    import python.build.purchases.articulos as articulos
    import python.build.purchases.capitulo_c as capitulo_c

purchases = oio.readStage(cl.subsample, 'purchases_0')

for c in (
        # PITFALL: Any correction reliant on a column's being a number
        # cannot be trusted to work here. Put it later in the program,
        # after running `all_columns_to_numbers`.
    [
        Correction.Replace_Substring_In_Column("quantity", ",", "."),
        Correction.Replace_Missing_Values("quantity", 1),
        Correction.Replace_Missing_Values("per month", 1),
        Correction.Change_Column_Type("coicop", str),
        Correction.Replace_Entirely_If_Substring_Is_In_Column(
            "coicop", "inv", np.nan)
    ] + list(
        chain.from_iterable([
            # chain.from_iterable concatenates its argument's members
# In a previous incarnation of tax.co,
# the ORDEN variable was assumed to mean the same thing in the purchase data that it means in the person data:
# a unique-within-household identifier of persons.
# This code explores the effect that has on estimated household spending.

if True:
    import numpy as np
    import pandas as pd
    #
    import python.build.classes as cla
    import python.build.purchases.legends as legends
    import python.build.output_io as oio
    import python.common.common as com
    import python.common.util as util

pur = oio.readStage(  # the last purchases-level data set
    com.subsample, "purchases_2_vat." + com.strategy_suffix)

ppl = oio.readStage(  # the first person-level data set
    com.subsample,
    'people_1',
    usecols=["household", "household-member"])

hh = (ppl.groupby("household").agg({
    "household-member": "max"
}).reset_index().rename(columns={"household-member": "max member"}))

pur["n purchases"] = 1
hh_pur = (pur.groupby("household").agg({
    "household-member": "max",
    "n purchases": "sum"
}).reset_index().rename(columns={"household-member": "max orden"}))
Ejemplo n.º 14
0
    for (c, t) in [("recently bought this house", cla.InSet({True, False})),
                   ("recently bought this house", cla.CoversRange(0, 1)),
                   ("recently bought this house", cla.MeanBounds(0, 0.01)),
                   ("recently bought this house", cla.MissingAtMost(0)),
                   ("estrato", cla.InRange(0, 6)),
                   ("estrato", cla.CoversRange(0, 3)),
                   ("estrato", cla.MeanBounds(1.5, 2.5)),
                   ("estrato", cla.MissingAtMost(0.02))]:
        assert t.test(bs[c])


if True:  # run tests
    log = "starting\n"
    bs = oio.readStage(
        1  # PITFALL: For buildings, we always use the full sample.
        ,
        'buildings',
        dtype={"estrato": 'float64'}
        # If subsample is so small that there are no missing values,
        # "estrato" will by default be read as "int64".
    )
    test_types(bs)
    test_nullity(bs)
    test_ranges(bs)
    assert (unique(bs.columns))
    for ss in com.valid_subsamples:
        # PITFALL: Looping over subsample sizes because this program
        # always uses the full sample.
        # If it works, it works for all subsamples.
        oio.test_write(ss, "build_buildings", log)
Ejemplo n.º 15
0
  import sys
  import pandas                    as pd
  #
  import python.build.ss_functions as ss
  import python.build.output_io    as oio
  import python.common.util        as util
  import python.common.common      as com
  #
  import python.build.people_3_income_taxish_functions as f4
  if   com.regime_year == 2016:
      import python.regime.r2016 as regime
  elif com.regime_year == 2018:
      import python.regime.r2018 as regime
  else:
      import python.regime.r2019 as regime


ppl = oio.readStage( com.subsample
                   , "people_2_buildings" )

ppl = ss.mk_ss_contribs(ppl)

ppl = f4.insert_has_dependent_column(ppl)

ppl = regime.income_taxes( ppl )

oio.saveStage( com.subsample
             , ppl
             , 'people_3_income_taxish.' + com.strategy_year_suffix
)
Ejemplo n.º 16
0
    # But it's surprising, because for subsample = 10,
    # the reality is much less than the expectation.

    assert (set(df.columns) == set(Purchase_2_Columns_missing.all_columns()))

    # coicop and 25-broad-categs are each individually missing substantially,
    # but exactly one of them is always present
    assert len(df[(~pd.isnull(df["coicop"]))
                  & (~pd.isnull(df["25-broad-categs"]))]) == 0
    assert len(df[(pd.isnull(df["coicop"])) |
                  (pd.isnull(df["25-broad-categs"]))]) == len(df)

    for c in Purchase_2_Columns_missing.never:
        assert (len(df[pd.isnull(df[c])]) == 0)

    for c in Purchase_2_Columns_missing.slightly:
        assert ((len(df[pd.isnull(df[c])]) / len(df)) < 0.03)

    for c in Purchase_2_Columns_missing.very:
        assert ((len(df[pd.isnull(df[c])]) / len(df)) < 0.25)

    return log


if True:  # IO
    log = "starting\n"
    ps = oio.readStage(com.subsample, "purchases_2_vat." + com.strategy_suffix)
    log += test_ranges(ps)
    log += test_output(ps)
    oio.test_write(com.subsample, "build_purchases_2_vat", log)
Ejemplo n.º 17
0
# TODO: divide into sub-modules

if True:
    import numpy as np
    import pandas as pd
    import re as regex
    #
    import python.build.classes as cla
    import python.build.output_io as oio
    import python.build.people.main_defs as defs
    from python.build.people.empleados import generar_empleados
    import python.build.people.files as files
    import python.common.common as cl
    import python.common.misc as c

ppl = oio.readStage(cl.subsample, 'people_0')

ppl = ppl.drop(  # drop non-members of household
    ppl[ppl["relationship"].isin([6, 7, 8])].index)

if True:  # make independiente a 0 or a 1
    ppl["independiente"] = ppl["independiente"].apply(lambda x: 1
                                                      if x in [4, 5] else 0)

if True:  # remap some boolean integers
    for cn in (
        ["female"] +  # originally 1=male, 2=female
        [included for (_, included) in files.inclusion_pairs]
            # Originally, 1 = included, 2 = omitted.
            # Now 0 = included, 1 = omitted.
    ):
Ejemplo n.º 18
0
import pandas as pd
import numpy as np

import python.build.output_io as oio


subsample = 10
purchases = oio.readStage( subsample, "purchases_2_vat" )

purchases["purchases"] = 1

# When I check purchases[ purchases["coicop"] == x ] for these x,
# the results are consistent with the coicop-vat bridge.
# 11110103, 11110104, 11110105
# 1119807, 1119808, 1119809
# 1180103, 1180201, 1180301


## vat per coicop
p_sum = purchases.groupby( 'coicop' )[ "value" ] . agg( 'sum' )
p_first = purchases.groupby( 'coicop' )[ "vat, min" ] . agg( 'mean' )
p = pd.concat( [p_sum, p_first]
              , axis = 1 )

oio.saveStage( subsample, p, "vat-and-spending-per-coicop"
               , index = True
)


## vat per rate
q_sum = purchases.groupby( 'vat, min' )[ "value" ] . agg( 'sum' )
Ejemplo n.º 19
0
    log += "Very few missing quantity values."
    assert ((1e-5) > (len(df[pd.isnull(df["quantity"])]) / len(df)))

    log += "Very few negative quantity values."
    assert ((1e-5) > (len(df[df["quantity"] <= 0]) / len(df)))

    log += "Negative quantity purchases are for very little money."
    assert (df[df["quantity"] < 0]["value"] < 1e4).all()

    log += "Very few purchases with a frequency of \"never\"."
    assert ((1e-5) > (len(df[df["per month"] > 10]) / len(df)))

    log += "Those few frequency=\"never\" purchases are for very little money."
    assert (df[df["per month"] > 10]["value"] < 1e4).all()

    return log


if True:  # run the tests
    log = "starting\n"

    # unit tests
    log += test_drop_if_coicop_or_value_invalid()
    log += test_drop_absurdly_big_expenditures()

    # integration test
    df = oio.readStage(com.subsample, 'purchases_1')
    log += test_output(df)

    oio.test_write(com.subsample, "purchases_correct", log)
Ejemplo n.º 20
0
        ("has-child", cla.MeanBounds(0.4, 0.8)),
        ("has-elderly", cla.MeanBounds(0.1, 0.3)),
        ("used savings", cla.MeanBounds(0.03, 0.12)),  # PITFALL:
            # Bigger than the mean from the people data,
            # because it varies within household.
        ("recently bought this house", cla.MeanBounds(0, 0.01)),
        ("female head", cla.MeanBounds(0.25, 0.55)),
        ("seguro de riesgos laborales", cla.MeanBounds(0.3, 0.6))
    ]:
        assert test.test(hh[c])


if True:  # IO
    log = "starting\n"
    #
    hh = oio.readStage(com.subsample,
                       "households_1_agg_plus." + com.strategy_year_suffix)
    ppl = oio.readStage(com.subsample,
                        "people_3_income_taxish." + com.strategy_year_suffix)
    hh["edu-max"] = util.interpretCategorical(hh["edu-max"], edu_key.values())
    ppl["edu"] = util.interpretCategorical(ppl["edu"], edu_key.values())

    test_const_within_group(
        # TODO ? move this test to the tests of person data
        gs=["household"],
        cs=defs.cols_const_within_hh,
        d=hh)
    test_indices(hh=hh, ppl=ppl)
    test_income_ranks(hh=hh, ppl=ppl)
    test_sums(hh=hh, ppl=ppl)
    test_bools(hh=hh, ppl=ppl)
    com_tests.test_quantiles(df=hh)
Ejemplo n.º 21
0
        import sys
        import os
        from functools import reduce
        #
        import pandas as pd
        import numpy as np
        #
        import python.common.util as util
        import python.draw.util as draw
        import python.build.output_io as oio
        import python.build.common as c

vat_pics_dir = "output/vat/pics/recip-" + str(
    c.subsample) + "/" + c.strategy_suffix + "/"
if not os.path.exists(vat_pics_dir): os.makedirs(vat_pics_dir)
people = oio.readStage(c.subsample, 'people_3_purchases.' + c.strategy_suffix)

edu_key = {
    1: "Ninguno",
    2: "Preescolar",
    3: "Basica\n Primaria",
    4: "Basica\n Secundaria",
    5: "Media",
    6: "Superior o\n Universitaria",
    9: "No sabe,\n no informa"
}
people["edu"] = pd.Categorical(people["edu"],
                               categories=list(edu_key.values()),
                               ordered=True)

if True:  # single series
Ejemplo n.º 22
0
# The only thing to check is the increase in the set of columns.
# (Could check length, but a left merge cannot change that.)

if True:
    import sys
    import pandas as pd
    #
    import python.build.classes as cl
    import python.build.output_io as oio
    import python.common.common as com
    import python.common.misc as misc
    import python.common.util as util

in_cols = oio.readStage(com.subsample, "people_1", nrows=1)
in_rows = oio.readStage(com.subsample, "people_1", usecols=["household"])
out = oio.readStage(com.subsample, 'people_2_buildings')

cols1 = set(in_cols.columns)
cols2 = set(out.columns)
new_cols = {
    "estrato", 'recently bought this house', "region-1", "region-2",
    "age-decile", "income-decile", "IT", "IC", "ICM", "ICMD", "GT", "GC",
    "GCM", "female head"
}

assert util.unique(out.columns)
assert util.unique(new_cols)

assert set.intersection(cols1, new_cols) == set()
assert set.union(cols1, new_cols) == cols2
assert set.difference(cols2, cols1) == new_cols
Ejemplo n.º 23
0
    import pandas as pd
    from itertools import chain
    #
    import python.build.output_io as oio
    import python.common.common as cl
    import python.common.misc as c
    import python.common.describe as desc
    import python.draw.util as draw

if cl.regime_year == 2016:
    import python.regime.r2016 as regime
else:
    import python.regime.r2018 as regime

if True:  # Get, prepare the data
    hh = oio.readStage(cl.subsample,
                       "households_2_purchases." + cl.strategy_year_suffix)
    hh["income-percentile-in[90,97]"] = ((hh["income-percentile"] >= 90)
                                         & (hh["income-percentile"] <= 97))
    hh["income < min wage"] = (hh["income"] < c.min_wage)

if True:  # Sum the ss tax components, keep sum, drop components.
    ss_tax_components = [
        "tax, ss, pension", "tax, ss, pension, employer", "tax, ss, salud",
        "tax, ss, salud, employer", "tax, ss, solidaridad",
        "tax, ss, parafiscales", "tax, ss, cajas de compensacion"
    ]
    hh["tax, ss"] = hh[ss_tax_components].sum(axis="columns")
    hh = hh.drop(columns=ss_tax_components)

if True:  # Narrow the set of columns
    basicVars = ["household", "weight"]
Ejemplo n.º 24
0
# TODO : automate these tests.

import numpy                     as np
import pandas                    as pd

import python.build.output_io    as oio
import python.common.misc        as c
import python.common.common     as cl


people = oio.readStage( cl.subsample
                      , 'people_3_income_taxish.' + cl.strategy_suffix )

ppl = people.rename( columns = {
    "relative, child" : "child"
  , "relative, non-child" : "rel"
  , "dependent" : "dep"
  , "disabled" : "disab"
  , "income, labor" : "labor" } )

# These should all have a mean of 1
ppl["dep"][
  (ppl["student"] == 1) & (ppl["age"] < 24) ].mean()

ppl["dep"][
  (ppl["child"] == 1) & (ppl["age"] < 19) ].mean()

ppl["dep"][
  ((ppl["rel"]==1) & (ppl["labor"] < (260*c.uvt) ) ) ].mean()

ppl["dep"][
Ejemplo n.º 25
0
# Beyond the shape of the data, there's nothing to test.

if True:
  import pandas as pd
  #
  import python.build.classes as cla
  import python.build.output_io as oio
  import python.common.common as com
  from   python.common.misc import num_households
  import python.common.util as util


sums = oio.readStage(
    com.subsample,
    "purchase_sums." + com.strategy_suffix )

assert util.unique( sums.columns )
assert ( set( sums.columns )  ==
         { "household",
           "value, tax, purchaselike non-VAT",
           "value, tax, predial",
           "value, tax, purchaselike non-predial non-VAT",
           "transactions",
           "value, non-purchase",
           "value, purchase",
           "value, spending",
           "value, consumption",
           "vat paid" } )

if com.subsample < 11: # The data is too sparse to test
                       # the smaller samples this way
Ejemplo n.º 26
0
    import python.build.classes as cla
    import python.build.purchases.legends as legends
    import python.build.output_io as oio
    import python.common.common as c
    import python.common.util as util

if True:  # input files
    purchases = oio.readStage(
        # Data is too big unless we down-cast the numbers
        # from 64-bit to 32-bit.
        c.subsample,
        "purchases_1",
        dtype={
            "25-broad-categs": "float32",
            "coicop": "float32",
            "per month": "float32",
            "household": "int32",
            "household-member": "int32",
            "is-purchase": "float32",
            "quantity": "float32",
            "value": "float32",
            "weight": "float32",
            "where-got": "float32"
        })

    vat_cap_c = (oio.readStage(c.subsample,
                               "vat_cap_c_brief." + c.strategy_suffix,
                               dtype={
                                   "25-broad-categs": "int32",
                                   "vat": "float32",
                                   "vat frac": "float32"
Ejemplo n.º 27
0
# Incorporate sums of purchases into households.
# Compute some more variables.

if True:
  import pandas as pd
  import numpy as np
  #
  import python.common.util as util
  import python.build.output_io as oio
  import python.common.common as com


if True: # merge purchase data into person data
  # PITFALL: The unit of observation in all these data sets is a household.
  hh = oio.readStage(
    com.subsample,
    "households_1_agg_plus." + com.strategy_year_suffix )
  pur = oio.readStage(
    com.subsample,
    "purchase_sums." + com.strategy_suffix )
  merge = pd.merge( hh, pur,
                    how = "left",
                    on=["household"] )

if True: # In San Andrés there is no VAT.
  merge.loc[ merge["region-1"] == "SAN ANDRÉS", "vat paid" ] = 0

if True: # create a few more variables
  merge["vat / purchase value" ] = (
    merge["vat paid"]        / merge["value, purchase" ] )
  merge["vat / income"] = (