def parse_file(fn): """Break down the xls into a 2d data array, stripping off first rows which do not have data.""" data = xls2list.xls2list(fn) for n, row in enumerate(data[3:]): r = parse_row(row) # All of the earmarks have a description, stop when we finish all # earmarks if not r.description: break # The id's aren't remotely uniq, map to something that is r.id=n+1 # Lets start at 1 instead of 0 yield r
def parse_file(fn): """Break down the xls into a 2d data array, stripping off first rows which do not have data.""" data = xls2list.xls2list(fn) for n, row in enumerate(data[3:]): r = parse_row(row) # All of the earmarks have a description, stop when we finish all # earmarks if not r.description: break # The id's aren't remotely uniq, map to something that is r.id = n + 1 # Lets start at 1 instead of 0 yield r
def parse_state(state): def fixnum(x, multiply=1): if isinstance(x, unicode) and '*' in x: return None else: return x * multiply stats = xls2list.xls2list(SOI_PATH % state) loc = 11 # rest is all headers while loc+7 < len(stats): out = web.storage() bundle = stats[loc:loc+7] if bundle[0][0] == None: break out.loc = bundle[0][0] if isinstance(out.loc, float): out.loc = str(int(out.loc)).zfill(5) if out.loc.strip() == "MISSOURI": loc += 8 # duped data continue out.brackets = [] for line in bundle: if (isinstance(line[0], unicode) and line[0].strip() == 'Total' ) or isinstance(line[0], float): line[0] = None elif line[0].strip() == "Under $10,000": line[0] = 0 else: line[0] = int(''.join([x for x in line[0].split()[0] if x.isdigit()])) out.brackets.append(web.storage( bracket_low=line[0], n_filers=fixnum(line[1]), agi=fixnum(line[4], 1000), tot_tax=fixnum(line[35], 1000), n_dependents=fixnum(line[3]), n_eitc=fixnum(line[36]), tot_eitc=fixnum(line[37], 1000), tot_charity=fixnum(line[26], 1000), n_prepared=fixnum(line[38]) )) br = out.brackets[-1] err = (TypeError, ZeroDivisionError) try: br.pct_prepared = float(br.n_prepared)/br.n_filers except err: pass try: br.pct_charity = float(br.tot_charity)/br.agi except err: pass try: br.avg_eitc = float(br.tot_eitc)/br.n_eitc except TypeError: pass except ZeroDivisionError: br.avg_eitc = 0 try: br.pct_eitc = float(br.n_eitc)/br.n_filers except err: pass try: br.avg_dependents = float(br.n_dependents)/br.n_filers except err: pass try: br.avg_taxburden = float(br.tot_tax)/br.agi except err: pass try: br.avg_income = float(br.agi)/br.n_filers except err: pass try: out.gini = gini_est(out.brackets) except MissingData: pass yield out loc += 8
def parse_state(state): def fixnum(x, multiply=1): if isinstance(x, unicode) and '*' in x: return None else: return x * multiply stats = xls2list.xls2list(SOI_PATH % state) loc = 11 # rest is all headers while loc + 7 < len(stats): out = web.storage() bundle = stats[loc:loc + 7] if bundle[0][0] == None: break out.loc = bundle[0][0] if isinstance(out.loc, float): out.loc = str(int(out.loc)).zfill(5) if out.loc.strip() == "MISSOURI": loc += 8 # duped data continue out.brackets = [] for line in bundle: if (isinstance(line[0], unicode) and line[0].strip() == 'Total') or isinstance( line[0], float): line[0] = None elif line[0].strip() == "Under $10,000": line[0] = 0 else: line[0] = int(''.join( [x for x in line[0].split()[0] if x.isdigit()])) out.brackets.append( web.storage(bracket_low=line[0], n_filers=fixnum(line[1]), agi=fixnum(line[4], 1000), tot_tax=fixnum(line[35], 1000), n_dependents=fixnum(line[3]), n_eitc=fixnum(line[36]), tot_eitc=fixnum(line[37], 1000), tot_charity=fixnum(line[26], 1000), n_prepared=fixnum(line[38]))) br = out.brackets[-1] err = (TypeError, ZeroDivisionError) try: br.pct_prepared = float(br.n_prepared) / br.n_filers except err: pass try: br.pct_charity = float(br.tot_charity) / br.agi except err: pass try: br.avg_eitc = float(br.tot_eitc) / br.n_eitc except TypeError: pass except ZeroDivisionError: br.avg_eitc = 0 try: br.pct_eitc = float(br.n_eitc) / br.n_filers except err: pass try: br.avg_dependents = float(br.n_dependents) / br.n_filers except err: pass try: br.avg_taxburden = float(br.tot_tax) / br.agi except err: pass try: br.avg_income = float(br.agi) / br.n_filers except err: pass try: out.gini = gini_est(out.brackets) except MissingData: pass yield out loc += 8