def load_soi_farm_prop(data_tree=naics.generate_tree(), blue_tree=None, blueprint=None, from_out=False, out_path=_FARM_PROP_OUT_PATH): """ This function loads the soi nonfarm proprietorship data: :param data_tree: The NAICS tree to read the data into. :param cols_dict: A dictionary mapping dataframe columns to the name of the column names in the input file :param blueprint: The key corresponding to a dataframe in a tree to be used as a "blueprint" for populating the df_list dataframes forward. :param blue_tree: A NAICS tree with the "blueprint" dataframe. The default is the original NAICS tree. :param from_out: Whether to read in the data from output. :param output_path: The path of the output file. """ # If from_out, load the data tree from output: if from_out: data_tree = naics.load_tree_dfs(input_path=out_path, tree=data_tree) return data_tree # Load Farm Proprietorship data: farm_data = pd.read_csv(_FARM_IN_PATH) new_farm_cols = ["Land", "FA"] # data_tree.append_all(df_nm=_FARM_DF_NM, df_cols=new_farm_cols) # land_mult = ((farm_data["R_sp"][0] + farm_data["Q_sp"][0]) * (float(farm_data["A_sp"][0]) / farm_data["A_p"][0])) total = farm_data["R_p"][0] + farm_data["Q_p"][0] total_pa = 0 cur_codes = [111, 112] proportions = np.zeros(len(cur_codes)) proportions = naics.get_proportions(cur_codes, data_tree, _AST_PRT_DF_NM, [_LAND_COL_NM, _DEPR_COL_NM]) # for ind_code in cur_codes: cur_ind = naics.find_naics(data_tree, ind_code) cur_df = cur_ind.data.dfs[_AST_PRT_DF_NM] total_pa += (cur_df[_LAND_COL_NM][0] + cur_df[_DEPR_COL_NM][0]) # for i in xrange(0, len(cur_codes)): cur_ind = naics.find_naics(data_tree, cur_codes[i]) cur_ind.data.dfs[_FARM_DF_NM]["Land"][0] = ( land_mult * cur_ind.data.dfs[_AST_PRT_DF_NM][_LAND_COL_NM][0] / total_pa) cur_ind.data.dfs[_FARM_DF_NM]["FA"][0] = ( (proportions.iloc[1, i] * total) - cur_ind.data.dfs[_FARM_DF_NM]["Land"][0]) # Default: if blueprint == None and _TOT_CORP_DF_NM in data_tree.enum_inds[ 0].data.dfs.keys(): blueprint = _TOT_CORP_DF_NM naics.pop_back(tree=data_tree, df_list=[_FARM_DF_NM]) naics.pop_forward(tree=data_tree, df_list=[_FARM_DF_NM], blueprint=blueprint, blue_tree=blue_tree) # return data_tree
def read_land(asset_tree): land_data = pd.read_csv(_LAND_IN_PATH) land_data = _LAND_IN_FILE_FCTR * land_data # Initializing NAICS tree for the land data: df_cols = ["All", "Corp", "Non-Corp"] land_tree = naics.generate_tree() land_tree.append_all(df_nm="Land", df_cols=df_cols) ''' Calculate the proportion that belong in corporate and non-corporate tax categories:''' corp_sum = 0.0 non_corp_sum = 0.0 for i in _CORP_NMS: corp_sum += asset_tree.enum_inds[0].data.dfs["LAND"][i][0] for i in _NCORP_NMS: non_corp_sum += asset_tree.enum_inds[0].data.dfs["LAND"][i][0] if corp_sum + non_corp_sum == 0: return land_tree ''' Initialize the total industry category--corresponding to NAICS code of "1": ''' land_df = land_tree.enum_inds[0].data.dfs["Land"] land_df["Corp"][0] = land_data["Corporate"][0] land_df["Non-Corp"][0] = land_data["Non-Corporate"][0] land_df["All"][0] = (land_data["Corporate"][0]+ land_data["Non-Corporate"][0]) # Use the asset_tree to populate the rest: naics.pop_back(land_tree, ["Land"]) naics.pop_forward(land_tree, ["Land"], "LAND", asset_tree) return land_tree
def load_type(data_tree=naics.generate_tree(), blue_tree = None, blueprint = None, from_out=False, out_path=None): """ This function loads the soi partnership asset data. :param data_tree: The NAICS tree to read the data into. :param blueprint: The key corresponding to a dataframe in a tree to be used as a "blueprint" for populating the df_list dataframes forward. :param blue_tree: A NAICS tree with the "blueprint" dataframe. The default is the original NAICS tree. :param from_out: Whether to read in the data from output. """ # Initializing the output path: if out_path == None: out_path = _TYP_OUT_PATH # If from_out, load the data tree from output: if from_out: data_tree = naics.load_tree_dfs(input_path=out_path, tree=data_tree) return data_tree # Opening data on income by partner type: wb = xlrd.open_workbook(_TYP_IN_PATH) ws = wb.sheet_by_index(0) num_rows = ws.nrows # Initializing dataframe to hold pertinent type income data: typ_df = pd.DataFrame(np.zeros((ws.ncols-1, len(_TYP_IN_ROW_NMS))), columns=_TYP_DF_DICT.values()) # Extracting the data. For each input row: for in_row_nm in _TYP_IN_ROW_NMS: df_col_key = _TYP_IN_ROWS_DF_DICT[in_row_nm] df_col_nm = _TYP_DF_DICT[df_col_key] in_row_nm = in_row_nm.lower() for ws_row_index in xrange(0, num_rows): ws_row_nm = str(ws.cell_value(ws_row_index,0)).lower() if(in_row_nm in ws_row_nm): typ_df[df_col_nm] = ws.row_values(ws_row_index,1) break # Scaling the data to the correct units: typ_df = typ_df * _TYP_FILE_FCTR # Reading in the crosswalks between the columns and the NAICS codes: typ_cross = pd.read_csv(_TYP_IN_CROSS_PATH) # data_tree = naics.load_data_with_cross( data_tree=data_tree, data_df=typ_df, cross_df=typ_cross, df_nm=_TYP_DF_NM ) # Default blueprint is partner income, and, if not, then tot_corps: has_inc_df = _INC_DF_NM in data_tree.enum_inds[0].data.dfs.keys() has_tot_df = _TOT_CORP_DF_NM in data_tree.enum_inds[0].data.dfs.keys() if blueprint == None and has_inc_df: blueprint = _INC_DF_NM elif blueprint == None and has_tot_df: blueprint = _TOT_CORP_DF_NM # Populate all levels of specificity in the NAICS tree: naics.pop_back(tree=data_tree, df_list=[_TYP_DF_NM]) naics.pop_forward(tree=data_tree, df_list=[_TYP_DF_NM], blueprint=blueprint, blue_tree=blue_tree) return data_tree
def read_inventories(asset_tree): # Opening BEA's excel file on depreciable assets by industry: inv_book = xlrd.open_workbook(_INV_IN_PATH) sht0 = inv_book.sheet_by_index(0) num_rows = sht0.nrows num_cols = sht0.ncols #Find the starting index in worksheet. cur_index = naics.search_ws(sht0, 1, 25, True, [0, 0], True) check_index = naics.search_ws(sht0, "line", 20) if (cur_index[1] != check_index[1]): print "ERROR" # Reading in the crosswalk: inv_cross = pd.read_csv(_INV_IN_CROSS_PATH) # Creating a tree for the inventory data: data_cols = ["All", "Corp", "Non-Corp"] inv_tree = naics.generate_tree() inv_tree.append_all(df_nm="Inventories", df_cols=data_cols) # inv_data = np.zeros(inv_cross.shape[0]) # cross_index = 0 for i in xrange(cur_index[0], num_rows): if (cross_index >= inv_cross.shape[0]): break cur_list = str(sht0.cell_value(i, cur_index[1])).strip() cur_name = str(sht0.cell_value(i, cur_index[1] + 1)).strip() checks = ((str(cur_list) == str(inv_cross["List"][cross_index])) and (str(cur_name) == str(inv_cross["Industry"][cross_index]))) if (checks): cross_index += 1 try: cur_value = float(sht0.cell_value(i, num_cols - 1)) except ValueError: continue inv_data[cross_index - 1] = cur_value # Data is in billions: inv_data[cross_index - 1] = _INV_IN_FILE_FCTR * inv_data[cross_index - 1] # for i in xrange(0, inv_cross.shape[0]): cur_codes = inv_cross["NAICS"][i].strip().split(".") proportions = naics.get_proportions(cur_codes, asset_tree, "INV") for j in xrange(0, proportions.shape[1]): cur_ind = inv_tree.enum_inds[int(proportions.iloc[0, j])] prev_ind = asset_tree.enum_inds[int(proportions.iloc[0, j])] prev_df = prev_ind.data.dfs["INV"] if (sum(prev_df.iloc[0, :]) != 0): cur_dfs = ((prev_df / sum(prev_df.iloc[0, :])) * (inv_data[i] * proportions.iloc[1, j])) inv_df = cur_ind.data.dfs["Inventories"] inv_df["All"] += sum(cur_dfs.iloc[0, :]) for k in _CORP_NMS: inv_df["Corp"] += cur_dfs[k][0] for k in _NCORP_NMS: inv_df["Non-Corp"] += cur_dfs[k][0] # naics.pop_back(inv_tree, ["Inventories"]) naics.pop_forward(inv_tree, ["Inventories"], "INV", asset_tree) return inv_tree
def load_soi_farm_prop(data_tree = naics.generate_tree(), blue_tree = None, blueprint = None, from_out=False, out_path=_FARM_PROP_OUT_PATH): """ This function loads the soi nonfarm proprietorship data: :param data_tree: The NAICS tree to read the data into. :param cols_dict: A dictionary mapping dataframe columns to the name of the column names in the input file :param blueprint: The key corresponding to a dataframe in a tree to be used as a "blueprint" for populating the df_list dataframes forward. :param blue_tree: A NAICS tree with the "blueprint" dataframe. The default is the original NAICS tree. :param from_out: Whether to read in the data from output. :param output_path: The path of the output file. """ # If from_out, load the data tree from output: if from_out: data_tree = naics.load_tree_dfs(input_path=out_path, tree=data_tree) return data_tree # Load Farm Proprietorship data: farm_data = pd.read_csv(_FARM_IN_PATH) new_farm_cols = ["Land", "FA"] # for i in data_tree.enum_inds: i.append_dfs((_FARM_DF_NM, pd.DataFrame(np.zeros((1,len(new_farm_cols))), columns=new_farm_cols))) # land_mult = ((farm_data["R_sp"][0] + farm_data["Q_sp"][0]) * (float(farm_data["A_sp"][0])/farm_data["A_p"][0])) total = farm_data.iloc[0,0] + farm_data.iloc[0,2] total_pa = 0 cur_codes = [111,112] proportions = np.zeros(len(cur_codes)) proportions = naics.get_proportions(cur_codes, data_tree, "PA_assets", ["Land (Net)","Depreciable assets (Net)"]) # for i in xrange(0, len(cur_codes)): cur_ind = naics.find_naics(data_tree, cur_codes[i]) cur_df = cur_ind.data.dfs["PA_assets"] total_pa += (cur_df["Land (Net)"][0] + cur_df["Depreciable assets (Net)"][0]) # for i in xrange(0,len(cur_codes)): cur_ind = naics.find_naics(data_tree, cur_codes[i]) cur_ind.data.dfs[_FARM_DF_NM]["Land"][0] = (land_mult * cur_ind.data.dfs["PA_assets"]["Land (Net)"][0]/ total_pa) cur_ind.data.dfs[_FARM_DF_NM]["FA"][0] = ((proportions.iloc[1,i]*total) - cur_ind.data.dfs[_FARM_DF_NM]["Land"][0]) # Default: if blueprint == None and _TOT_CORP_DF_NM in data_tree.enum_inds[0].data.dfs.keys(): blueprint = _TOT_CORP_DF_NM naics.pop_back(tree=data_tree, df_list=[_FARM_DF_NM]) naics.pop_forward(tree=data_tree, df_list=[_FARM_DF_NM], blueprint=blueprint, blue_tree=blue_tree) # return data_tree
def read_inventories(asset_tree): # Opening BEA's excel file on depreciable assets by industry: inv_book = xlrd.open_workbook(_INV_IN_PATH) sht0 = inv_book.sheet_by_index(0) num_rows = sht0.nrows num_cols = sht0.ncols #Find the starting index in worksheet. cur_index = naics.search_ws(sht0, 1, 25, True, [0,0], True) check_index = naics.search_ws(sht0, "line", 20) if(cur_index[1] != check_index[1]): print "ERROR" # Reading in the crosswalk: inv_cross = pd.read_csv(_INV_IN_CROSS_PATH) # Creating a tree for the inventory data: data_cols = ["All", "Corp", "Non-Corp"] inv_tree = naics.generate_tree() inv_tree.append_all(df_nm="Inventories", df_cols=data_cols) # inv_data = np.zeros(inv_cross.shape[0]) # cross_index = 0 for i in xrange(cur_index[0], num_rows): if(cross_index >= inv_cross.shape[0]): break cur_list = str(sht0.cell_value(i, cur_index[1])).strip() cur_name = str(sht0.cell_value(i, cur_index[1]+1)).strip() checks = ((str(cur_list) == str(inv_cross["List"][cross_index])) and (str(cur_name) == str(inv_cross["Industry"][cross_index]))) if(checks): cross_index += 1 try: cur_value = float(sht0.cell_value(i, num_cols-1)) except ValueError: continue inv_data[cross_index-1] = cur_value # Data is in billions: inv_data[cross_index-1] = _INV_IN_FILE_FCTR * inv_data[cross_index-1] # for i in xrange(0, inv_cross.shape[0]): cur_codes = inv_cross["NAICS"][i].strip().split(".") proportions = naics.get_proportions(cur_codes, asset_tree, "INV") for j in xrange(0, proportions.shape[1]): cur_ind = inv_tree.enum_inds[int(proportions.iloc[0,j])] prev_ind = asset_tree.enum_inds[int(proportions.iloc[0,j])] prev_df = prev_ind.data.dfs["INV"] if(sum(prev_df.iloc[0, :]) != 0): cur_dfs = ((prev_df/sum(prev_df.iloc[0,:])) * (inv_data[i] * proportions.iloc[1,j])) inv_df = cur_ind.data.dfs["Inventories"] inv_df["All"] += sum(cur_dfs.iloc[0,:]) for k in _CORP_NMS: inv_df["Corp"] += cur_dfs[k][0] for k in _NCORP_NMS: inv_df["Non-Corp"] += cur_dfs[k][0] # naics.pop_back(inv_tree, ["Inventories"]) naics.pop_forward(inv_tree, ["Inventories"], "INV", asset_tree) return inv_tree
def load_income(data_tree=naics.generate_tree(), blue_tree=None, blueprint=None, from_out=False, out_path=None): """ This function loads the soi partnership income data. :param data_tree: The NAICS tree to read the data into. :param blueprint: The key corresponding to a dataframe in a tree to be used as a "blueprint" for populating the df_list dataframes forward. :param blue_tree: A NAICS tree with the "blueprint" dataframe. The default is the original NAICS tree. :param from_out: Whether to read in the data from output. """ # Initializing the output path: if out_path == None: out_path = _INC_OUT_PATH # If from_out, load the data tree from output: if from_out: data_tree = naics.load_tree_dfs(input_path=out_path, tree=data_tree) return data_tree # Opening data on net income/loss: wb = xlrd.open_workbook(_INC_IN_PATH) ws = wb.sheet_by_index(0) start_col = naics.search_ws(ws, _INC_STRT_COL_NM, 20)[1] # Initializing dataframe to hold pertinent income/loss data: data_df = pd.DataFrame(np.zeros((ws.ncols-start_col,3)), columns = _INC_PRT_DF_COL_NMS) # Extracting the data from the worksheet: for row in xrange(0, ws.nrows): # Going through each row of excel file, looking for input rows: if(_INC_NET_INC_ROW_NM in str(ws.cell_value(row,0)).lower()): data_df[_INC_NET_INC_COL_NM] = ws.row_values(row+1, start_col) data_df[_INC_NET_LOSS_COL_NM] = ws.row_values(row+2, start_col) break if(_INC_DEPR_ROW_NM in str(ws.cell_value(row,0)).lower()): data_df[_INC_DEPR_COL_NM] = ws.row_values(row, start_col) # Scaling the data to the correct units: data_df = data_df * _INC_FILE_FCTR # Reading in the crosswalks between the columns and the NAICS codes: pa01cross = pd.read_csv(_INC_IN_CROSS_PATH) # Processing the inc/loss data into the NAICS tree: data_tree = naics.load_data_with_cross( data_tree=data_tree, data_df=data_df, cross_df=pa01cross, df_nm=_INC_DF_NM ) # Default blueprint is tot_corps: has_tot_df = _TOT_CORP_DF_NM in data_tree.enum_inds[0].data.dfs.keys() if blueprint == None and has_tot_df: blueprint = _TOT_CORP_DF_NM # Populate all levels of specificity in the NAICS tree: naics.pop_back(tree=data_tree, df_list=[_INC_DF_NM]) naics.pop_forward(tree=data_tree, df_list=[_INC_DF_NM], blueprint=blueprint, blue_tree=blue_tree) return data_tree
def load_pa_05_data(data_tree = None, blue_tree = None, blueprint = None): # Defining constant factor (e.g. data is in thousands): pa_05_fctr = 10 ** 3 # Defining constant list of types of partners: cols_05 = ["Corporate general partners", "Corporate limited partners", "Individual general partners", "Individual limited partners", "Partnership general partners", "Partnership limited partners", "Tax-exempt organization general partners", "Tax-exempt organization limited partners", "Nominee and other general partners", "Nominee and other limited partners"] if data_tree == None: data_tree = naics.generate_tree() # for i in os.listdir(prt_dir): if("pa05.xls" in i): pa_05_file = os.path.abspath(prt_dir + "\\" + i) elif("pa05_Crosswalk.csv" in i): pa_05_cross_file = os.path.abspath(prt_dir + "\\" + i) # book_05 = xlrd.open_workbook(pa_05_file) sheet_05 = book_05.sheet_by_index(0) cur_rows = sheet_05.nrows # Extracting the relevant data: data_05 = [None]*len(cols_05) for i in xrange(0, len(cols_05)): for row in xrange(0, cur_rows): if(cols_05[i].lower() in str(sheet_05.cell_value(row,0)).lower()): data_05[i] = sheet_05.row_values(row,1) break # Reformatting the data: data_05 = pd.DataFrame(data_05).T # Data is in thousands of dollars: data_05 = data_05 * pa_05_fctr # Reading in the crosswalks between the columns and the NAICS codes: pa05cross = pd.read_csv(pa_05_cross_file) # data_tree = naics.load_data_with_cross( data_tree = data_tree, data_df = data_05, cross_df = pa05cross, data_cols = cols_05, df_name = "PA_types" ) # Defaults: if blueprint == None and "PA_inc_loss" in data_tree.enum_inds[0].data.dfs.keys(): blueprint = "PA_inc_loss" elif blueprint == None and "tot_corps" in data_tree.enum_inds[0].data.dfs.keys(): blueprint = "tot_corps" naics.pop_back(tree=data_tree, df_list=["PA_types"]) naics.pop_forward(tree=data_tree, df_list=["PA_types"], blueprint=blueprint, blue_tree=blue_tree) # return data_tree
def main(): # naics_tree = naics.load_naics(naics_codes_file) # read_wages.load_nipa_wages_ind(data_folder, naics_tree) # parameters = [read_wages.WAGES] # naics.pop_back(naics_tree, parameters) naics.pop_forward(naics_tree, parameters, None, None, None, True) # naics.print_tree_dfs(naics_tree, output_folder)
def get_incs(): # naics_tree = naics.generate_tree() # read_inc.load_nipa_inc_ind(data_folder, naics_tree) read_inc.load_nipa_int_ind(data_folder, naics_tree) read_inc.calc_bus_inc(naics_tree) # parameters = [read_inc.BUS_INC, read_inc.INT_INC, read_inc.FIN_INC] # naics.pop_back(naics_tree, parameters) naics.pop_forward(naics_tree, parameters) # naics.print_tree_dfs(naics_tree, output_folder) return naics_tree
def load_pa_01_data(data_tree = None, blue_tree = None, blueprint = None): # Defining constants: pa_01_fctr = 10 ** 3 # #if data_tree == None: # data_tree = naics.generate_tree() # Names of the files with the partnership data: for i in os.listdir(prt_dir): if("pa01.xls" in i): pa_01_file = os.path.abspath(prt_dir + "\\" + i) elif("pa01_Crosswalk.csv" in i): pa_01_cross_file = os.path.abspath(prt_dir + "\\" + i) # Inputting data on net income/loss by industry from "**pa01.xls": book_01 = xlrd.open_workbook(pa_01_file) sheet_01 = book_01.sheet_by_index(0) num_rows = sheet_01.nrows # The data to be extracted: cols_01 = ["Total net income", "Total net loss", "Depreciation"] num_cols = sheet_01.ncols start_col = naics.search_ws(sheet_01, "All\nindustries", 20)[1] data_01 = pd.DataFrame(np.zeros((num_cols-start_col,3)), columns = cols_01) # Extracting the data: for i in xrange(0, num_rows): if("total net income" in str(sheet_01.cell_value(i,0)).lower()): data_01["Total net income"] = sheet_01.row_values(i+1,start_col) data_01["Total net loss"] = sheet_01.row_values(i+2,start_col) break if("depreciation" in str(sheet_01.cell_value(i,0)).lower()): data_01["Depreciation"] = sheet_01.row_values(i,start_col) # data_01 = data_01 * pa_01_fctr # Reading in the crosswalks between the columns and the NAICS codes: pa01cross = pd.read_csv(pa_01_cross_file) # data_tree = naics.load_data_with_cross( data_tree = data_tree, data_df = data_01, cross_df = pa01cross, df_name = "PA_inc_loss" ) # if blueprint == None and "tot_corps" in data_tree.enum_inds[0].data.dfs.keys(): blueprint = "tot_corps" naics.pop_back(tree=data_tree, df_list=["PA_inc_loss"]) naics.pop_forward(tree=data_tree, df_list=["PA_inc_loss"], blueprint=blueprint, blue_tree=blue_tree) # return data_tree
def load_soi_farm_prop(data_tree = None, blue_tree = None, blueprint = None): # if data_tree == None: data_tree = naics.generate_tree() #Load Farm Proprietorship data: farm_data = pd.read_csv(os.path.abspath(prop_dir + "\\Farm_Data.csv")) new_farm_cols = ["Land", "FA"] # for i in data_tree.enum_inds: i.append_dfs(("farm_prop", pd.DataFrame(np.zeros((1,len(new_farm_cols))), columns=new_farm_cols))) # land_mult = ((farm_data["R_sp"][0] + farm_data["Q_sp"][0]) * (float(farm_data["A_sp"][0])/farm_data["A_p"][0])) total = farm_data.iloc[0,0] + farm_data.iloc[0,2] total_pa = 0 cur_codes = [111,112] proportions = np.zeros(len(cur_codes)) proportions = naics.get_proportions(cur_codes, data_tree, "PA_assets", ["Land (Net)","Depreciable assets (Net)"]) # for i in xrange(0, len(cur_codes)): cur_ind = naics.find_naics(data_tree, cur_codes[i]) cur_df = cur_ind.data.dfs["PA_assets"] total_pa += (cur_df["Land (Net)"][0] + cur_df["Depreciable assets (Net)"][0]) # for i in xrange(0,len(cur_codes)): cur_ind = naics.find_naics(data_tree, cur_codes[i]) cur_ind.data.dfs["farm_prop"]["Land"][0] = (land_mult * cur_ind.data.dfs["PA_assets"]["Land (Net)"][0]/ total_pa) cur_ind.data.dfs["farm_prop"]["FA"][0] = ((proportions.iloc[1,i]*total) - cur_ind.data.dfs["farm_prop"]["Land"][0]) # Default: if blueprint == None and "tot_corps" in data_tree.enum_inds[0].data.dfs.keys(): blueprint = "tot_corps" naics.pop_back(tree=data_tree, df_list=["farm_prop"]) naics.pop_forward(tree=data_tree, df_list=["farm_prop"], blueprint=blueprint, blue_tree=blue_tree) # return data_tree
def load_soi_tot_corp(data_tree=naics.generate_tree(), cols_dict=_DFLT_TOT_CORP_COLS_DICT, blueprint=None, blue_tree=None, from_out=False, output_path=_TOT_CORP_OUT_PATH): """ This function pulls the soi total corporation data. :param data_tree: The NAICS tree to read the data into. :param cols_dict: A dictionary mapping dataframe columns to the name of the column names in the input file :param blueprint: The key corresponding to a dataframe in a tree to be used as a "blueprint" for populating the df_list dataframes forward. :param blue_tree: A NAICS tree with the "blueprint" dataframe. The default is the original NAICS tree. :param from_out: Whether to read in the data from output. :param output_path: The path of the output file. """ # If from_out, load the data tree from output: if from_out: data_tree = naics.load_tree_dfs(input_path=output_path, tree=data_tree) return data_tree # Pertinent information: num_inds = len(data_tree.enum_inds) # Number of industries in NAICS tree. data_cols = cols_dict.keys() # Dataframe column names. # Opening the soi total corporate data file: try: tot_corp_data = pd.read_csv(_TOT_CORP_IN_PATH).fillna(0) except IOError: print "IOError: Tot-Corp soi data file not found." return None # Initializing dataframes for all NAICS industries: data_tree.append_all(df_nm=_TOT_DF_NM, df_cols=data_cols) # Reading the total corporation data into the NAICS tree: enum_index = 0 for code_num in np.unique(tot_corp_data[_NAICS_COL_NM]): # Find the industry with a code that matches "code_num": ind_found = False for i in range(0, num_inds): enum_index = (enum_index + 1) % num_inds cur_ind = data_tree.enum_inds[enum_index] cur_dfs = cur_ind.data.dfs[cst.CODE_DF_NM] for j in range(0, cur_dfs.shape[0]): if (cur_dfs.iloc[j, 0] == code_num): # Industry with the matching code has been found: ind_found = True cur_dfs = cur_ind.data.dfs[_TOT_DF_NM] break # If the matching industry has been found stop searching for it: if ind_found: break # If no match was found, then ignore data. if not ind_found: continue # Indicators for if rows in tot_corp_data match current industry code: indicators = (tot_corp_data[_NAICS_COL_NM] == code_num) # Calculating the data: for j in cols_dict: # Some of the data may not be reported: if cols_dict[j] == "": cur_dfs[j] = 0 else: # Note: double counting the data in the original dataset. cur_dfs[j][0] = sum( indicators * tot_corp_data[cols_dict[j]]) / 2.0 cur_dfs[j][0] = cur_dfs[j] * _TOT_CORP_IN_FILE_FCTR # Populate all levels of specificity in the NAICS tree: naics.pop_back(tree=data_tree, df_list=[_TOT_DF_NM]) naics.pop_forward(tree=data_tree, df_list=[_TOT_DF_NM], blueprint=blueprint, blue_tree=blue_tree) return data_tree
def load_asset(data_tree=naics.generate_tree(), blue_tree=None, blueprint=None, from_out=False): """ This function loads the soi partnership asset data. :param data_tree: The NAICS tree to read the data into. :param blueprint: The key corresponding to a dataframe in a tree to be used as a "blueprint" for populating the df_list dataframes forward. :param blue_tree: A NAICS tree with the "blueprint" dataframe. The default is the original NAICS tree. :param from_out: Whether to read in the data from output. """ # If from_out, load the data tree from output: if from_out: data_tree = naics.load_tree_dfs(input_file=_AST_OUT_PATH, tree=data_tree) return data_tree # Opening data on depreciable fixed assets, inventories, and land: wb = xlrd.open_workbook(_AST_IN_PATH) ws = wb.sheet_by_index(0) num_rows = ws.nrows # Columns of the asset dataframe: df_cols = _AST_DF_DICT.values() # Initializing dataframe to hold pertinent asset data: ast_df = pd.DataFrame(np.zeros((ws.ncols-1,len(df_cols))), columns=df_cols) ''' Extracting the data (note that the rows with total data appear first). For each input row:''' for in_row_nm in _AST_IN_ROW_NMS: # Key corresponding to total asset column: df_net_col_key = _AST_IN_ROWS_DF_NET_DICT[in_row_nm] # Asset dataframes net income column name: df_net_col_nm = _AST_DF_DICT[df_net_col_key] # Key corresponding to assets of net income partnerships column: df_inc_col_key = _AST_IN_ROWS_DF_INC_DICT[in_row_nm] # Asset dataframes total income column name: df_inc_col_nm = _AST_DF_DICT[df_inc_col_key] in_row_nm = in_row_nm.lower() # Finding the first input row with in_row_nm: for in_row1 in xrange(0, num_rows): in_net_row_nm = str(ws.cell_value(in_row1,0)).lower() if(in_row_nm in in_net_row_nm): # Total asset data: ast_df[df_net_col_nm] = ws.row_values(in_row1, 1) # Finding the second input row with in_row_nm: for in_row2 in xrange(in_row1+1, num_rows): in_inc_row_nm = str(ws.cell_value(in_row2,0)).lower() if(in_row_nm in in_inc_row_nm): # Asset data for companies with net income: ast_df[df_inc_col_nm] = ws.row_values(in_row2,1) break break # Scaling the data to the correct units: ast_df = ast_df * _AST_FILE_FCTR # Reading in the crosswalks between the columns and the NAICS codes: ast_cross = pd.read_csv(_AST_IN_CROSS_PATH) # Processing the asset data into the NAICS tree: data_tree = naics.load_data_with_cross( data_tree=data_tree, data_df=ast_df, cross_df=ast_cross, df_nm=_AST_DF_NM ) # Default blueprint is tot_corps: has_tot_df = _TOT_CORP_DF_NM in data_tree.enum_inds[0].data.dfs.keys() if blueprint == None and has_tot_df: blueprint = _TOT_CORP_DF_NM # Populate all levels of specificity in the NAICS tree: naics.pop_back(tree=data_tree, df_list=[_AST_DF_NM]) naics.pop_forward(tree=data_tree, df_list=[_AST_DF_NM], blueprint=blueprint, blue_tree=blue_tree) return data_tree
def load_soi_nonfarm_prop(data_tree=naics.generate_tree(), blue_tree=None, blueprint=None, from_out=False, out_path=_NFARM_PROP_OUT_PATH): """ This function loads the soi nonfarm proprietorship data: :param data_tree: The NAICS tree to read the data into. :param cols_dict: A dictionary mapping dataframe columns to the name of the column names in the input file :param blueprint: The key corresponding to a dataframe in a tree to be used as a "blueprint" for populating the df_list dataframes forward. :param blue_tree: A NAICS tree with the "blueprint" dataframe. The default is the original NAICS tree. :param from_out: Whether to read in the data from output. :param output_path: The path of the output file. """ # If from_out, load the data tree from output: if from_out: data_tree = naics.load_tree_dfs(input_path=out_path, tree=data_tree) return data_tree # Opening nonfarm proprietor data: wb = xlrd.open_workbook(_DDCT_IN_PATH) ws = wb.sheet_by_index(0) cross = pd.read_csv(_DDCT_IN_CROSS_PATH) # Finding the relevant positions in worksheet: pos1 = naics.search_ws(ws, _SECTOR_COL, 20, True, [0,0], True) pos2 = naics.search_ws(ws, _DDCT_COL1, 20) pos3 = naics.search_ws(ws,_DDCT_COL2, 20, True, np.array(pos2) + np.array([0,1])) # data_tree.append_all(df_nm=_NFARM_DF_NM, df_cols=[_NFARM_DF_COL_NM]) # cross_index = cross.shape[0]-1 enum_index = len(data_tree.enum_inds)-1 for i in xrange(pos1[0],ws.nrows): cur_cell = str(ws.cell_value(i,pos1[1])).lower().strip() # tot_proportions = 0 for j in xrange(0, cross.shape[0]): cross_index = (cross_index+1) % cross.shape[0] cur_ind_name = str(cross.iloc[cross_index,0]).lower().strip() if(cur_cell == cur_ind_name): if pd.isnull(cross.iloc[cross_index,1]): continue ind_codes = str(cross.iloc[cross_index,1]).split(".") for k in xrange(0, len(data_tree.enum_inds)): enum_index = (enum_index+1) % len(data_tree.enum_inds) cur_data = data_tree.enum_inds[enum_index].data cur_codes = cur_data.dfs[_CODE_DF_NM] cur_proportions = naics.compare_codes(ind_codes, cur_codes.iloc[:,0]) if cur_proportions == 0: continue tot_proportions += cur_proportions cur_dfs = cur_data.dfs[_NFARM_DF_NM][_NFARM_DF_COL_NM] cur_dfs[0] += (_DDCT_FILE_FCTR * cur_proportions * (ws.cell_value(i,pos2[1]) + ws.cell_value(i,pos3[1]))) if(tot_proportions == 1): break # Default: if blueprint == None and _TOT_CORP_DF_NM in data_tree.enum_inds[0].data.dfs.keys(): blueprint = _TOT_CORP_DF_NM naics.pop_back(tree=data_tree, df_list=[_NFARM_DF_NM]) naics.pop_forward(tree=data_tree, df_list=[_NFARM_DF_NM], blueprint=blueprint, blue_tree=blue_tree) # return data_tree
def load_soi_tot_corp(data_tree=naics.generate_tree(), cols_dict=_DFLT_TOT_CORP_COLS_DICT, blueprint=None, blue_tree=None, from_out=False, output_path=_TOT_CORP_OUT_PATH): """ This function pulls the soi total corporation data. :param data_tree: The NAICS tree to read the data into. :param cols_dict: A dictionary mapping dataframe columns to the name of the column names in the input file :param blueprint: The key corresponding to a dataframe in a tree to be used as a "blueprint" for populating the df_list dataframes forward. :param blue_tree: A NAICS tree with the "blueprint" dataframe. The default is the original NAICS tree. :param from_out: Whether to read in the data from output. :param output_path: The path of the output file. """ # If from_out, load the data tree from output: if from_out: data_tree = naics.load_tree_dfs(input_path=output_path, tree=data_tree) return data_tree # Pertinent information: num_inds = len(data_tree.enum_inds) # Number of industries in NAICS tree. data_cols = cols_dict.keys() # Dataframe column names. # Opening the soi total corporate data file: try: tot_corp_data = pd.read_csv(_TOT_CORP_IN_PATH).fillna(0) except IOError: print "IOError: Tot-Corp soi data file not found." return None # Initializing dataframes for all NAICS industries: data_tree.append_all(df_nm=_TOT_DF_NM, df_cols=data_cols) # Reading the total corporation data into the NAICS tree: enum_index = 0 for code_num in np.unique(tot_corp_data[_NAICS_COL_NM]): # Find the industry with a code that matches "code_num": ind_found = False for i in range(0, num_inds): enum_index = (enum_index + 1) % num_inds cur_ind = data_tree.enum_inds[enum_index] cur_dfs = cur_ind.data.dfs[cst.CODE_DF_NM] for j in range(0, cur_dfs.shape[0]): if(cur_dfs.iloc[j,0] == code_num): # Industry with the matching code has been found: ind_found = True cur_dfs = cur_ind.data.dfs[_TOT_DF_NM] break # If the matching industry has been found stop searching for it: if ind_found: break # If no match was found, then ignore data. if not ind_found: continue # Indicators for if rows in tot_corp_data match current industry code: indicators = (tot_corp_data[_NAICS_COL_NM] == code_num) # Calculating the data: for j in cols_dict: # Some of the data may not be reported: if cols_dict[j] == "": cur_dfs[j] = 0 else: # Note: double counting the data in the original dataset. cur_dfs[j][0] = sum(indicators * tot_corp_data[cols_dict[j]])/2.0 cur_dfs[j][0] = cur_dfs[j] * _TOT_CORP_IN_FILE_FCTR # Populate all levels of specificity in the NAICS tree: naics.pop_back(tree=data_tree, df_list=[_TOT_DF_NM]) naics.pop_forward(tree=data_tree, df_list=[_TOT_DF_NM], blueprint=blueprint, blue_tree=blue_tree) return data_tree
# Reading in the SOI Tax Stats-Corporation Data: naics.load_soi_corporate_data(data_tree, data_folder) # Reading in the SOI Tax Stats-Partnership Data: naics.load_soi_partner_data(data_tree, data_folder) # Reading in the SOI Tax Stats-Proprietorship Data: naics.load_soi_proprietor_data(data_tree, data_folder) ''' Many industries are not listed in the SOI datasets. The data for these missing industries are interpolated. ''' # Get a list of the names of all the pd dfs besides the list of codes: cur_names = data_tree.enum_inds[0].data.dfs.keys() cur_names.remove("Codes:") # Populate missing industry data backwards throught the tree: naics.pop_back(data_tree, cur_names) # Populate the missing total corporate data forwards through the tree: naics.pop_forward(data_tree, ["tot_corps"]) # Populate other missing data using tot_corps as a "blueprint": cur_names = ["c_corps", "s_corps", "PA_inc/loss", "PA_assets", "soi_prop"] naics.pop_forward(data_tree, cur_names, "tot_corps") # Populate pa05 using pa01: naics.pop_forward(data_tree, ["PA_types"], "PA_inc/loss") # naics.pop_back(data_tree, ["farm_prop"]) naics.pop_forward(data_tree, ["farm_prop"], "tot_corps") #Create an output tree containing only the final data on FA, INV, and LAND. output_tree = naics.summary_tree(data_tree, data_folder) # Create a tree with all the FA's broken down by type of asset:
def read_bea(asset_tree): # Opening BEA's excel file on depreciable assets by industry: bea_book = xlrd.open_workbook(_BEA_ASSET_PATH) sht_names = bea_book.sheet_names() num_shts = bea_book.nsheets # Opening "readme" sheet: try: bea_readme = bea_book.sheet_by_name("readme") except xlrd.XLRDError: bea_readme = bea_book.sheet_by_index(0) # Finding relevant positions in the readme sheet: sht_pos = naics.search_ws(bea_readme, "Industry Title", 25, False) if (sht_pos == [-1, -1]): sht_pos = naics.search_ws(bea_readme, "bea code", 25, False, [0, 0], True) sht_pos[1] = sht_pos[1] - 1 if (sht_pos == [-1, -1]): print "Error in reading BEA fixed asset \"readme\" sheet." return None cur_row = sht_pos[0] + 1 cur_col = sht_pos[1] # Finding the number of industries (includes those without bea codes): number_of_industries = 0 while cur_row < bea_readme.nrows: #if(str(bea_readme.cell_value(cur_row, cur_col)) != ""): if (unicode(bea_readme.cell_value(cur_row, cur_col)).encode('utf8') != ""): # for rownum in xrange(sh.nrows): #wr.writerow([unicode(c).encode('utf8') for c in sh.row_values(rownum)]) number_of_industries += 1 cur_row += 1 # Making a list of BEA codes based on the names of the worksheets: bea_codes1 = np.zeros(num_shts - 1, dtype=object) for index in xrange(1, num_shts): bea_codes1[index - 1] = str(sht_names[index]) # Making a list of BEA codes based on info in the readme sheet: code_index = 0 cur_row = sht_pos[0] + 1 cur_col = sht_pos[1] bea_codes2 = np.zeros(number_of_industries, dtype=object) while cur_row < bea_readme.nrows: if (unicode(bea_readme.cell_value(cur_row, cur_col)).encode('utf8') != ""): cur_code = str(bea_readme.cell_value(cur_row, cur_col + 1)) cur_code = cur_code.replace("\xa0", " ").strip() bea_codes2[code_index] = cur_code code_index += 1 cur_row += 1 # Reading in a list of the assets in the BEA file: list_file = os.path.join(_BEA_DIR, "detailnonres_list.csv") asset_list = pd.read_csv(list_file) for i in xrange(0, asset_list.shape[0]): asset_list.iloc[i, 0] = asset_list.iloc[i, 0].replace("\xa0", " ") asset_list.iloc[i, 0] = asset_list.iloc[i, 0].strip() # Reading in the corresponding naics codes: naics_file = os.path.join(_BEA_DIR, "detailnonres_naics.csv") naics_cross = pd.read_csv(naics_file).replace("\xa0", " ") naics_inds = naics_cross["Industry"] for i in xrange(0, naics_cross.shape[0]): naics_inds[i] = naics_inds[i].replace("\xa0", " ").strip() # Creating a chart cross-referencing industry names, BEA and NAICS codes. chart_cols = ["Industry", "BEA Code", "NAICS Code"] bea_chart = pd.DataFrame(np.zeros(shape=(num_shts - 2, 3), dtype=object), columns=chart_cols) bea_inds = bea_chart["Industry"] bea_naics = bea_chart["NAICS Code"] cur_row = sht_pos[0] + 1 cur_col = sht_pos[1] num_naics = naics_cross.shape[0] # Filling chart with naics codes that are in both lists and the crosswalk: naics_counter = 0 for i in range(0, num_shts - 2): for cur_row in range(sht_pos[0] + 1, bea_readme.nrows): bea_code = unicode(bea_readme.cell_value(cur_row, cur_col + 1)).encode('utf8') if (str(bea_codes1[i]) == bea_code): bea_ind = unicode(bea_readme.cell_value( cur_row, cur_col)).encode('utf8') bea_ind = bea_ind.replace('\xa0', ' ').strip() bea_inds[i] = bea_ind bea_chart["BEA Code"][i] = bea_code for k in xrange(0, num_naics): naics_counter = (naics_counter + 1) % num_naics if (naics_inds[naics_counter] == bea_chart["Industry"][i]): bea_naics[i] = naics_cross["NAICS"][naics_counter] break break # If they match except one has ".0" at the end: elif (str(bea_codes1[i]) == str( bea_readme.cell_value(cur_row, cur_col + 1))[:-2]): bea_ind = unicode(bea_readme.cell_value( cur_row, cur_col)).encode('utf8') bea_ind = bea_ind.replace('\xa0', ' ').strip() bea_chart["Industry"][i] = bea_ind cur_code = str(bea_readme.cell_value(cur_row, cur_col + 1))[:-2] bea_chart["BEA Code"][i] = cur_code for k in xrange(0, num_naics): naics_counter = (naics_counter + 1) % num_naics if (naics_inds[naics_counter] == bea_inds[i]): bea_naics[i] = naics_cross["NAICS"][naics_counter] break break # Initializing the table of assets: #cur_sht = bea_book.sheet_by_name(bea_chart["BEA Code"][0]) #sht_pos = naics.search_ws(cur_sht, "asset codes", 25, False) bea_table = pd.DataFrame(np.zeros( (asset_list.shape[0], bea_chart.shape[0])), columns=bea_chart["BEA Code"]) # For each industry, calculating for i in bea_chart["BEA Code"]: cur_sht = bea_book.sheet_by_name(i) sht_pos = naics.search_ws(cur_sht, "asset codes", 25, False) for j in xrange( 0, len(asset_list)): #xrange(sht_pos[0]+2, cur_sht.nrows): cur_asset = asset_list.iloc[j, 0] for k in xrange(sht_pos[0] + 2, cur_sht.nrows): cur_cell = unicode(cur_sht.cell_value(k, sht_pos[1] + 1)).encode('utf8') cur_cell = cur_cell.replace("\xa0", " ").strip() if (cur_asset == cur_cell): bea_table[i][j] = float( cur_sht.cell_value(k, cur_sht.ncols - 1)) #bea_table[i] = np.array(cur_sht.col_values(cur_sht.ncols-1, sht_pos[0]+2, cur_sht.nrows)) # The dollar amounts are in millions: bea_table = bea_table.convert_objects(convert_numeric=True).fillna(0) bea_table = bea_table * _BEA_IN_FILE_FCTR # Initialize tree for assets data: fixed_asset_tree = naics.generate_tree() for i in xrange(0, len(fixed_asset_tree.enum_inds)): fixed_asset_tree.enum_inds[i].data.append( ("All", pd.DataFrame(np.zeros((1, asset_list.shape[0])), columns=asset_list.iloc[:, 0]))) fixed_asset_tree.enum_inds[i].data.append( ("Corp", pd.DataFrame(np.zeros((1, asset_list.shape[0])), columns=asset_list.iloc[:, 0]))) fixed_asset_tree.enum_inds[i].data.append( ("Non-Corp", pd.DataFrame(np.zeros((1, asset_list.shape[0])), columns=asset_list.iloc[:, 0]))) # Fill in data from BEA's fixed asset table: enum_index = len(asset_tree.enum_inds) - 1 for i in xrange(0, bea_table.shape[1]): cur_codes = str(bea_chart["NAICS Code"][i]).split(".") tot_share = 0 all_proportions = naics.get_proportions(cur_codes, asset_tree, "FA").iloc[1, :] corp_proportions = naics.get_proportions(cur_codes, asset_tree, "FA", _CORP_NMS).iloc[1, :] non_corp_proportions = naics.get_proportions(cur_codes, asset_tree, "FA", _NCORP_NMS).iloc[1, :] for code_index in xrange(0, len(cur_codes)): for j in xrange(0, len(fixed_asset_tree.enum_inds)): enum_index = (enum_index + 1) % len(fixed_asset_tree.enum_inds) out_dfs = asset_tree.enum_inds[enum_index].data.dfs if (sum(out_dfs["FA"].iloc[0, :]) == 0): continue all_ratio = 1.0 corp_ratio = 0.0 non_corp_ratio = 0.0 for category in _CORP_NMS: corp_ratio += (out_dfs["FA"][category][0] / sum(out_dfs["FA"].iloc[0, :])) for category in _NCORP_NMS: non_corp_ratio += (out_dfs["FA"][category][0] / sum(out_dfs["FA"].iloc[0, :])) cur_data = fixed_asset_tree.enum_inds[enum_index].data ind_codes = cur_data.dfs["Codes:"].iloc[:, 0] share = naics.compare_codes(cur_codes, ind_codes) tot_share += share if (share == 0): continue num_assets = fixed_asset_tree.enum_inds[0].data.dfs[ "All"].shape[1] for k in xrange(0, num_assets): cur_data.dfs["All"].iloc[0, k] = (bea_table.iloc[k, i] * all_ratio * all_proportions[code_index]) cur_data.dfs["Corp"].iloc[0, k] = ( bea_table.iloc[k, i] * corp_ratio * corp_proportions[code_index]) cur_data.dfs["Non-Corp"].iloc[0, k] = ( bea_table.iloc[k, i] * non_corp_ratio * non_corp_proportions[code_index]) break if (tot_share == 1): break # naics.pop_back(fixed_asset_tree, ["All", "Corp", "Non-Corp"]) naics.pop_forward(tree=fixed_asset_tree, df_list=["All"], blueprint="FA", blue_tree=asset_tree) naics.pop_forward(tree=fixed_asset_tree, df_list=["Corp"], blueprint="FA", blue_tree=asset_tree, sub_print=_CORP_NMS) naics.pop_forward(tree=fixed_asset_tree, df_list=["Non-Corp"], blueprint="FA", blue_tree=asset_tree, sub_print=_NCORP_NMS) return fixed_asset_tree
def load_soi_nonfarm_prop(data_tree=naics.generate_tree(), blue_tree=None, blueprint=None, from_out=False, out_path=_NFARM_PROP_OUT_PATH): """ This function loads the soi nonfarm proprietorship data: :param data_tree: The NAICS tree to read the data into. :param cols_dict: A dictionary mapping dataframe columns to the name of the column names in the input file :param blueprint: The key corresponding to a dataframe in a tree to be used as a "blueprint" for populating the df_list dataframes forward. :param blue_tree: A NAICS tree with the "blueprint" dataframe. The default is the original NAICS tree. :param from_out: Whether to read in the data from output. :param output_path: The path of the output file. """ # If from_out, load the data tree from output: if from_out: data_tree = naics.load_tree_dfs(input_path=out_path, tree=data_tree) return data_tree # Opening nonfarm proprietor data: wb = xlrd.open_workbook(_DDCT_IN_PATH) ws = wb.sheet_by_index(0) cross = pd.read_csv(_DDCT_IN_CROSS_PATH) # Finding the relevant positions in worksheet: pos1 = naics.search_ws(ws, _SECTOR_COL, 20, True, [0, 0], True) pos2 = naics.search_ws(ws, _DDCT_COL1, 20) pos3 = naics.search_ws(ws, _DDCT_COL2, 20, True, np.array(pos2) + np.array([0, 1])) # data_tree.append_all(df_nm=_NFARM_DF_NM, df_cols=[_NFARM_DF_COL_NM]) # cross_index = cross.shape[0] - 1 enum_index = len(data_tree.enum_inds) - 1 for i in xrange(pos1[0], ws.nrows): cur_cell = str(ws.cell_value(i, pos1[1])).lower().strip() # tot_proportions = 0 for j in xrange(0, cross.shape[0]): cross_index = (cross_index + 1) % cross.shape[0] cur_ind_name = str(cross.iloc[cross_index, 0]).lower().strip() if (cur_cell == cur_ind_name): if pd.isnull(cross.iloc[cross_index, 1]): continue ind_codes = str(cross.iloc[cross_index, 1]).split(".") for k in xrange(0, len(data_tree.enum_inds)): enum_index = (enum_index + 1) % len(data_tree.enum_inds) cur_data = data_tree.enum_inds[enum_index].data cur_codes = cur_data.dfs[_CODE_DF_NM] cur_proportions = naics.compare_codes( ind_codes, cur_codes.iloc[:, 0]) if cur_proportions == 0: continue tot_proportions += cur_proportions cur_dfs = cur_data.dfs[_NFARM_DF_NM][_NFARM_DF_COL_NM] cur_dfs[0] += (_DDCT_FILE_FCTR * cur_proportions * (ws.cell_value(i, pos2[1]) + ws.cell_value(i, pos3[1]))) if (tot_proportions == 1): break # Default: if blueprint == None and _TOT_CORP_DF_NM in data_tree.enum_inds[ 0].data.dfs.keys(): blueprint = _TOT_CORP_DF_NM naics.pop_back(tree=data_tree, df_list=[_NFARM_DF_NM]) naics.pop_forward(tree=data_tree, df_list=[_NFARM_DF_NM], blueprint=blueprint, blue_tree=blue_tree) # return data_tree
def init_depr_rates(data_tree=naics.generate_tree(), get_econ=False, get_tax_est=False, get_tax_150=False, get_tax_200=False, get_tax_sl=False, get_tax_ads=False, soi_from_out=False, output_data=False): # Reading in the SOI Tax Stats-Corporation data: soi_tree = naics.generate_tree() soi_tree = read_soi.load_corporate(soi_tree=soi_tree, from_out=soi_from_out, output_data=(not soi_from_out)) # Reading in the SOI Tax Stats-Partnership data: soi_tree = read_soi.load_partner(soi_tree=soi_tree, from_out=soi_from_out, output_data=(not soi_from_out)) # Reading in the SOI Tax Stats-Proprietorship data: soi_tree = read_soi.load_soi_proprietorship(soi_tree=soi_tree, from_out=soi_from_out, output_data=(not soi_from_out)) ''' Many industries are not listed in the SOI datasets. The data for these missing industries are interpolated. ''' # Get a list of the names of all the pd dfs besides the list of codes: #cur_names = soi_tree.enum_inds[0].data.dfs.keys() #cur_names.remove(_CODE_DF_NM) # Populate missing industry data backwards throught the tree: #naics.pop_back(data_tree, cur_names) # Populate the missing total corporate data forwards through the tree: #naics.pop_forward(data_tree, ["tot_corps"]) # Populate other missing data using tot_corps as a "blueprint": #cur_names = ["c_corps", "s_corps", "PA_inc_loss", "PA_assets", "soi_prop"] #naics.pop_forward(data_tree, cur_names, "tot_corps") # Calculate c_corps data: #read_soi.calc_c_corp(data_tree) #naics.pop_back(data_tree,["c_corps"]) #naics.pop_forward(data_tree, ["c_corps"], "tot_corps") # Populate pa05 using pa01: #naics.pop_forward(data_tree, ["PA_types"], "PA_inc_loss") # #naics.pop_back(data_tree, ["farm_prop"]) #naics.pop_forward(data_tree, ["farm_prop"], "tot_corps") #Create an output tree containing only the final data on FA, INV, and LAND. output_tree = calc_assets.summary_tree(data_tree, _DATA_DIR) # Create a tree with all the FA's broken down by type of asset: asset_tree = read_bea.read_bea(output_tree, _DATA_DIR) naics.pop_back(asset_tree, ["All", "Corp", "Non-Corp"]) # corp_types = ["C Corporations", "Corporate general partners", "Corporate limited partners"] non_corp_types = ["S Corporations", "Individual general partners", "Individual limited partners", "Partnership general partners", "Partnership limited partners", "Tax-exempt organization general partners", "Tax-exempt organization limited partners", "Nominee and other general partners", "Nominee and other limited partners", "Sole Proprietors"] naics.pop_forward(asset_tree, ["All"], "FA", output_tree) naics.pop_forward(asset_tree, ["Corp"], "FA", output_tree, corp_types) naics.pop_forward(asset_tree, ["Non-Corp"], "FA", output_tree, non_corp_types) # inv_tree = read_inv.read_inventories(output_tree, _DATA_DIR) naics.pop_back(inv_tree, ["Inventories"]) naics.pop_forward(inv_tree, ["Inventories"]) # land_tree = read_land.read_land(output_tree, _DATA_DIR) naics.pop_back(land_tree, ["Land"]) naics.pop_forward(land_tree, ["Land"], "LAND", output_tree) # econ_depr_tree = calc_rates.calc_depr_rates(asset_tree, inv_tree, land_tree, _DATA_DIR) tax_depr_tree = calc_rates.calc_tax_depr_rates(asset_tree, inv_tree, land_tree, _DATA_DIR) naics.pop_rates(tax_depr_tree) return {"Econ": econ_depr_tree, "Tax": tax_depr_tree}
def load_soi_tot_corp(data_tree = None, cols_dict = None, blue_tree = None, blueprint = None): """This function pulls SOI total corporate data. :param data_tree: A string to be converted? :returns: A bar formatted string?huh """ if data_tree == None: data_tree = naics.generate_tree() # The aggregate 1120 filings data for all corporations: tot_corp_file = "" for i in os.listdir(corp_dir): if(i[4:] == "sb1.csv"): tot_corp_file = os.path.abspath(corp_dir + "\\" + i) break try: tot_corp_data = pd.read_csv(tot_corp_file).fillna(0) except IOError: print "IOError: Could not find tot-corp soi data file." return None # Listing the relevant columns that are being extracted from the dataset: if cols_dict == None: # Default: cols_dict = dict([("Depreciable Assets","DPRCBL_ASSTS"), ("Accumulated Depreciation", "ACCUM_DPR"), ("Land", "LAND"), ("Inventories", "INVNTRY"), ("Interest Paid", "INTRST_PD"), ("Capital Stock", "CAP_STCK"), ("Additional paid-in Capital", "PD_CAP_SRPLS"), ("Earnings (rtnd appr)", "RTND_ERNGS_APPR"), ("Earnings (rtnd unappr.)", "COMP_RTND_ERNGS_UNAPPR"), ("Cost of Treasury Stock", "CST_TRSRY_STCK")]) data_cols = cols_dict.keys() # Initializing data on all corporations: for i in data_tree.enum_inds: i.append_dfs(("tot_corps", pd.DataFrame(np.zeros((1,len(data_cols))), columns = data_cols))) # Loading total-corporation data: enum_index = 0 for code_num in np.unique(tot_corp_data["INDY_CD"]): # Find the industry with a code that matches "code_num": ind_found = False for i in range(0, len(data_tree.enum_inds)): enum_index = (enum_index + 1) % len(data_tree.enum_inds) cur_dfs = data_tree.enum_inds[i].data.dfs["Codes:"] for j in range(0, cur_dfs.shape[0]): if(cur_dfs.iloc[j,0] == code_num): # Industry with the matching code has been found: ind_found = True cur_dfs = data_tree.enum_inds[i].data.dfs["tot_corps"] break # If the matching industry has been found stop searching for it. if ind_found: break # If no match was found, then ignore data. if not ind_found: continue # Indicators for if rows in tot_corp_data match current industry code: indicators = (tot_corp_data["INDY_CD"] == code_num) # Filling in every column in the dataframe: for j in cols_dict: cur_dfs[j][0] = sum(indicators * tot_corp_data[cols_dict[j]]) # naics.pop_back(tree=data_tree, df_list=["tot_corps"]) naics.pop_forward(tree=data_tree, df_list=["tot_corps"], blueprint=blueprint, blue_tree=blue_tree) # return data_tree
def load_soi_prop_data(data_tree = None, blue_tree = None, blueprint = None): # prop_fctr = 10**3 # if data_tree == None: data_tree = naics.generate_tree() # Finding the "\**sp01br" file in the proprietorships folder: for i in os.listdir(prop_dir): if(i[2:] == "sp01br.xls"): sp01br_file = os.path.abspath(prop_dir + "\\" + i) if(i[2:] == "sp01br_Crosswalk.csv"): sp01br_cross_file = os.path.abspath(prop_dir + "\\" + i) # Opening nonfarm proprietor data: cur_wb = xlrd.open_workbook(sp01br_file) cur_ws = cur_wb.sheet_by_index(0) cur_cross = pd.read_csv(sp01br_cross_file) # Finding the relevant positions in worksheet: pos1 = naics.search_ws(cur_ws,"Industrial sector",20, True, [0,0], True) pos2 = naics.search_ws(cur_ws,"Depreciation\ndeduction",20) pos3 = naics.search_ws(cur_ws,"Depreciation\ndeduction",20, True, np.array(pos2) + np.array([0,1])) # for i in data_tree.enum_inds: i.append_dfs(("soi_prop", pd.DataFrame(np.zeros((1,1)), columns = ["Depr Deductions"]))) # cross_index = cur_cross.shape[0]-1 enum_index = len(data_tree.enum_inds)-1 for i in xrange(pos1[0],cur_ws.nrows): cur_cell = str(cur_ws.cell_value(i,pos1[1])).lower().strip() # tot_proportions = 0 for j in xrange(0, cur_cross.shape[0]): cross_index = (cross_index+1) % cur_cross.shape[0] cur_ind_name = str(cur_cross.iloc[cross_index,0]).lower().strip() if(cur_cell == cur_ind_name): if pd.isnull(cur_cross.iloc[cross_index,1]): continue ind_codes = str(cur_cross.iloc[cross_index,1]).split(".") for k in xrange(0, len(data_tree.enum_inds)): enum_index = (enum_index+1) % len(data_tree.enum_inds) cur_data = data_tree.enum_inds[enum_index].data cur_codes = cur_data.dfs["Codes:"] # #print ind_codes #print cur_codes cur_proportions = naics.compare_codes(ind_codes, cur_codes.iloc[:,0]) if cur_proportions == 0: continue tot_proportions += cur_proportions cur_dfs = cur_data.dfs["soi_prop"]["Depr Deductions"] cur_dfs[0] += (prop_fctr * cur_proportions * (cur_ws.cell_value(i,pos2[1]) + cur_ws.cell_value(i,pos3[1]))) if(tot_proportions == 1): break # Default: if blueprint == None and "tot_corps" in data_tree.enum_inds[0].data.dfs.keys(): blueprint = "tot_corps" naics.pop_back(tree=data_tree, df_list=["soi_prop"]) naics.pop_forward(tree=data_tree, df_list=["soi_prop"], blueprint=blueprint, blue_tree=blue_tree) # return data_tree
def read_bea(asset_tree): # Opening BEA's excel file on depreciable assets by industry: bea_book = xlrd.open_workbook(_BEA_ASSET_PATH) sht_names = bea_book.sheet_names() num_shts = bea_book.nsheets # Opening "readme" sheet: try: bea_readme = bea_book.sheet_by_name("readme") except xlrd.XLRDError: bea_readme = bea_book.sheet_by_index(0) # Finding relevant positions in the readme sheet: sht_pos = naics.search_ws(bea_readme, "Industry Title", 25, False) if(sht_pos == [-1,-1]): sht_pos = naics.search_ws(bea_readme, "bea code", 25, False, [0,0], True) sht_pos[1] = sht_pos[1] - 1 if(sht_pos == [-1,-1]): print "Error in reading BEA fixed asset \"readme\" sheet." return None cur_row = sht_pos[0] + 1 cur_col = sht_pos[1] # Finding the number of industries (includes those without bea codes): number_of_industries = 0 while cur_row < bea_readme.nrows: #if(str(bea_readme.cell_value(cur_row, cur_col)) != ""): if(unicode(bea_readme.cell_value(cur_row, cur_col)).encode('utf8') != ""): # for rownum in xrange(sh.nrows): #wr.writerow([unicode(c).encode('utf8') for c in sh.row_values(rownum)]) number_of_industries += 1 cur_row += 1 # Making a list of BEA codes based on the names of the worksheets: bea_codes1 = np.zeros(num_shts-1, dtype=object) for index in xrange(1, num_shts): bea_codes1[index-1] = str(sht_names[index]) # Making a list of BEA codes based on info in the readme sheet: code_index = 0 cur_row = sht_pos[0] + 1 cur_col = sht_pos[1] bea_codes2 = np.zeros(number_of_industries, dtype=object) while cur_row < bea_readme.nrows: if(unicode(bea_readme.cell_value(cur_row, cur_col)).encode('utf8') != ""): cur_code = str(bea_readme.cell_value(cur_row, cur_col+1)) cur_code = cur_code.replace("\xa0", " ").strip() bea_codes2[code_index] = cur_code code_index += 1 cur_row += 1 # Reading in a list of the assets in the BEA file: list_file = os.path.join(_BEA_DIR, "detailnonres_list.csv") asset_list = pd.read_csv(list_file) for i in xrange(0, asset_list.shape[0]): asset_list.iloc[i,0] = asset_list.iloc[i,0].replace("\xa0", " ") asset_list.iloc[i,0] = asset_list.iloc[i,0].strip() # Reading in the corresponding naics codes: naics_file = os.path.join(_BEA_DIR, "detailnonres_naics.csv") naics_cross = pd.read_csv(naics_file).replace("\xa0", " ") naics_inds = naics_cross["Industry"] for i in xrange(0, naics_cross.shape[0]): naics_inds[i] = naics_inds[i].replace("\xa0", " ").strip() # Creating a chart cross-referencing industry names, BEA and NAICS codes. chart_cols = ["Industry","BEA Code","NAICS Code"] bea_chart = pd.DataFrame(np.zeros(shape=(num_shts-2,3), dtype=object), columns = chart_cols) bea_inds = bea_chart["Industry"] bea_naics = bea_chart["NAICS Code"] cur_row = sht_pos[0] + 1 cur_col = sht_pos[1] num_naics = naics_cross.shape[0] # Filling chart with naics codes that are in both lists and the crosswalk: naics_counter = 0 #for i in range(0, num_shts-2): i = 0 for cur_row in range(sht_pos[0]+1, bea_readme.nrows): bea_code = unicode(bea_readme.cell_value(cur_row,cur_col+1)).encode('utf8') if(str(bea_codes1[i]) == bea_code): bea_ind = unicode(bea_readme.cell_value(cur_row,cur_col)).encode('utf8') bea_ind = bea_ind.replace('\xa0', ' ').strip() bea_ind = bea_ind.replace('\xc2', '').strip() bea_inds[i] = bea_ind bea_chart["BEA Code"][i] = bea_code for k in xrange(0, num_naics): naics_counter = (naics_counter+1) % num_naics if(naics_inds[naics_counter] == bea_chart["Industry"][i]): bea_naics[i] = naics_cross["NAICS"][naics_counter] i += 1 break # If they match except one has ".0" at the end: elif(str(bea_codes1[i]) == str(bea_readme.cell_value(cur_row, cur_col+1))[:-2]): bea_ind = unicode(bea_readme.cell_value(cur_row,cur_col)).encode('utf8') bea_ind = bea_ind.replace('\xa0', ' ').strip() bea_ind = bea_ind.replace('\xc2', '').strip() bea_chart["Industry"][i] = bea_ind cur_code = str(bea_readme.cell_value(cur_row, cur_col+1))[:-2] bea_chart["BEA Code"][i] = cur_code for k in xrange(0, num_naics): naics_counter = (naics_counter+1) % num_naics if(naics_inds[naics_counter] == bea_inds[i]): bea_naics[i] = naics_cross["NAICS"][naics_counter] i += 1 break # Initializing the table of assets: #cur_sht = bea_book.sheet_by_name(bea_chart["BEA Code"][0]) #sht_pos = naics.search_ws(cur_sht, "asset codes", 25, False) bea_table = pd.DataFrame(np.zeros((asset_list.shape[0], bea_chart.shape[0])), columns = bea_chart["BEA Code"]) # For each industry, calculating for i in bea_chart["BEA Code"]: cur_sht = bea_book.sheet_by_name(i) sht_pos = naics.search_ws(cur_sht, "asset codes", 25, False) for j in xrange(0, len(asset_list)): #xrange(sht_pos[0]+2, cur_sht.nrows): cur_asset = asset_list.iloc[j,0] for k in xrange(sht_pos[0]+2, cur_sht.nrows): cur_cell = unicode(cur_sht.cell_value(k, sht_pos[1]+1)).encode('utf8') cur_cell = cur_cell.replace("\xa0", " ").strip() if(cur_asset == cur_cell): bea_table[i][j] = float(cur_sht.cell_value(k, cur_sht.ncols-1)) #bea_table[i] = np.array(cur_sht.col_values(cur_sht.ncols-1, sht_pos[0]+2, cur_sht.nrows)) # The dollar amounts are in millions: bea_table = bea_table.convert_objects(convert_numeric=True).fillna(0) bea_table = bea_table * _BEA_IN_FILE_FCTR # Initialize tree for assets data: fixed_asset_tree = naics.generate_tree() for i in xrange(0, len(fixed_asset_tree.enum_inds)): fixed_asset_tree.enum_inds[i].data.append(("All", pd.DataFrame(np.zeros((1, asset_list.shape[0])), columns = asset_list.iloc[:,0]))) fixed_asset_tree.enum_inds[i].data.append(("Corp", pd.DataFrame(np.zeros((1, asset_list.shape[0])), columns = asset_list.iloc[:,0]))) fixed_asset_tree.enum_inds[i].data.append(("Non-Corp", pd.DataFrame(np.zeros((1, asset_list.shape[0])), columns = asset_list.iloc[:,0]))) # Fill in data from BEA's fixed asset table: enum_index = len(asset_tree.enum_inds) - 1 for i in xrange(0, bea_table.shape[1]): cur_codes = str(bea_chart["NAICS Code"][i]).split(".") tot_share = 0 all_proportions = naics.get_proportions(cur_codes, asset_tree, "FA").iloc[1,:] corp_proportions = naics.get_proportions(cur_codes, asset_tree, "FA", _CORP_NMS).iloc[1,:] non_corp_proportions = naics.get_proportions(cur_codes, asset_tree, "FA", _NCORP_NMS).iloc[1,:] for code_index in xrange(0, len(cur_codes)): for j in xrange(0, len(fixed_asset_tree.enum_inds)): enum_index = (enum_index+1) % len(fixed_asset_tree.enum_inds) out_dfs = asset_tree.enum_inds[enum_index].data.dfs if(sum(out_dfs["FA"].iloc[0,:]) == 0): continue all_ratio = 1.0 corp_ratio = 0.0 non_corp_ratio = 0.0 for category in _CORP_NMS: corp_ratio += (out_dfs["FA"][category][0]/ sum(out_dfs["FA"].iloc[0,:])) for category in _NCORP_NMS: non_corp_ratio += (out_dfs["FA"][category][0]/ sum(out_dfs["FA"].iloc[0, :])) cur_data = fixed_asset_tree.enum_inds[enum_index].data ind_codes = cur_data.dfs["Codes:"].iloc[:,0] share = naics.compare_codes(cur_codes, ind_codes) tot_share += share if(share == 0): continue num_assets = fixed_asset_tree.enum_inds[0].data.dfs["All"].shape[1] for k in xrange(0, num_assets): cur_data.dfs["All"].iloc[0,k] = (bea_table.iloc[k,i]* all_ratio* all_proportions[code_index]) cur_data.dfs["Corp"].iloc[0,k] = (bea_table.iloc[k,i]* corp_ratio* corp_proportions[code_index]) cur_data.dfs["Non-Corp"].iloc[0,k] = (bea_table.iloc[k,i]* non_corp_ratio* non_corp_proportions[code_index]) break if(tot_share == 1): break # naics.pop_back(fixed_asset_tree, ["All", "Corp", "Non-Corp"]) naics.pop_forward(tree=fixed_asset_tree, df_list=["All"], blueprint="FA", blue_tree=asset_tree) naics.pop_forward(tree=fixed_asset_tree, df_list=["Corp"], blueprint="FA", blue_tree=asset_tree, sub_print=_CORP_NMS) naics.pop_forward(tree=fixed_asset_tree, df_list=["Non-Corp"], blueprint="FA", blue_tree=asset_tree, sub_print=_NCORP_NMS) return fixed_asset_tree
def load_pa_03_data(data_tree = None, blue_tree = None, blueprint = None): # Defining constants: pa_03_fctr = 10 ** 3 # if data_tree == None: data_tree = naics.generate_tree() # for i in os.listdir(prt_dir): if("pa03.xls" in i): pa_03_file = os.path.abspath(prt_dir + "\\" + i) elif("pa03_Crosswalk.csv" in i): pa_03_cross_file = os.path.abspath(prt_dir + "\\" + i) # Inputting data on depreciable fixed assets, inventories, and land: book_03 = xlrd.open_workbook(pa_03_file) sheet_03 = book_03.sheet_by_index(0) # Finding the relevant details about the table, e.g. dimensions: cur_rows = sheet_03.nrows # The following categories of data to be extracted: cols_03 = ["Depreciable assets (Net)", "Accumulated depreciation (Net)", "Inventories (Net)", "Land (Net)", "Depreciable assets (Income)", "Accumulated depreciation (Income)", "Inventories (Income)", "Land (Income)"] # The more general column names that are used in the input file: gen_03 = ["Depreciable assets", "Accumulated depreciation", "Inventories", "Land"] # The data to be extracted on partnerships as a whole: tot_data_03 = [None]*len(gen_03) # The data to be extracted on partnerships with income: inc_data_03 = [None]*len(gen_03) # Extracting the data (note that the rows with total data appear first): for i in xrange(0, len(gen_03)): for row1 in xrange(0, cur_rows): if(gen_03[i].lower() in str(sheet_03.cell_value(row1,0)).lower()): tot_data_03[i] = sheet_03.row_values(row1,1) for row2 in xrange(row1+1, cur_rows): cur_cell = str(sheet_03.cell_value(row2,0)).lower() if(gen_03[i].lower() in cur_cell): inc_data_03[i] = sheet_03.row_values(row2,1) break break # Reformatting the data: data_03 = pd.concat([pd.DataFrame(tot_data_03).T, pd.DataFrame(inc_data_03).T], axis = 1) # Data is in the thousands: data_03 = data_03 * pa_03_fctr # Reading in the crosswalks between the columns and the NAICS codes: pa03cross = pd.read_csv(pa_03_cross_file) # data_tree = naics.load_data_with_cross( data_tree = data_tree, data_df = data_03, cross_df = pa03cross, data_cols = cols_03, df_name = "PA_assets" ) # if blueprint == None and "tot_corps" in data_tree.enum_inds[0].data.dfs.keys(): blueprint = "tot_corps" naics.pop_back(tree=data_tree, df_list=["PA_assets"]) naics.pop_forward(tree=data_tree, df_list=["PA_assets"], blueprint=blueprint, blue_tree=blue_tree) # return data_tree
def load_asset(data_tree=naics.generate_tree(), blue_tree=None, blueprint=None, from_out=False, out_path=None): """ This function loads the soi partnership asset data. :param data_tree: The NAICS tree to read the data into. :param blueprint: The key corresponding to a dataframe in a tree to be used as a "blueprint" for populating the df_list dataframes forward. :param blue_tree: A NAICS tree with the "blueprint" dataframe. The default is the original NAICS tree. :param from_out: Whether to read in the data from output. """ # Initializing the output path: if out_path == None: out_path = _AST_OUT_PATH # If from_out, load the data tree from output: if from_out: data_tree = naics.load_tree_dfs(input_path=out_path, tree=data_tree) return data_tree # Opening data on depreciable fixed assets, inventories, and land: wb = xlrd.open_workbook(_AST_IN_PATH) ws = wb.sheet_by_index(0) num_rows = ws.nrows # Columns of the asset dataframe: df_cols = _AST_DF_DICT.values() # Initializing dataframe to hold pertinent asset data: ast_df = pd.DataFrame(np.zeros((ws.ncols-1,len(df_cols))), columns=df_cols) ''' Extracting the data (note that the rows with total data appear first). For each input row:''' for in_row_nm in _AST_IN_ROW_NMS: # Key corresponding to total asset column: df_net_col_key = _AST_IN_ROWS_DF_NET_DICT[in_row_nm] # Asset dataframes net income column name: df_net_col_nm = _AST_DF_DICT[df_net_col_key] # Key corresponding to assets of net income partnerships column: df_inc_col_key = _AST_IN_ROWS_DF_INC_DICT[in_row_nm] # Asset dataframes total income column name: df_inc_col_nm = _AST_DF_DICT[df_inc_col_key] in_row_nm = in_row_nm.lower() # Finding the first input row with in_row_nm: for in_row1 in xrange(0, num_rows): in_net_row_nm = str(ws.cell_value(in_row1,0)).lower() if(in_row_nm in in_net_row_nm): # Total asset data: ast_df[df_net_col_nm] = ws.row_values(in_row1, 1) # Finding the second input row with in_row_nm: for in_row2 in xrange(in_row1+1, num_rows): in_inc_row_nm = str(ws.cell_value(in_row2,0)).lower() if(in_row_nm in in_inc_row_nm): # Asset data for companies with net income: ast_df[df_inc_col_nm] = ws.row_values(in_row2,1) break break # Scaling the data to the correct units: ast_df = ast_df * _AST_FILE_FCTR # Reading in the crosswalks between the columns and the NAICS codes: ast_cross = pd.read_csv(_AST_IN_CROSS_PATH) # Processing the asset data into the NAICS tree: data_tree = naics.load_data_with_cross( data_tree=data_tree, data_df=ast_df, cross_df=ast_cross, df_nm=_AST_DF_NM ) # Default blueprint is tot_corps: has_tot_df = _TOT_CORP_DF_NM in data_tree.enum_inds[0].data.dfs.keys() if blueprint == None and has_tot_df: blueprint = _TOT_CORP_DF_NM # Populate all levels of specificity in the NAICS tree: naics.pop_back(tree=data_tree, df_list=[_AST_DF_NM]) naics.pop_forward(tree=data_tree, df_list=[_AST_DF_NM], blueprint=blueprint, blue_tree=blue_tree) return data_tree
data_tree = naics.load_naics(data_folder + "\\2012_NAICS_Codes.csv") # Reading in the SOI Tax Stats-Corporation Data: naics.load_soi_corporate_data(data_tree, data_folder) # Reading in the SOI Tax Stats-Partnership Data: naics.load_soi_partner_data(data_tree, data_folder) # Reading in the SOI Tax Stats-Proprietorship Data: naics.load_soi_proprietor_data(data_tree, data_folder) ''' Many industries are not listed in the SOI datasets. The data for these missing industries are interpolated. ''' # Get a list of the names of all the pd dfs besides the list of codes: cur_names = data_tree.enum_inds[0].data.dfs.keys() cur_names.remove("Codes:") # Populate missing industry data backwards throught the tree: naics.pop_back(data_tree, cur_names) # Populate the missing total corporate data forwards through the tree: naics.pop_forward(data_tree, ["tot_corps"]) # Populate other missing data using tot_corps as a "blueprint": cur_names = ["c_corps", "s_corps", "PA_inc/loss", "PA_assets", "soi_prop"] naics.pop_forward(data_tree, cur_names, "tot_corps") # Populate pa05 using pa01: naics.pop_forward(data_tree, ["PA_types"], "PA_inc/loss") # naics.pop_back(data_tree, ["farm_prop"]) naics.pop_forward(data_tree, ["farm_prop"], "tot_corps") #Create an output tree containing only the final data on FA, INV, and LAND. output_tree = naics.summary_tree(data_tree, data_folder) # Create a tree with all the FA's broken down by type of asset: