def load_type(data_tree=naics.generate_tree(), blue_tree = None, blueprint = None, from_out=False, out_path=None): """ This function loads the soi partnership asset data. :param data_tree: The NAICS tree to read the data into. :param blueprint: The key corresponding to a dataframe in a tree to be used as a "blueprint" for populating the df_list dataframes forward. :param blue_tree: A NAICS tree with the "blueprint" dataframe. The default is the original NAICS tree. :param from_out: Whether to read in the data from output. """ # Initializing the output path: if out_path == None: out_path = _TYP_OUT_PATH # If from_out, load the data tree from output: if from_out: data_tree = naics.load_tree_dfs(input_path=out_path, tree=data_tree) return data_tree # Opening data on income by partner type: wb = xlrd.open_workbook(_TYP_IN_PATH) ws = wb.sheet_by_index(0) num_rows = ws.nrows # Initializing dataframe to hold pertinent type income data: typ_df = pd.DataFrame(np.zeros((ws.ncols-1, len(_TYP_IN_ROW_NMS))), columns=_TYP_DF_DICT.values()) # Extracting the data. For each input row: for in_row_nm in _TYP_IN_ROW_NMS: df_col_key = _TYP_IN_ROWS_DF_DICT[in_row_nm] df_col_nm = _TYP_DF_DICT[df_col_key] in_row_nm = in_row_nm.lower() for ws_row_index in xrange(0, num_rows): ws_row_nm = str(ws.cell_value(ws_row_index,0)).lower() if(in_row_nm in ws_row_nm): typ_df[df_col_nm] = ws.row_values(ws_row_index,1) break # Scaling the data to the correct units: typ_df = typ_df * _TYP_FILE_FCTR # Reading in the crosswalks between the columns and the NAICS codes: typ_cross = pd.read_csv(_TYP_IN_CROSS_PATH) # data_tree = naics.load_data_with_cross( data_tree=data_tree, data_df=typ_df, cross_df=typ_cross, df_nm=_TYP_DF_NM ) # Default blueprint is partner income, and, if not, then tot_corps: has_inc_df = _INC_DF_NM in data_tree.enum_inds[0].data.dfs.keys() has_tot_df = _TOT_CORP_DF_NM in data_tree.enum_inds[0].data.dfs.keys() if blueprint == None and has_inc_df: blueprint = _INC_DF_NM elif blueprint == None and has_tot_df: blueprint = _TOT_CORP_DF_NM # Populate all levels of specificity in the NAICS tree: naics.pop_back(tree=data_tree, df_list=[_TYP_DF_NM]) naics.pop_forward(tree=data_tree, df_list=[_TYP_DF_NM], blueprint=blueprint, blue_tree=blue_tree) return data_tree
def load_pa_05_data(data_tree = None, blue_tree = None, blueprint = None): # Defining constant factor (e.g. data is in thousands): pa_05_fctr = 10 ** 3 # Defining constant list of types of partners: cols_05 = ["Corporate general partners", "Corporate limited partners", "Individual general partners", "Individual limited partners", "Partnership general partners", "Partnership limited partners", "Tax-exempt organization general partners", "Tax-exempt organization limited partners", "Nominee and other general partners", "Nominee and other limited partners"] if data_tree == None: data_tree = naics.generate_tree() # for i in os.listdir(prt_dir): if("pa05.xls" in i): pa_05_file = os.path.abspath(prt_dir + "\\" + i) elif("pa05_Crosswalk.csv" in i): pa_05_cross_file = os.path.abspath(prt_dir + "\\" + i) # book_05 = xlrd.open_workbook(pa_05_file) sheet_05 = book_05.sheet_by_index(0) cur_rows = sheet_05.nrows # Extracting the relevant data: data_05 = [None]*len(cols_05) for i in xrange(0, len(cols_05)): for row in xrange(0, cur_rows): if(cols_05[i].lower() in str(sheet_05.cell_value(row,0)).lower()): data_05[i] = sheet_05.row_values(row,1) break # Reformatting the data: data_05 = pd.DataFrame(data_05).T # Data is in thousands of dollars: data_05 = data_05 * pa_05_fctr # Reading in the crosswalks between the columns and the NAICS codes: pa05cross = pd.read_csv(pa_05_cross_file) # data_tree = naics.load_data_with_cross( data_tree = data_tree, data_df = data_05, cross_df = pa05cross, data_cols = cols_05, df_name = "PA_types" ) # Defaults: if blueprint == None and "PA_inc_loss" in data_tree.enum_inds[0].data.dfs.keys(): blueprint = "PA_inc_loss" elif blueprint == None and "tot_corps" in data_tree.enum_inds[0].data.dfs.keys(): blueprint = "tot_corps" naics.pop_back(tree=data_tree, df_list=["PA_types"]) naics.pop_forward(tree=data_tree, df_list=["PA_types"], blueprint=blueprint, blue_tree=blue_tree) # return data_tree
def load_income(data_tree=naics.generate_tree(), blue_tree=None, blueprint=None, from_out=False, out_path=None): """ This function loads the soi partnership income data. :param data_tree: The NAICS tree to read the data into. :param blueprint: The key corresponding to a dataframe in a tree to be used as a "blueprint" for populating the df_list dataframes forward. :param blue_tree: A NAICS tree with the "blueprint" dataframe. The default is the original NAICS tree. :param from_out: Whether to read in the data from output. """ # Initializing the output path: if out_path == None: out_path = _INC_OUT_PATH # If from_out, load the data tree from output: if from_out: data_tree = naics.load_tree_dfs(input_path=out_path, tree=data_tree) return data_tree # Opening data on net income/loss: wb = xlrd.open_workbook(_INC_IN_PATH) ws = wb.sheet_by_index(0) start_col = naics.search_ws(ws, _INC_STRT_COL_NM, 20)[1] # Initializing dataframe to hold pertinent income/loss data: data_df = pd.DataFrame(np.zeros((ws.ncols-start_col,3)), columns = _INC_PRT_DF_COL_NMS) # Extracting the data from the worksheet: for row in xrange(0, ws.nrows): # Going through each row of excel file, looking for input rows: if(_INC_NET_INC_ROW_NM in str(ws.cell_value(row,0)).lower()): data_df[_INC_NET_INC_COL_NM] = ws.row_values(row+1, start_col) data_df[_INC_NET_LOSS_COL_NM] = ws.row_values(row+2, start_col) break if(_INC_DEPR_ROW_NM in str(ws.cell_value(row,0)).lower()): data_df[_INC_DEPR_COL_NM] = ws.row_values(row, start_col) # Scaling the data to the correct units: data_df = data_df * _INC_FILE_FCTR # Reading in the crosswalks between the columns and the NAICS codes: pa01cross = pd.read_csv(_INC_IN_CROSS_PATH) # Processing the inc/loss data into the NAICS tree: data_tree = naics.load_data_with_cross( data_tree=data_tree, data_df=data_df, cross_df=pa01cross, df_nm=_INC_DF_NM ) # Default blueprint is tot_corps: has_tot_df = _TOT_CORP_DF_NM in data_tree.enum_inds[0].data.dfs.keys() if blueprint == None and has_tot_df: blueprint = _TOT_CORP_DF_NM # Populate all levels of specificity in the NAICS tree: naics.pop_back(tree=data_tree, df_list=[_INC_DF_NM]) naics.pop_forward(tree=data_tree, df_list=[_INC_DF_NM], blueprint=blueprint, blue_tree=blue_tree) return data_tree
def load_pa_01_data(data_tree = None, blue_tree = None, blueprint = None): # Defining constants: pa_01_fctr = 10 ** 3 # #if data_tree == None: # data_tree = naics.generate_tree() # Names of the files with the partnership data: for i in os.listdir(prt_dir): if("pa01.xls" in i): pa_01_file = os.path.abspath(prt_dir + "\\" + i) elif("pa01_Crosswalk.csv" in i): pa_01_cross_file = os.path.abspath(prt_dir + "\\" + i) # Inputting data on net income/loss by industry from "**pa01.xls": book_01 = xlrd.open_workbook(pa_01_file) sheet_01 = book_01.sheet_by_index(0) num_rows = sheet_01.nrows # The data to be extracted: cols_01 = ["Total net income", "Total net loss", "Depreciation"] num_cols = sheet_01.ncols start_col = naics.search_ws(sheet_01, "All\nindustries", 20)[1] data_01 = pd.DataFrame(np.zeros((num_cols-start_col,3)), columns = cols_01) # Extracting the data: for i in xrange(0, num_rows): if("total net income" in str(sheet_01.cell_value(i,0)).lower()): data_01["Total net income"] = sheet_01.row_values(i+1,start_col) data_01["Total net loss"] = sheet_01.row_values(i+2,start_col) break if("depreciation" in str(sheet_01.cell_value(i,0)).lower()): data_01["Depreciation"] = sheet_01.row_values(i,start_col) # data_01 = data_01 * pa_01_fctr # Reading in the crosswalks between the columns and the NAICS codes: pa01cross = pd.read_csv(pa_01_cross_file) # data_tree = naics.load_data_with_cross( data_tree = data_tree, data_df = data_01, cross_df = pa01cross, df_name = "PA_inc_loss" ) # if blueprint == None and "tot_corps" in data_tree.enum_inds[0].data.dfs.keys(): blueprint = "tot_corps" naics.pop_back(tree=data_tree, df_list=["PA_inc_loss"]) naics.pop_forward(tree=data_tree, df_list=["PA_inc_loss"], blueprint=blueprint, blue_tree=blue_tree) # return data_tree
def load_asset(data_tree=naics.generate_tree(), blue_tree=None, blueprint=None, from_out=False): """ This function loads the soi partnership asset data. :param data_tree: The NAICS tree to read the data into. :param blueprint: The key corresponding to a dataframe in a tree to be used as a "blueprint" for populating the df_list dataframes forward. :param blue_tree: A NAICS tree with the "blueprint" dataframe. The default is the original NAICS tree. :param from_out: Whether to read in the data from output. """ # If from_out, load the data tree from output: if from_out: data_tree = naics.load_tree_dfs(input_file=_AST_OUT_PATH, tree=data_tree) return data_tree # Opening data on depreciable fixed assets, inventories, and land: wb = xlrd.open_workbook(_AST_IN_PATH) ws = wb.sheet_by_index(0) num_rows = ws.nrows # Columns of the asset dataframe: df_cols = _AST_DF_DICT.values() # Initializing dataframe to hold pertinent asset data: ast_df = pd.DataFrame(np.zeros((ws.ncols-1,len(df_cols))), columns=df_cols) ''' Extracting the data (note that the rows with total data appear first). For each input row:''' for in_row_nm in _AST_IN_ROW_NMS: # Key corresponding to total asset column: df_net_col_key = _AST_IN_ROWS_DF_NET_DICT[in_row_nm] # Asset dataframes net income column name: df_net_col_nm = _AST_DF_DICT[df_net_col_key] # Key corresponding to assets of net income partnerships column: df_inc_col_key = _AST_IN_ROWS_DF_INC_DICT[in_row_nm] # Asset dataframes total income column name: df_inc_col_nm = _AST_DF_DICT[df_inc_col_key] in_row_nm = in_row_nm.lower() # Finding the first input row with in_row_nm: for in_row1 in xrange(0, num_rows): in_net_row_nm = str(ws.cell_value(in_row1,0)).lower() if(in_row_nm in in_net_row_nm): # Total asset data: ast_df[df_net_col_nm] = ws.row_values(in_row1, 1) # Finding the second input row with in_row_nm: for in_row2 in xrange(in_row1+1, num_rows): in_inc_row_nm = str(ws.cell_value(in_row2,0)).lower() if(in_row_nm in in_inc_row_nm): # Asset data for companies with net income: ast_df[df_inc_col_nm] = ws.row_values(in_row2,1) break break # Scaling the data to the correct units: ast_df = ast_df * _AST_FILE_FCTR # Reading in the crosswalks between the columns and the NAICS codes: ast_cross = pd.read_csv(_AST_IN_CROSS_PATH) # Processing the asset data into the NAICS tree: data_tree = naics.load_data_with_cross( data_tree=data_tree, data_df=ast_df, cross_df=ast_cross, df_nm=_AST_DF_NM ) # Default blueprint is tot_corps: has_tot_df = _TOT_CORP_DF_NM in data_tree.enum_inds[0].data.dfs.keys() if blueprint == None and has_tot_df: blueprint = _TOT_CORP_DF_NM # Populate all levels of specificity in the NAICS tree: naics.pop_back(tree=data_tree, df_list=[_AST_DF_NM]) naics.pop_forward(tree=data_tree, df_list=[_AST_DF_NM], blueprint=blueprint, blue_tree=blue_tree) return data_tree
def load_pa_03_data(data_tree = None, blue_tree = None, blueprint = None): # Defining constants: pa_03_fctr = 10 ** 3 # if data_tree == None: data_tree = naics.generate_tree() # for i in os.listdir(prt_dir): if("pa03.xls" in i): pa_03_file = os.path.abspath(prt_dir + "\\" + i) elif("pa03_Crosswalk.csv" in i): pa_03_cross_file = os.path.abspath(prt_dir + "\\" + i) # Inputting data on depreciable fixed assets, inventories, and land: book_03 = xlrd.open_workbook(pa_03_file) sheet_03 = book_03.sheet_by_index(0) # Finding the relevant details about the table, e.g. dimensions: cur_rows = sheet_03.nrows # The following categories of data to be extracted: cols_03 = ["Depreciable assets (Net)", "Accumulated depreciation (Net)", "Inventories (Net)", "Land (Net)", "Depreciable assets (Income)", "Accumulated depreciation (Income)", "Inventories (Income)", "Land (Income)"] # The more general column names that are used in the input file: gen_03 = ["Depreciable assets", "Accumulated depreciation", "Inventories", "Land"] # The data to be extracted on partnerships as a whole: tot_data_03 = [None]*len(gen_03) # The data to be extracted on partnerships with income: inc_data_03 = [None]*len(gen_03) # Extracting the data (note that the rows with total data appear first): for i in xrange(0, len(gen_03)): for row1 in xrange(0, cur_rows): if(gen_03[i].lower() in str(sheet_03.cell_value(row1,0)).lower()): tot_data_03[i] = sheet_03.row_values(row1,1) for row2 in xrange(row1+1, cur_rows): cur_cell = str(sheet_03.cell_value(row2,0)).lower() if(gen_03[i].lower() in cur_cell): inc_data_03[i] = sheet_03.row_values(row2,1) break break # Reformatting the data: data_03 = pd.concat([pd.DataFrame(tot_data_03).T, pd.DataFrame(inc_data_03).T], axis = 1) # Data is in the thousands: data_03 = data_03 * pa_03_fctr # Reading in the crosswalks between the columns and the NAICS codes: pa03cross = pd.read_csv(pa_03_cross_file) # data_tree = naics.load_data_with_cross( data_tree = data_tree, data_df = data_03, cross_df = pa03cross, data_cols = cols_03, df_name = "PA_assets" ) # if blueprint == None and "tot_corps" in data_tree.enum_inds[0].data.dfs.keys(): blueprint = "tot_corps" naics.pop_back(tree=data_tree, df_list=["PA_assets"]) naics.pop_forward(tree=data_tree, df_list=["PA_assets"], blueprint=blueprint, blue_tree=blue_tree) # return data_tree
def load_asset(data_tree=naics.generate_tree(), blue_tree=None, blueprint=None, from_out=False, out_path=None): """ This function loads the soi partnership asset data. :param data_tree: The NAICS tree to read the data into. :param blueprint: The key corresponding to a dataframe in a tree to be used as a "blueprint" for populating the df_list dataframes forward. :param blue_tree: A NAICS tree with the "blueprint" dataframe. The default is the original NAICS tree. :param from_out: Whether to read in the data from output. """ # Initializing the output path: if out_path == None: out_path = _AST_OUT_PATH # If from_out, load the data tree from output: if from_out: data_tree = naics.load_tree_dfs(input_path=out_path, tree=data_tree) return data_tree # Opening data on depreciable fixed assets, inventories, and land: wb = xlrd.open_workbook(_AST_IN_PATH) ws = wb.sheet_by_index(0) num_rows = ws.nrows # Columns of the asset dataframe: df_cols = _AST_DF_DICT.values() # Initializing dataframe to hold pertinent asset data: ast_df = pd.DataFrame(np.zeros((ws.ncols-1,len(df_cols))), columns=df_cols) ''' Extracting the data (note that the rows with total data appear first). For each input row:''' for in_row_nm in _AST_IN_ROW_NMS: # Key corresponding to total asset column: df_net_col_key = _AST_IN_ROWS_DF_NET_DICT[in_row_nm] # Asset dataframes net income column name: df_net_col_nm = _AST_DF_DICT[df_net_col_key] # Key corresponding to assets of net income partnerships column: df_inc_col_key = _AST_IN_ROWS_DF_INC_DICT[in_row_nm] # Asset dataframes total income column name: df_inc_col_nm = _AST_DF_DICT[df_inc_col_key] in_row_nm = in_row_nm.lower() # Finding the first input row with in_row_nm: for in_row1 in xrange(0, num_rows): in_net_row_nm = str(ws.cell_value(in_row1,0)).lower() if(in_row_nm in in_net_row_nm): # Total asset data: ast_df[df_net_col_nm] = ws.row_values(in_row1, 1) # Finding the second input row with in_row_nm: for in_row2 in xrange(in_row1+1, num_rows): in_inc_row_nm = str(ws.cell_value(in_row2,0)).lower() if(in_row_nm in in_inc_row_nm): # Asset data for companies with net income: ast_df[df_inc_col_nm] = ws.row_values(in_row2,1) break break # Scaling the data to the correct units: ast_df = ast_df * _AST_FILE_FCTR # Reading in the crosswalks between the columns and the NAICS codes: ast_cross = pd.read_csv(_AST_IN_CROSS_PATH) # Processing the asset data into the NAICS tree: data_tree = naics.load_data_with_cross( data_tree=data_tree, data_df=ast_df, cross_df=ast_cross, df_nm=_AST_DF_NM ) # Default blueprint is tot_corps: has_tot_df = _TOT_CORP_DF_NM in data_tree.enum_inds[0].data.dfs.keys() if blueprint == None and has_tot_df: blueprint = _TOT_CORP_DF_NM # Populate all levels of specificity in the NAICS tree: naics.pop_back(tree=data_tree, df_list=[_AST_DF_NM]) naics.pop_forward(tree=data_tree, df_list=[_AST_DF_NM], blueprint=blueprint, blue_tree=blue_tree) return data_tree