Python load_data_with_cross Examples, naics_processing.load_data_with_cross Python Examples

Example #1

0

Show file

def load_type(data_tree=naics.generate_tree(),
               blue_tree = None, blueprint = None,
               from_out=False, out_path=None):
    """ This function loads the soi partnership asset data.
    
    :param data_tree: The NAICS tree to read the data into.
    :param blueprint: The key corresponding to a dataframe in a tree to be
           used as a "blueprint" for populating the df_list dataframes forward.
    :param blue_tree: A NAICS tree with the "blueprint" dataframe. The default
           is the original NAICS tree.
    :param from_out: Whether to read in the data from output.
    """
    # Initializing the output path:
    if out_path == None:
        out_path = _TYP_OUT_PATH
    # If from_out, load the data tree from output:
    if from_out:
        data_tree = naics.load_tree_dfs(input_path=out_path,
                                        tree=data_tree)
        return data_tree
    # Opening data on income by partner type:
    wb = xlrd.open_workbook(_TYP_IN_PATH)
    ws = wb.sheet_by_index(0)
    num_rows = ws.nrows
    # Initializing dataframe to hold pertinent type income data:
    typ_df = pd.DataFrame(np.zeros((ws.ncols-1, len(_TYP_IN_ROW_NMS))),
                          columns=_TYP_DF_DICT.values())
    # Extracting the data. For each input row:
    for in_row_nm in _TYP_IN_ROW_NMS:
        
        df_col_key = _TYP_IN_ROWS_DF_DICT[in_row_nm]
        df_col_nm = _TYP_DF_DICT[df_col_key]
        in_row_nm = in_row_nm.lower()
        for ws_row_index in xrange(0, num_rows):
            ws_row_nm = str(ws.cell_value(ws_row_index,0)).lower()
            if(in_row_nm in ws_row_nm):
                typ_df[df_col_nm] = ws.row_values(ws_row_index,1)
                break
    # Scaling the data to the correct units:
    typ_df = typ_df * _TYP_FILE_FCTR
    # Reading in the crosswalks between the columns and the NAICS codes:
    typ_cross = pd.read_csv(_TYP_IN_CROSS_PATH)
    #
    data_tree = naics.load_data_with_cross(
                    data_tree=data_tree, data_df=typ_df,
                    cross_df=typ_cross, df_nm=_TYP_DF_NM
                    )
    # Default blueprint is partner income, and, if not, then tot_corps:
    has_inc_df = _INC_DF_NM in data_tree.enum_inds[0].data.dfs.keys()
    has_tot_df = _TOT_CORP_DF_NM in data_tree.enum_inds[0].data.dfs.keys()
    if blueprint == None and has_inc_df:
        blueprint = _INC_DF_NM
    elif blueprint == None and has_tot_df:
        blueprint = _TOT_CORP_DF_NM
    # Populate all levels of specificity in the NAICS tree:
    naics.pop_back(tree=data_tree, df_list=[_TYP_DF_NM])
    naics.pop_forward(tree=data_tree, df_list=[_TYP_DF_NM],
                      blueprint=blueprint, blue_tree=blue_tree)
    return data_tree

Example #2

0

Show file

File: pull_soi.py Project: evan-magnusson/dynamic

def load_pa_05_data(data_tree = None, blue_tree = None, blueprint = None):
    # Defining constant factor (e.g. data is in thousands):
    pa_05_fctr = 10 ** 3
    # Defining constant list of types of partners:
    cols_05 = ["Corporate general partners", 
               "Corporate limited partners",
               "Individual general partners",
               "Individual limited partners",
               "Partnership general partners",
               "Partnership limited partners",
               "Tax-exempt organization general partners",
               "Tax-exempt organization limited partners",
               "Nominee and other general partners", 
               "Nominee and other limited partners"]
    if data_tree == None:
        data_tree = naics.generate_tree()
    #
    for i in os.listdir(prt_dir):
        if("pa05.xls" in i):
            pa_05_file = os.path.abspath(prt_dir + "\\" + i)
        elif("pa05_Crosswalk.csv" in i):
            pa_05_cross_file = os.path.abspath(prt_dir + "\\" + i)
    #
    book_05 = xlrd.open_workbook(pa_05_file)
    sheet_05 = book_05.sheet_by_index(0)
    cur_rows = sheet_05.nrows
    # Extracting the relevant data:
    data_05 = [None]*len(cols_05)
    for i in xrange(0, len(cols_05)):
        for row in xrange(0, cur_rows):
            if(cols_05[i].lower() in str(sheet_05.cell_value(row,0)).lower()):
                data_05[i] = sheet_05.row_values(row,1)
                break
    # Reformatting the data:
    data_05 = pd.DataFrame(data_05).T
    # Data is in thousands of dollars:
    data_05 = data_05 * pa_05_fctr
    # Reading in the crosswalks between the columns and the NAICS codes:
    pa05cross = pd.read_csv(pa_05_cross_file)
    #
    data_tree = naics.load_data_with_cross(
                    data_tree = data_tree, data_df = data_05,
                    cross_df = pa05cross, data_cols = cols_05,
                    df_name = "PA_types"
                    )
    # Defaults:
    if blueprint == None and "PA_inc_loss" in data_tree.enum_inds[0].data.dfs.keys():
        blueprint = "PA_inc_loss"
    elif blueprint == None and "tot_corps" in data_tree.enum_inds[0].data.dfs.keys():
        blueprint = "tot_corps"
    naics.pop_back(tree=data_tree, df_list=["PA_types"])
    naics.pop_forward(tree=data_tree, df_list=["PA_types"],
                      blueprint=blueprint, blue_tree=blue_tree)
    #
    return data_tree

Example #3

0

Show file

def load_income(data_tree=naics.generate_tree(),
                blue_tree=None, blueprint=None,
                from_out=False, out_path=None):
    """ This function loads the soi partnership income data.
    
    :param data_tree: The NAICS tree to read the data into.
    :param blueprint: The key corresponding to a dataframe in a tree to be
           used as a "blueprint" for populating the df_list dataframes forward.
    :param blue_tree: A NAICS tree with the "blueprint" dataframe. The default
           is the original NAICS tree.
    :param from_out: Whether to read in the data from output.
    """
    # Initializing the output path:
    if out_path == None:
        out_path = _INC_OUT_PATH
    # If from_out, load the data tree from output:
    if from_out:
        data_tree = naics.load_tree_dfs(input_path=out_path, 
                                        tree=data_tree)
        return data_tree
    # Opening data on net income/loss:
    wb = xlrd.open_workbook(_INC_IN_PATH)
    ws = wb.sheet_by_index(0)
    start_col = naics.search_ws(ws, _INC_STRT_COL_NM, 20)[1]
    # Initializing dataframe to hold pertinent income/loss data:
    data_df = pd.DataFrame(np.zeros((ws.ncols-start_col,3)), 
                           columns = _INC_PRT_DF_COL_NMS)
    # Extracting the data from the worksheet:
    for row in xrange(0, ws.nrows):
        # Going through each row of excel file, looking for input rows:
        if(_INC_NET_INC_ROW_NM in str(ws.cell_value(row,0)).lower()):
            data_df[_INC_NET_INC_COL_NM] = ws.row_values(row+1, start_col)
            data_df[_INC_NET_LOSS_COL_NM] = ws.row_values(row+2, start_col)
            break
        if(_INC_DEPR_ROW_NM in str(ws.cell_value(row,0)).lower()):
            data_df[_INC_DEPR_COL_NM] = ws.row_values(row, start_col)
    # Scaling the data to the correct units:
    data_df = data_df * _INC_FILE_FCTR
    # Reading in the crosswalks between the columns and the NAICS codes:
    pa01cross = pd.read_csv(_INC_IN_CROSS_PATH)
    # Processing the inc/loss data into the NAICS tree:
    data_tree = naics.load_data_with_cross(
                    data_tree=data_tree, data_df=data_df,
                    cross_df=pa01cross, df_nm=_INC_DF_NM
                    )
    # Default blueprint is tot_corps:
    has_tot_df = _TOT_CORP_DF_NM in data_tree.enum_inds[0].data.dfs.keys()
    if blueprint == None and has_tot_df:
        blueprint = _TOT_CORP_DF_NM
    # Populate all levels of specificity in the NAICS tree:
    naics.pop_back(tree=data_tree, df_list=[_INC_DF_NM])
    naics.pop_forward(tree=data_tree, df_list=[_INC_DF_NM],
                      blueprint=blueprint, blue_tree=blue_tree)
    
    return data_tree

Example #4

0

Show file

File: pull_soi.py Project: evan-magnusson/dynamic

def load_pa_01_data(data_tree = None, blue_tree = None, blueprint = None):
    # Defining constants:
    pa_01_fctr = 10 ** 3
    #
    #if data_tree == None:
    #    data_tree = naics.generate_tree()
    # Names of the files with the partnership data:
    for i in os.listdir(prt_dir):
        if("pa01.xls" in i):
            pa_01_file = os.path.abspath(prt_dir + "\\" + i)
        elif("pa01_Crosswalk.csv" in i):
            pa_01_cross_file = os.path.abspath(prt_dir + "\\" + i)
    # Inputting data on net income/loss by industry from "**pa01.xls":
    book_01 = xlrd.open_workbook(pa_01_file)
    sheet_01 = book_01.sheet_by_index(0)
    num_rows = sheet_01.nrows
    # The data to be extracted:
    cols_01 = ["Total net income", "Total net loss", "Depreciation"]
    num_cols = sheet_01.ncols
    start_col = naics.search_ws(sheet_01, "All\nindustries", 20)[1]
    data_01 = pd.DataFrame(np.zeros((num_cols-start_col,3)), columns = cols_01)
    # Extracting the data:
    for i in xrange(0, num_rows):
        if("total net income" in str(sheet_01.cell_value(i,0)).lower()):
            data_01["Total net income"] = sheet_01.row_values(i+1,start_col)
            data_01["Total net loss"] = sheet_01.row_values(i+2,start_col)
            break
        if("depreciation" in str(sheet_01.cell_value(i,0)).lower()):
            data_01["Depreciation"] = sheet_01.row_values(i,start_col)
    #
    data_01 = data_01 * pa_01_fctr
    # Reading in the crosswalks between the columns and the NAICS codes:
    pa01cross = pd.read_csv(pa_01_cross_file)
    #
    data_tree = naics.load_data_with_cross(
                    data_tree = data_tree, data_df = data_01,
                    cross_df = pa01cross, df_name = "PA_inc_loss"
                    )
    #
    if blueprint == None and "tot_corps" in data_tree.enum_inds[0].data.dfs.keys():
        blueprint = "tot_corps"
    naics.pop_back(tree=data_tree, df_list=["PA_inc_loss"])
    naics.pop_forward(tree=data_tree, df_list=["PA_inc_loss"],
                      blueprint=blueprint, blue_tree=blue_tree)
    #
    return data_tree

Example #5

0

Show file

File: pull_soi_partner.py Project: evan-magnusson/dynamic

def load_asset(data_tree=naics.generate_tree(),
             blue_tree=None, blueprint=None,
             from_out=False):
    """ This function loads the soi partnership asset data.
    
    :param data_tree: The NAICS tree to read the data into.
    :param blueprint: The key corresponding to a dataframe in a tree to be
           used as a "blueprint" for populating the df_list dataframes forward.
    :param blue_tree: A NAICS tree with the "blueprint" dataframe. The default
           is the original NAICS tree.
    :param from_out: Whether to read in the data from output.
    """
    # If from_out, load the data tree from output:
    if from_out:
        data_tree = naics.load_tree_dfs(input_file=_AST_OUT_PATH,
                                        tree=data_tree)
        return data_tree
    # Opening data on depreciable fixed assets, inventories, and land:
    wb = xlrd.open_workbook(_AST_IN_PATH)
    ws = wb.sheet_by_index(0)
    num_rows = ws.nrows
    # Columns of the asset dataframe:
    df_cols = _AST_DF_DICT.values()
    # Initializing dataframe to hold pertinent asset data:
    ast_df = pd.DataFrame(np.zeros((ws.ncols-1,len(df_cols))), columns=df_cols)
    ''' Extracting the data (note that the rows with total data appear first).
    For each input row:'''
    for in_row_nm in _AST_IN_ROW_NMS:
        # Key corresponding to total asset column:
        df_net_col_key = _AST_IN_ROWS_DF_NET_DICT[in_row_nm]
        # Asset dataframes net income column name:
        df_net_col_nm = _AST_DF_DICT[df_net_col_key]
        # Key corresponding to assets of net income partnerships column:
        df_inc_col_key = _AST_IN_ROWS_DF_INC_DICT[in_row_nm]
        # Asset dataframes total income column name:
        df_inc_col_nm = _AST_DF_DICT[df_inc_col_key]
        in_row_nm = in_row_nm.lower()
        # Finding the first input row with in_row_nm:
        for in_row1 in xrange(0, num_rows):
            in_net_row_nm = str(ws.cell_value(in_row1,0)).lower()
            if(in_row_nm in in_net_row_nm):
                # Total asset data:
                ast_df[df_net_col_nm] = ws.row_values(in_row1, 1)
                # Finding the second input row with in_row_nm:
                for in_row2 in xrange(in_row1+1, num_rows):
                    in_inc_row_nm = str(ws.cell_value(in_row2,0)).lower()
                    if(in_row_nm in in_inc_row_nm):
                        # Asset data for companies with net income:
                        ast_df[df_inc_col_nm] = ws.row_values(in_row2,1)
                        break
                break
    # Scaling the data to the correct units:
    ast_df = ast_df * _AST_FILE_FCTR
    # Reading in the crosswalks between the columns and the NAICS codes:
    ast_cross = pd.read_csv(_AST_IN_CROSS_PATH)
    # Processing the asset data into the NAICS tree:
    data_tree = naics.load_data_with_cross(
                    data_tree=data_tree, data_df=ast_df,
                    cross_df=ast_cross, df_nm=_AST_DF_NM
                    )
    # Default blueprint is tot_corps:
    has_tot_df = _TOT_CORP_DF_NM in data_tree.enum_inds[0].data.dfs.keys()
    if blueprint == None and has_tot_df:
        blueprint = _TOT_CORP_DF_NM
    # Populate all levels of specificity in the NAICS tree:
    naics.pop_back(tree=data_tree, df_list=[_AST_DF_NM])
    naics.pop_forward(tree=data_tree, df_list=[_AST_DF_NM],
                      blueprint=blueprint, blue_tree=blue_tree)
    return data_tree

Example #6

0

Show file

File: pull_soi.py Project: evan-magnusson/dynamic

def load_pa_03_data(data_tree = None, blue_tree = None, blueprint = None):
    # Defining constants:
    pa_03_fctr = 10 ** 3
    #
    if data_tree == None:
        data_tree = naics.generate_tree()
    #
    for i in os.listdir(prt_dir):
        if("pa03.xls" in i):
            pa_03_file = os.path.abspath(prt_dir + "\\" + i)
        elif("pa03_Crosswalk.csv" in i):
            pa_03_cross_file = os.path.abspath(prt_dir + "\\" + i)
    # Inputting data on depreciable fixed assets, inventories, and land:
    book_03 = xlrd.open_workbook(pa_03_file)
    sheet_03 = book_03.sheet_by_index(0)
    # Finding the relevant details about the table, e.g. dimensions:
    cur_rows = sheet_03.nrows
    # The following categories of data to be extracted:
    cols_03 = ["Depreciable assets (Net)", "Accumulated depreciation (Net)", 
               "Inventories (Net)", "Land (Net)", 
               "Depreciable assets (Income)", 
               "Accumulated depreciation (Income)", "Inventories (Income)",
               "Land (Income)"]
    # The more general column names that are used in the input file:
    gen_03 = ["Depreciable assets", "Accumulated depreciation", 
                    "Inventories", "Land"]
    # The data to be extracted on partnerships as a whole:
    tot_data_03 = [None]*len(gen_03)
    # The data to be extracted on partnerships with income:
    inc_data_03 = [None]*len(gen_03)
    # Extracting the data (note that the rows with total data appear first):
    for i in xrange(0, len(gen_03)):
        for row1 in xrange(0, cur_rows):
            if(gen_03[i].lower() in str(sheet_03.cell_value(row1,0)).lower()):
                tot_data_03[i] = sheet_03.row_values(row1,1)
                for row2 in xrange(row1+1, cur_rows):
                    cur_cell = str(sheet_03.cell_value(row2,0)).lower()
                    if(gen_03[i].lower() in cur_cell):
                        inc_data_03[i] = sheet_03.row_values(row2,1)
                        break
                break
    # Reformatting the data:
    data_03 = pd.concat([pd.DataFrame(tot_data_03).T,
                         pd.DataFrame(inc_data_03).T], axis = 1)
    # Data is in the thousands:
    data_03 = data_03 * pa_03_fctr
    # Reading in the crosswalks between the columns and the NAICS codes:
    pa03cross = pd.read_csv(pa_03_cross_file)
    #
    data_tree = naics.load_data_with_cross(
                    data_tree = data_tree, data_df = data_03,
                    cross_df = pa03cross, data_cols = cols_03,
                    df_name = "PA_assets"
                    )
    #
    if blueprint == None and "tot_corps" in data_tree.enum_inds[0].data.dfs.keys():
        blueprint = "tot_corps"
    naics.pop_back(tree=data_tree, df_list=["PA_assets"])
    naics.pop_forward(tree=data_tree, df_list=["PA_assets"],
                      blueprint=blueprint, blue_tree=blue_tree)
    #
    return data_tree

Example #7

0

Show file

def load_asset(data_tree=naics.generate_tree(),
             blue_tree=None, blueprint=None,
             from_out=False, out_path=None):
    """ This function loads the soi partnership asset data.
    
    :param data_tree: The NAICS tree to read the data into.
    :param blueprint: The key corresponding to a dataframe in a tree to be
           used as a "blueprint" for populating the df_list dataframes forward.
    :param blue_tree: A NAICS tree with the "blueprint" dataframe. The default
           is the original NAICS tree.
    :param from_out: Whether to read in the data from output.
    """
    # Initializing the output path:
    if out_path == None:
        out_path = _AST_OUT_PATH
    # If from_out, load the data tree from output:
    if from_out:
        data_tree = naics.load_tree_dfs(input_path=out_path,
                                        tree=data_tree)
        return data_tree
    # Opening data on depreciable fixed assets, inventories, and land:
    wb = xlrd.open_workbook(_AST_IN_PATH)
    ws = wb.sheet_by_index(0)
    num_rows = ws.nrows
    # Columns of the asset dataframe:
    df_cols = _AST_DF_DICT.values()
    # Initializing dataframe to hold pertinent asset data:
    ast_df = pd.DataFrame(np.zeros((ws.ncols-1,len(df_cols))), columns=df_cols)
    ''' Extracting the data (note that the rows with total data appear first).
    For each input row:'''
    for in_row_nm in _AST_IN_ROW_NMS:
        # Key corresponding to total asset column:
        df_net_col_key = _AST_IN_ROWS_DF_NET_DICT[in_row_nm]
        # Asset dataframes net income column name:
        df_net_col_nm = _AST_DF_DICT[df_net_col_key]
        # Key corresponding to assets of net income partnerships column:
        df_inc_col_key = _AST_IN_ROWS_DF_INC_DICT[in_row_nm]
        # Asset dataframes total income column name:
        df_inc_col_nm = _AST_DF_DICT[df_inc_col_key]
        in_row_nm = in_row_nm.lower()
        # Finding the first input row with in_row_nm:
        for in_row1 in xrange(0, num_rows):
            in_net_row_nm = str(ws.cell_value(in_row1,0)).lower()
            if(in_row_nm in in_net_row_nm):
                # Total asset data:
                ast_df[df_net_col_nm] = ws.row_values(in_row1, 1)
                # Finding the second input row with in_row_nm:
                for in_row2 in xrange(in_row1+1, num_rows):
                    in_inc_row_nm = str(ws.cell_value(in_row2,0)).lower()
                    if(in_row_nm in in_inc_row_nm):
                        # Asset data for companies with net income:
                        ast_df[df_inc_col_nm] = ws.row_values(in_row2,1)
                        break
                break
    # Scaling the data to the correct units:
    ast_df = ast_df * _AST_FILE_FCTR
    # Reading in the crosswalks between the columns and the NAICS codes:
    ast_cross = pd.read_csv(_AST_IN_CROSS_PATH)
    # Processing the asset data into the NAICS tree:
    data_tree = naics.load_data_with_cross(
                    data_tree=data_tree, data_df=ast_df,
                    cross_df=ast_cross, df_nm=_AST_DF_NM
                    )
    # Default blueprint is tot_corps:
    has_tot_df = _TOT_CORP_DF_NM in data_tree.enum_inds[0].data.dfs.keys()
    if blueprint == None and has_tot_df:
        blueprint = _TOT_CORP_DF_NM
    # Populate all levels of specificity in the NAICS tree:
    naics.pop_back(tree=data_tree, df_list=[_AST_DF_NM])
    naics.pop_forward(tree=data_tree, df_list=[_AST_DF_NM],
                      blueprint=blueprint, blue_tree=blue_tree)
    return data_tree