Ejemplo n.º 1
0
def main():
    #
    naics_tree = naics.load_naics(naics_codes_file)
    #
    read_wages.load_nipa_wages_ind(data_folder, naics_tree)
    #
    parameters = [read_wages.WAGES]
    #
    naics.pop_back(naics_tree, parameters)
    naics.pop_forward(naics_tree, parameters, None, None, None, True)
    #
    naics.print_tree_dfs(naics_tree, output_folder)
Ejemplo n.º 2
0
def calc_depr_rates(asset_tree, inv_tree, land_tree, data_folder):
    # The directory with depreciation rates data:
    depr_folder = os.path.abspath(data_folder + "\\Depreciation Rates")
    # Opening file containing depreciation rates by asset type:
    depr_econ = pd.read_csv(os.path.abspath(depr_folder + "\\Economic Depreciation Rates.csv"))
    depr_econ = depr_econ.fillna(1)
    econ_assets = depr_econ["Asset"]
    econ_rates = depr_econ["Economic Depreciation Rate"]
    #
    types = ["All", "Corp", "Non-Corp"]
    # Initialize tree for depreciation rates:
    depr_tree = naics.load_naics(data_folder + "\\2012_NAICS_Codes.csv")
    for i in depr_tree.enum_inds:
        i.data.append(("Economic", pd.DataFrame(np.zeros((1, 3)), columns=types)))
    #
    for i in types:
        asset_list = asset_tree.enum_inds[0].data.dfs[i].columns
        asset_list = asset_list.values.tolist()

        match = np.array([-1] * len(asset_list))
        for j in xrange(0, asset_tree.enum_inds[0].data.dfs[i].shape[1]):
            for k in xrange(0, len(econ_assets)):
                if str(asset_list[j]).strip() == str(econ_assets[k]).strip():
                    match[j] = k
        for j in xrange(0, len(depr_tree.enum_inds)):
            cur_sum = 0
            asset_df = asset_tree.enum_inds[j].data.dfs[i]
            depr_df = depr_tree.enum_inds[j].data.dfs["Economic"]
            for k in xrange(0, len(asset_list)):
                if match[k] == -1:
                    print k
                    continue
                cur_sum += asset_df.iloc[0, k] * econ_rates[match[k]]
            if sum(asset_df.iloc[0, :]) != 0:
                depr_df[i][0] = cur_sum / sum(asset_df.iloc[0, :])
            else:
                depr_df[i][0] = 0
        # Inventories and land have an approximately zero depreciation rate:
        for j in xrange(0, len(depr_tree.enum_inds)):
            tot_assets = sum(asset_tree.enum_inds[j].data.dfs["All"].iloc[0, :])
            tot_inv = inv_tree.enum_inds[j].data.dfs["Inventories"]["All"][0]
            tot_land = land_tree.enum_inds[j].data.dfs["Land"]["All"][0]
            if tot_assets + tot_inv + tot_land == 0:
                continue
            ratio = tot_assets / (tot_assets + tot_inv + tot_land)
            #
            cur_df = depr_tree.enum_inds[j].data.dfs["Economic"]
            cur_df[i][0] = ratio * cur_df[i][0]
    return depr_tree
Ejemplo n.º 3
0
def get_incs():
    #
    naics_tree = naics.load_naics(naics_codes_file)
    #
    read_inc.load_nipa_inc_ind(data_folder, naics_tree)
    read_inc.load_nipa_int_ind(data_folder, naics_tree)
    read_inc.calc_bus_inc(naics_tree)
    #
    parameters = [read_inc.BUS_INC, read_inc.INT_INC, read_inc.FIN_INC]
    #
    naics.pop_back(naics_tree, parameters)
    naics.pop_forward(naics_tree, parameters)
    #
    naics.print_tree_dfs(naics_tree, output_folder)
    return naics_tree
Ejemplo n.º 4
0
def test_load_naics(path=None, messages=True):
    # Default path if none is specified:
    if path == None:
        path = os.getcwd()
        path = os.path.abspath(path + "\\data\\2012_NAICS_Codes.csv")
    # Using the function being tested to create a tree:
    cur_tree = naics.load_naics(path)
    # Replicating the codes in the input file:
    rep_codes = np.zeros(0)
    for ind in cur_tree.enum_inds:
        cur_codes = ind.data.dfs["Codes:"].iloc[:, 0]
        rep_codes = np.append(rep_codes, cur_codes)
    rep_codes = rep_codes.astype(int)
    rep_codes = np.unique(rep_codes)
    rep_codes = np.sort(rep_codes)
    #
    orig_data = pd.read_csv(path).iloc[:, 0]
    orig_codes = np.zeros(0)
    for i in xrange(0, len(orig_data)):
        cur_codes = str(orig_data[i]).split("-")
        orig_codes = np.append(orig_codes, cur_codes)
    orig_codes = orig_codes.astype(int)
    orig_codes = np.unique(orig_codes)
    orig_codes = np.sort(orig_codes)
    #
    rep_index = 0
    orig_index = 0
    matches = 0
    while ((rep_index < len(rep_codes)) and (orig_index < len(orig_codes))):
        if (rep_codes[rep_index] == int(orig_codes[orig_index])):
            rep_index += 1
            orig_index += 1
            matches += 1
        elif (rep_codes[rep_index] <= orig_codes[orig_index]):
            rep_index += 1
        elif (rep_codes[rep_index] >= orig_codes[orig_index]):
            orig_index += 1
    if matches == len(orig_codes):
        if messages:
            print "\"load_naics\" passed test 1."
        return None
    else:
        mismatch = str(len(orig_codes) - matches)
        if messages:
            print "\"load_naics\" failed test 1. Mismatches:" + mismatch + "."
        return int(mismatch)
Ejemplo n.º 5
0
def test_load_naics(path = None, messages = True):
    # Default path if none is specified:
    if path == None:
        path = os.getcwd()
        path = os.path.abspath(path + "\\data\\2012_NAICS_Codes.csv")
    # Using the function being tested to create a tree:
    cur_tree = naics.load_naics(path)
    # Replicating the codes in the input file:
    rep_codes = np.zeros(0)
    for ind in cur_tree.enum_inds:
        cur_codes = ind.data.dfs["Codes:"].iloc[:,0]
        rep_codes = np.append(rep_codes, cur_codes)
    rep_codes = rep_codes.astype(int)
    rep_codes = np.unique(rep_codes)
    rep_codes = np.sort(rep_codes)
    #
    orig_data = pd.read_csv(path).iloc[:,0]
    orig_codes = np.zeros(0)
    for i in xrange(0, len(orig_data)):
        cur_codes = str(orig_data[i]).split("-")
        orig_codes = np.append(orig_codes, cur_codes)
    orig_codes = orig_codes.astype(int)
    orig_codes = np.unique(orig_codes)
    orig_codes = np.sort(orig_codes)
    #
    rep_index = 0
    orig_index = 0
    matches = 0
    while((rep_index < len(rep_codes)) and (orig_index < len(orig_codes))):
        if(rep_codes[rep_index] == int(orig_codes[orig_index])):
            rep_index += 1
            orig_index += 1
            matches += 1
        elif(rep_codes[rep_index] <= orig_codes[orig_index]):
            rep_index += 1
        elif(rep_codes[rep_index] >= orig_codes[orig_index]):
            orig_index += 1
    if matches == len(orig_codes):
        if messages:
            print "\"load_naics\" passed test 1."
        return None
    else:
        mismatch = str(len(orig_codes) - matches)
        if messages:
            print "\"load_naics\" failed test 1. Mismatches:" + mismatch + "."
        return int(mismatch)
Ejemplo n.º 6
0
def read_land(output_tree, data_folder):
    land_folder = os.path.abspath(data_folder + "\\Land")
    land_file = os.path.abspath(land_folder + "\\Fin_Accounts-Land.csv")
    land_data = pd.read_csv(land_file)
    # Data is in billions:
    land_data = (10 ** 9) * land_data
    corp_types = ["C Corporations", "Corporate general partners", "Corporate limited partners"]
    non_corp_types = [
        "S Corporations",
        "Individual general partners",
        "Individual limited partners",
        "Partnership general partners",
        "Partnership limited partners",
        "Tax-exempt organization general partners",
        "Tax-exempt organization limited partners",
        "Nominee and other general partners",
        "Nominee and other limited partners",
        "Sole Proprietors",
    ]
    land_tree = naics.load_naics(data_folder + "\\2012_NAICS_Codes.csv")
    df_cols = ["All", "Corp", "Non-Corp"]
    for i in land_tree.enum_inds:
        i.data.append(("Land", pd.DataFrame(np.zeros((1, len(df_cols))), columns=df_cols)))
    corp_sum = 0.0
    non_corp_sum = 0.0
    for i in corp_types:
        corp_sum += output_tree.enum_inds[0].data.dfs["LAND"][i][0]
    for i in non_corp_types:
        non_corp_sum += output_tree.enum_inds[0].data.dfs["LAND"][i][0]
    if corp_sum + non_corp_sum == 0:
        return land_tree
    # corp_proportion = corp_sum / (corp_sum + non_corp_sum)
    # non_corp_proportion = non_corp_sum / (corp_sum + non_corp_sum)
    land_df = land_tree.enum_inds[0].data.dfs["Land"]
    land_df["Corp"][0] = land_data["Corporate"][0]
    land_df["Non-Corp"][0] = land_data["Non-Corporate"][0]
    land_df["All"][0] = land_data["Corporate"][0] + land_data["Non-Corporate"][0]
    return land_tree
Ejemplo n.º 7
0
def summary_tree(data_tree, data_folder):
    all_sectors = ["C Corporations", 
                   "S Corporations",
                   "Corporate general partners", 
                   "Corporate limited partners",
                   "Individual general partners",
                   "Individual limited partners",
                   "Partnership general partners",
                   "Partnership limited partners",
                   "Tax-exempt organization general partners",
                   "Tax-exempt organization limited partners",
                   "Nominee and other general partners", 
                   "Nominee and other limited partners", 
                   "Sole Proprietors"]
    
    pa_types = data_tree.enum_inds[0].data.dfs["PA_types"].columns
    pa_types = pa_types.values.tolist()
    #
    output_tree = naics.load_naics(data_folder + "\\2012_NAICS_Codes.csv")
    #
    for i in output_tree.enum_inds:
        i.append_dfs(("FA",pd.DataFrame(np.zeros((1, len(all_sectors))),
                                        columns = all_sectors)))
        i.append_dfs(("INV",pd.DataFrame(np.zeros((1, len(all_sectors))),
                                         columns = all_sectors)))
        i.append_dfs(("LAND",pd.DataFrame(np.zeros((1, len(all_sectors))),
                                          columns = all_sectors)))
    #
    for i in range(0, len(output_tree.enum_inds)):
        #
        #cur_data = data_tree.enum_inds[i].data
        #out_data = output_tree.enum_inds[i].data
        cur_dfs = data_tree.enum_inds[i].data.dfs
        out_dfs = output_tree.enum_inds[i].data.dfs
        partner_sum = sum(cur_dfs["PA_types"].iloc[0,:])
        #
        for j in range(0, len(all_sectors)):
            sector = all_sectors[j]
            #
            if sector == "C Corporations":
                cur_df = cur_dfs["c_corps"]
                out_dfs["FA"][sector][0] = cur_df["Depreciable Assets"][0]
                out_dfs["INV"][sector][0] = cur_df["Inventories"][0]
                out_dfs["LAND"][sector][0] = cur_df["Land"][0]
            elif sector == "S Corporations":
                cur_df = cur_dfs["s_corps"]
                out_dfs["FA"][sector][0] = cur_df["Depreciable Assets"][0]
                out_dfs["INV"][sector][0] = cur_df["Inventories"][0]
                out_dfs["LAND"][sector][0] = cur_df["Land"][0]
            elif sector in pa_types:
                if partner_sum != 0:
                    ratio = abs(cur_dfs["PA_types"][sector][0])/partner_sum
                else:
                    ratio = abs(1.0/cur_dfs["PA_types"].shape[0])
                cur_df = cur_dfs["PA_assets"]
                out_dfs["FA"][sector][0] = abs(
                                    ratio*cur_df["Depreciable assets (Net)"][0]
                                    )
                out_dfs["INV"][sector][0] = abs(
                                        ratio*cur_df["Inventories (Net)"][0]
                                        )
                out_dfs["LAND"][sector][0] = abs(
                                                ratio*cur_df["Land (Net)"][0]
                                                )
            elif sector == "Sole Proprietors":
                if cur_dfs["PA_inc_loss"]["Depreciation"][0] != 0:
                    ratio = abs(cur_dfs["soi_prop"]["Depr Deductions"][0]/
                                cur_dfs["PA_inc_loss"]["Depreciation"][0])
                else:
                    ratio = 0.0
                cur_df = cur_dfs["PA_assets"]
                out_dfs["FA"][sector][0] = abs(
                                        (ratio*
                                        cur_df["Depreciable assets (Net)"][0])+
                                        cur_dfs["farm_prop"]["FA"][0]
                                        )
                out_dfs["INV"][sector][0] = abs(
                                        (ratio*cur_df["Inventories (Net)"][0])+
                                        cur_dfs["farm_prop"]["Land"][0]
                                        )
                out_dfs["LAND"][sector][0] = abs(ratio*cur_df["Land (Net)"][0])
    return output_tree
Ejemplo n.º 8
0
-------------------------------------------------------------------------------
The main script of the program:
    --Loading the SOI Tax Stats-Corporation Data.
    --Loading the SOI Tax Stats-Partnership Data.
    --Loading tax data for Proprietorships.
    --Creating "output_tree" stating FA, INV, and LAND for various sectors.
-------------------------------------------------------------------------------
'''
# Working directory:
path = os.getcwd()
# Relevant path and file names:
data_folder = os.path.abspath(path + "\\data")
output_folder = os.path.abspath(path + "\\OUTPUT")

# Create a tree based off NAICS Codes:
data_tree = naics.load_naics(data_folder + "\\2012_NAICS_Codes.csv")
# Reading in the SOI Tax Stats-Corporation Data:
naics.load_soi_corporate_data(data_tree, data_folder)
# Reading in the SOI Tax Stats-Partnership Data:
naics.load_soi_partner_data(data_tree, data_folder)
# Reading in the SOI Tax Stats-Proprietorship Data:
naics.load_soi_proprietor_data(data_tree, data_folder)

'''
Many industries are not listed in the SOI datasets. The data for these missing
    industries are interpolated.
'''
# Get a list of the names of all the pd dfs besides the list of codes:
cur_names = data_tree.enum_inds[0].data.dfs.keys()
cur_names.remove("Codes:")
# Populate missing industry data backwards throught the tree:
Ejemplo n.º 9
0
def calc_tax_depr_rates(asset_tree, inv_tree, land_tree, data_folder):
    # The directory with depreciation rates data:
    depr_folder = os.path.abspath(data_folder + "\\Depreciation Rates")
    #
    tax_file = os.path.abspath(depr_folder + "\\BEA_IRS_Crosswalk.csv")
    tax_data = pd.read_csv(tax_file).fillna(0)
    tax_assets = tax_data["Asset Type"]
    for i in xrange(0, len(tax_assets)):
        tax_assets[i] = str(tax_assets[i]).replace("\xa0", " ").strip()
    #
    r = 0.05
    #
    # tax_cols = {"GDS 200%": 2, "GDS 150%": 1.5, "GDS SL": 1.0, "ADS SL": 1.0}
    tax_gds_mthds = {"GDS 200%": 2.0, "GDS 150%": 1.5, "GDS SL": 1.0}
    tax_ads_mthds = {"ADS SL": 1.0}
    tax_cols = tax_gds_mthds.keys() + tax_ads_mthds.keys()
    tax_systems = {"GDS": tax_gds_mthds, "ADS": tax_ads_mthds}
    tax_rates = pd.DataFrame(np.zeros((len(tax_assets), len(tax_cols))), columns=tax_cols)
    tax_rates["Asset"] = tax_assets
    # Compute the tax rates:
    for i in tax_systems:
        tax_yrs = tax_data[i]
        for j in tax_systems[i]:
            tax_b = tax_systems[i][j]
            tax_beta = tax_b / tax_yrs
            tax_star = tax_yrs * (1 - (1 / tax_b))
            # tax_z = (((tax_beta/(tax_beta+r))*
            #            (1-np.exp(-1*(tax_beta+r)*tax_star)))+
            #            ((np.exp(-1*tax_beta*tax_star)*
            #            np.exp(-1*r*tax_star)-np.exp(-1*r*tax_yrs))/
            #            ((tax_yrs-tax_star)*r)))
            tax_z = ((tax_beta / (tax_beta + r)) * (1 - np.exp(-1 * (tax_beta + r) * tax_star))) + (
                (np.exp(-1 * tax_beta * tax_star) / ((tax_yrs - tax_star) * r))
                * (np.exp(-1 * r * tax_star) - np.exp(-1 * r * tax_yrs))
            )
            tax_rates[j] = r / ((1 / tax_z) - 1)
    tax_rates = tax_rates.fillna(0)
    #
    types = ["All", "Corp", "Non-Corp"]
    # Initialize tree for depreciation rates:
    depr_tree = naics.load_naics(data_folder + "\\2012_NAICS_Codes.csv")
    for i in depr_tree.enum_inds:
        for j in tax_systems:
            for k in tax_systems[j]:
                i.data.append((k, pd.DataFrame(np.zeros((1, 3)), columns=types)))
    for i in depr_tree.enum_inds:
        i.data.append(("Recommended", pd.DataFrame(np.zeros((1, 3)), columns=types)))
    #
    for i in types:
        asset_list = asset_tree.enum_inds[0].data.dfs[i].columns
        asset_list = asset_list.values.tolist()
        match = np.array([-1] * len(asset_list))
        for j in xrange(0, asset_tree.enum_inds[0].data.dfs[i].shape[1]):
            for k in xrange(0, len(tax_assets)):
                if str(asset_list[j]).strip() == str(tax_assets[k]).strip():
                    match[j] = k
        for j in xrange(0, len(depr_tree.enum_inds)):
            cur_ind = depr_tree.enum_inds[j]
            asset_df = asset_tree.enum_inds[j].data.dfs[i]
            #
            tot_assets = sum(asset_tree.enum_inds[j].data.dfs[i].iloc[0, :])
            tot_inv = inv_tree.enum_inds[j].data.dfs["Inventories"][i][0]
            tot_land = land_tree.enum_inds[j].data.dfs["Land"][i][0]
            if tot_assets + tot_inv + tot_land == 0:
                continue
            ratio = tot_assets / (tot_assets + tot_inv + tot_land)
            #
            for k in tax_cols:
                cur_tax = cur_ind.data.dfs[k][i]
                cur_sum = 0.0
                for l in xrange(0, len(asset_list)):
                    if match[l] == -1:
                        continue
                    cur_sum += asset_df.iloc[0, l] * tax_rates[k][match[l]]
                cur_tax[0] = ratio * (cur_sum / sum(asset_df.iloc[0, :]))
            #
            cur_tax = cur_ind.data.dfs["Recommended"][i]
            cur_sum = 0
            for l in xrange(0, len(asset_list)):
                if match[l] == -1:
                    continue
                cur_rate = tax_rates[tax_data["Method"][match[l]]][match[l]]
                cur_sum += asset_df.iloc[0, l] * cur_rate
            cur_tax[0] = ratio * (cur_sum / sum(asset_df.iloc[0, :]))
    return depr_tree
Ejemplo n.º 10
0
def read_bea(output_tree, data_folder):
    # The directory with BEA data:
    bea_folder = os.path.abspath(data_folder + "\\BEA")
    # Opening BEA's excel file on depreciable assets by industry:
    bea_book = xlrd.open_workbook(os.path.abspath(
                                    bea_folder + "\\detailnonres_stk1.xlsx"))
    sht_names = bea_book.sheet_names()
    num_shts = bea_book.nsheets
    # Opening "readme" sheet:
    try:
        bea_readme = bea_book.sheet_by_name("readme")
    except xlrd.XLRDError:
        bea_readme = bea_book.sheet_by_index(0)
    # Finding relevant positions in the readme sheet:
    sht_pos = naics.search_ws(bea_readme, "Industry Title", 25, False)
    if(sht_pos == [-1,-1]):
        sht_pos = naics.search_ws(bea_readme, "bea code", 25, False, [0,0], True)
        sht_pos[1] = sht_pos[1] - 1
    if(sht_pos == [-1,-1]):
        print "Error in reading BEA fixed asset \"readme\" sheet."
        return None
    cur_row = sht_pos[0] + 1
    cur_col = sht_pos[1]
    # Finding the number of industries (includes those without bea codes):
    number_of_industries = 0
    while cur_row < bea_readme.nrows:
        if(str(bea_readme.cell_value(cur_row, cur_col)) != ""):
            number_of_industries += 1
        cur_row += 1
    # Making a list of BEA codes based on the names of the worksheets:
    bea_codes1 = np.zeros(num_shts-1, dtype=object)
    for index in xrange(1, num_shts):
        bea_codes1[index-1] = str(sht_names[index])
    # Making a list of BEA codes based on info in the readme sheet:
    code_index = 0
    cur_row = sht_pos[0] + 1
    cur_col = sht_pos[1]
    bea_codes2 = np.zeros(number_of_industries, dtype=object)
    while cur_row < bea_readme.nrows:
        if(str(bea_readme.cell_value(cur_row, cur_col)) != ""):
            cur_code = str(bea_readme.cell_value(cur_row, cur_col+1))
            cur_code = cur_code.replace("\xa0", " ").strip()
            bea_codes2[code_index] = cur_code
            code_index += 1
        cur_row += 1
    # Reading in a list of the assets in the BEA file:
    list_file = os.path.abspath(bea_folder + "\\detailnonres_list.csv")
    asset_list = pd.read_csv(list_file)
    for i in xrange(0, asset_list.shape[0]):
        asset_list.iloc[i,0] = asset_list.iloc[i,0].replace("\xa0", " ")
        asset_list.iloc[i,0] = asset_list.iloc[i,0].strip()
    
    # Reading in the corresponding naics codes:
    naics_file = os.path.abspath(bea_folder + "\\detailnonres_naics.csv")
    naics_cross = pd.read_csv(naics_file).replace("\xa0", " ")
    naics_inds = naics_cross["Industry"]
    for i in xrange(0, naics_cross.shape[0]):
        naics_inds[i] = naics_inds[i].replace("\xa0", " ").strip()
    # Creating a chart cross-referencing industry names, BEA and NAICS codes.
    chart_cols = ["Industry","BEA Code","NAICS Code"]
    bea_chart = pd.DataFrame(np.zeros(shape=(num_shts-2,3), dtype=object),
                             columns = chart_cols)
    bea_inds = bea_chart["Industry"]
    bea_naics = bea_chart["NAICS Code"]
    cur_row = sht_pos[0] + 1
    cur_col = sht_pos[1]
    num_naics = naics_cross.shape[0]
    # Filling chart with naics codes that are in both lists and the crosswalk:
    naics_counter = 0
    for i in range(0, num_shts-2):
        for cur_row in range(sht_pos[0]+1, bea_readme.nrows):
            bea_code = str(bea_readme.cell_value(cur_row,cur_col+1))
            if(str(bea_codes1[i]) == bea_code):
                bea_ind = str(bea_readme.cell_value(cur_row,cur_col))
                bea_ind = bea_ind.replace('\xa0', ' ').strip()
                bea_inds[i] = bea_ind
                bea_chart["BEA Code"][i] = bea_code
                for k in xrange(0, num_naics):
                    naics_counter = (naics_counter+1) % num_naics
                    if(naics_inds[naics_counter] == bea_chart["Industry"][i]):
                       bea_naics[i] = naics_cross["NAICS"][naics_counter]
                       break
                break
            # If they match except one has ".0" at the end:
            elif(str(bea_codes1[i]) == 
                    str(bea_readme.cell_value(cur_row, cur_col+1))[:-2]):
                bea_ind = str(bea_readme.cell_value(cur_row, cur_col))
                bea_ind = bea_ind.replace('\xa0', ' ').strip()
                bea_chart["Industry"][i] = bea_ind
                cur_code = str(bea_readme.cell_value(cur_row, cur_col+1))[:-2]
                bea_chart["BEA Code"][i] = cur_code
                for k in xrange(0, num_naics):
                    naics_counter = (naics_counter+1) % num_naics
                    if(naics_inds[naics_counter] == bea_inds[i]):
                        bea_naics[i] = naics_cross["NAICS"][naics_counter]
                        break
                break
    # Initializing the table of assets:
    #cur_sht = bea_book.sheet_by_name(bea_chart["BEA Code"][0])
    #sht_pos = naics.search_ws(cur_sht, "asset codes", 25, False)
    bea_table = pd.DataFrame(np.zeros((asset_list.shape[0],
                                       bea_chart.shape[0])), 
                             columns = bea_chart["BEA Code"])
    # For each industry, calculating 
    for i in bea_chart["BEA Code"]:
        cur_sht = bea_book.sheet_by_name(i)
        sht_pos = naics.search_ws(cur_sht, "asset codes", 25, False)
        for j in xrange(0, len(asset_list)): #xrange(sht_pos[0]+2, cur_sht.nrows):
            cur_asset = asset_list.iloc[j,0]
            for k in xrange(sht_pos[0]+2, cur_sht.nrows):
                cur_cell = str(cur_sht.cell_value(k, sht_pos[1]+1))
                cur_cell = cur_cell.replace("\xa0", " ").strip()
                if(cur_asset == cur_cell):
                    bea_table[i][j] = float(
                                        cur_sht.cell_value(k, cur_sht.ncols-1)
                                        )
        #bea_table[i] = np.array(cur_sht.col_values(cur_sht.ncols-1, sht_pos[0]+2, cur_sht.nrows))
    # The dollar amounts are in millions:
    bea_table = bea_table.convert_objects(convert_numeric=True).fillna(0)
    bea_table = bea_table * 1000000
    # Breaking down by corporate tax status:
    corp_types = ["C Corporations",
                  "Corporate general partners", 
                  "Corporate limited partners"]
    non_corp_types = ["S Corporations",
                      "Individual general partners",
                      "Individual limited partners",
                      "Partnership general partners",
                      "Partnership limited partners",
                      "Tax-exempt organization general partners",
                      "Tax-exempt organization limited partners",
                      "Nominee and other general partners", 
                      "Nominee and other limited partners",
                      "Sole Proprietors"]
    # Initialize tree for assets data:
    asset_tree = naics.load_naics(data_folder + "\\2012_NAICS_Codes.csv")
    for i in xrange(0, len(asset_tree.enum_inds)):
        asset_tree.enum_inds[i].data.append(("All", 
                pd.DataFrame(np.zeros((1, asset_list.shape[0])), 
                             columns = asset_list.iloc[:,0])))
        asset_tree.enum_inds[i].data.append(("Corp", 
                pd.DataFrame(np.zeros((1, asset_list.shape[0])),
                             columns = asset_list.iloc[:,0])))
        asset_tree.enum_inds[i].data.append(("Non-Corp", 
                pd.DataFrame(np.zeros((1, asset_list.shape[0])),
                             columns = asset_list.iloc[:,0])))
    # Fill in data from BEA's fixed asset table:
    enum_index = len(output_tree.enum_inds) - 1
    for i in xrange(0, bea_table.shape[1]):
        cur_codes = str(bea_chart["NAICS Code"][i]).split(".")
        tot_share = 0
        all_proportions = naics.get_proportions(cur_codes, output_tree, 
                                          "FA").iloc[1,:]
        corp_proportions = naics.get_proportions(cur_codes, output_tree, "FA", 
                                           corp_types).iloc[1,:]
        non_corp_proportions = naics.get_proportions(cur_codes, output_tree, 
                                               "FA", non_corp_types).iloc[1,:]
        for code_index in xrange(0, len(cur_codes)):
            for j in xrange(0, len(asset_tree.enum_inds)):
                enum_index = (enum_index+1) % len(asset_tree.enum_inds)
                out_dfs = output_tree.enum_inds[enum_index].data.dfs
                if(sum(out_dfs["FA"].iloc[0,:]) == 0):
                    continue
                all_ratio = 1.0
                corp_ratio = 0.0
                non_corp_ratio = 0.0
                for category in corp_types:
                    corp_ratio += (out_dfs["FA"][category][0]/
                                        sum(out_dfs["FA"].iloc[0,:]))
                for category in non_corp_types:
                    non_corp_ratio += (out_dfs["FA"][category][0]/
                                            sum(out_dfs["FA"].iloc[0,:]))
                cur_data = asset_tree.enum_inds[enum_index].data
                ind_codes = cur_data.dfs["Codes:"].iloc[:,0]
                share = naics.compare_codes(cur_codes, ind_codes)
                tot_share += share
                if(share == 0):
                    continue
                num_assets = asset_tree.enum_inds[0].data.dfs["All"].shape[1]
                for k in xrange(0, num_assets):
                    cur_data.dfs["All"].iloc[0,k] = (bea_table.iloc[k,i]*
                                            all_ratio*
                                            all_proportions[code_index])
                    cur_data.dfs["Corp"].iloc[0,k] = (bea_table.iloc[k,i]*
                                            corp_ratio*
                                            corp_proportions[code_index])
                    cur_data.dfs["Non-Corp"].iloc[0,k] = (bea_table.iloc[k,i]*
                                            non_corp_ratio*
                                            non_corp_proportions[code_index])
                break
            if(tot_share == 1):
                break
    return asset_tree
Ejemplo n.º 11
0
-------------------------------------------------------------------------------
The main script of the program:
    --Loading the SOI Tax Stats-Corporation Data.
    --Loading the SOI Tax Stats-Partnership Data.
    --Loading tax data for Proprietorships.
    --Creating "output_tree" stating FA, INV, and LAND for various sectors.
-------------------------------------------------------------------------------
'''
# Working directory:
path = os.getcwd()
# Relevant path and file names:
data_folder = os.path.abspath(path + "\\data")
output_folder = os.path.abspath(path + "\\OUTPUT")

# Create a tree based off NAICS Codes:
data_tree = naics.load_naics(data_folder + "\\2012_NAICS_Codes.csv")
# Reading in the SOI Tax Stats-Corporation Data:
naics.load_soi_corporate_data(data_tree, data_folder)
# Reading in the SOI Tax Stats-Partnership Data:
naics.load_soi_partner_data(data_tree, data_folder)
# Reading in the SOI Tax Stats-Proprietorship Data:
naics.load_soi_proprietor_data(data_tree, data_folder)
'''
Many industries are not listed in the SOI datasets. The data for these missing
    industries are interpolated.
'''
# Get a list of the names of all the pd dfs besides the list of codes:
cur_names = data_tree.enum_inds[0].data.dfs.keys()
cur_names.remove("Codes:")
# Populate missing industry data backwards throught the tree:
naics.pop_back(data_tree, cur_names)
Ejemplo n.º 12
0
def read_inventories(output_tree, data_folder):
    # The directory with inventory data:
    inv_folder = os.path.abspath(data_folder + "\\Inventories")
    # Opening BEA's excel file on depreciable assets by industry:
    inv_book = xlrd.open_workbook(os.path.abspath(
                                    inv_folder + "\\Inventories.xls"))
    sht0 = inv_book.sheet_by_index(0)
    num_rows = sht0.nrows
    num_cols = sht0.ncols
    #Find the starting index in worksheet.
    cur_index = naics.search_ws(sht0, 1, 25, True, [0,0], True)
    check_index = naics.search_ws(sht0, "line", 20)
    if(cur_index[1] != check_index[1]):
        print "ERROR"
    # Breaking down by corporate tax status:
    corp_types = ["C Corporations",
                  "Corporate general partners", 
                  "Corporate limited partners"]
    non_corp_types = ["S Corporations",
                      "Individual general partners",
                      "Individual limited partners",
                      "Partnership general partners",
                      "Partnership limited partners",
                      "Tax-exempt organization general partners",
                      "Tax-exempt organization limited partners",
                      "Nominee and other general partners", 
                      "Nominee and other limited partners",
                      "Sole Proprietors"]
    # Reading in the crosswalk:
    inv_cross = pd.read_csv(os.path.abspath(
                                inv_folder + "\\Inventories_Crosswalk.csv"))
    # Creating a tree for the inventory data:
    inv_tree = naics.load_naics(data_folder + "\\2012_NAICS_Codes.csv")
    #
    data_cols = ["All", "Corp", "Non-Corp"]
    for i in inv_tree.enum_inds:
        i.data.append(("Inventories",
                       pd.DataFrame(np.zeros((1, len(data_cols))), 
                                    columns = data_cols)))
    #
    inv_data = np.zeros(inv_cross.shape[0])
    #
    cross_index = 0
    for i in xrange(cur_index[0], num_rows):
        if(cross_index >= inv_cross.shape[0]):
            break
        cur_list = str(sht0.cell_value(i, cur_index[1])).strip()
        cur_name = str(sht0.cell_value(i, cur_index[1]+1)).strip()
        checks = ((str(cur_list) == str(inv_cross["List"][cross_index])) and 
                    (str(cur_name) == str(inv_cross["Industry"][cross_index])))
        if(checks):
            cross_index += 1
            try:
                cur_value = float(sht0.cell_value(i, num_cols-1))
            except ValueError:
                continue
            inv_data[cross_index-1] = cur_value
            # Data is in billions:
            inv_data[cross_index-1] = (10**9) * inv_data[cross_index-1]
    #
    for i in xrange(0, inv_cross.shape[0]):
        cur_codes = inv_cross["NAICS"][i].strip().split(".")
        proportions = naics.get_proportions(cur_codes, output_tree, "INV")
        for j in xrange(0, proportions.shape[1]):
            cur_ind = inv_tree.enum_inds[int(proportions.iloc[0,j])]
            prev_ind = output_tree.enum_inds[int(proportions.iloc[0,j])]
            prev_df = prev_ind.data.dfs["INV"]
            if(sum(prev_df.iloc[0, :]) != 0):
                cur_dfs = ((prev_df/sum(prev_df.iloc[0,:])) *
                                (inv_data[i] * proportions.iloc[1,j]))
                inv_df = cur_ind.data.dfs["Inventories"]
                inv_df["All"] += sum(cur_dfs.iloc[0,:])
                for k in corp_types:
                    inv_df["Corp"] += cur_dfs[k][0]
                for k in non_corp_types:
                    inv_df["Non-Corp"] += cur_dfs[k][0]
    #
    return inv_tree