def get_ybp_rcas(ymbp, geo_level):

    ymbp = ymbp.reset_index()
    month_criterion = ymbp["month"].map(lambda x: x == "00")
    hs_criterion = ymbp["hs_id"].map(lambda x: len(x) == 6)
    bra_criterion = ymbp["bra_id"].map(lambda x: len(x) == geo_level)
    ymbp = ymbp[month_criterion & hs_criterion & bra_criterion]
    ymbp = ymbp[["bra_id", "hs_id", "export_val"]]
    ymbp = ymbp.pivot(index="bra_id", columns="hs_id", values="export_val").fillna(0)

    # ymb = ymb.reset_index()
    # month_criterion = ymb['month'].map(lambda x: x == '00')
    # bra_criterion = ymb['bra_id'].map(lambda x: len(x) == geo_level)
    # ymb = ymb[month_criterion & bra_criterion]
    # # threshold... bras w/ product diversity > 1
    # allowed_bras = ymb[ymb.hs_diversity > 1].set_index("bra_id").index
    # ymbp = ymbp.reindex(allowed_bras)

    rcas = ps_calcs.rca(ymbp)
    rcas[rcas >= 1] = 1
    rcas[rcas < 1] = 0
    rcas = rcas.fillna(0)

    prod_diversity = rcas.sum(axis=1)
    allowed_bras = prod_diversity[prod_diversity > 0].index

    rcas = rcas.reindex(allowed_bras)
    ymbp = ymbp.reindex(allowed_bras)

    shares = ymbp * rcas

    return shares
Example #2
0
def calc_rca(ybc, year):

    rcas = pd.DataFrame()
    for geo_level in [2, 4, 8]:

        print "geo level:", geo_level

        ybc_data = ybc.reset_index()

        bra_criterion = ybc_data["bra_id"].map(lambda x: len(x) == geo_level)
        course_criterion = ybc_data["course_sc_id"].map(lambda x: len(x) == 5)
        ybc_data = ybc_data[bra_criterion & course_criterion]

        ybc_data = ybc_data[["bra_id", "course_sc_id", "students"]]

        ybc_data = ybc_data.pivot(index="bra_id", columns="course_sc_id", values="students")
        ybc_data_rca = ps_calcs.rca(ybc_data)
        ybc_data_rca = pd.DataFrame(ybc_data_rca.stack(), columns=["students_rca"])

        if rcas.empty:
            rcas = ybc_data_rca
        else:
            rcas = pd.concat([rcas, ybc_data_rca])
        rcas = rcas.replace(0, np.nan)
        rcas = rcas.dropna(how="all")

    rcas["year"] = int(year)
    rcas = rcas.set_index("year", append=True)
    rcas = rcas.swaplevel("year", "course_sc_id")
    rcas = rcas.swaplevel("year", "bra_id")
    ybc = ybc.merge(rcas, how="outer", left_index=True, right_index=True)

    return ybc
def get_shares(ymbp, geo_level):

    ymbp = ymbp.reset_index()
    month_criterion = ymbp['month'].map(lambda x: x == '00')
    hs_criterion = ymbp['hs_id'].map(lambda x: len(x) == 6)
    bra_criterion = ymbp['bra_id'].map(lambda x: len(x) == geo_level)
    ymbp = ymbp[month_criterion & hs_criterion & bra_criterion]
    ymbp = ymbp[["bra_id","hs_id","export_val"]]
    ymbp = ymbp.pivot(index="bra_id", columns="hs_id", values="export_val").fillna(0)

    rcas = ps_calcs.rca(ymbp)
    rcas[rcas >= 1] = 1
    rcas[rcas < 1] = 0
    rcas = rcas.fillna(0)

    prod_diversity = rcas.sum(axis=1)
    allowed_bras = prod_diversity[prod_diversity>0].index

    rcas = rcas.reindex(allowed_bras)
    ymbp = ymbp.reindex(allowed_bras)

    shares = ymbp * rcas


    return shares
Example #4
0
def get_ybp_rcas(ymbp, geo_level):

    ymbp = ymbp.reset_index()
    month_criterion = ymbp['month'].map(lambda x: x == '00')
    hs_criterion = ymbp['hs_id'].map(lambda x: len(x) == 6)
    bra_criterion = ymbp['bra_id'].map(lambda x: len(x) == geo_level)
    ymbp = ymbp[month_criterion & hs_criterion & bra_criterion]
    ymbp = ymbp[["bra_id", "hs_id", "export_val"]]
    ymbp = ymbp.pivot(index="bra_id", columns="hs_id",
                      values="export_val").fillna(0)

    # ymb = ymb.reset_index()
    # month_criterion = ymb['month'].map(lambda x: x == '00')
    # bra_criterion = ymb['bra_id'].map(lambda x: len(x) == geo_level)
    # ymb = ymb[month_criterion & bra_criterion]
    # # threshold... bras w/ product diversity > 1
    # allowed_bras = ymb[ymb.hs_diversity > 1].set_index("bra_id").index
    # ymbp = ymbp.reindex(allowed_bras)

    rcas = ps_calcs.rca(ymbp)
    rcas[rcas >= 1] = 1
    rcas[rcas < 1] = 0
    rcas = rcas.fillna(0)

    prod_diversity = rcas.sum(axis=1)
    allowed_bras = prod_diversity[prod_diversity > 0].index

    rcas = rcas.reindex(allowed_bras)
    ymbp = ymbp.reindex(allowed_bras)

    shares = ymbp * rcas

    return shares
Example #5
0
def main():
    ''' 
        Step 1:
        Import the data file to a pandas DataFrame.
    '''
    try:
        oec_df = pd.read_csv("data/year_origin_hs92_4.tsv", \
                                sep="\t", \
                                converters={"hs92":str})
    except IOError:
        sys.exit("File doesn't exist, use fetch_oec_data.sh to download.")
    ''' 
        Step 2:
        Convert our vertically oriented data CPY (country-product-year) into
        the multidimensional Mcp matrix.
        rows     = countries
        columns  = products
    '''
    # Only use most recent year (could loop through each year too...)
    most_recent_year = sorted(oec_df.year.unique())[-1]
    oec_df = oec_df[oec_df.year == most_recent_year]

    # We only care about the country, product and export_val columns
    # so let's drop all the others
    oec_df = oec_df[["origin", "hs92", "export_val"]]

    # Drop all rows without export value
    oec_df = oec_df[~oec_df.export_val.isnull()]

    # Now we pivot our flat file to be countries X products
    mcp = oec_df.pivot(index="origin", columns="hs92", values="export_val")
    ''' 
        Step 3:
        Now this is the easiest part, we use the ps_calcs library to run the
        RCA calculation on the Mcp matrix.
    '''
    rcas = rca(mcp)

    # Here are some tests...
    # 1. Print the 10 products New Zealand (nzl) has the highest RCA in.
    # 0204 = Sheep and Goat Meat
    # print rcas.ix['nzl'].order(ascending=False).head(10)

    # Here are some tests...
    # 1. Print the 10 countries with the highest RCA in cars (8703).
    # SVK = Slovakia
    # print rcas['8703'].order(ascending=False).head(10)
    ''' 
        Step 4:
        Lastly, we can convert our nominal RCA values into binary 1s and 0s,
        1 and > meaning that countries exports their fair share of the product
        and 0 meaning they don't.
    '''
    rcas[rcas >= 1] = 1
    rcas[rcas < 1] = 0

    print "Calculation run successfully! Read the source code to see what's going on."
def prox(year, output_path, attr, i_attr, table, column):

    attr_depths = depths[attr]
    i_attr_depths = depths[i_attr]

    years = get_years(year)

    for year in years:
        print "year:", year

        for i, depth in enumerate(attr_depths):
            print attr, "depth:", depth

            query = """
                SELECT {0}_id, {1}_id, {2}
                FROM {3}
                WHERE year=%s
            """.format(attr, i_attr, column, table)

            if len(attr_depths) > 1:
                query += " and {}_id_len={}".format(attr, depth)

            if len(i_attr_depths) > 1:
                query += " and {}_id_len={}".format(i_attr, i_attr_depths[-1])

            if "secex" in table:
                query += " and month=0"

            data = sql.read_sql(query, db, params=[year])
            data = data.pivot(index="{}_id".format(i_attr),
                              columns="{}_id".format(attr),
                              values=column)

            rcas = ps_calcs.rca(data)

            rcas[rcas >= 1] = 1
            rcas[rcas < 1] = 0

            prox = ps_calcs.proximity(rcas)
            prox = pd.DataFrame(prox.unstack(),
                                columns=["{}_proximity".format(i_attr)])
            prox["year"] = year
            prox = prox.set_index("year", append=True)

            output_path_w_year = os.path.abspath(
                os.path.join(output_path, str(year)))
            if not os.path.exists(output_path_w_year):
                os.makedirs(output_path_w_year)
            fp = os.path.join(output_path_w_year,
                              "{}_{}_proximity.tsv".format(attr, i_attr))

            file_mode = 'a' if i else 'w'
            user_header = False if i else True
            with open(fp, file_mode) as f:
                prox.to_csv(f, header=user_header, sep="\t")
Example #7
0
def calc_rca(ypw):
    ypw_rca = ypw.reset_index()
    ypw_rca = ypw_rca.pivot(index="wld_id", columns="hs_id", values="val_usd")
    ypw_rca = ps_calcs.rca(ypw_rca)
    ypw_rca = pd.DataFrame(ypw_rca.T.stack(), columns=["rca"])
    ypw_rca = ypw_rca.replace(0, np.nan)

    # print ypw_rca.head()
    # print ypw.head()
    # sys.exit()

    ypw['rca'] = ypw_rca['rca']

    return ypw
Example #8
0
def get_domestic_rcas(geo_level, year, ybp, depths):
    ybp = ybp.reset_index()
    
    hs_criterion = ybp['hs_id'].map(lambda x: len(x) == depths['hs'][-1])
    bra_criterion = ybp['bra_id'].map(lambda x: len(x) == geo_level)
    
    ybp = ybp[hs_criterion & bra_criterion]
    ybp = ybp[["bra_id", "hs_id", "val_usd"]]
    ybp = ybp.pivot(index="bra_id", columns="hs_id", values="val_usd").fillna(0)
    
    rcas = ps_calcs.rca(ybp).fillna(0)
    rcas[rcas == np.inf] = 0
    
    return rcas
def calc_rca(ypw):
    ypw_rca = ypw.reset_index()
    ypw_rca = ypw_rca.pivot(index="wld_id", columns="hs_id", values="val_usd")
    ypw_rca = ps_calcs.rca(ypw_rca)
    ypw_rca = pd.DataFrame(ypw_rca.T.stack(), columns=["rca"])
    ypw_rca = ypw_rca.replace(0, np.nan)
    
    # print ypw_rca.head()
    # print ypw.head()
    # sys.exit()
    
    ypw['rca'] = ypw_rca['rca']
    
    return ypw
def prox(year, output_path, attr, i_attr, table, column):
    
    attr_depths = depths[attr]
    i_attr_depths = depths[i_attr]
    
    years = get_years(year)
    
    for year in years:
        print "year:", year
        
        for i, depth in enumerate(attr_depths):
            print attr, "depth:", depth
            
            query = """
                SELECT {0}_id, {1}_id, {2}
                FROM {3}
                WHERE year=%s
            """.format(attr, i_attr, column, table)
            
            if len(attr_depths) > 1:
                query += " and {}_id_len={}".format(attr, depth)
            
            if len(i_attr_depths) > 1:
                query += " and {}_id_len={}".format(i_attr, i_attr_depths[-1])
            
            if "secex" in table:
                query += " and month=0"
            
            data = sql.read_sql(query, db, params=[year])
            data = data.pivot(index="{}_id".format(i_attr), columns="{}_id".format(attr), values=column)
        
            rcas = ps_calcs.rca(data)
        
            rcas[rcas >= 1] = 1
            rcas[rcas < 1] = 0
        
            prox = ps_calcs.proximity(rcas)
            prox = pd.DataFrame(prox.unstack(), columns=["{}_proximity".format(i_attr)])
            prox["year"] = year
            prox = prox.set_index("year", append=True)
        
            output_path_w_year = os.path.abspath(os.path.join(output_path, str(year)))
            if not os.path.exists(output_path_w_year): os.makedirs(output_path_w_year)
            fp = os.path.join(output_path_w_year, "{}_{}_proximity.tsv".format(attr, i_attr))
        
            file_mode = 'a' if i else 'w'
            user_header = False if i else True
            with open(fp, file_mode) as f:
                prox.to_csv(f, header=user_header, sep="\t")
Example #11
0
def get_wld_proximity(year, ypw_file_path):

    '''Get world values from ypw file'''
    table = pd.read_csv(ypw_file_path, compression="bz2", sep="\t", converters={"hs_id":str})
    table = table.rename(columns={"val_usd":"export_val"})
    table = table.pivot(index="wld_id", columns="hs_id", values="export_val").fillna(0)

    '''Use growth library to run RCA calculation on data'''
    mcp = ps_calcs.rca(table)
    mcp[mcp >= 1] = 1
    mcp[mcp < 1] = 0

    prox = ps_calcs.proximity(mcp)

    return prox
Example #12
0
def get_domestic_rcas(geo_level, year, ybp, depths):
    ybp = ybp.reset_index()

    hs_criterion = ybp['hs_id'].map(lambda x: len(x) == depths['hs'][-1])
    bra_criterion = ybp['bra_id'].map(lambda x: len(x) == geo_level)

    ybp = ybp[hs_criterion & bra_criterion]
    ybp = ybp[["bra_id", "hs_id", "val_usd"]]
    ybp = ybp.pivot(index="bra_id", columns="hs_id",
                    values="val_usd").fillna(0)

    rcas = ps_calcs.rca(ybp).fillna(0)
    rcas[rcas == np.inf] = 0

    return rcas
Example #13
0
def get_ybi_rcas(ybi, geo_level):
    ybi = ybi.reset_index()

    cnae_criterion = ybi['cnae_id'].str.len() == 6
    bra_criterion = ybi['bra_id'].str.len() == geo_level

    ybi = ybi[cnae_criterion & bra_criterion]
    ybi = ybi[["bra_id", "cnae_id", "wage"]]

    ybi = ybi.pivot(index="bra_id", columns="cnae_id", values="wage").fillna(0)

    rcas = ps_calcs.rca(ybi)
    rcas[rcas >= 1] = 1
    rcas[rcas < 1] = 0

    return rcas
def get_ybp_rcas(ybp, geo_level, depths):
    
    ybp = ybp.reset_index()
    hs_criterion = ybp['hs_id'].map(lambda x: len(x) == depths["hs"][-1])
    bra_criterion = ybp['bra_id'].map(lambda x: len(x) == geo_level)
    
    ybp = ybp[hs_criterion & bra_criterion]
    ybp = ybp[["bra_id","hs_id","val_usd"]]
    
    ybp = ybp.pivot(index="bra_id", columns="hs_id", values="val_usd").fillna(0)
    
    rcas = ps_calcs.rca(ybp)
    rcas[rcas >= 1] = 1
    rcas[rcas < 1] = 0
    
    return rcas
Example #15
0
def get_domestic_rcas(geo_level, year, ymbp, trade_flow):
    ymbp = ymbp.reset_index()
    val_col = trade_flow+"_val"

    month_criterion = ymbp['month'].map(lambda x: x == "00")
    hs_criterion = ymbp['hs_id'].map(lambda x: len(x) == 6)
    bra_criterion = ymbp['bra_id'].map(lambda x: len(x) == geo_level)

    ymbp = ymbp[month_criterion & hs_criterion & bra_criterion]
    ymbp = ymbp[["bra_id", "hs_id", val_col]]
    ymbp = ymbp.pivot(index="bra_id", columns="hs_id", values=val_col).fillna(0)

    rcas = ps_calcs.rca(ymbp).fillna(0)
    rcas[rcas == np.inf] = 0

    return rcas
def get_ybp_rcas(ybp, geo_level, depths):

    ybp = ybp.reset_index()
    hs_criterion = ybp['hs_id'].map(lambda x: len(x) == depths["hs"][-1])
    bra_criterion = ybp['bra_id'].map(lambda x: len(x) == geo_level)

    ybp = ybp[hs_criterion & bra_criterion]
    ybp = ybp[["bra_id", "hs_id", "val_usd"]]

    ybp = ybp.pivot(index="bra_id", columns="hs_id",
                    values="val_usd").fillna(0)

    rcas = ps_calcs.rca(ybp)
    rcas[rcas >= 1] = 1
    rcas[rcas < 1] = 0

    return rcas
Example #17
0
def get_wld_proximity(year, ypw_file_path):
    '''Get world values from ypw file'''
    table = pd.read_csv(ypw_file_path,
                        compression="bz2",
                        sep="\t",
                        converters={"hs_id": str})
    table = table.rename(columns={"val_usd": "export_val"})
    table = table.pivot(index="wld_id", columns="hs_id",
                        values="export_val").fillna(0)
    '''Use growth library to run RCA calculation on data'''
    mcp = ps_calcs.rca(table)
    mcp[mcp >= 1] = 1
    mcp[mcp < 1] = 0

    prox = ps_calcs.proximity(mcp)

    return prox
Example #18
0
def get_domestic_rcas(geo_level, year, ymbp, trade_flow):
    ymbp = ymbp.reset_index()
    val_col = trade_flow + "_val"

    month_criterion = ymbp['month'].map(lambda x: x == "00")
    hs_criterion = ymbp['hs_id'].map(lambda x: len(x) == 6)
    bra_criterion = ymbp['bra_id'].map(lambda x: len(x) == geo_level)

    ymbp = ymbp[month_criterion & hs_criterion & bra_criterion]
    ymbp = ymbp[["bra_id", "hs_id", val_col]]
    ymbp = ymbp.pivot(index="bra_id", columns="hs_id",
                      values=val_col).fillna(0)

    rcas = ps_calcs.rca(ymbp).fillna(0)
    rcas[rcas == np.inf] = 0

    return rcas
Example #19
0
def get_wld_proximity(year, ypw_file_path):
    ''' Connect to DB '''
    db = MySQLdb.connect(host=os.environ.get("DATAVIVA_DB_HOST", "localhost"), 
                            user=os.environ.get("DATAVIVA_DB_USER", "root"), 
                            passwd=os.environ.get("DATAVIVA_DB_PW", ""), 
                            db=os.environ.get("DATAVIVA_DB_NAME", "dataviva"))

    '''Get world values from ypw file'''
    table = pd.read_csv(ypw_file_path, compression="bz2", sep="\t", converters={"hs_id":str})
    table = table.rename(columns={"val_usd":"export_val"})
    table = table.pivot(index="wld_id", columns="hs_id", values="export_val").fillna(0)    

    '''Use growth library to run RCA calculation on data'''
    mcp = ps_calcs.rca(table)
    mcp[mcp >= 1] = 1
    mcp[mcp < 1] = 0
    
    prox = ps_calcs.proximity(mcp)

    return prox
Example #20
0
def calc_rca(ybuc, year):

    ybc = ybuc.groupby(level=["year", "bra_id", "course_hedu_id"]).sum()
    ybc = ybc[["enrolled"]]
    ybc = ybc.reset_index()
    ybc = ybc.drop("year", axis=1)

    rcas = ybc.pivot(index="bra_id", columns="course_hedu_id", values="enrolled")
    rcas = ps_calcs.rca(rcas)
    rcas = pd.DataFrame(rcas.stack(), columns=["enrolled_rca"])

    rcas = rcas.replace(0, np.nan)
    rcas = rcas.dropna(how="all")

    rcas["year"] = int(year)
    rcas = rcas.set_index("year", append=True)
    rcas = rcas.swaplevel("year", "course_hedu_id")
    rcas = rcas.swaplevel("year", "bra_id")

    return rcas
Example #21
0
def get_wld_proximity(year):
    ''' Connect to DB '''
    db = MySQLdb.connect(host=os.environ["DATAVIVA_DB_HOST"],
                         user=os.environ["DATAVIVA_DB_USER"],
                         passwd=os.environ["DATAVIVA_DB_PW"],
                         db=os.environ["DATAVIVA_DB_NAME"])
    '''Get values from database'''
    q = "select wld_id, hs_id, val_usd " \
        "from comtrade_ypw " \
        "where year = {0} and length(hs_id) = 6".format(year)
    table = sql.read_sql(q, db)
    table = table.rename(columns={"val_usd": "val_usd"})
    table = table.pivot(index="wld_id", columns="hs_id", values="val_usd")
    table = table.fillna(0)
    '''Use growth library to run RCA calculation on data'''
    mcp = ps_calcs.rca(table)
    mcp[mcp >= 1] = 1
    mcp[mcp < 1] = 0

    prox = ps_calcs.proximity(mcp)

    return prox
Example #22
0
def get_wld_proximity(year, ypw_file_path):
    ''' Connect to DB '''
    db = MySQLdb.connect(host=os.environ.get("DATAVIVA_DB_HOST", "localhost"),
                         user=os.environ.get("DATAVIVA_DB_USER", "root"),
                         passwd=os.environ.get("DATAVIVA_DB_PW", ""),
                         db=os.environ.get("DATAVIVA_DB_NAME", "dataviva"))
    '''Get world values from ypw file'''
    table = pd.read_csv(ypw_file_path,
                        compression="bz2",
                        sep="\t",
                        converters={"hs_id": str})
    table = table.rename(columns={"val_usd": "export_val"})
    table = table.pivot(index="wld_id", columns="hs_id",
                        values="export_val").fillna(0)
    '''Use growth library to run RCA calculation on data'''
    mcp = ps_calcs.rca(table)
    mcp[mcp >= 1] = 1
    mcp[mcp < 1] = 0

    prox = ps_calcs.proximity(mcp)

    return prox
def calc_complexity(ypw):
    
    ubiquity_required = 20
    diversity_required = 200
    total_exports_required = 50000000
    
    '''trim country list by diversity'''
    origin_diversity = ypw.reset_index()
    origin_diversity = origin_diversity["wld_id"].value_counts()
    origin_diversity = origin_diversity[origin_diversity > diversity_required]
    
    '''trim country list by total exports'''
    origin_totals = ypw.groupby(level=['wld_id']).sum()
    origin_totals = origin_totals['val_usd']
    origin_totals = origin_totals[origin_totals > total_exports_required]
    
    filtered_origins = set(origin_diversity.index).intersection(set(origin_totals.index))
    
    '''trim product list by ubiquity'''
    product_ubiquity = ypw.reset_index()
    product_ubiquity = product_ubiquity["hs_id"].value_counts()
    product_ubiquity = product_ubiquity[product_ubiquity > ubiquity_required]
    
    filtered_products = set(product_ubiquity.index)
    
    '''re-calculate rcas'''
    origins_to_drop = set(ypw.index.get_level_values('wld_id')).difference(filtered_origins)
    products_to_drop = set(ypw.index.get_level_values('hs_id')).difference(filtered_products)
    
    ypw = ypw.drop(list(origins_to_drop), axis=0, level='wld_id')
    ypw = ypw.drop(list(products_to_drop), axis=0, level='hs_id')
    ypw_rca = ypw.reset_index()
    ypw_rca = ypw_rca.pivot(index="wld_id", columns="hs_id", values="val_usd")
    ypw_rca = ps_calcs.rca(ypw_rca)
    
    ypw_rca[ypw_rca >= 1] = 1
    ypw_rca[ypw_rca < 1] = 0
    
    return ps_calcs.complexity(ypw_rca)
Example #24
0
def calc_rca(ypw):

    ubiquity_required = 20
    diversity_required = 200
    total_exports_required = 50000000

    '''trim country list by diversity'''
    origin_diversity = ypw.reset_index()
    origin_diversity = origin_diversity["wld_id"].value_counts()
    origin_diversity = origin_diversity[origin_diversity > diversity_required]

    '''trim country list by total exports'''
    origin_totals = ypw.groupby(level=['wld_id']).sum()
    origin_totals = origin_totals['val_usd']
    origin_totals = origin_totals[origin_totals > total_exports_required]

    filtered_origins = set(origin_diversity.index).intersection(set(origin_totals.index))

    '''trim product list by ubiquity'''
    product_ubiquity = ypw.reset_index()
    product_ubiquity = product_ubiquity[product_ubiquity['val_usd'] > 0]
    product_ubiquity = product_ubiquity["hs_id"].value_counts()
    product_ubiquity = product_ubiquity[product_ubiquity > ubiquity_required]

    filtered_products = set(product_ubiquity.index)

    '''re-calculate rcas'''
    origins_to_drop = set(ypw.index.get_level_values('wld_id')).difference(filtered_origins)
    products_to_drop = set(ypw.index.get_level_values('hs_id')).difference(filtered_products)

    ypw = ypw.drop(list(origins_to_drop), axis=0, level='wld_id')
    ypw = ypw.drop(list(products_to_drop), axis=0, level='hs_id')

    ypw_rca = ypw.reset_index()
    ypw_rca = ypw_rca.pivot(index="wld_id", columns="hs_id", values="val_usd")
    ypw_rca = ps_calcs.rca(ypw_rca)

    return ypw_rca.fillna(0)
Example #25
0
def get_wld_proximity(year):
    ''' Connect to DB '''
    db = MySQLdb.connect(host=os.environ["DATAVIVA_DB_HOST"], user=os.environ["DATAVIVA_DB_USER"], 
                            passwd=os.environ["DATAVIVA_DB_PW"], 
                            db=os.environ["DATAVIVA_DB_NAME"])

    '''Get values from database'''
    q = "select wld_id, hs_id, val_usd " \
        "from comtrade_ypw " \
        "where year = {0} and length(hs_id) = 6".format(year)
    table = sql.read_sql(q, db)
    table = table.rename(columns={"val_usd":"val_usd"})
    table = table.pivot(index="wld_id", columns="hs_id", values="val_usd")
    table = table.fillna(0)

    '''Use growth library to run RCA calculation on data'''
    mcp = ps_calcs.rca(table)
    mcp[mcp >= 1] = 1
    mcp[mcp < 1] = 0
    
    prox = ps_calcs.proximity(mcp)

    return prox
def calc_rca(ybc, year):

    rcas = pd.DataFrame()
    for geo_level in [2, 4, 8]:

        print "geo level:", geo_level

        ybc_data = ybc.reset_index()

        bra_criterion = ybc_data["bra_id"].map(lambda x: len(x) == geo_level)
        course_criterion = ybc_data["course_sc_id"].map(lambda x: len(x) == 5)
        ybc_data = ybc_data[bra_criterion & course_criterion]

        ybc_data = ybc_data[["bra_id", "course_sc_id", "students"]]

        ybc_data = ybc_data.pivot(index="bra_id",
                                  columns="course_sc_id",
                                  values="students")
        ybc_data_rca = ps_calcs.rca(ybc_data)
        ybc_data_rca = pd.DataFrame(ybc_data_rca.stack(),
                                    columns=["students_rca"])

        if rcas.empty:
            rcas = ybc_data_rca
        else:
            rcas = pd.concat([rcas, ybc_data_rca])
        rcas = rcas.replace(0, np.nan)
        rcas = rcas.dropna(how="all")

    rcas["year"] = int(year)
    rcas = rcas.set_index("year", append=True)
    rcas = rcas.swaplevel("year", "course_sc_id")
    rcas = rcas.swaplevel("year", "bra_id")
    ybc = ybc.merge(rcas, how="outer", left_index=True, right_index=True)

    return ybc
Example #27
0
def main(input_file, year, output_dir):

    output_dir = os.path.abspath(os.path.join(output_dir, str(year)))
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    store = pd.HDFStore(os.path.join(output_dir, 'yodp.h5'))

    try:
        ypw = store.get('ypw')
    except KeyError:
        '''
        Import file to pandas dataframe
        '''
        comtrade_df = import_file(input_file)
        '''
        Add indexes
        '''
        ypw = comtrade_df.groupby(['hs_id', 'wld_id']).sum()

        store.put('ypw', ypw)
    '''
    Calculate RCA
    '''
    ypw_rca = ypw.reset_index()
    ypw_rca = ypw_rca.pivot(index="wld_id", columns="hs_id", values="val_usd")
    ypw_rca = ps_calcs.rca(ypw_rca)

    ypw_rca_binary = ypw_rca.copy()
    ypw_rca_binary[ypw_rca_binary >= 1] = 1
    ypw_rca_binary[ypw_rca_binary < 1] = 0
    '''
        DISTANCES
    '''
    ypw_prox = ps_calcs.proximity(ypw_rca_binary)
    ypw_dist = ps_calcs.distance(ypw_rca_binary, ypw_prox).fillna(0)
    '''
        COMPLEXITY
    '''
    eci, pci = calc_complexity(ypw)
    '''
        OPP GAIN
    '''
    ypw_opp_gain = ps_calcs.opportunity_gain(
        ypw_rca_binary[pci.index], ypw_prox[pci.index].reindex(pci.index), pci)
    '''
        MERGE DATA
    '''
    ypw_opp_gain = pd.DataFrame(ypw_opp_gain.T.stack(), columns=["opp_gain"])
    ypw_opp_gain = ypw_opp_gain.replace(0, np.nan)

    ypw_dist = pd.DataFrame(ypw_dist.T.stack(), columns=["distance"])
    ypw_dist = ypw_dist.replace(0, np.nan)

    ypw_rca = pd.DataFrame(ypw_rca.T.stack(), columns=["rca"])
    ypw_rca = ypw_rca.replace(0, np.nan)

    new_ypw = ypw \
                .merge(ypw_rca, how="outer", left_index=True, right_index=True) \
                .merge(ypw_dist, how="outer", left_index=True, right_index=True) \
                .merge(ypw_opp_gain, how="outer", left_index=True, right_index=True)
    new_ypw = new_ypw.reset_index()
    new_ypw["year"] = year
    cols = new_ypw.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    new_ypw = new_ypw[cols]
    '''
    Write out to files
    '''
    new_file_path = os.path.abspath(
        os.path.join(output_dir, "comtrade_ypw.tsv.bz2"))
    new_ypw.to_csv(bz2.BZ2File(new_file_path, 'wb'),
                   sep="\t",
                   index=False,
                   float_format="%.3f")

    new_file_path = os.path.abspath(
        os.path.join(output_dir, "comtrade_pci.tsv.bz2"))
    pd.DataFrame(pci, columns=["pci"]).to_csv(bz2.BZ2File(new_file_path, 'wb'),
                                              sep="\t",
                                              index=True,
                                              float_format="%.3f")

    new_file_path = os.path.abspath(
        os.path.join(output_dir, "comtrade_eci.tsv.bz2"))
    pd.DataFrame(eci, columns=["eci"]).to_csv(bz2.BZ2File(new_file_path, 'wb'),
                                              sep="\t",
                                              index=True,
                                              float_format="%.3f")
Example #28
0
def rdo(ybi, yi, year, depths):

    rca_dist_opp = []
    for geo_level in depths["bra"]:
        print "geo level:", geo_level

        ybi_data = ybi.reset_index()

        bra_criterion = ybi_data["bra_id"].str.len() == geo_level
        cnae_criterion = ybi_data["cnae_id"].str.len() == 6
        ybi_data = ybi_data[bra_criterion & cnae_criterion]

        # ybi_data = ybi_data.reindex(index=ybi_index)
        # ybi_data = ybi_data.drop(["year", "num_emp", "num_est", "wage_avg", "num_emp_est"], axis=1)
        ybi_data = ybi_data[["bra_id", "cnae_id", "wage"]]

        # ybi_data = ybi_data.unstack()
        # levels = ybi_data.columns.levels
        # labels = ybi_data.columns.labels
        # ybi_data.columns = levels[1][labels[1]]
        '''
            RCAS
        '''

        # ybi_data = ybi_data.pivot(index="bra_id", columns="cnae_id", values="wage").fillna(0)
        ybi_data = ybi_data.pivot(index="bra_id",
                                  columns="cnae_id",
                                  values="wage")
        rcas = ps_calcs.rca(ybi_data)

        rcas_binary = rcas.copy()
        rcas_binary[rcas_binary >= 1] = 1
        rcas_binary[rcas_binary < 1] = 0
        '''
            DISTANCES
        '''
        '''calculate proximity for opportunity gain calculation'''
        prox = ps_calcs.proximity(rcas_binary)
        '''calculate distances using proximity'''
        dist = ps_calcs.distance(rcas_binary, prox).fillna(0)
        '''
            OPP GAIN
        '''
        '''calculate product complexity'''
        pci = ps_calcs.complexity(rcas_binary)[1]
        '''calculate opportunity gain'''
        opp_gain = ps_calcs.opportunity_gain(rcas_binary, prox, pci)

        rdo = []
        for bra in rcas.index:
            for cnae in rcas.columns:
                rdo.append([
                    year, bra, cnae, rcas[cnae][bra], dist[cnae][bra],
                    opp_gain[cnae][bra]
                ])

        rca_dist_opp += rdo

    # now time to merge!
    print "merging datasets..."
    ybi_rdo = pd.DataFrame(
        rca_dist_opp,
        columns=["year", "bra_id", "cnae_id", "rca", "distance", "opp_gain"])
    ybi_rdo["year"] = ybi_rdo["year"].astype(int)
    ybi_rdo["rca"][ybi_rdo["rca"] == 0] = np.nan
    ybi_rdo = ybi_rdo.set_index(["year", "bra_id", "cnae_id"])

    # get union of both sets of indexes
    all_ybi_indexes = set(ybi.index).union(set(ybi_rdo.index))

    all_ybi_indexes = pd.MultiIndex.from_tuples(
        all_ybi_indexes, names=["year", "bra_id", "cnae_id"])
    # ybi = ybi.reindex(index=all_ybi_indexes, fill_value=0)
    ybi = ybi.reindex(index=all_ybi_indexes)
    ybi["rca"] = ybi_rdo["rca"]
    ybi["distance"] = ybi_rdo["distance"]
    ybi["opp_gain"] = ybi_rdo["opp_gain"]

    return ybi
Example #29
0
def rdo(ybi, yi, year, depths):
    
    rca_dist_opp = []
    for geo_level in depths["bra"]:
        print "geo level:", geo_level
        
        ybi_data = ybi.reset_index()
        
        bra_criterion = ybi_data["bra_id"].str.len() == geo_level
        cnae_criterion = ybi_data["cnae_id"].str.len() == 6
        ybi_data = ybi_data[bra_criterion & cnae_criterion]
        
        # ybi_data = ybi_data.reindex(index=ybi_index)
        # ybi_data = ybi_data.drop(["year", "num_emp", "num_est", "wage_avg", "num_emp_est"], axis=1)
        ybi_data = ybi_data[["bra_id", "cnae_id", "wage"]]
    
        # ybi_data = ybi_data.unstack()
        # levels = ybi_data.columns.levels
        # labels = ybi_data.columns.labels
        # ybi_data.columns = levels[1][labels[1]]

        '''
            RCAS
        '''
        
        # ybi_data = ybi_data.pivot(index="bra_id", columns="cnae_id", values="wage").fillna(0)
        ybi_data = ybi_data.pivot(index="bra_id", columns="cnae_id", values="wage")
        rcas = ps_calcs.rca(ybi_data)
    
        rcas_binary = rcas.copy()
        rcas_binary[rcas_binary >= 1] = 1
        rcas_binary[rcas_binary < 1] = 0
    
        '''
            DISTANCES
        '''
    
        '''calculate proximity for opportunity gain calculation'''    
        prox = ps_calcs.proximity(rcas_binary)
        '''calculate distances using proximity'''    
        dist = ps_calcs.distance(rcas_binary, prox).fillna(0)
    
        '''
            OPP GAIN
        '''
    
        '''calculate product complexity'''
        pci = ps_calcs.complexity(rcas_binary)[1]
        '''calculate opportunity gain'''
        opp_gain = ps_calcs.opportunity_gain(rcas_binary, prox, pci)
    
        rdo = []
        for bra in rcas.index:
            for cnae in rcas.columns:
                rdo.append([year, bra, cnae, rcas[cnae][bra], dist[cnae][bra], opp_gain[cnae][bra]])
    
        rca_dist_opp += rdo
    
    # now time to merge!
    print "merging datasets..."
    ybi_rdo = pd.DataFrame(rca_dist_opp, columns=["year", "bra_id", "cnae_id", "rca", "distance", "opp_gain"])
    ybi_rdo["year"] = ybi_rdo["year"].astype(int)
    ybi_rdo["rca"][ybi_rdo["rca"] == 0] = np.nan
    ybi_rdo = ybi_rdo.set_index(["year", "bra_id", "cnae_id"])
    
    # get union of both sets of indexes
    all_ybi_indexes = set(ybi.index).union(set(ybi_rdo.index))
    
    all_ybi_indexes = pd.MultiIndex.from_tuples(all_ybi_indexes, names=["year", "bra_id", "cnae_id"])
    # ybi = ybi.reindex(index=all_ybi_indexes, fill_value=0)
    ybi = ybi.reindex(index=all_ybi_indexes)
    ybi["rca"] = ybi_rdo["rca"]
    ybi["distance"] = ybi_rdo["distance"]
    ybi["opp_gain"] = ybi_rdo["opp_gain"]
    
    return ybi
def main(input_file, year, output_dir):
    
    output_dir = os.path.abspath(os.path.join(output_dir, str(year)))
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    store = pd.HDFStore(os.path.join(output_dir,'yodp.h5'))
    
    try:
        ypw = store.get('ypw')
    except KeyError:
        '''
        Import file to pandas dataframe
        '''
        comtrade_df = import_file(input_file)
    
        '''
        Add indexes
        '''
        ypw = comtrade_df.groupby(['hs_id', 'wld_id']).sum()
        
        store.put('ypw', ypw)
    
    '''
    Calculate RCA
    '''
    ypw_rca = ypw.reset_index()
    ypw_rca = ypw_rca.pivot(index="wld_id", columns="hs_id", values="val_usd")
    ypw_rca = ps_calcs.rca(ypw_rca)
    
    ypw_rca_binary = ypw_rca.copy()
    ypw_rca_binary[ypw_rca_binary >= 1] = 1
    ypw_rca_binary[ypw_rca_binary < 1] = 0
    
    '''
        DISTANCES
    '''
    ypw_prox = ps_calcs.proximity(ypw_rca_binary)
    ypw_dist = ps_calcs.distance(ypw_rca_binary, ypw_prox).fillna(0)
    
    '''
        COMPLEXITY
    '''
    eci, pci = calc_complexity(ypw)
    
    '''
        OPP GAIN
    '''
    ypw_opp_gain = ps_calcs.opportunity_gain(ypw_rca_binary[pci.index], ypw_prox[pci.index].reindex(pci.index), pci)
    
    '''
        MERGE DATA
    '''
    ypw_opp_gain = pd.DataFrame(ypw_opp_gain.T.stack(), columns=["opp_gain"])
    ypw_opp_gain = ypw_opp_gain.replace(0, np.nan)
    
    ypw_dist = pd.DataFrame(ypw_dist.T.stack(), columns=["distance"])
    ypw_dist = ypw_dist.replace(0, np.nan)
    
    ypw_rca = pd.DataFrame(ypw_rca.T.stack(), columns=["rca"])
    ypw_rca = ypw_rca.replace(0, np.nan)
    
    new_ypw = ypw \
                .merge(ypw_rca, how="outer", left_index=True, right_index=True) \
                .merge(ypw_dist, how="outer", left_index=True, right_index=True) \
                .merge(ypw_opp_gain, how="outer", left_index=True, right_index=True)
    new_ypw = new_ypw.reset_index()
    new_ypw["year"] = year
    cols = new_ypw.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    new_ypw = new_ypw[cols]
    
    
    '''
    Write out to files
    '''
    new_file_path = os.path.abspath(os.path.join(output_dir, "comtrade_ypw.tsv.bz2"))
    new_ypw.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=False, float_format="%.3f")
    
    new_file_path = os.path.abspath(os.path.join(output_dir, "comtrade_pci.tsv.bz2"))
    pd.DataFrame(pci, columns=["pci"]).to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True, float_format="%.3f")
    
    new_file_path = os.path.abspath(os.path.join(output_dir, "comtrade_eci.tsv.bz2"))
    pd.DataFrame(eci, columns=["eci"]).to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True, float_format="%.3f")
Example #31
0
def main():
    ''' 
        Step 1:
        Import the data file to a pandas DataFrame.
    '''
    try:
        oec_df = pd.read_csv("data/year_origin_hs92_4.tsv", \
                                sep="\t", \
                                converters={"hs92":str})
    except IOError:
        sys.exit("File doesn't exist, use fetch_oec_data.sh to download.")
    

    ''' 
        Step 2:
        Convert our vertically oriented data CPY (country-product-year) into
        the multidimensional Mcp matrix.
        rows     = countries
        columns  = products
    '''
    # Only use most recent year (could loop through each year too...)
    most_recent_year = sorted(oec_df.year.unique())[-1]
    oec_df = oec_df[oec_df.year == most_recent_year]
    
    # We only care about the country, product and export_val columns
    # so let's drop all the others
    oec_df = oec_df[["origin", "hs92", "export_val"]]
    
    # Drop all rows without export value
    oec_df = oec_df[~oec_df.export_val.isnull()]
    
    # Now we pivot our flat file to be countries X products
    mcp = oec_df.pivot(index="origin", columns="hs92", values="export_val")
    
    ''' 
        Step 3:
        Now this is the easiest part, we use the ps_calcs library to run the
        RCA calculation on the Mcp matrix.
    '''
    rcas = rca(mcp)
    
    # Here are some tests...
    # 1. Print the 10 products New Zealand (nzl) has the highest RCA in.
    # 0204 = Sheep and Goat Meat
    # print rcas.ix['nzl'].order(ascending=False).head(10)
    
    # Here are some tests...
    # 1. Print the 10 countries with the highest RCA in cars (8703).
    # SVK = Slovakia
    # print rcas['8703'].order(ascending=False).head(10)


    ''' 
        Step 4:
        Lastly, we can convert our nominal RCA values into binary 1s and 0s,
        1 and > meaning that countries exports their fair share of the product
        and 0 meaning they don't.
    '''
    rcas[rcas >= 1] = 1
    rcas[rcas < 1] = 0

    print("\nThe top 10 HS product codes that Brazil has RCA in:\n")
    print(rcas.loc["bra"].sort_values(ascending=False).head(10))
    
    print("\nCalculation run successfully! Read the source code to see what's going on.")