def get_ybp_rcas(ymbp, geo_level): ymbp = ymbp.reset_index() month_criterion = ymbp["month"].map(lambda x: x == "00") hs_criterion = ymbp["hs_id"].map(lambda x: len(x) == 6) bra_criterion = ymbp["bra_id"].map(lambda x: len(x) == geo_level) ymbp = ymbp[month_criterion & hs_criterion & bra_criterion] ymbp = ymbp[["bra_id", "hs_id", "export_val"]] ymbp = ymbp.pivot(index="bra_id", columns="hs_id", values="export_val").fillna(0) # ymb = ymb.reset_index() # month_criterion = ymb['month'].map(lambda x: x == '00') # bra_criterion = ymb['bra_id'].map(lambda x: len(x) == geo_level) # ymb = ymb[month_criterion & bra_criterion] # # threshold... bras w/ product diversity > 1 # allowed_bras = ymb[ymb.hs_diversity > 1].set_index("bra_id").index # ymbp = ymbp.reindex(allowed_bras) rcas = ps_calcs.rca(ymbp) rcas[rcas >= 1] = 1 rcas[rcas < 1] = 0 rcas = rcas.fillna(0) prod_diversity = rcas.sum(axis=1) allowed_bras = prod_diversity[prod_diversity > 0].index rcas = rcas.reindex(allowed_bras) ymbp = ymbp.reindex(allowed_bras) shares = ymbp * rcas return shares
def calc_rca(ybc, year): rcas = pd.DataFrame() for geo_level in [2, 4, 8]: print "geo level:", geo_level ybc_data = ybc.reset_index() bra_criterion = ybc_data["bra_id"].map(lambda x: len(x) == geo_level) course_criterion = ybc_data["course_sc_id"].map(lambda x: len(x) == 5) ybc_data = ybc_data[bra_criterion & course_criterion] ybc_data = ybc_data[["bra_id", "course_sc_id", "students"]] ybc_data = ybc_data.pivot(index="bra_id", columns="course_sc_id", values="students") ybc_data_rca = ps_calcs.rca(ybc_data) ybc_data_rca = pd.DataFrame(ybc_data_rca.stack(), columns=["students_rca"]) if rcas.empty: rcas = ybc_data_rca else: rcas = pd.concat([rcas, ybc_data_rca]) rcas = rcas.replace(0, np.nan) rcas = rcas.dropna(how="all") rcas["year"] = int(year) rcas = rcas.set_index("year", append=True) rcas = rcas.swaplevel("year", "course_sc_id") rcas = rcas.swaplevel("year", "bra_id") ybc = ybc.merge(rcas, how="outer", left_index=True, right_index=True) return ybc
def get_shares(ymbp, geo_level): ymbp = ymbp.reset_index() month_criterion = ymbp['month'].map(lambda x: x == '00') hs_criterion = ymbp['hs_id'].map(lambda x: len(x) == 6) bra_criterion = ymbp['bra_id'].map(lambda x: len(x) == geo_level) ymbp = ymbp[month_criterion & hs_criterion & bra_criterion] ymbp = ymbp[["bra_id","hs_id","export_val"]] ymbp = ymbp.pivot(index="bra_id", columns="hs_id", values="export_val").fillna(0) rcas = ps_calcs.rca(ymbp) rcas[rcas >= 1] = 1 rcas[rcas < 1] = 0 rcas = rcas.fillna(0) prod_diversity = rcas.sum(axis=1) allowed_bras = prod_diversity[prod_diversity>0].index rcas = rcas.reindex(allowed_bras) ymbp = ymbp.reindex(allowed_bras) shares = ymbp * rcas return shares
def get_ybp_rcas(ymbp, geo_level): ymbp = ymbp.reset_index() month_criterion = ymbp['month'].map(lambda x: x == '00') hs_criterion = ymbp['hs_id'].map(lambda x: len(x) == 6) bra_criterion = ymbp['bra_id'].map(lambda x: len(x) == geo_level) ymbp = ymbp[month_criterion & hs_criterion & bra_criterion] ymbp = ymbp[["bra_id", "hs_id", "export_val"]] ymbp = ymbp.pivot(index="bra_id", columns="hs_id", values="export_val").fillna(0) # ymb = ymb.reset_index() # month_criterion = ymb['month'].map(lambda x: x == '00') # bra_criterion = ymb['bra_id'].map(lambda x: len(x) == geo_level) # ymb = ymb[month_criterion & bra_criterion] # # threshold... bras w/ product diversity > 1 # allowed_bras = ymb[ymb.hs_diversity > 1].set_index("bra_id").index # ymbp = ymbp.reindex(allowed_bras) rcas = ps_calcs.rca(ymbp) rcas[rcas >= 1] = 1 rcas[rcas < 1] = 0 rcas = rcas.fillna(0) prod_diversity = rcas.sum(axis=1) allowed_bras = prod_diversity[prod_diversity > 0].index rcas = rcas.reindex(allowed_bras) ymbp = ymbp.reindex(allowed_bras) shares = ymbp * rcas return shares
def main(): ''' Step 1: Import the data file to a pandas DataFrame. ''' try: oec_df = pd.read_csv("data/year_origin_hs92_4.tsv", \ sep="\t", \ converters={"hs92":str}) except IOError: sys.exit("File doesn't exist, use fetch_oec_data.sh to download.") ''' Step 2: Convert our vertically oriented data CPY (country-product-year) into the multidimensional Mcp matrix. rows = countries columns = products ''' # Only use most recent year (could loop through each year too...) most_recent_year = sorted(oec_df.year.unique())[-1] oec_df = oec_df[oec_df.year == most_recent_year] # We only care about the country, product and export_val columns # so let's drop all the others oec_df = oec_df[["origin", "hs92", "export_val"]] # Drop all rows without export value oec_df = oec_df[~oec_df.export_val.isnull()] # Now we pivot our flat file to be countries X products mcp = oec_df.pivot(index="origin", columns="hs92", values="export_val") ''' Step 3: Now this is the easiest part, we use the ps_calcs library to run the RCA calculation on the Mcp matrix. ''' rcas = rca(mcp) # Here are some tests... # 1. Print the 10 products New Zealand (nzl) has the highest RCA in. # 0204 = Sheep and Goat Meat # print rcas.ix['nzl'].order(ascending=False).head(10) # Here are some tests... # 1. Print the 10 countries with the highest RCA in cars (8703). # SVK = Slovakia # print rcas['8703'].order(ascending=False).head(10) ''' Step 4: Lastly, we can convert our nominal RCA values into binary 1s and 0s, 1 and > meaning that countries exports their fair share of the product and 0 meaning they don't. ''' rcas[rcas >= 1] = 1 rcas[rcas < 1] = 0 print "Calculation run successfully! Read the source code to see what's going on."
def prox(year, output_path, attr, i_attr, table, column): attr_depths = depths[attr] i_attr_depths = depths[i_attr] years = get_years(year) for year in years: print "year:", year for i, depth in enumerate(attr_depths): print attr, "depth:", depth query = """ SELECT {0}_id, {1}_id, {2} FROM {3} WHERE year=%s """.format(attr, i_attr, column, table) if len(attr_depths) > 1: query += " and {}_id_len={}".format(attr, depth) if len(i_attr_depths) > 1: query += " and {}_id_len={}".format(i_attr, i_attr_depths[-1]) if "secex" in table: query += " and month=0" data = sql.read_sql(query, db, params=[year]) data = data.pivot(index="{}_id".format(i_attr), columns="{}_id".format(attr), values=column) rcas = ps_calcs.rca(data) rcas[rcas >= 1] = 1 rcas[rcas < 1] = 0 prox = ps_calcs.proximity(rcas) prox = pd.DataFrame(prox.unstack(), columns=["{}_proximity".format(i_attr)]) prox["year"] = year prox = prox.set_index("year", append=True) output_path_w_year = os.path.abspath( os.path.join(output_path, str(year))) if not os.path.exists(output_path_w_year): os.makedirs(output_path_w_year) fp = os.path.join(output_path_w_year, "{}_{}_proximity.tsv".format(attr, i_attr)) file_mode = 'a' if i else 'w' user_header = False if i else True with open(fp, file_mode) as f: prox.to_csv(f, header=user_header, sep="\t")
def calc_rca(ypw): ypw_rca = ypw.reset_index() ypw_rca = ypw_rca.pivot(index="wld_id", columns="hs_id", values="val_usd") ypw_rca = ps_calcs.rca(ypw_rca) ypw_rca = pd.DataFrame(ypw_rca.T.stack(), columns=["rca"]) ypw_rca = ypw_rca.replace(0, np.nan) # print ypw_rca.head() # print ypw.head() # sys.exit() ypw['rca'] = ypw_rca['rca'] return ypw
def get_domestic_rcas(geo_level, year, ybp, depths): ybp = ybp.reset_index() hs_criterion = ybp['hs_id'].map(lambda x: len(x) == depths['hs'][-1]) bra_criterion = ybp['bra_id'].map(lambda x: len(x) == geo_level) ybp = ybp[hs_criterion & bra_criterion] ybp = ybp[["bra_id", "hs_id", "val_usd"]] ybp = ybp.pivot(index="bra_id", columns="hs_id", values="val_usd").fillna(0) rcas = ps_calcs.rca(ybp).fillna(0) rcas[rcas == np.inf] = 0 return rcas
def prox(year, output_path, attr, i_attr, table, column): attr_depths = depths[attr] i_attr_depths = depths[i_attr] years = get_years(year) for year in years: print "year:", year for i, depth in enumerate(attr_depths): print attr, "depth:", depth query = """ SELECT {0}_id, {1}_id, {2} FROM {3} WHERE year=%s """.format(attr, i_attr, column, table) if len(attr_depths) > 1: query += " and {}_id_len={}".format(attr, depth) if len(i_attr_depths) > 1: query += " and {}_id_len={}".format(i_attr, i_attr_depths[-1]) if "secex" in table: query += " and month=0" data = sql.read_sql(query, db, params=[year]) data = data.pivot(index="{}_id".format(i_attr), columns="{}_id".format(attr), values=column) rcas = ps_calcs.rca(data) rcas[rcas >= 1] = 1 rcas[rcas < 1] = 0 prox = ps_calcs.proximity(rcas) prox = pd.DataFrame(prox.unstack(), columns=["{}_proximity".format(i_attr)]) prox["year"] = year prox = prox.set_index("year", append=True) output_path_w_year = os.path.abspath(os.path.join(output_path, str(year))) if not os.path.exists(output_path_w_year): os.makedirs(output_path_w_year) fp = os.path.join(output_path_w_year, "{}_{}_proximity.tsv".format(attr, i_attr)) file_mode = 'a' if i else 'w' user_header = False if i else True with open(fp, file_mode) as f: prox.to_csv(f, header=user_header, sep="\t")
def get_wld_proximity(year, ypw_file_path): '''Get world values from ypw file''' table = pd.read_csv(ypw_file_path, compression="bz2", sep="\t", converters={"hs_id":str}) table = table.rename(columns={"val_usd":"export_val"}) table = table.pivot(index="wld_id", columns="hs_id", values="export_val").fillna(0) '''Use growth library to run RCA calculation on data''' mcp = ps_calcs.rca(table) mcp[mcp >= 1] = 1 mcp[mcp < 1] = 0 prox = ps_calcs.proximity(mcp) return prox
def get_ybi_rcas(ybi, geo_level): ybi = ybi.reset_index() cnae_criterion = ybi['cnae_id'].str.len() == 6 bra_criterion = ybi['bra_id'].str.len() == geo_level ybi = ybi[cnae_criterion & bra_criterion] ybi = ybi[["bra_id", "cnae_id", "wage"]] ybi = ybi.pivot(index="bra_id", columns="cnae_id", values="wage").fillna(0) rcas = ps_calcs.rca(ybi) rcas[rcas >= 1] = 1 rcas[rcas < 1] = 0 return rcas
def get_ybp_rcas(ybp, geo_level, depths): ybp = ybp.reset_index() hs_criterion = ybp['hs_id'].map(lambda x: len(x) == depths["hs"][-1]) bra_criterion = ybp['bra_id'].map(lambda x: len(x) == geo_level) ybp = ybp[hs_criterion & bra_criterion] ybp = ybp[["bra_id","hs_id","val_usd"]] ybp = ybp.pivot(index="bra_id", columns="hs_id", values="val_usd").fillna(0) rcas = ps_calcs.rca(ybp) rcas[rcas >= 1] = 1 rcas[rcas < 1] = 0 return rcas
def get_domestic_rcas(geo_level, year, ymbp, trade_flow): ymbp = ymbp.reset_index() val_col = trade_flow+"_val" month_criterion = ymbp['month'].map(lambda x: x == "00") hs_criterion = ymbp['hs_id'].map(lambda x: len(x) == 6) bra_criterion = ymbp['bra_id'].map(lambda x: len(x) == geo_level) ymbp = ymbp[month_criterion & hs_criterion & bra_criterion] ymbp = ymbp[["bra_id", "hs_id", val_col]] ymbp = ymbp.pivot(index="bra_id", columns="hs_id", values=val_col).fillna(0) rcas = ps_calcs.rca(ymbp).fillna(0) rcas[rcas == np.inf] = 0 return rcas
def get_ybp_rcas(ybp, geo_level, depths): ybp = ybp.reset_index() hs_criterion = ybp['hs_id'].map(lambda x: len(x) == depths["hs"][-1]) bra_criterion = ybp['bra_id'].map(lambda x: len(x) == geo_level) ybp = ybp[hs_criterion & bra_criterion] ybp = ybp[["bra_id", "hs_id", "val_usd"]] ybp = ybp.pivot(index="bra_id", columns="hs_id", values="val_usd").fillna(0) rcas = ps_calcs.rca(ybp) rcas[rcas >= 1] = 1 rcas[rcas < 1] = 0 return rcas
def get_wld_proximity(year, ypw_file_path): '''Get world values from ypw file''' table = pd.read_csv(ypw_file_path, compression="bz2", sep="\t", converters={"hs_id": str}) table = table.rename(columns={"val_usd": "export_val"}) table = table.pivot(index="wld_id", columns="hs_id", values="export_val").fillna(0) '''Use growth library to run RCA calculation on data''' mcp = ps_calcs.rca(table) mcp[mcp >= 1] = 1 mcp[mcp < 1] = 0 prox = ps_calcs.proximity(mcp) return prox
def get_domestic_rcas(geo_level, year, ymbp, trade_flow): ymbp = ymbp.reset_index() val_col = trade_flow + "_val" month_criterion = ymbp['month'].map(lambda x: x == "00") hs_criterion = ymbp['hs_id'].map(lambda x: len(x) == 6) bra_criterion = ymbp['bra_id'].map(lambda x: len(x) == geo_level) ymbp = ymbp[month_criterion & hs_criterion & bra_criterion] ymbp = ymbp[["bra_id", "hs_id", val_col]] ymbp = ymbp.pivot(index="bra_id", columns="hs_id", values=val_col).fillna(0) rcas = ps_calcs.rca(ymbp).fillna(0) rcas[rcas == np.inf] = 0 return rcas
def get_wld_proximity(year, ypw_file_path): ''' Connect to DB ''' db = MySQLdb.connect(host=os.environ.get("DATAVIVA_DB_HOST", "localhost"), user=os.environ.get("DATAVIVA_DB_USER", "root"), passwd=os.environ.get("DATAVIVA_DB_PW", ""), db=os.environ.get("DATAVIVA_DB_NAME", "dataviva")) '''Get world values from ypw file''' table = pd.read_csv(ypw_file_path, compression="bz2", sep="\t", converters={"hs_id":str}) table = table.rename(columns={"val_usd":"export_val"}) table = table.pivot(index="wld_id", columns="hs_id", values="export_val").fillna(0) '''Use growth library to run RCA calculation on data''' mcp = ps_calcs.rca(table) mcp[mcp >= 1] = 1 mcp[mcp < 1] = 0 prox = ps_calcs.proximity(mcp) return prox
def calc_rca(ybuc, year): ybc = ybuc.groupby(level=["year", "bra_id", "course_hedu_id"]).sum() ybc = ybc[["enrolled"]] ybc = ybc.reset_index() ybc = ybc.drop("year", axis=1) rcas = ybc.pivot(index="bra_id", columns="course_hedu_id", values="enrolled") rcas = ps_calcs.rca(rcas) rcas = pd.DataFrame(rcas.stack(), columns=["enrolled_rca"]) rcas = rcas.replace(0, np.nan) rcas = rcas.dropna(how="all") rcas["year"] = int(year) rcas = rcas.set_index("year", append=True) rcas = rcas.swaplevel("year", "course_hedu_id") rcas = rcas.swaplevel("year", "bra_id") return rcas
def get_wld_proximity(year): ''' Connect to DB ''' db = MySQLdb.connect(host=os.environ["DATAVIVA_DB_HOST"], user=os.environ["DATAVIVA_DB_USER"], passwd=os.environ["DATAVIVA_DB_PW"], db=os.environ["DATAVIVA_DB_NAME"]) '''Get values from database''' q = "select wld_id, hs_id, val_usd " \ "from comtrade_ypw " \ "where year = {0} and length(hs_id) = 6".format(year) table = sql.read_sql(q, db) table = table.rename(columns={"val_usd": "val_usd"}) table = table.pivot(index="wld_id", columns="hs_id", values="val_usd") table = table.fillna(0) '''Use growth library to run RCA calculation on data''' mcp = ps_calcs.rca(table) mcp[mcp >= 1] = 1 mcp[mcp < 1] = 0 prox = ps_calcs.proximity(mcp) return prox
def get_wld_proximity(year, ypw_file_path): ''' Connect to DB ''' db = MySQLdb.connect(host=os.environ.get("DATAVIVA_DB_HOST", "localhost"), user=os.environ.get("DATAVIVA_DB_USER", "root"), passwd=os.environ.get("DATAVIVA_DB_PW", ""), db=os.environ.get("DATAVIVA_DB_NAME", "dataviva")) '''Get world values from ypw file''' table = pd.read_csv(ypw_file_path, compression="bz2", sep="\t", converters={"hs_id": str}) table = table.rename(columns={"val_usd": "export_val"}) table = table.pivot(index="wld_id", columns="hs_id", values="export_val").fillna(0) '''Use growth library to run RCA calculation on data''' mcp = ps_calcs.rca(table) mcp[mcp >= 1] = 1 mcp[mcp < 1] = 0 prox = ps_calcs.proximity(mcp) return prox
def calc_complexity(ypw): ubiquity_required = 20 diversity_required = 200 total_exports_required = 50000000 '''trim country list by diversity''' origin_diversity = ypw.reset_index() origin_diversity = origin_diversity["wld_id"].value_counts() origin_diversity = origin_diversity[origin_diversity > diversity_required] '''trim country list by total exports''' origin_totals = ypw.groupby(level=['wld_id']).sum() origin_totals = origin_totals['val_usd'] origin_totals = origin_totals[origin_totals > total_exports_required] filtered_origins = set(origin_diversity.index).intersection(set(origin_totals.index)) '''trim product list by ubiquity''' product_ubiquity = ypw.reset_index() product_ubiquity = product_ubiquity["hs_id"].value_counts() product_ubiquity = product_ubiquity[product_ubiquity > ubiquity_required] filtered_products = set(product_ubiquity.index) '''re-calculate rcas''' origins_to_drop = set(ypw.index.get_level_values('wld_id')).difference(filtered_origins) products_to_drop = set(ypw.index.get_level_values('hs_id')).difference(filtered_products) ypw = ypw.drop(list(origins_to_drop), axis=0, level='wld_id') ypw = ypw.drop(list(products_to_drop), axis=0, level='hs_id') ypw_rca = ypw.reset_index() ypw_rca = ypw_rca.pivot(index="wld_id", columns="hs_id", values="val_usd") ypw_rca = ps_calcs.rca(ypw_rca) ypw_rca[ypw_rca >= 1] = 1 ypw_rca[ypw_rca < 1] = 0 return ps_calcs.complexity(ypw_rca)
def calc_rca(ypw): ubiquity_required = 20 diversity_required = 200 total_exports_required = 50000000 '''trim country list by diversity''' origin_diversity = ypw.reset_index() origin_diversity = origin_diversity["wld_id"].value_counts() origin_diversity = origin_diversity[origin_diversity > diversity_required] '''trim country list by total exports''' origin_totals = ypw.groupby(level=['wld_id']).sum() origin_totals = origin_totals['val_usd'] origin_totals = origin_totals[origin_totals > total_exports_required] filtered_origins = set(origin_diversity.index).intersection(set(origin_totals.index)) '''trim product list by ubiquity''' product_ubiquity = ypw.reset_index() product_ubiquity = product_ubiquity[product_ubiquity['val_usd'] > 0] product_ubiquity = product_ubiquity["hs_id"].value_counts() product_ubiquity = product_ubiquity[product_ubiquity > ubiquity_required] filtered_products = set(product_ubiquity.index) '''re-calculate rcas''' origins_to_drop = set(ypw.index.get_level_values('wld_id')).difference(filtered_origins) products_to_drop = set(ypw.index.get_level_values('hs_id')).difference(filtered_products) ypw = ypw.drop(list(origins_to_drop), axis=0, level='wld_id') ypw = ypw.drop(list(products_to_drop), axis=0, level='hs_id') ypw_rca = ypw.reset_index() ypw_rca = ypw_rca.pivot(index="wld_id", columns="hs_id", values="val_usd") ypw_rca = ps_calcs.rca(ypw_rca) return ypw_rca.fillna(0)
def get_wld_proximity(year): ''' Connect to DB ''' db = MySQLdb.connect(host=os.environ["DATAVIVA_DB_HOST"], user=os.environ["DATAVIVA_DB_USER"], passwd=os.environ["DATAVIVA_DB_PW"], db=os.environ["DATAVIVA_DB_NAME"]) '''Get values from database''' q = "select wld_id, hs_id, val_usd " \ "from comtrade_ypw " \ "where year = {0} and length(hs_id) = 6".format(year) table = sql.read_sql(q, db) table = table.rename(columns={"val_usd":"val_usd"}) table = table.pivot(index="wld_id", columns="hs_id", values="val_usd") table = table.fillna(0) '''Use growth library to run RCA calculation on data''' mcp = ps_calcs.rca(table) mcp[mcp >= 1] = 1 mcp[mcp < 1] = 0 prox = ps_calcs.proximity(mcp) return prox
def main(input_file, year, output_dir): output_dir = os.path.abspath(os.path.join(output_dir, str(year))) if not os.path.exists(output_dir): os.makedirs(output_dir) store = pd.HDFStore(os.path.join(output_dir, 'yodp.h5')) try: ypw = store.get('ypw') except KeyError: ''' Import file to pandas dataframe ''' comtrade_df = import_file(input_file) ''' Add indexes ''' ypw = comtrade_df.groupby(['hs_id', 'wld_id']).sum() store.put('ypw', ypw) ''' Calculate RCA ''' ypw_rca = ypw.reset_index() ypw_rca = ypw_rca.pivot(index="wld_id", columns="hs_id", values="val_usd") ypw_rca = ps_calcs.rca(ypw_rca) ypw_rca_binary = ypw_rca.copy() ypw_rca_binary[ypw_rca_binary >= 1] = 1 ypw_rca_binary[ypw_rca_binary < 1] = 0 ''' DISTANCES ''' ypw_prox = ps_calcs.proximity(ypw_rca_binary) ypw_dist = ps_calcs.distance(ypw_rca_binary, ypw_prox).fillna(0) ''' COMPLEXITY ''' eci, pci = calc_complexity(ypw) ''' OPP GAIN ''' ypw_opp_gain = ps_calcs.opportunity_gain( ypw_rca_binary[pci.index], ypw_prox[pci.index].reindex(pci.index), pci) ''' MERGE DATA ''' ypw_opp_gain = pd.DataFrame(ypw_opp_gain.T.stack(), columns=["opp_gain"]) ypw_opp_gain = ypw_opp_gain.replace(0, np.nan) ypw_dist = pd.DataFrame(ypw_dist.T.stack(), columns=["distance"]) ypw_dist = ypw_dist.replace(0, np.nan) ypw_rca = pd.DataFrame(ypw_rca.T.stack(), columns=["rca"]) ypw_rca = ypw_rca.replace(0, np.nan) new_ypw = ypw \ .merge(ypw_rca, how="outer", left_index=True, right_index=True) \ .merge(ypw_dist, how="outer", left_index=True, right_index=True) \ .merge(ypw_opp_gain, how="outer", left_index=True, right_index=True) new_ypw = new_ypw.reset_index() new_ypw["year"] = year cols = new_ypw.columns.tolist() cols = cols[-1:] + cols[:-1] new_ypw = new_ypw[cols] ''' Write out to files ''' new_file_path = os.path.abspath( os.path.join(output_dir, "comtrade_ypw.tsv.bz2")) new_ypw.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=False, float_format="%.3f") new_file_path = os.path.abspath( os.path.join(output_dir, "comtrade_pci.tsv.bz2")) pd.DataFrame(pci, columns=["pci"]).to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True, float_format="%.3f") new_file_path = os.path.abspath( os.path.join(output_dir, "comtrade_eci.tsv.bz2")) pd.DataFrame(eci, columns=["eci"]).to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True, float_format="%.3f")
def rdo(ybi, yi, year, depths): rca_dist_opp = [] for geo_level in depths["bra"]: print "geo level:", geo_level ybi_data = ybi.reset_index() bra_criterion = ybi_data["bra_id"].str.len() == geo_level cnae_criterion = ybi_data["cnae_id"].str.len() == 6 ybi_data = ybi_data[bra_criterion & cnae_criterion] # ybi_data = ybi_data.reindex(index=ybi_index) # ybi_data = ybi_data.drop(["year", "num_emp", "num_est", "wage_avg", "num_emp_est"], axis=1) ybi_data = ybi_data[["bra_id", "cnae_id", "wage"]] # ybi_data = ybi_data.unstack() # levels = ybi_data.columns.levels # labels = ybi_data.columns.labels # ybi_data.columns = levels[1][labels[1]] ''' RCAS ''' # ybi_data = ybi_data.pivot(index="bra_id", columns="cnae_id", values="wage").fillna(0) ybi_data = ybi_data.pivot(index="bra_id", columns="cnae_id", values="wage") rcas = ps_calcs.rca(ybi_data) rcas_binary = rcas.copy() rcas_binary[rcas_binary >= 1] = 1 rcas_binary[rcas_binary < 1] = 0 ''' DISTANCES ''' '''calculate proximity for opportunity gain calculation''' prox = ps_calcs.proximity(rcas_binary) '''calculate distances using proximity''' dist = ps_calcs.distance(rcas_binary, prox).fillna(0) ''' OPP GAIN ''' '''calculate product complexity''' pci = ps_calcs.complexity(rcas_binary)[1] '''calculate opportunity gain''' opp_gain = ps_calcs.opportunity_gain(rcas_binary, prox, pci) rdo = [] for bra in rcas.index: for cnae in rcas.columns: rdo.append([ year, bra, cnae, rcas[cnae][bra], dist[cnae][bra], opp_gain[cnae][bra] ]) rca_dist_opp += rdo # now time to merge! print "merging datasets..." ybi_rdo = pd.DataFrame( rca_dist_opp, columns=["year", "bra_id", "cnae_id", "rca", "distance", "opp_gain"]) ybi_rdo["year"] = ybi_rdo["year"].astype(int) ybi_rdo["rca"][ybi_rdo["rca"] == 0] = np.nan ybi_rdo = ybi_rdo.set_index(["year", "bra_id", "cnae_id"]) # get union of both sets of indexes all_ybi_indexes = set(ybi.index).union(set(ybi_rdo.index)) all_ybi_indexes = pd.MultiIndex.from_tuples( all_ybi_indexes, names=["year", "bra_id", "cnae_id"]) # ybi = ybi.reindex(index=all_ybi_indexes, fill_value=0) ybi = ybi.reindex(index=all_ybi_indexes) ybi["rca"] = ybi_rdo["rca"] ybi["distance"] = ybi_rdo["distance"] ybi["opp_gain"] = ybi_rdo["opp_gain"] return ybi
def rdo(ybi, yi, year, depths): rca_dist_opp = [] for geo_level in depths["bra"]: print "geo level:", geo_level ybi_data = ybi.reset_index() bra_criterion = ybi_data["bra_id"].str.len() == geo_level cnae_criterion = ybi_data["cnae_id"].str.len() == 6 ybi_data = ybi_data[bra_criterion & cnae_criterion] # ybi_data = ybi_data.reindex(index=ybi_index) # ybi_data = ybi_data.drop(["year", "num_emp", "num_est", "wage_avg", "num_emp_est"], axis=1) ybi_data = ybi_data[["bra_id", "cnae_id", "wage"]] # ybi_data = ybi_data.unstack() # levels = ybi_data.columns.levels # labels = ybi_data.columns.labels # ybi_data.columns = levels[1][labels[1]] ''' RCAS ''' # ybi_data = ybi_data.pivot(index="bra_id", columns="cnae_id", values="wage").fillna(0) ybi_data = ybi_data.pivot(index="bra_id", columns="cnae_id", values="wage") rcas = ps_calcs.rca(ybi_data) rcas_binary = rcas.copy() rcas_binary[rcas_binary >= 1] = 1 rcas_binary[rcas_binary < 1] = 0 ''' DISTANCES ''' '''calculate proximity for opportunity gain calculation''' prox = ps_calcs.proximity(rcas_binary) '''calculate distances using proximity''' dist = ps_calcs.distance(rcas_binary, prox).fillna(0) ''' OPP GAIN ''' '''calculate product complexity''' pci = ps_calcs.complexity(rcas_binary)[1] '''calculate opportunity gain''' opp_gain = ps_calcs.opportunity_gain(rcas_binary, prox, pci) rdo = [] for bra in rcas.index: for cnae in rcas.columns: rdo.append([year, bra, cnae, rcas[cnae][bra], dist[cnae][bra], opp_gain[cnae][bra]]) rca_dist_opp += rdo # now time to merge! print "merging datasets..." ybi_rdo = pd.DataFrame(rca_dist_opp, columns=["year", "bra_id", "cnae_id", "rca", "distance", "opp_gain"]) ybi_rdo["year"] = ybi_rdo["year"].astype(int) ybi_rdo["rca"][ybi_rdo["rca"] == 0] = np.nan ybi_rdo = ybi_rdo.set_index(["year", "bra_id", "cnae_id"]) # get union of both sets of indexes all_ybi_indexes = set(ybi.index).union(set(ybi_rdo.index)) all_ybi_indexes = pd.MultiIndex.from_tuples(all_ybi_indexes, names=["year", "bra_id", "cnae_id"]) # ybi = ybi.reindex(index=all_ybi_indexes, fill_value=0) ybi = ybi.reindex(index=all_ybi_indexes) ybi["rca"] = ybi_rdo["rca"] ybi["distance"] = ybi_rdo["distance"] ybi["opp_gain"] = ybi_rdo["opp_gain"] return ybi
def main(input_file, year, output_dir): output_dir = os.path.abspath(os.path.join(output_dir, str(year))) if not os.path.exists(output_dir): os.makedirs(output_dir) store = pd.HDFStore(os.path.join(output_dir,'yodp.h5')) try: ypw = store.get('ypw') except KeyError: ''' Import file to pandas dataframe ''' comtrade_df = import_file(input_file) ''' Add indexes ''' ypw = comtrade_df.groupby(['hs_id', 'wld_id']).sum() store.put('ypw', ypw) ''' Calculate RCA ''' ypw_rca = ypw.reset_index() ypw_rca = ypw_rca.pivot(index="wld_id", columns="hs_id", values="val_usd") ypw_rca = ps_calcs.rca(ypw_rca) ypw_rca_binary = ypw_rca.copy() ypw_rca_binary[ypw_rca_binary >= 1] = 1 ypw_rca_binary[ypw_rca_binary < 1] = 0 ''' DISTANCES ''' ypw_prox = ps_calcs.proximity(ypw_rca_binary) ypw_dist = ps_calcs.distance(ypw_rca_binary, ypw_prox).fillna(0) ''' COMPLEXITY ''' eci, pci = calc_complexity(ypw) ''' OPP GAIN ''' ypw_opp_gain = ps_calcs.opportunity_gain(ypw_rca_binary[pci.index], ypw_prox[pci.index].reindex(pci.index), pci) ''' MERGE DATA ''' ypw_opp_gain = pd.DataFrame(ypw_opp_gain.T.stack(), columns=["opp_gain"]) ypw_opp_gain = ypw_opp_gain.replace(0, np.nan) ypw_dist = pd.DataFrame(ypw_dist.T.stack(), columns=["distance"]) ypw_dist = ypw_dist.replace(0, np.nan) ypw_rca = pd.DataFrame(ypw_rca.T.stack(), columns=["rca"]) ypw_rca = ypw_rca.replace(0, np.nan) new_ypw = ypw \ .merge(ypw_rca, how="outer", left_index=True, right_index=True) \ .merge(ypw_dist, how="outer", left_index=True, right_index=True) \ .merge(ypw_opp_gain, how="outer", left_index=True, right_index=True) new_ypw = new_ypw.reset_index() new_ypw["year"] = year cols = new_ypw.columns.tolist() cols = cols[-1:] + cols[:-1] new_ypw = new_ypw[cols] ''' Write out to files ''' new_file_path = os.path.abspath(os.path.join(output_dir, "comtrade_ypw.tsv.bz2")) new_ypw.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=False, float_format="%.3f") new_file_path = os.path.abspath(os.path.join(output_dir, "comtrade_pci.tsv.bz2")) pd.DataFrame(pci, columns=["pci"]).to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True, float_format="%.3f") new_file_path = os.path.abspath(os.path.join(output_dir, "comtrade_eci.tsv.bz2")) pd.DataFrame(eci, columns=["eci"]).to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True, float_format="%.3f")
def main(): ''' Step 1: Import the data file to a pandas DataFrame. ''' try: oec_df = pd.read_csv("data/year_origin_hs92_4.tsv", \ sep="\t", \ converters={"hs92":str}) except IOError: sys.exit("File doesn't exist, use fetch_oec_data.sh to download.") ''' Step 2: Convert our vertically oriented data CPY (country-product-year) into the multidimensional Mcp matrix. rows = countries columns = products ''' # Only use most recent year (could loop through each year too...) most_recent_year = sorted(oec_df.year.unique())[-1] oec_df = oec_df[oec_df.year == most_recent_year] # We only care about the country, product and export_val columns # so let's drop all the others oec_df = oec_df[["origin", "hs92", "export_val"]] # Drop all rows without export value oec_df = oec_df[~oec_df.export_val.isnull()] # Now we pivot our flat file to be countries X products mcp = oec_df.pivot(index="origin", columns="hs92", values="export_val") ''' Step 3: Now this is the easiest part, we use the ps_calcs library to run the RCA calculation on the Mcp matrix. ''' rcas = rca(mcp) # Here are some tests... # 1. Print the 10 products New Zealand (nzl) has the highest RCA in. # 0204 = Sheep and Goat Meat # print rcas.ix['nzl'].order(ascending=False).head(10) # Here are some tests... # 1. Print the 10 countries with the highest RCA in cars (8703). # SVK = Slovakia # print rcas['8703'].order(ascending=False).head(10) ''' Step 4: Lastly, we can convert our nominal RCA values into binary 1s and 0s, 1 and > meaning that countries exports their fair share of the product and 0 meaning they don't. ''' rcas[rcas >= 1] = 1 rcas[rcas < 1] = 0 print("\nThe top 10 HS product codes that Brazil has RCA in:\n") print(rcas.loc["bra"].sort_values(ascending=False).head(10)) print("\nCalculation run successfully! Read the source code to see what's going on.")