def main(input_file, year, output_dir): output_dir = os.path.abspath(os.path.join(output_dir, str(year))) if not os.path.exists(output_dir): os.makedirs(output_dir) store = pd.HDFStore(os.path.join(output_dir, 'yodp.h5')) try: ypw = store.get('ypw') except KeyError: ''' Import file to pandas dataframe ''' comtrade_df = import_file(input_file) ''' Add indexes ''' ypw = comtrade_df.groupby(['hs_id', 'wld_id']).sum() store.put('ypw', ypw) ''' Calculate RCA ''' ypw_rca = ypw.reset_index() ypw_rca = ypw_rca.pivot(index="wld_id", columns="hs_id", values="val_usd") ypw_rca = ps_calcs.rca(ypw_rca) ypw_rca_binary = ypw_rca.copy() ypw_rca_binary[ypw_rca_binary >= 1] = 1 ypw_rca_binary[ypw_rca_binary < 1] = 0 ''' DISTANCES ''' ypw_prox = ps_calcs.proximity(ypw_rca_binary) ypw_dist = ps_calcs.distance(ypw_rca_binary, ypw_prox).fillna(0) ''' COMPLEXITY ''' eci, pci = calc_complexity(ypw) ''' OPP GAIN ''' ypw_opp_gain = ps_calcs.opportunity_gain( ypw_rca_binary[pci.index], ypw_prox[pci.index].reindex(pci.index), pci) ''' MERGE DATA ''' ypw_opp_gain = pd.DataFrame(ypw_opp_gain.T.stack(), columns=["opp_gain"]) ypw_opp_gain = ypw_opp_gain.replace(0, np.nan) ypw_dist = pd.DataFrame(ypw_dist.T.stack(), columns=["distance"]) ypw_dist = ypw_dist.replace(0, np.nan) ypw_rca = pd.DataFrame(ypw_rca.T.stack(), columns=["rca"]) ypw_rca = ypw_rca.replace(0, np.nan) new_ypw = ypw \ .merge(ypw_rca, how="outer", left_index=True, right_index=True) \ .merge(ypw_dist, how="outer", left_index=True, right_index=True) \ .merge(ypw_opp_gain, how="outer", left_index=True, right_index=True) new_ypw = new_ypw.reset_index() new_ypw["year"] = year cols = new_ypw.columns.tolist() cols = cols[-1:] + cols[:-1] new_ypw = new_ypw[cols] ''' Write out to files ''' new_file_path = os.path.abspath( os.path.join(output_dir, "comtrade_ypw.tsv.bz2")) new_ypw.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=False, float_format="%.3f") new_file_path = os.path.abspath( os.path.join(output_dir, "comtrade_pci.tsv.bz2")) pd.DataFrame(pci, columns=["pci"]).to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True, float_format="%.3f") new_file_path = os.path.abspath( os.path.join(output_dir, "comtrade_eci.tsv.bz2")) pd.DataFrame(eci, columns=["eci"]).to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True, float_format="%.3f")
def rdo(ybi, yi, year, depths): rca_dist_opp = [] for geo_level in depths["bra"]: print "geo level:", geo_level ybi_data = ybi.reset_index() bra_criterion = ybi_data["bra_id"].str.len() == geo_level cnae_criterion = ybi_data["cnae_id"].str.len() == 6 ybi_data = ybi_data[bra_criterion & cnae_criterion] # ybi_data = ybi_data.reindex(index=ybi_index) # ybi_data = ybi_data.drop(["year", "num_emp", "num_est", "wage_avg", "num_emp_est"], axis=1) ybi_data = ybi_data[["bra_id", "cnae_id", "wage"]] # ybi_data = ybi_data.unstack() # levels = ybi_data.columns.levels # labels = ybi_data.columns.labels # ybi_data.columns = levels[1][labels[1]] ''' RCAS ''' # ybi_data = ybi_data.pivot(index="bra_id", columns="cnae_id", values="wage").fillna(0) ybi_data = ybi_data.pivot(index="bra_id", columns="cnae_id", values="wage") rcas = ps_calcs.rca(ybi_data) rcas_binary = rcas.copy() rcas_binary[rcas_binary >= 1] = 1 rcas_binary[rcas_binary < 1] = 0 ''' DISTANCES ''' '''calculate proximity for opportunity gain calculation''' prox = ps_calcs.proximity(rcas_binary) '''calculate distances using proximity''' dist = ps_calcs.distance(rcas_binary, prox).fillna(0) ''' OPP GAIN ''' '''calculate product complexity''' pci = ps_calcs.complexity(rcas_binary)[1] '''calculate opportunity gain''' opp_gain = ps_calcs.opportunity_gain(rcas_binary, prox, pci) rdo = [] for bra in rcas.index: for cnae in rcas.columns: rdo.append([year, bra, cnae, rcas[cnae][bra], dist[cnae][bra], opp_gain[cnae][bra]]) rca_dist_opp += rdo # now time to merge! print "merging datasets..." ybi_rdo = pd.DataFrame(rca_dist_opp, columns=["year", "bra_id", "cnae_id", "rca", "distance", "opp_gain"]) ybi_rdo["year"] = ybi_rdo["year"].astype(int) ybi_rdo["rca"][ybi_rdo["rca"] == 0] = np.nan ybi_rdo = ybi_rdo.set_index(["year", "bra_id", "cnae_id"]) # get union of both sets of indexes all_ybi_indexes = set(ybi.index).union(set(ybi_rdo.index)) all_ybi_indexes = pd.MultiIndex.from_tuples(all_ybi_indexes, names=["year", "bra_id", "cnae_id"]) # ybi = ybi.reindex(index=all_ybi_indexes, fill_value=0) ybi = ybi.reindex(index=all_ybi_indexes) ybi["rca"] = ybi_rdo["rca"] ybi["distance"] = ybi_rdo["distance"] ybi["opp_gain"] = ybi_rdo["opp_gain"] return ybi
def rdo(ybp, yp, year, depths): hs = yp[["val_usd"]].groupby(level=["hs_id"]).sum().dropna() hs = [h for h in hs.index if len(h) == depths["hs"][-1]] rca_dist_opp = [] for geo_level in depths["bra"]: print "geo_level", geo_level ''' RCAS ''' rcas_dom = get_domestic_rcas(geo_level, year, ybp, depths) rcas_dom = rcas_dom.reindex(columns=hs) rcas_wld = get_wld_rcas(geo_level, year, ybp, depths) rcas_wld = rcas_wld.reindex(columns=hs) rcas_dom_binary = rcas_dom.copy() rcas_dom_binary[rcas_dom_binary >= 1] = 1 rcas_dom_binary[rcas_dom_binary < 1] = 0 rcas_wld_binary = rcas_wld.copy() rcas_wld_binary[rcas_wld_binary >= 1] = 1 rcas_wld_binary[rcas_wld_binary < 1] = 0 ''' DISTANCES ''' '''domestic distances''' prox_dom = ps_calcs.proximity(rcas_dom_binary) dist_dom = ps_calcs.distance(rcas_dom_binary, prox_dom).fillna(0) '''world distances''' prox_wld = get_wld_proximity(year) hs_wld = set(rcas_wld_binary.columns).intersection( set(prox_wld.columns)) # hs_wld = set(rcas_wld_binary.columns).union(set(prox_wld.columns)) prox_wld = prox_wld.reindex(columns=hs_wld, index=hs_wld) rcas_wld_binary = rcas_wld_binary.reindex(columns=hs_wld) dist_wld = ps_calcs.distance(rcas_wld_binary, prox_wld).fillna(0) ''' OPP GAIN ''' '''same PCIs for all since we are using world PCIs''' pcis = get_pcis(geo_level, yp, depths) # all_hs_dom = set(pcis.index).union(set(rcas_dom.columns)) all_hs_dom = set(pcis.index).intersection(set(rcas_dom.columns)) pcis_dom = pcis.reindex(index=all_hs_dom) rcas_dom_binary = rcas_dom_binary.reindex(columns=all_hs_dom) prox_dom = prox_dom.reindex(index=all_hs_dom, columns=all_hs_dom) # print rcas_dom_binary.shape, prox_dom.shape, pcis.shape # all_hs_wld = set(pcis.index).union(set(rcas_wld.columns)) all_hs_wld = set(pcis.index).intersection(set(rcas_wld.columns)) pcis_wld = pcis.reindex(index=all_hs_wld) rcas_wld_binary = rcas_wld_binary.reindex(columns=all_hs_wld) prox_wld = prox_wld.reindex(index=all_hs_wld, columns=all_hs_wld) # print rcas_dom_binary.shape, prox_dom.shape, pcis.shape opp_gain_wld = ps_calcs.opportunity_gain(rcas_wld_binary, prox_wld, pcis_wld) opp_gain_dom = ps_calcs.opportunity_gain(rcas_dom_binary, prox_dom, pcis_wld) ''' SET RCAS TO NULL ''' rcas_dom = rcas_dom.replace(0, np.nan) rcas_wld = rcas_wld.replace(0, np.nan) def tryto(df, col, ind): if col in df.columns: if ind in df.index: return df[col][ind] return None # print opp_gain_wld.ix["al000107"].ix["041601"] # print opp_gain_dom.ix["al000107"].ix["041601"] # print tryto(opp_gain_dom, "041601", "al000107") # print "al000107" in set(rcas_dom.index).union(set(rcas_wld.index)) # print "041601" in set(export_hs).union(set(import_hs)) # sys.exit() for bra in set(rcas_dom.index).union(set(rcas_wld.index)): for h in hs: rca_dist_opp.append([year, bra, h, \ tryto(rcas_dom, h, bra), tryto(rcas_wld, h, bra), \ tryto(dist_dom, h, bra), tryto(dist_wld, h, bra), \ tryto(opp_gain_dom, h, bra), tryto(opp_gain_wld, h, bra) ]) # print len(rca_dist_opp), "rows updated" # now time to merge! # print "merging datasets..." ybp_rdo = pd.DataFrame(rca_dist_opp, columns=[ "year", "bra_id", "hs_id", "rca", "rca_wld", "distance", "distance_wld", "opp_gain", "opp_gain_wld" ]) ybp_rdo["year"] = ybp_rdo["year"].astype("int") ybp_rdo = ybp_rdo.set_index(["year", "bra_id", "hs_id"]) ybp = pd.merge(ybp, ybp_rdo, how="outer", left_index=True, right_index=True) return ybp
def rdo(ymbp, ymp, year, geo_depths, ypw_file_path): export_hs = ymp[["export_val"]].groupby(level=["hs_id"]).sum().dropna() export_hs = [hs for hs in export_hs.index if len(hs) == 6] import_hs = ymp[["import_val"]].groupby(level=["hs_id"]).sum().dropna() import_hs = [hs for hs in import_hs.index if len(hs) == 6] rca_dist_opp = [] for geo_level in geo_depths: print "geo_level",geo_level ''' RCAS ''' rcas_dom = get_domestic_rcas(geo_level, year, ymbp, "export") rcas_dom = rcas_dom.reindex(columns=export_hs) rcd = get_domestic_rcas(geo_level, year, ymbp, "import") rcd = rcd.reindex(columns=import_hs) # print rcd.ix["mg"] # sys.exit() rcas_wld = get_wld_rcas(geo_level, year, ymbp, ypw_file_path) rcas_wld = rcas_wld.reindex(columns=export_hs) # print rcas_wld.ix["mg"] # print rcas_wld['010204'] # sys.exit() rcas_dom_binary = rcas_dom.copy() rcas_dom_binary[rcas_dom_binary >= 1] = 1 rcas_dom_binary[rcas_dom_binary < 1] = 0 rcas_wld_binary = rcas_wld.copy() rcas_wld_binary[rcas_wld_binary >= 1] = 1 rcas_wld_binary[rcas_wld_binary < 1] = 0 ''' DISTANCES ''' '''domestic distances''' prox_dom = ps_calcs.proximity(rcas_dom_binary) dist_dom = ps_calcs.distance(rcas_dom_binary, prox_dom).fillna(0) '''world distances''' prox_wld = get_wld_proximity(year, ypw_file_path) hs_wld = set(rcas_wld_binary.columns).intersection(set(prox_wld.columns)) # hs_wld = set(rcas_wld_binary.columns).union(set(prox_wld.columns)) prox_wld = prox_wld.reindex(columns=hs_wld, index=hs_wld) rcas_wld_binary = rcas_wld_binary.reindex(columns=hs_wld) dist_wld = ps_calcs.distance(rcas_wld_binary, prox_wld).fillna(0) ''' OPP GAIN ''' '''same PCIs for all since we are using world PCIs''' pcis = get_pcis(geo_level, ymp) # all_hs_dom = set(pcis.index).union(set(rcas_dom.columns)) all_hs_dom = set(pcis.index).intersection(set(rcas_dom.columns)) pcis_dom = pcis.reindex(index=all_hs_dom) rcas_dom_binary = rcas_dom_binary.reindex(columns = all_hs_dom) prox_dom = prox_dom.reindex(index = all_hs_dom, columns = all_hs_dom) # print rcas_dom_binary.shape, prox_dom.shape, pcis.shape # all_hs_wld = set(pcis.index).union(set(rcas_wld.columns)) all_hs_wld = set(pcis.index).intersection(set(rcas_wld.columns)) pcis_wld = pcis.reindex(index=all_hs_wld) rcas_wld_binary = rcas_wld_binary.reindex(columns = all_hs_wld) prox_wld = prox_wld.reindex(index = all_hs_wld, columns = all_hs_wld) # print rcas_dom_binary.shape, prox_dom.shape, pcis.shape opp_gain_wld = ps_calcs.opportunity_gain(rcas_wld_binary, prox_wld, pcis_wld) opp_gain_dom = ps_calcs.opportunity_gain(rcas_dom_binary, prox_dom, pcis_wld) ''' SET RCAS TO NULL ''' rcas_dom = rcas_dom.replace(0, np.nan) rcas_wld = rcas_wld.replace(0, np.nan) rcd = rcd.replace(0, np.nan) def tryto(df, col, ind): if col in df.columns: if ind in df.index: return df[col][ind] return None # print opp_gain_wld.ix["al000107"].ix["041601"] # print opp_gain_dom.ix["al000107"].ix["041601"] # print tryto(opp_gain_dom, "041601", "al000107") # print "al000107" in set(rcas_dom.index).union(set(rcas_wld.index)) # print "041601" in set(export_hs).union(set(import_hs)) # sys.exit() for bra in set(rcas_dom.index).union(set(rcas_wld.index)): for hs in set(export_hs).union(set(import_hs)): rca_dist_opp.append([year, bra, hs, \ tryto(rcas_dom, hs, bra), tryto(rcas_wld, hs, bra), \ tryto(rcd, hs, bra), \ tryto(dist_dom, hs, bra), tryto(dist_wld, hs, bra), \ tryto(opp_gain_dom, hs, bra), tryto(opp_gain_wld, hs, bra) ]) # print len(rca_dist_opp), "rows updated" # now time to merge! # print "merging datasets..." ybp_rdo = pd.DataFrame(rca_dist_opp, columns=["year", "bra_id", "hs_id", "rca", "rca_wld", "rcd", "distance", "distance_wld", "opp_gain", "opp_gain_wld"]) ybp_rdo["year"] = ybp_rdo["year"].astype("int") ybp_rdo["month"] = "00" ybp_rdo = ybp_rdo.set_index(["year", "month", "bra_id", "hs_id"]) ymbp = pd.merge(ymbp, ybp_rdo, how="outer", left_index=True, right_index=True) return ymbp
def main(input_file, year, output_dir): output_dir = os.path.abspath(os.path.join(output_dir, str(year))) if not os.path.exists(output_dir): os.makedirs(output_dir) store = pd.HDFStore(os.path.join(output_dir,'yodp.h5')) try: ypw = store.get('ypw') except KeyError: ''' Import file to pandas dataframe ''' comtrade_df = import_file(input_file) ''' Add indexes ''' ypw = comtrade_df.groupby(['hs_id', 'wld_id']).sum() store.put('ypw', ypw) ''' Calculate RCA ''' ypw_rca = ypw.reset_index() ypw_rca = ypw_rca.pivot(index="wld_id", columns="hs_id", values="val_usd") ypw_rca = ps_calcs.rca(ypw_rca) ypw_rca_binary = ypw_rca.copy() ypw_rca_binary[ypw_rca_binary >= 1] = 1 ypw_rca_binary[ypw_rca_binary < 1] = 0 ''' DISTANCES ''' ypw_prox = ps_calcs.proximity(ypw_rca_binary) ypw_dist = ps_calcs.distance(ypw_rca_binary, ypw_prox).fillna(0) ''' COMPLEXITY ''' eci, pci = calc_complexity(ypw) ''' OPP GAIN ''' ypw_opp_gain = ps_calcs.opportunity_gain(ypw_rca_binary[pci.index], ypw_prox[pci.index].reindex(pci.index), pci) ''' MERGE DATA ''' ypw_opp_gain = pd.DataFrame(ypw_opp_gain.T.stack(), columns=["opp_gain"]) ypw_opp_gain = ypw_opp_gain.replace(0, np.nan) ypw_dist = pd.DataFrame(ypw_dist.T.stack(), columns=["distance"]) ypw_dist = ypw_dist.replace(0, np.nan) ypw_rca = pd.DataFrame(ypw_rca.T.stack(), columns=["rca"]) ypw_rca = ypw_rca.replace(0, np.nan) new_ypw = ypw \ .merge(ypw_rca, how="outer", left_index=True, right_index=True) \ .merge(ypw_dist, how="outer", left_index=True, right_index=True) \ .merge(ypw_opp_gain, how="outer", left_index=True, right_index=True) new_ypw = new_ypw.reset_index() new_ypw["year"] = year cols = new_ypw.columns.tolist() cols = cols[-1:] + cols[:-1] new_ypw = new_ypw[cols] ''' Write out to files ''' new_file_path = os.path.abspath(os.path.join(output_dir, "comtrade_ypw.tsv.bz2")) new_ypw.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=False, float_format="%.3f") new_file_path = os.path.abspath(os.path.join(output_dir, "comtrade_pci.tsv.bz2")) pd.DataFrame(pci, columns=["pci"]).to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True, float_format="%.3f") new_file_path = os.path.abspath(os.path.join(output_dir, "comtrade_eci.tsv.bz2")) pd.DataFrame(eci, columns=["eci"]).to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True, float_format="%.3f")
def rdo(ymbp, ymp, year, geo_depths): export_hs = ymp[["export_val"]].groupby(level=["hs_id"]).sum().dropna() export_hs = [hs for hs in export_hs.index if len(hs) == 6] import_hs = ymp[["import_val"]].groupby(level=["hs_id"]).sum().dropna() import_hs = [hs for hs in import_hs.index if len(hs) == 6] rca_dist_opp = [] for geo_level in geo_depths: print "geo_level", geo_level """ RCAS """ rcas_dom = get_domestic_rcas(geo_level, year, ymbp, "export") rcas_dom = rcas_dom.reindex(columns=export_hs) rcd = get_domestic_rcas(geo_level, year, ymbp, "import") rcd = rcd.reindex(columns=import_hs) # print rcd.ix["mg"] # sys.exit() rcas_wld = get_wld_rcas(geo_level, year, ymbp) rcas_wld = rcas_wld.reindex(columns=export_hs) # print rcas_wld.ix["4"] # print rcas_wld['010204'] # sys.exit() rcas_dom_binary = rcas_dom.copy() rcas_dom_binary[rcas_dom_binary >= 1] = 1 rcas_dom_binary[rcas_dom_binary < 1] = 0 rcas_wld_binary = rcas_wld.copy() rcas_wld_binary[rcas_wld_binary >= 1] = 1 rcas_wld_binary[rcas_wld_binary < 1] = 0 """ DISTANCES """ """domestic distances""" prox_dom = ps_calcs.proximity(rcas_dom_binary) dist_dom = ps_calcs.distance(rcas_dom_binary, prox_dom).fillna(0) """world distances""" prox_wld = get_wld_proximity(year) hs_wld = set(rcas_wld_binary.columns).intersection(set(prox_wld.columns)) # hs_wld = set(rcas_wld_binary.columns).union(set(prox_wld.columns)) prox_wld = prox_wld.reindex(columns=hs_wld, index=hs_wld) rcas_wld_binary = rcas_wld_binary.reindex(columns=hs_wld) dist_wld = ps_calcs.distance(rcas_wld_binary, prox_wld).fillna(0) """ OPP GAIN """ """same PCIs for all since we are using world PCIs""" pcis = get_pcis(geo_level, ymp) # all_hs_dom = set(pcis.index).union(set(rcas_dom.columns)) all_hs_dom = set(pcis.index).intersection(set(rcas_dom.columns)) pcis_dom = pcis.reindex(index=all_hs_dom) rcas_dom_binary = rcas_dom_binary.reindex(columns=all_hs_dom) prox_dom = prox_dom.reindex(index=all_hs_dom, columns=all_hs_dom) # print rcas_dom_binary.shape, prox_dom.shape, pcis.shape # all_hs_wld = set(pcis.index).union(set(rcas_wld.columns)) all_hs_wld = set(pcis.index).intersection(set(rcas_wld.columns)) pcis_wld = pcis.reindex(index=all_hs_wld) rcas_wld_binary = rcas_wld_binary.reindex(columns=all_hs_wld) prox_wld = prox_wld.reindex(index=all_hs_wld, columns=all_hs_wld) # print rcas_dom_binary.shape, prox_dom.shape, pcis.shape opp_gain_wld = ps_calcs.opportunity_gain(rcas_wld_binary, prox_wld, pcis_wld) opp_gain_dom = ps_calcs.opportunity_gain(rcas_dom_binary, prox_dom, pcis_wld) """ SET RCAS TO NULL """ rcas_dom = rcas_dom.replace(0, np.nan) rcas_wld = rcas_wld.replace(0, np.nan) rcd = rcd.replace(0, np.nan) def tryto(df, col, ind): if col in df.columns: if ind in df.index: return df[col][ind] return None # print opp_gain_wld.ix["al000107"].ix["041601"] # print opp_gain_dom.ix["al000107"].ix["041601"] # print tryto(opp_gain_dom, "041601", "al000107") # print "al000107" in set(rcas_dom.index).union(set(rcas_wld.index)) # print "041601" in set(export_hs).union(set(import_hs)) # sys.exit() """ Connect to DB """ db = MySQLdb.connect( host=os.environ.get("DATAVIVA_DB_HOST", "localhost"), user=os.environ["DATAVIVA_DB_USER"], passwd=os.environ["DATAVIVA_DB_PW"], db=os.environ["DATAVIVA_DB_NAME"], ) db.autocommit(1) cursor = db.cursor() for bra in set(rcas_dom.index).union(set(rcas_wld.index)): for hs in set(export_hs).union(set(import_hs)): cursor.execute( "update secex_ymbp set rca_wld=%s, opp_gain_wld=%s, distance_wld=%s where year=%s and month=0 and bra_id=%s and hs_id=%s;", [tryto(rcas_wld, hs, bra), tryto(opp_gain_wld, hs, bra), tryto(dist_wld, hs, bra), year, bra, hs], )
def rdo(ybi, yi, year, depths): rca_dist_opp = [] for geo_level in depths["bra"]: print "geo level:", geo_level ybi_data = ybi.reset_index() bra_criterion = ybi_data["bra_id"].str.len() == geo_level cnae_criterion = ybi_data["cnae_id"].str.len() == 6 ybi_data = ybi_data[bra_criterion & cnae_criterion] # ybi_data = ybi_data.reindex(index=ybi_index) # ybi_data = ybi_data.drop(["year", "num_emp", "num_est", "wage_avg", "num_emp_est"], axis=1) ybi_data = ybi_data[["bra_id", "cnae_id", "wage"]] # ybi_data = ybi_data.unstack() # levels = ybi_data.columns.levels # labels = ybi_data.columns.labels # ybi_data.columns = levels[1][labels[1]] ''' RCAS ''' # ybi_data = ybi_data.pivot(index="bra_id", columns="cnae_id", values="wage").fillna(0) ybi_data = ybi_data.pivot(index="bra_id", columns="cnae_id", values="wage") rcas = ps_calcs.rca(ybi_data) rcas_binary = rcas.copy() rcas_binary[rcas_binary >= 1] = 1 rcas_binary[rcas_binary < 1] = 0 ''' DISTANCES ''' '''calculate proximity for opportunity gain calculation''' prox = ps_calcs.proximity(rcas_binary) '''calculate distances using proximity''' dist = ps_calcs.distance(rcas_binary, prox).fillna(0) ''' OPP GAIN ''' '''calculate product complexity''' pci = ps_calcs.complexity(rcas_binary)[1] '''calculate opportunity gain''' opp_gain = ps_calcs.opportunity_gain(rcas_binary, prox, pci) rdo = [] for bra in rcas.index: for cnae in rcas.columns: rdo.append([ year, bra, cnae, rcas[cnae][bra], dist[cnae][bra], opp_gain[cnae][bra] ]) rca_dist_opp += rdo # now time to merge! print "merging datasets..." ybi_rdo = pd.DataFrame( rca_dist_opp, columns=["year", "bra_id", "cnae_id", "rca", "distance", "opp_gain"]) ybi_rdo["year"] = ybi_rdo["year"].astype(int) ybi_rdo["rca"][ybi_rdo["rca"] == 0] = np.nan ybi_rdo = ybi_rdo.set_index(["year", "bra_id", "cnae_id"]) # get union of both sets of indexes all_ybi_indexes = set(ybi.index).union(set(ybi_rdo.index)) all_ybi_indexes = pd.MultiIndex.from_tuples( all_ybi_indexes, names=["year", "bra_id", "cnae_id"]) # ybi = ybi.reindex(index=all_ybi_indexes, fill_value=0) ybi = ybi.reindex(index=all_ybi_indexes) ybi["rca"] = ybi_rdo["rca"] ybi["distance"] = ybi_rdo["distance"] ybi["opp_gain"] = ybi_rdo["opp_gain"] return ybi
def rdo(ymbp, ymp, year, geo_depths): export_hs = ymp[["export_val"]].groupby(level=["hs_id"]).sum().dropna() export_hs = [hs for hs in export_hs.index if len(hs) == 6] import_hs = ymp[["import_val"]].groupby(level=["hs_id"]).sum().dropna() import_hs = [hs for hs in import_hs.index if len(hs) == 6] rca_dist_opp = [] for geo_level in geo_depths: print "geo_level", geo_level ''' RCAS ''' rcas_dom = get_domestic_rcas(geo_level, year, ymbp, "export") rcas_dom = rcas_dom.reindex(columns=export_hs) rcd = get_domestic_rcas(geo_level, year, ymbp, "import") rcd = rcd.reindex(columns=import_hs) # print rcd.ix["mg"] # sys.exit() rcas_wld = get_wld_rcas(geo_level, year, ymbp) rcas_wld = rcas_wld.reindex(columns=export_hs) # print rcas_wld.ix["4"] # print rcas_wld['010204'] # sys.exit() rcas_dom_binary = rcas_dom.copy() rcas_dom_binary[rcas_dom_binary >= 1] = 1 rcas_dom_binary[rcas_dom_binary < 1] = 0 rcas_wld_binary = rcas_wld.copy() rcas_wld_binary[rcas_wld_binary >= 1] = 1 rcas_wld_binary[rcas_wld_binary < 1] = 0 ''' DISTANCES ''' '''domestic distances''' prox_dom = ps_calcs.proximity(rcas_dom_binary) dist_dom = ps_calcs.distance(rcas_dom_binary, prox_dom).fillna(0) '''world distances''' prox_wld = get_wld_proximity(year) hs_wld = set(rcas_wld_binary.columns).intersection( set(prox_wld.columns)) # hs_wld = set(rcas_wld_binary.columns).union(set(prox_wld.columns)) prox_wld = prox_wld.reindex(columns=hs_wld, index=hs_wld) rcas_wld_binary = rcas_wld_binary.reindex(columns=hs_wld) dist_wld = ps_calcs.distance(rcas_wld_binary, prox_wld).fillna(0) ''' OPP GAIN ''' '''same PCIs for all since we are using world PCIs''' pcis = get_pcis(geo_level, ymp) # all_hs_dom = set(pcis.index).union(set(rcas_dom.columns)) all_hs_dom = set(pcis.index).intersection(set(rcas_dom.columns)) pcis_dom = pcis.reindex(index=all_hs_dom) rcas_dom_binary = rcas_dom_binary.reindex(columns=all_hs_dom) prox_dom = prox_dom.reindex(index=all_hs_dom, columns=all_hs_dom) # print rcas_dom_binary.shape, prox_dom.shape, pcis.shape # all_hs_wld = set(pcis.index).union(set(rcas_wld.columns)) all_hs_wld = set(pcis.index).intersection(set(rcas_wld.columns)) pcis_wld = pcis.reindex(index=all_hs_wld) rcas_wld_binary = rcas_wld_binary.reindex(columns=all_hs_wld) prox_wld = prox_wld.reindex(index=all_hs_wld, columns=all_hs_wld) # print rcas_dom_binary.shape, prox_dom.shape, pcis.shape opp_gain_wld = ps_calcs.opportunity_gain(rcas_wld_binary, prox_wld, pcis_wld) opp_gain_dom = ps_calcs.opportunity_gain(rcas_dom_binary, prox_dom, pcis_wld) ''' SET RCAS TO NULL ''' rcas_dom = rcas_dom.replace(0, np.nan) rcas_wld = rcas_wld.replace(0, np.nan) rcd = rcd.replace(0, np.nan) def tryto(df, col, ind): if col in df.columns: if ind in df.index: return df[col][ind] return None # print opp_gain_wld.ix["al000107"].ix["041601"] # print opp_gain_dom.ix["al000107"].ix["041601"] # print tryto(opp_gain_dom, "041601", "al000107") # print "al000107" in set(rcas_dom.index).union(set(rcas_wld.index)) # print "041601" in set(export_hs).union(set(import_hs)) # sys.exit() ''' Connect to DB ''' db = MySQLdb.connect(host=os.environ.get("DATAVIVA_DB_HOST", "localhost"), user=os.environ["DATAVIVA_DB_USER"], passwd=os.environ["DATAVIVA_DB_PW"], db=os.environ["DATAVIVA_DB_NAME"]) db.autocommit(1) cursor = db.cursor() for bra in set(rcas_dom.index).union(set(rcas_wld.index)): for hs in set(export_hs).union(set(import_hs)): cursor.execute( "update secex_ymbp set rca_wld=%s, opp_gain_wld=%s, distance_wld=%s where year=%s and month=0 and bra_id=%s and hs_id=%s;", [ tryto(rcas_wld, hs, bra), tryto(opp_gain_wld, hs, bra), tryto(dist_wld, hs, bra), year, bra, hs ])