Exemple #1
0
def main(input_file, year, output_dir):

    output_dir = os.path.abspath(os.path.join(output_dir, str(year)))
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    store = pd.HDFStore(os.path.join(output_dir, 'yodp.h5'))

    try:
        ypw = store.get('ypw')
    except KeyError:
        '''
        Import file to pandas dataframe
        '''
        comtrade_df = import_file(input_file)
        '''
        Add indexes
        '''
        ypw = comtrade_df.groupby(['hs_id', 'wld_id']).sum()

        store.put('ypw', ypw)
    '''
    Calculate RCA
    '''
    ypw_rca = ypw.reset_index()
    ypw_rca = ypw_rca.pivot(index="wld_id", columns="hs_id", values="val_usd")
    ypw_rca = ps_calcs.rca(ypw_rca)

    ypw_rca_binary = ypw_rca.copy()
    ypw_rca_binary[ypw_rca_binary >= 1] = 1
    ypw_rca_binary[ypw_rca_binary < 1] = 0
    '''
        DISTANCES
    '''
    ypw_prox = ps_calcs.proximity(ypw_rca_binary)
    ypw_dist = ps_calcs.distance(ypw_rca_binary, ypw_prox).fillna(0)
    '''
        COMPLEXITY
    '''
    eci, pci = calc_complexity(ypw)
    '''
        OPP GAIN
    '''
    ypw_opp_gain = ps_calcs.opportunity_gain(
        ypw_rca_binary[pci.index], ypw_prox[pci.index].reindex(pci.index), pci)
    '''
        MERGE DATA
    '''
    ypw_opp_gain = pd.DataFrame(ypw_opp_gain.T.stack(), columns=["opp_gain"])
    ypw_opp_gain = ypw_opp_gain.replace(0, np.nan)

    ypw_dist = pd.DataFrame(ypw_dist.T.stack(), columns=["distance"])
    ypw_dist = ypw_dist.replace(0, np.nan)

    ypw_rca = pd.DataFrame(ypw_rca.T.stack(), columns=["rca"])
    ypw_rca = ypw_rca.replace(0, np.nan)

    new_ypw = ypw \
                .merge(ypw_rca, how="outer", left_index=True, right_index=True) \
                .merge(ypw_dist, how="outer", left_index=True, right_index=True) \
                .merge(ypw_opp_gain, how="outer", left_index=True, right_index=True)
    new_ypw = new_ypw.reset_index()
    new_ypw["year"] = year
    cols = new_ypw.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    new_ypw = new_ypw[cols]
    '''
    Write out to files
    '''
    new_file_path = os.path.abspath(
        os.path.join(output_dir, "comtrade_ypw.tsv.bz2"))
    new_ypw.to_csv(bz2.BZ2File(new_file_path, 'wb'),
                   sep="\t",
                   index=False,
                   float_format="%.3f")

    new_file_path = os.path.abspath(
        os.path.join(output_dir, "comtrade_pci.tsv.bz2"))
    pd.DataFrame(pci, columns=["pci"]).to_csv(bz2.BZ2File(new_file_path, 'wb'),
                                              sep="\t",
                                              index=True,
                                              float_format="%.3f")

    new_file_path = os.path.abspath(
        os.path.join(output_dir, "comtrade_eci.tsv.bz2"))
    pd.DataFrame(eci, columns=["eci"]).to_csv(bz2.BZ2File(new_file_path, 'wb'),
                                              sep="\t",
                                              index=True,
                                              float_format="%.3f")
Exemple #2
0
def rdo(ybi, yi, year, depths):
    
    rca_dist_opp = []
    for geo_level in depths["bra"]:
        print "geo level:", geo_level
        
        ybi_data = ybi.reset_index()
        
        bra_criterion = ybi_data["bra_id"].str.len() == geo_level
        cnae_criterion = ybi_data["cnae_id"].str.len() == 6
        ybi_data = ybi_data[bra_criterion & cnae_criterion]
        
        # ybi_data = ybi_data.reindex(index=ybi_index)
        # ybi_data = ybi_data.drop(["year", "num_emp", "num_est", "wage_avg", "num_emp_est"], axis=1)
        ybi_data = ybi_data[["bra_id", "cnae_id", "wage"]]
    
        # ybi_data = ybi_data.unstack()
        # levels = ybi_data.columns.levels
        # labels = ybi_data.columns.labels
        # ybi_data.columns = levels[1][labels[1]]

        '''
            RCAS
        '''
        
        # ybi_data = ybi_data.pivot(index="bra_id", columns="cnae_id", values="wage").fillna(0)
        ybi_data = ybi_data.pivot(index="bra_id", columns="cnae_id", values="wage")
        rcas = ps_calcs.rca(ybi_data)
    
        rcas_binary = rcas.copy()
        rcas_binary[rcas_binary >= 1] = 1
        rcas_binary[rcas_binary < 1] = 0
    
        '''
            DISTANCES
        '''
    
        '''calculate proximity for opportunity gain calculation'''    
        prox = ps_calcs.proximity(rcas_binary)
        '''calculate distances using proximity'''    
        dist = ps_calcs.distance(rcas_binary, prox).fillna(0)
    
        '''
            OPP GAIN
        '''
    
        '''calculate product complexity'''
        pci = ps_calcs.complexity(rcas_binary)[1]
        '''calculate opportunity gain'''
        opp_gain = ps_calcs.opportunity_gain(rcas_binary, prox, pci)
    
        rdo = []
        for bra in rcas.index:
            for cnae in rcas.columns:
                rdo.append([year, bra, cnae, rcas[cnae][bra], dist[cnae][bra], opp_gain[cnae][bra]])
    
        rca_dist_opp += rdo
    
    # now time to merge!
    print "merging datasets..."
    ybi_rdo = pd.DataFrame(rca_dist_opp, columns=["year", "bra_id", "cnae_id", "rca", "distance", "opp_gain"])
    ybi_rdo["year"] = ybi_rdo["year"].astype(int)
    ybi_rdo["rca"][ybi_rdo["rca"] == 0] = np.nan
    ybi_rdo = ybi_rdo.set_index(["year", "bra_id", "cnae_id"])
    
    # get union of both sets of indexes
    all_ybi_indexes = set(ybi.index).union(set(ybi_rdo.index))
    
    all_ybi_indexes = pd.MultiIndex.from_tuples(all_ybi_indexes, names=["year", "bra_id", "cnae_id"])
    # ybi = ybi.reindex(index=all_ybi_indexes, fill_value=0)
    ybi = ybi.reindex(index=all_ybi_indexes)
    ybi["rca"] = ybi_rdo["rca"]
    ybi["distance"] = ybi_rdo["distance"]
    ybi["opp_gain"] = ybi_rdo["opp_gain"]
    
    return ybi
Exemple #3
0
def rdo(ybp, yp, year, depths):

    hs = yp[["val_usd"]].groupby(level=["hs_id"]).sum().dropna()
    hs = [h for h in hs.index if len(h) == depths["hs"][-1]]

    rca_dist_opp = []
    for geo_level in depths["bra"]:
        print "geo_level", geo_level
        '''
            RCAS
        '''
        rcas_dom = get_domestic_rcas(geo_level, year, ybp, depths)
        rcas_dom = rcas_dom.reindex(columns=hs)

        rcas_wld = get_wld_rcas(geo_level, year, ybp, depths)
        rcas_wld = rcas_wld.reindex(columns=hs)

        rcas_dom_binary = rcas_dom.copy()
        rcas_dom_binary[rcas_dom_binary >= 1] = 1
        rcas_dom_binary[rcas_dom_binary < 1] = 0

        rcas_wld_binary = rcas_wld.copy()
        rcas_wld_binary[rcas_wld_binary >= 1] = 1
        rcas_wld_binary[rcas_wld_binary < 1] = 0
        '''
            DISTANCES
        '''
        '''domestic distances'''
        prox_dom = ps_calcs.proximity(rcas_dom_binary)
        dist_dom = ps_calcs.distance(rcas_dom_binary, prox_dom).fillna(0)
        '''world distances'''
        prox_wld = get_wld_proximity(year)
        hs_wld = set(rcas_wld_binary.columns).intersection(
            set(prox_wld.columns))

        # hs_wld = set(rcas_wld_binary.columns).union(set(prox_wld.columns))
        prox_wld = prox_wld.reindex(columns=hs_wld, index=hs_wld)
        rcas_wld_binary = rcas_wld_binary.reindex(columns=hs_wld)

        dist_wld = ps_calcs.distance(rcas_wld_binary, prox_wld).fillna(0)
        '''
            OPP GAIN
        '''
        '''same PCIs for all since we are using world PCIs'''
        pcis = get_pcis(geo_level, yp, depths)

        # all_hs_dom = set(pcis.index).union(set(rcas_dom.columns))
        all_hs_dom = set(pcis.index).intersection(set(rcas_dom.columns))
        pcis_dom = pcis.reindex(index=all_hs_dom)
        rcas_dom_binary = rcas_dom_binary.reindex(columns=all_hs_dom)
        prox_dom = prox_dom.reindex(index=all_hs_dom, columns=all_hs_dom)

        # print rcas_dom_binary.shape, prox_dom.shape, pcis.shape

        # all_hs_wld = set(pcis.index).union(set(rcas_wld.columns))
        all_hs_wld = set(pcis.index).intersection(set(rcas_wld.columns))
        pcis_wld = pcis.reindex(index=all_hs_wld)
        rcas_wld_binary = rcas_wld_binary.reindex(columns=all_hs_wld)
        prox_wld = prox_wld.reindex(index=all_hs_wld, columns=all_hs_wld)

        # print rcas_dom_binary.shape, prox_dom.shape, pcis.shape
        opp_gain_wld = ps_calcs.opportunity_gain(rcas_wld_binary, prox_wld,
                                                 pcis_wld)
        opp_gain_dom = ps_calcs.opportunity_gain(rcas_dom_binary, prox_dom,
                                                 pcis_wld)
        '''
            SET RCAS TO NULL
        '''
        rcas_dom = rcas_dom.replace(0, np.nan)
        rcas_wld = rcas_wld.replace(0, np.nan)

        def tryto(df, col, ind):
            if col in df.columns:
                if ind in df.index:
                    return df[col][ind]
            return None

        # print opp_gain_wld.ix["al000107"].ix["041601"]
        # print opp_gain_dom.ix["al000107"].ix["041601"]
        # print tryto(opp_gain_dom, "041601", "al000107")
        # print "al000107" in set(rcas_dom.index).union(set(rcas_wld.index))
        # print "041601" in set(export_hs).union(set(import_hs))
        # sys.exit()

        for bra in set(rcas_dom.index).union(set(rcas_wld.index)):
            for h in hs:
                rca_dist_opp.append([year, bra, h, \
                                tryto(rcas_dom, h, bra), tryto(rcas_wld, h, bra), \
                                tryto(dist_dom, h, bra), tryto(dist_wld, h, bra), \
                                tryto(opp_gain_dom, h, bra), tryto(opp_gain_wld, h, bra) ])

        # print len(rca_dist_opp), "rows updated"

    # now time to merge!
    # print "merging datasets..."
    ybp_rdo = pd.DataFrame(rca_dist_opp,
                           columns=[
                               "year", "bra_id", "hs_id", "rca", "rca_wld",
                               "distance", "distance_wld", "opp_gain",
                               "opp_gain_wld"
                           ])
    ybp_rdo["year"] = ybp_rdo["year"].astype("int")
    ybp_rdo = ybp_rdo.set_index(["year", "bra_id", "hs_id"])

    ybp = pd.merge(ybp,
                   ybp_rdo,
                   how="outer",
                   left_index=True,
                   right_index=True)

    return ybp
Exemple #4
0
def rdo(ymbp, ymp, year, geo_depths, ypw_file_path):

    export_hs = ymp[["export_val"]].groupby(level=["hs_id"]).sum().dropna()
    export_hs = [hs for hs in export_hs.index if len(hs) == 6]

    import_hs = ymp[["import_val"]].groupby(level=["hs_id"]).sum().dropna()
    import_hs = [hs for hs in import_hs.index if len(hs) == 6]

    rca_dist_opp = []
    for geo_level in geo_depths:
        print "geo_level",geo_level

        '''
            RCAS
        '''
        rcas_dom = get_domestic_rcas(geo_level, year, ymbp, "export")
        rcas_dom = rcas_dom.reindex(columns=export_hs)

        rcd = get_domestic_rcas(geo_level, year, ymbp, "import")
        rcd = rcd.reindex(columns=import_hs)
        # print rcd.ix["mg"]
        # sys.exit()

        rcas_wld = get_wld_rcas(geo_level, year, ymbp, ypw_file_path)
        rcas_wld = rcas_wld.reindex(columns=export_hs)
        # print rcas_wld.ix["mg"]
        # print rcas_wld['010204']
        # sys.exit()

        rcas_dom_binary = rcas_dom.copy()
        rcas_dom_binary[rcas_dom_binary >= 1] = 1
        rcas_dom_binary[rcas_dom_binary < 1] = 0

        rcas_wld_binary = rcas_wld.copy()
        rcas_wld_binary[rcas_wld_binary >= 1] = 1
        rcas_wld_binary[rcas_wld_binary < 1] = 0

        '''
            DISTANCES
        '''
        '''domestic distances'''
        prox_dom = ps_calcs.proximity(rcas_dom_binary)
        dist_dom = ps_calcs.distance(rcas_dom_binary, prox_dom).fillna(0)

        '''world distances'''
        prox_wld = get_wld_proximity(year, ypw_file_path)
        hs_wld = set(rcas_wld_binary.columns).intersection(set(prox_wld.columns))

        # hs_wld = set(rcas_wld_binary.columns).union(set(prox_wld.columns))
        prox_wld = prox_wld.reindex(columns=hs_wld, index=hs_wld)
        rcas_wld_binary = rcas_wld_binary.reindex(columns=hs_wld)

        dist_wld = ps_calcs.distance(rcas_wld_binary, prox_wld).fillna(0)

        '''
            OPP GAIN
        '''

        '''same PCIs for all since we are using world PCIs'''
        pcis = get_pcis(geo_level, ymp)

        # all_hs_dom = set(pcis.index).union(set(rcas_dom.columns))
        all_hs_dom = set(pcis.index).intersection(set(rcas_dom.columns))
        pcis_dom = pcis.reindex(index=all_hs_dom)
        rcas_dom_binary = rcas_dom_binary.reindex(columns = all_hs_dom)
        prox_dom = prox_dom.reindex(index = all_hs_dom, columns = all_hs_dom)

        # print rcas_dom_binary.shape, prox_dom.shape, pcis.shape

        # all_hs_wld = set(pcis.index).union(set(rcas_wld.columns))
        all_hs_wld = set(pcis.index).intersection(set(rcas_wld.columns))
        pcis_wld = pcis.reindex(index=all_hs_wld)
        rcas_wld_binary = rcas_wld_binary.reindex(columns = all_hs_wld)
        prox_wld = prox_wld.reindex(index = all_hs_wld, columns = all_hs_wld)

        # print rcas_dom_binary.shape, prox_dom.shape, pcis.shape
        opp_gain_wld = ps_calcs.opportunity_gain(rcas_wld_binary, prox_wld, pcis_wld)
        opp_gain_dom = ps_calcs.opportunity_gain(rcas_dom_binary, prox_dom, pcis_wld)

        '''
            SET RCAS TO NULL
        '''
        rcas_dom = rcas_dom.replace(0, np.nan)
        rcas_wld = rcas_wld.replace(0, np.nan)
        rcd = rcd.replace(0, np.nan)

        def tryto(df, col, ind):
            if col in df.columns:
                if ind in df.index:
                    return df[col][ind]
            return None

        # print opp_gain_wld.ix["al000107"].ix["041601"]
        # print opp_gain_dom.ix["al000107"].ix["041601"]
        # print tryto(opp_gain_dom, "041601", "al000107")
        # print "al000107" in set(rcas_dom.index).union(set(rcas_wld.index))
        # print "041601" in set(export_hs).union(set(import_hs))
        # sys.exit()

        for bra in set(rcas_dom.index).union(set(rcas_wld.index)):
            for hs in set(export_hs).union(set(import_hs)):
                rca_dist_opp.append([year, bra, hs, \
                                tryto(rcas_dom, hs, bra), tryto(rcas_wld, hs, bra), \
                                tryto(rcd, hs, bra), \
                                tryto(dist_dom, hs, bra), tryto(dist_wld, hs, bra), \
                                tryto(opp_gain_dom, hs, bra), tryto(opp_gain_wld, hs, bra) ])

        # print len(rca_dist_opp), "rows updated"

    # now time to merge!
    # print "merging datasets..."
    ybp_rdo = pd.DataFrame(rca_dist_opp, columns=["year", "bra_id", "hs_id", "rca", "rca_wld", "rcd", "distance", "distance_wld", "opp_gain", "opp_gain_wld"])
    ybp_rdo["year"] = ybp_rdo["year"].astype("int")
    ybp_rdo["month"] = "00"
    ybp_rdo = ybp_rdo.set_index(["year", "month", "bra_id", "hs_id"])

    ymbp = pd.merge(ymbp, ybp_rdo, how="outer", left_index=True, right_index=True)

    return ymbp
def main(input_file, year, output_dir):
    
    output_dir = os.path.abspath(os.path.join(output_dir, str(year)))
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    store = pd.HDFStore(os.path.join(output_dir,'yodp.h5'))
    
    try:
        ypw = store.get('ypw')
    except KeyError:
        '''
        Import file to pandas dataframe
        '''
        comtrade_df = import_file(input_file)
    
        '''
        Add indexes
        '''
        ypw = comtrade_df.groupby(['hs_id', 'wld_id']).sum()
        
        store.put('ypw', ypw)
    
    '''
    Calculate RCA
    '''
    ypw_rca = ypw.reset_index()
    ypw_rca = ypw_rca.pivot(index="wld_id", columns="hs_id", values="val_usd")
    ypw_rca = ps_calcs.rca(ypw_rca)
    
    ypw_rca_binary = ypw_rca.copy()
    ypw_rca_binary[ypw_rca_binary >= 1] = 1
    ypw_rca_binary[ypw_rca_binary < 1] = 0
    
    '''
        DISTANCES
    '''
    ypw_prox = ps_calcs.proximity(ypw_rca_binary)
    ypw_dist = ps_calcs.distance(ypw_rca_binary, ypw_prox).fillna(0)
    
    '''
        COMPLEXITY
    '''
    eci, pci = calc_complexity(ypw)
    
    '''
        OPP GAIN
    '''
    ypw_opp_gain = ps_calcs.opportunity_gain(ypw_rca_binary[pci.index], ypw_prox[pci.index].reindex(pci.index), pci)
    
    '''
        MERGE DATA
    '''
    ypw_opp_gain = pd.DataFrame(ypw_opp_gain.T.stack(), columns=["opp_gain"])
    ypw_opp_gain = ypw_opp_gain.replace(0, np.nan)
    
    ypw_dist = pd.DataFrame(ypw_dist.T.stack(), columns=["distance"])
    ypw_dist = ypw_dist.replace(0, np.nan)
    
    ypw_rca = pd.DataFrame(ypw_rca.T.stack(), columns=["rca"])
    ypw_rca = ypw_rca.replace(0, np.nan)
    
    new_ypw = ypw \
                .merge(ypw_rca, how="outer", left_index=True, right_index=True) \
                .merge(ypw_dist, how="outer", left_index=True, right_index=True) \
                .merge(ypw_opp_gain, how="outer", left_index=True, right_index=True)
    new_ypw = new_ypw.reset_index()
    new_ypw["year"] = year
    cols = new_ypw.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    new_ypw = new_ypw[cols]
    
    
    '''
    Write out to files
    '''
    new_file_path = os.path.abspath(os.path.join(output_dir, "comtrade_ypw.tsv.bz2"))
    new_ypw.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=False, float_format="%.3f")
    
    new_file_path = os.path.abspath(os.path.join(output_dir, "comtrade_pci.tsv.bz2"))
    pd.DataFrame(pci, columns=["pci"]).to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True, float_format="%.3f")
    
    new_file_path = os.path.abspath(os.path.join(output_dir, "comtrade_eci.tsv.bz2"))
    pd.DataFrame(eci, columns=["eci"]).to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True, float_format="%.3f")
def rdo(ymbp, ymp, year, geo_depths):

    export_hs = ymp[["export_val"]].groupby(level=["hs_id"]).sum().dropna()
    export_hs = [hs for hs in export_hs.index if len(hs) == 6]

    import_hs = ymp[["import_val"]].groupby(level=["hs_id"]).sum().dropna()
    import_hs = [hs for hs in import_hs.index if len(hs) == 6]

    rca_dist_opp = []
    for geo_level in geo_depths:
        print "geo_level", geo_level

        """
            RCAS
        """
        rcas_dom = get_domestic_rcas(geo_level, year, ymbp, "export")
        rcas_dom = rcas_dom.reindex(columns=export_hs)

        rcd = get_domestic_rcas(geo_level, year, ymbp, "import")
        rcd = rcd.reindex(columns=import_hs)
        # print rcd.ix["mg"]
        # sys.exit()

        rcas_wld = get_wld_rcas(geo_level, year, ymbp)
        rcas_wld = rcas_wld.reindex(columns=export_hs)
        # print rcas_wld.ix["4"]
        # print rcas_wld['010204']
        # sys.exit()

        rcas_dom_binary = rcas_dom.copy()
        rcas_dom_binary[rcas_dom_binary >= 1] = 1
        rcas_dom_binary[rcas_dom_binary < 1] = 0

        rcas_wld_binary = rcas_wld.copy()
        rcas_wld_binary[rcas_wld_binary >= 1] = 1
        rcas_wld_binary[rcas_wld_binary < 1] = 0

        """
            DISTANCES
        """
        """domestic distances"""
        prox_dom = ps_calcs.proximity(rcas_dom_binary)
        dist_dom = ps_calcs.distance(rcas_dom_binary, prox_dom).fillna(0)

        """world distances"""
        prox_wld = get_wld_proximity(year)
        hs_wld = set(rcas_wld_binary.columns).intersection(set(prox_wld.columns))

        # hs_wld = set(rcas_wld_binary.columns).union(set(prox_wld.columns))
        prox_wld = prox_wld.reindex(columns=hs_wld, index=hs_wld)
        rcas_wld_binary = rcas_wld_binary.reindex(columns=hs_wld)

        dist_wld = ps_calcs.distance(rcas_wld_binary, prox_wld).fillna(0)

        """
            OPP GAIN
        """

        """same PCIs for all since we are using world PCIs"""
        pcis = get_pcis(geo_level, ymp)

        # all_hs_dom = set(pcis.index).union(set(rcas_dom.columns))
        all_hs_dom = set(pcis.index).intersection(set(rcas_dom.columns))
        pcis_dom = pcis.reindex(index=all_hs_dom)
        rcas_dom_binary = rcas_dom_binary.reindex(columns=all_hs_dom)
        prox_dom = prox_dom.reindex(index=all_hs_dom, columns=all_hs_dom)

        # print rcas_dom_binary.shape, prox_dom.shape, pcis.shape

        # all_hs_wld = set(pcis.index).union(set(rcas_wld.columns))
        all_hs_wld = set(pcis.index).intersection(set(rcas_wld.columns))
        pcis_wld = pcis.reindex(index=all_hs_wld)
        rcas_wld_binary = rcas_wld_binary.reindex(columns=all_hs_wld)
        prox_wld = prox_wld.reindex(index=all_hs_wld, columns=all_hs_wld)

        # print rcas_dom_binary.shape, prox_dom.shape, pcis.shape
        opp_gain_wld = ps_calcs.opportunity_gain(rcas_wld_binary, prox_wld, pcis_wld)
        opp_gain_dom = ps_calcs.opportunity_gain(rcas_dom_binary, prox_dom, pcis_wld)

        """
            SET RCAS TO NULL
        """
        rcas_dom = rcas_dom.replace(0, np.nan)
        rcas_wld = rcas_wld.replace(0, np.nan)
        rcd = rcd.replace(0, np.nan)

        def tryto(df, col, ind):
            if col in df.columns:
                if ind in df.index:
                    return df[col][ind]
            return None

        # print opp_gain_wld.ix["al000107"].ix["041601"]
        # print opp_gain_dom.ix["al000107"].ix["041601"]
        # print tryto(opp_gain_dom, "041601", "al000107")
        # print "al000107" in set(rcas_dom.index).union(set(rcas_wld.index))
        # print "041601" in set(export_hs).union(set(import_hs))
        # sys.exit()

        """ Connect to DB """
        db = MySQLdb.connect(
            host=os.environ.get("DATAVIVA_DB_HOST", "localhost"),
            user=os.environ["DATAVIVA_DB_USER"],
            passwd=os.environ["DATAVIVA_DB_PW"],
            db=os.environ["DATAVIVA_DB_NAME"],
        )
        db.autocommit(1)
        cursor = db.cursor()

        for bra in set(rcas_dom.index).union(set(rcas_wld.index)):
            for hs in set(export_hs).union(set(import_hs)):
                cursor.execute(
                    "update secex_ymbp set rca_wld=%s, opp_gain_wld=%s, distance_wld=%s where year=%s and month=0 and bra_id=%s and hs_id=%s;",
                    [tryto(rcas_wld, hs, bra), tryto(opp_gain_wld, hs, bra), tryto(dist_wld, hs, bra), year, bra, hs],
                )
def rdo(ybi, yi, year, depths):

    rca_dist_opp = []
    for geo_level in depths["bra"]:
        print "geo level:", geo_level

        ybi_data = ybi.reset_index()

        bra_criterion = ybi_data["bra_id"].str.len() == geo_level
        cnae_criterion = ybi_data["cnae_id"].str.len() == 6
        ybi_data = ybi_data[bra_criterion & cnae_criterion]

        # ybi_data = ybi_data.reindex(index=ybi_index)
        # ybi_data = ybi_data.drop(["year", "num_emp", "num_est", "wage_avg", "num_emp_est"], axis=1)
        ybi_data = ybi_data[["bra_id", "cnae_id", "wage"]]

        # ybi_data = ybi_data.unstack()
        # levels = ybi_data.columns.levels
        # labels = ybi_data.columns.labels
        # ybi_data.columns = levels[1][labels[1]]
        '''
            RCAS
        '''

        # ybi_data = ybi_data.pivot(index="bra_id", columns="cnae_id", values="wage").fillna(0)
        ybi_data = ybi_data.pivot(index="bra_id",
                                  columns="cnae_id",
                                  values="wage")
        rcas = ps_calcs.rca(ybi_data)

        rcas_binary = rcas.copy()
        rcas_binary[rcas_binary >= 1] = 1
        rcas_binary[rcas_binary < 1] = 0
        '''
            DISTANCES
        '''
        '''calculate proximity for opportunity gain calculation'''
        prox = ps_calcs.proximity(rcas_binary)
        '''calculate distances using proximity'''
        dist = ps_calcs.distance(rcas_binary, prox).fillna(0)
        '''
            OPP GAIN
        '''
        '''calculate product complexity'''
        pci = ps_calcs.complexity(rcas_binary)[1]
        '''calculate opportunity gain'''
        opp_gain = ps_calcs.opportunity_gain(rcas_binary, prox, pci)

        rdo = []
        for bra in rcas.index:
            for cnae in rcas.columns:
                rdo.append([
                    year, bra, cnae, rcas[cnae][bra], dist[cnae][bra],
                    opp_gain[cnae][bra]
                ])

        rca_dist_opp += rdo

    # now time to merge!
    print "merging datasets..."
    ybi_rdo = pd.DataFrame(
        rca_dist_opp,
        columns=["year", "bra_id", "cnae_id", "rca", "distance", "opp_gain"])
    ybi_rdo["year"] = ybi_rdo["year"].astype(int)
    ybi_rdo["rca"][ybi_rdo["rca"] == 0] = np.nan
    ybi_rdo = ybi_rdo.set_index(["year", "bra_id", "cnae_id"])

    # get union of both sets of indexes
    all_ybi_indexes = set(ybi.index).union(set(ybi_rdo.index))

    all_ybi_indexes = pd.MultiIndex.from_tuples(
        all_ybi_indexes, names=["year", "bra_id", "cnae_id"])
    # ybi = ybi.reindex(index=all_ybi_indexes, fill_value=0)
    ybi = ybi.reindex(index=all_ybi_indexes)
    ybi["rca"] = ybi_rdo["rca"]
    ybi["distance"] = ybi_rdo["distance"]
    ybi["opp_gain"] = ybi_rdo["opp_gain"]

    return ybi
def rdo(ymbp, ymp, year, geo_depths):

    export_hs = ymp[["export_val"]].groupby(level=["hs_id"]).sum().dropna()
    export_hs = [hs for hs in export_hs.index if len(hs) == 6]

    import_hs = ymp[["import_val"]].groupby(level=["hs_id"]).sum().dropna()
    import_hs = [hs for hs in import_hs.index if len(hs) == 6]

    rca_dist_opp = []
    for geo_level in geo_depths:
        print "geo_level", geo_level
        '''
            RCAS
        '''
        rcas_dom = get_domestic_rcas(geo_level, year, ymbp, "export")
        rcas_dom = rcas_dom.reindex(columns=export_hs)

        rcd = get_domestic_rcas(geo_level, year, ymbp, "import")
        rcd = rcd.reindex(columns=import_hs)
        # print rcd.ix["mg"]
        # sys.exit()

        rcas_wld = get_wld_rcas(geo_level, year, ymbp)
        rcas_wld = rcas_wld.reindex(columns=export_hs)
        # print rcas_wld.ix["4"]
        # print rcas_wld['010204']
        # sys.exit()

        rcas_dom_binary = rcas_dom.copy()
        rcas_dom_binary[rcas_dom_binary >= 1] = 1
        rcas_dom_binary[rcas_dom_binary < 1] = 0

        rcas_wld_binary = rcas_wld.copy()
        rcas_wld_binary[rcas_wld_binary >= 1] = 1
        rcas_wld_binary[rcas_wld_binary < 1] = 0
        '''
            DISTANCES
        '''
        '''domestic distances'''
        prox_dom = ps_calcs.proximity(rcas_dom_binary)
        dist_dom = ps_calcs.distance(rcas_dom_binary, prox_dom).fillna(0)
        '''world distances'''
        prox_wld = get_wld_proximity(year)
        hs_wld = set(rcas_wld_binary.columns).intersection(
            set(prox_wld.columns))

        # hs_wld = set(rcas_wld_binary.columns).union(set(prox_wld.columns))
        prox_wld = prox_wld.reindex(columns=hs_wld, index=hs_wld)
        rcas_wld_binary = rcas_wld_binary.reindex(columns=hs_wld)

        dist_wld = ps_calcs.distance(rcas_wld_binary, prox_wld).fillna(0)
        '''
            OPP GAIN
        '''
        '''same PCIs for all since we are using world PCIs'''
        pcis = get_pcis(geo_level, ymp)

        # all_hs_dom = set(pcis.index).union(set(rcas_dom.columns))
        all_hs_dom = set(pcis.index).intersection(set(rcas_dom.columns))
        pcis_dom = pcis.reindex(index=all_hs_dom)
        rcas_dom_binary = rcas_dom_binary.reindex(columns=all_hs_dom)
        prox_dom = prox_dom.reindex(index=all_hs_dom, columns=all_hs_dom)

        # print rcas_dom_binary.shape, prox_dom.shape, pcis.shape

        # all_hs_wld = set(pcis.index).union(set(rcas_wld.columns))
        all_hs_wld = set(pcis.index).intersection(set(rcas_wld.columns))
        pcis_wld = pcis.reindex(index=all_hs_wld)
        rcas_wld_binary = rcas_wld_binary.reindex(columns=all_hs_wld)
        prox_wld = prox_wld.reindex(index=all_hs_wld, columns=all_hs_wld)

        # print rcas_dom_binary.shape, prox_dom.shape, pcis.shape
        opp_gain_wld = ps_calcs.opportunity_gain(rcas_wld_binary, prox_wld,
                                                 pcis_wld)
        opp_gain_dom = ps_calcs.opportunity_gain(rcas_dom_binary, prox_dom,
                                                 pcis_wld)
        '''
            SET RCAS TO NULL
        '''
        rcas_dom = rcas_dom.replace(0, np.nan)
        rcas_wld = rcas_wld.replace(0, np.nan)
        rcd = rcd.replace(0, np.nan)

        def tryto(df, col, ind):
            if col in df.columns:
                if ind in df.index:
                    return df[col][ind]
            return None

        # print opp_gain_wld.ix["al000107"].ix["041601"]
        # print opp_gain_dom.ix["al000107"].ix["041601"]
        # print tryto(opp_gain_dom, "041601", "al000107")
        # print "al000107" in set(rcas_dom.index).union(set(rcas_wld.index))
        # print "041601" in set(export_hs).union(set(import_hs))
        # sys.exit()
        ''' Connect to DB '''
        db = MySQLdb.connect(host=os.environ.get("DATAVIVA_DB_HOST",
                                                 "localhost"),
                             user=os.environ["DATAVIVA_DB_USER"],
                             passwd=os.environ["DATAVIVA_DB_PW"],
                             db=os.environ["DATAVIVA_DB_NAME"])
        db.autocommit(1)
        cursor = db.cursor()

        for bra in set(rcas_dom.index).union(set(rcas_wld.index)):
            for hs in set(export_hs).union(set(import_hs)):
                cursor.execute(
                    "update secex_ymbp set rca_wld=%s, opp_gain_wld=%s, distance_wld=%s where year=%s and month=0 and bra_id=%s and hs_id=%s;",
                    [
                        tryto(rcas_wld, hs, bra),
                        tryto(opp_gain_wld, hs, bra),
                        tryto(dist_wld, hs, bra), year, bra, hs
                    ])