def prox(year, output_path, attr, i_attr, table, column):

    attr_depths = depths[attr]
    i_attr_depths = depths[i_attr]

    years = get_years(year)

    for year in years:
        print "year:", year

        for i, depth in enumerate(attr_depths):
            print attr, "depth:", depth

            query = """
                SELECT {0}_id, {1}_id, {2}
                FROM {3}
                WHERE year=%s
            """.format(attr, i_attr, column, table)

            if len(attr_depths) > 1:
                query += " and {}_id_len={}".format(attr, depth)

            if len(i_attr_depths) > 1:
                query += " and {}_id_len={}".format(i_attr, i_attr_depths[-1])

            if "secex" in table:
                query += " and month=0"

            data = sql.read_sql(query, db, params=[year])
            data = data.pivot(index="{}_id".format(i_attr),
                              columns="{}_id".format(attr),
                              values=column)

            rcas = ps_calcs.rca(data)

            rcas[rcas >= 1] = 1
            rcas[rcas < 1] = 0

            prox = ps_calcs.proximity(rcas)
            prox = pd.DataFrame(prox.unstack(),
                                columns=["{}_proximity".format(i_attr)])
            prox["year"] = year
            prox = prox.set_index("year", append=True)

            output_path_w_year = os.path.abspath(
                os.path.join(output_path, str(year)))
            if not os.path.exists(output_path_w_year):
                os.makedirs(output_path_w_year)
            fp = os.path.join(output_path_w_year,
                              "{}_{}_proximity.tsv".format(attr, i_attr))

            file_mode = 'a' if i else 'w'
            user_header = False if i else True
            with open(fp, file_mode) as f:
                prox.to_csv(f, header=user_header, sep="\t")
def prox(year, output_path, attr, i_attr, table, column):
    
    attr_depths = depths[attr]
    i_attr_depths = depths[i_attr]
    
    years = get_years(year)
    
    for year in years:
        print "year:", year
        
        for i, depth in enumerate(attr_depths):
            print attr, "depth:", depth
            
            query = """
                SELECT {0}_id, {1}_id, {2}
                FROM {3}
                WHERE year=%s
            """.format(attr, i_attr, column, table)
            
            if len(attr_depths) > 1:
                query += " and {}_id_len={}".format(attr, depth)
            
            if len(i_attr_depths) > 1:
                query += " and {}_id_len={}".format(i_attr, i_attr_depths[-1])
            
            if "secex" in table:
                query += " and month=0"
            
            data = sql.read_sql(query, db, params=[year])
            data = data.pivot(index="{}_id".format(i_attr), columns="{}_id".format(attr), values=column)
        
            rcas = ps_calcs.rca(data)
        
            rcas[rcas >= 1] = 1
            rcas[rcas < 1] = 0
        
            prox = ps_calcs.proximity(rcas)
            prox = pd.DataFrame(prox.unstack(), columns=["{}_proximity".format(i_attr)])
            prox["year"] = year
            prox = prox.set_index("year", append=True)
        
            output_path_w_year = os.path.abspath(os.path.join(output_path, str(year)))
            if not os.path.exists(output_path_w_year): os.makedirs(output_path_w_year)
            fp = os.path.join(output_path_w_year, "{}_{}_proximity.tsv".format(attr, i_attr))
        
            file_mode = 'a' if i else 'w'
            user_header = False if i else True
            with open(fp, file_mode) as f:
                prox.to_csv(f, header=user_header, sep="\t")
Beispiel #3
0
def get_wld_proximity(year, ypw_file_path):

    '''Get world values from ypw file'''
    table = pd.read_csv(ypw_file_path, compression="bz2", sep="\t", converters={"hs_id":str})
    table = table.rename(columns={"val_usd":"export_val"})
    table = table.pivot(index="wld_id", columns="hs_id", values="export_val").fillna(0)

    '''Use growth library to run RCA calculation on data'''
    mcp = ps_calcs.rca(table)
    mcp[mcp >= 1] = 1
    mcp[mcp < 1] = 0

    prox = ps_calcs.proximity(mcp)

    return prox
Beispiel #4
0
def get_wld_proximity(year, ypw_file_path):
    '''Get world values from ypw file'''
    table = pd.read_csv(ypw_file_path,
                        compression="bz2",
                        sep="\t",
                        converters={"hs_id": str})
    table = table.rename(columns={"val_usd": "export_val"})
    table = table.pivot(index="wld_id", columns="hs_id",
                        values="export_val").fillna(0)
    '''Use growth library to run RCA calculation on data'''
    mcp = ps_calcs.rca(table)
    mcp[mcp >= 1] = 1
    mcp[mcp < 1] = 0

    prox = ps_calcs.proximity(mcp)

    return prox
Beispiel #5
0
def get_wld_proximity(year, ypw_file_path):
    ''' Connect to DB '''
    db = MySQLdb.connect(host=os.environ.get("DATAVIVA_DB_HOST", "localhost"), 
                            user=os.environ.get("DATAVIVA_DB_USER", "root"), 
                            passwd=os.environ.get("DATAVIVA_DB_PW", ""), 
                            db=os.environ.get("DATAVIVA_DB_NAME", "dataviva"))

    '''Get world values from ypw file'''
    table = pd.read_csv(ypw_file_path, compression="bz2", sep="\t", converters={"hs_id":str})
    table = table.rename(columns={"val_usd":"export_val"})
    table = table.pivot(index="wld_id", columns="hs_id", values="export_val").fillna(0)    

    '''Use growth library to run RCA calculation on data'''
    mcp = ps_calcs.rca(table)
    mcp[mcp >= 1] = 1
    mcp[mcp < 1] = 0
    
    prox = ps_calcs.proximity(mcp)

    return prox
Beispiel #6
0
def get_wld_proximity(year, ypw_file_path):
    ''' Connect to DB '''
    db = MySQLdb.connect(host=os.environ.get("DATAVIVA_DB_HOST", "localhost"),
                         user=os.environ.get("DATAVIVA_DB_USER", "root"),
                         passwd=os.environ.get("DATAVIVA_DB_PW", ""),
                         db=os.environ.get("DATAVIVA_DB_NAME", "dataviva"))
    '''Get world values from ypw file'''
    table = pd.read_csv(ypw_file_path,
                        compression="bz2",
                        sep="\t",
                        converters={"hs_id": str})
    table = table.rename(columns={"val_usd": "export_val"})
    table = table.pivot(index="wld_id", columns="hs_id",
                        values="export_val").fillna(0)
    '''Use growth library to run RCA calculation on data'''
    mcp = ps_calcs.rca(table)
    mcp[mcp >= 1] = 1
    mcp[mcp < 1] = 0

    prox = ps_calcs.proximity(mcp)

    return prox
Beispiel #7
0
def get_wld_proximity(year):
    ''' Connect to DB '''
    db = MySQLdb.connect(host=os.environ["DATAVIVA_DB_HOST"],
                         user=os.environ["DATAVIVA_DB_USER"],
                         passwd=os.environ["DATAVIVA_DB_PW"],
                         db=os.environ["DATAVIVA_DB_NAME"])
    '''Get values from database'''
    q = "select wld_id, hs_id, val_usd " \
        "from comtrade_ypw " \
        "where year = {0} and length(hs_id) = 6".format(year)
    table = sql.read_sql(q, db)
    table = table.rename(columns={"val_usd": "val_usd"})
    table = table.pivot(index="wld_id", columns="hs_id", values="val_usd")
    table = table.fillna(0)
    '''Use growth library to run RCA calculation on data'''
    mcp = ps_calcs.rca(table)
    mcp[mcp >= 1] = 1
    mcp[mcp < 1] = 0

    prox = ps_calcs.proximity(mcp)

    return prox
Beispiel #8
0
def get_wld_proximity(year):
    ''' Connect to DB '''
    db = MySQLdb.connect(host=os.environ["DATAVIVA_DB_HOST"], user=os.environ["DATAVIVA_DB_USER"], 
                            passwd=os.environ["DATAVIVA_DB_PW"], 
                            db=os.environ["DATAVIVA_DB_NAME"])

    '''Get values from database'''
    q = "select wld_id, hs_id, val_usd " \
        "from comtrade_ypw " \
        "where year = {0} and length(hs_id) = 6".format(year)
    table = sql.read_sql(q, db)
    table = table.rename(columns={"val_usd":"val_usd"})
    table = table.pivot(index="wld_id", columns="hs_id", values="val_usd")
    table = table.fillna(0)

    '''Use growth library to run RCA calculation on data'''
    mcp = ps_calcs.rca(table)
    mcp[mcp >= 1] = 1
    mcp[mcp < 1] = 0
    
    prox = ps_calcs.proximity(mcp)

    return prox
Beispiel #9
0
def rdo(ybp, yp, year, depths):

    hs = yp[["val_usd"]].groupby(level=["hs_id"]).sum().dropna()
    hs = [h for h in hs.index if len(h) == depths["hs"][-1]]

    rca_dist_opp = []
    for geo_level in depths["bra"]:
        print "geo_level", geo_level
        '''
            RCAS
        '''
        rcas_dom = get_domestic_rcas(geo_level, year, ybp, depths)
        rcas_dom = rcas_dom.reindex(columns=hs)

        rcas_wld = get_wld_rcas(geo_level, year, ybp, depths)
        rcas_wld = rcas_wld.reindex(columns=hs)

        rcas_dom_binary = rcas_dom.copy()
        rcas_dom_binary[rcas_dom_binary >= 1] = 1
        rcas_dom_binary[rcas_dom_binary < 1] = 0

        rcas_wld_binary = rcas_wld.copy()
        rcas_wld_binary[rcas_wld_binary >= 1] = 1
        rcas_wld_binary[rcas_wld_binary < 1] = 0
        '''
            DISTANCES
        '''
        '''domestic distances'''
        prox_dom = ps_calcs.proximity(rcas_dom_binary)
        dist_dom = ps_calcs.distance(rcas_dom_binary, prox_dom).fillna(0)
        '''world distances'''
        prox_wld = get_wld_proximity(year)
        hs_wld = set(rcas_wld_binary.columns).intersection(
            set(prox_wld.columns))

        # hs_wld = set(rcas_wld_binary.columns).union(set(prox_wld.columns))
        prox_wld = prox_wld.reindex(columns=hs_wld, index=hs_wld)
        rcas_wld_binary = rcas_wld_binary.reindex(columns=hs_wld)

        dist_wld = ps_calcs.distance(rcas_wld_binary, prox_wld).fillna(0)
        '''
            OPP GAIN
        '''
        '''same PCIs for all since we are using world PCIs'''
        pcis = get_pcis(geo_level, yp, depths)

        # all_hs_dom = set(pcis.index).union(set(rcas_dom.columns))
        all_hs_dom = set(pcis.index).intersection(set(rcas_dom.columns))
        pcis_dom = pcis.reindex(index=all_hs_dom)
        rcas_dom_binary = rcas_dom_binary.reindex(columns=all_hs_dom)
        prox_dom = prox_dom.reindex(index=all_hs_dom, columns=all_hs_dom)

        # print rcas_dom_binary.shape, prox_dom.shape, pcis.shape

        # all_hs_wld = set(pcis.index).union(set(rcas_wld.columns))
        all_hs_wld = set(pcis.index).intersection(set(rcas_wld.columns))
        pcis_wld = pcis.reindex(index=all_hs_wld)
        rcas_wld_binary = rcas_wld_binary.reindex(columns=all_hs_wld)
        prox_wld = prox_wld.reindex(index=all_hs_wld, columns=all_hs_wld)

        # print rcas_dom_binary.shape, prox_dom.shape, pcis.shape
        opp_gain_wld = ps_calcs.opportunity_gain(rcas_wld_binary, prox_wld,
                                                 pcis_wld)
        opp_gain_dom = ps_calcs.opportunity_gain(rcas_dom_binary, prox_dom,
                                                 pcis_wld)
        '''
            SET RCAS TO NULL
        '''
        rcas_dom = rcas_dom.replace(0, np.nan)
        rcas_wld = rcas_wld.replace(0, np.nan)

        def tryto(df, col, ind):
            if col in df.columns:
                if ind in df.index:
                    return df[col][ind]
            return None

        # print opp_gain_wld.ix["al000107"].ix["041601"]
        # print opp_gain_dom.ix["al000107"].ix["041601"]
        # print tryto(opp_gain_dom, "041601", "al000107")
        # print "al000107" in set(rcas_dom.index).union(set(rcas_wld.index))
        # print "041601" in set(export_hs).union(set(import_hs))
        # sys.exit()

        for bra in set(rcas_dom.index).union(set(rcas_wld.index)):
            for h in hs:
                rca_dist_opp.append([year, bra, h, \
                                tryto(rcas_dom, h, bra), tryto(rcas_wld, h, bra), \
                                tryto(dist_dom, h, bra), tryto(dist_wld, h, bra), \
                                tryto(opp_gain_dom, h, bra), tryto(opp_gain_wld, h, bra) ])

        # print len(rca_dist_opp), "rows updated"

    # now time to merge!
    # print "merging datasets..."
    ybp_rdo = pd.DataFrame(rca_dist_opp,
                           columns=[
                               "year", "bra_id", "hs_id", "rca", "rca_wld",
                               "distance", "distance_wld", "opp_gain",
                               "opp_gain_wld"
                           ])
    ybp_rdo["year"] = ybp_rdo["year"].astype("int")
    ybp_rdo = ybp_rdo.set_index(["year", "bra_id", "hs_id"])

    ybp = pd.merge(ybp,
                   ybp_rdo,
                   how="outer",
                   left_index=True,
                   right_index=True)

    return ybp
Beispiel #10
0
def rdo(ybi, yi, year, depths):
    
    rca_dist_opp = []
    for geo_level in depths["bra"]:
        print "geo level:", geo_level
        
        ybi_data = ybi.reset_index()
        
        bra_criterion = ybi_data["bra_id"].str.len() == geo_level
        cnae_criterion = ybi_data["cnae_id"].str.len() == 6
        ybi_data = ybi_data[bra_criterion & cnae_criterion]
        
        # ybi_data = ybi_data.reindex(index=ybi_index)
        # ybi_data = ybi_data.drop(["year", "num_emp", "num_est", "wage_avg", "num_emp_est"], axis=1)
        ybi_data = ybi_data[["bra_id", "cnae_id", "wage"]]
    
        # ybi_data = ybi_data.unstack()
        # levels = ybi_data.columns.levels
        # labels = ybi_data.columns.labels
        # ybi_data.columns = levels[1][labels[1]]

        '''
            RCAS
        '''
        
        # ybi_data = ybi_data.pivot(index="bra_id", columns="cnae_id", values="wage").fillna(0)
        ybi_data = ybi_data.pivot(index="bra_id", columns="cnae_id", values="wage")
        rcas = ps_calcs.rca(ybi_data)
    
        rcas_binary = rcas.copy()
        rcas_binary[rcas_binary >= 1] = 1
        rcas_binary[rcas_binary < 1] = 0
    
        '''
            DISTANCES
        '''
    
        '''calculate proximity for opportunity gain calculation'''    
        prox = ps_calcs.proximity(rcas_binary)
        '''calculate distances using proximity'''    
        dist = ps_calcs.distance(rcas_binary, prox).fillna(0)
    
        '''
            OPP GAIN
        '''
    
        '''calculate product complexity'''
        pci = ps_calcs.complexity(rcas_binary)[1]
        '''calculate opportunity gain'''
        opp_gain = ps_calcs.opportunity_gain(rcas_binary, prox, pci)
    
        rdo = []
        for bra in rcas.index:
            for cnae in rcas.columns:
                rdo.append([year, bra, cnae, rcas[cnae][bra], dist[cnae][bra], opp_gain[cnae][bra]])
    
        rca_dist_opp += rdo
    
    # now time to merge!
    print "merging datasets..."
    ybi_rdo = pd.DataFrame(rca_dist_opp, columns=["year", "bra_id", "cnae_id", "rca", "distance", "opp_gain"])
    ybi_rdo["year"] = ybi_rdo["year"].astype(int)
    ybi_rdo["rca"][ybi_rdo["rca"] == 0] = np.nan
    ybi_rdo = ybi_rdo.set_index(["year", "bra_id", "cnae_id"])
    
    # get union of both sets of indexes
    all_ybi_indexes = set(ybi.index).union(set(ybi_rdo.index))
    
    all_ybi_indexes = pd.MultiIndex.from_tuples(all_ybi_indexes, names=["year", "bra_id", "cnae_id"])
    # ybi = ybi.reindex(index=all_ybi_indexes, fill_value=0)
    ybi = ybi.reindex(index=all_ybi_indexes)
    ybi["rca"] = ybi_rdo["rca"]
    ybi["distance"] = ybi_rdo["distance"]
    ybi["opp_gain"] = ybi_rdo["opp_gain"]
    
    return ybi
Beispiel #11
0
def rdo(ymbp, ymp, year, geo_depths, ypw_file_path):

    export_hs = ymp[["export_val"]].groupby(level=["hs_id"]).sum().dropna()
    export_hs = [hs for hs in export_hs.index if len(hs) == 6]

    import_hs = ymp[["import_val"]].groupby(level=["hs_id"]).sum().dropna()
    import_hs = [hs for hs in import_hs.index if len(hs) == 6]

    rca_dist_opp = []
    for geo_level in geo_depths:
        print "geo_level",geo_level

        '''
            RCAS
        '''
        rcas_dom = get_domestic_rcas(geo_level, year, ymbp, "export")
        rcas_dom = rcas_dom.reindex(columns=export_hs)

        rcd = get_domestic_rcas(geo_level, year, ymbp, "import")
        rcd = rcd.reindex(columns=import_hs)
        # print rcd.ix["mg"]
        # sys.exit()

        rcas_wld = get_wld_rcas(geo_level, year, ymbp, ypw_file_path)
        rcas_wld = rcas_wld.reindex(columns=export_hs)
        # print rcas_wld.ix["mg"]
        # print rcas_wld['010204']
        # sys.exit()

        rcas_dom_binary = rcas_dom.copy()
        rcas_dom_binary[rcas_dom_binary >= 1] = 1
        rcas_dom_binary[rcas_dom_binary < 1] = 0

        rcas_wld_binary = rcas_wld.copy()
        rcas_wld_binary[rcas_wld_binary >= 1] = 1
        rcas_wld_binary[rcas_wld_binary < 1] = 0

        '''
            DISTANCES
        '''
        '''domestic distances'''
        prox_dom = ps_calcs.proximity(rcas_dom_binary)
        dist_dom = ps_calcs.distance(rcas_dom_binary, prox_dom).fillna(0)

        '''world distances'''
        prox_wld = get_wld_proximity(year, ypw_file_path)
        hs_wld = set(rcas_wld_binary.columns).intersection(set(prox_wld.columns))

        # hs_wld = set(rcas_wld_binary.columns).union(set(prox_wld.columns))
        prox_wld = prox_wld.reindex(columns=hs_wld, index=hs_wld)
        rcas_wld_binary = rcas_wld_binary.reindex(columns=hs_wld)

        dist_wld = ps_calcs.distance(rcas_wld_binary, prox_wld).fillna(0)

        '''
            OPP GAIN
        '''

        '''same PCIs for all since we are using world PCIs'''
        pcis = get_pcis(geo_level, ymp)

        # all_hs_dom = set(pcis.index).union(set(rcas_dom.columns))
        all_hs_dom = set(pcis.index).intersection(set(rcas_dom.columns))
        pcis_dom = pcis.reindex(index=all_hs_dom)
        rcas_dom_binary = rcas_dom_binary.reindex(columns = all_hs_dom)
        prox_dom = prox_dom.reindex(index = all_hs_dom, columns = all_hs_dom)

        # print rcas_dom_binary.shape, prox_dom.shape, pcis.shape

        # all_hs_wld = set(pcis.index).union(set(rcas_wld.columns))
        all_hs_wld = set(pcis.index).intersection(set(rcas_wld.columns))
        pcis_wld = pcis.reindex(index=all_hs_wld)
        rcas_wld_binary = rcas_wld_binary.reindex(columns = all_hs_wld)
        prox_wld = prox_wld.reindex(index = all_hs_wld, columns = all_hs_wld)

        # print rcas_dom_binary.shape, prox_dom.shape, pcis.shape
        opp_gain_wld = ps_calcs.opportunity_gain(rcas_wld_binary, prox_wld, pcis_wld)
        opp_gain_dom = ps_calcs.opportunity_gain(rcas_dom_binary, prox_dom, pcis_wld)

        '''
            SET RCAS TO NULL
        '''
        rcas_dom = rcas_dom.replace(0, np.nan)
        rcas_wld = rcas_wld.replace(0, np.nan)
        rcd = rcd.replace(0, np.nan)

        def tryto(df, col, ind):
            if col in df.columns:
                if ind in df.index:
                    return df[col][ind]
            return None

        # print opp_gain_wld.ix["al000107"].ix["041601"]
        # print opp_gain_dom.ix["al000107"].ix["041601"]
        # print tryto(opp_gain_dom, "041601", "al000107")
        # print "al000107" in set(rcas_dom.index).union(set(rcas_wld.index))
        # print "041601" in set(export_hs).union(set(import_hs))
        # sys.exit()

        for bra in set(rcas_dom.index).union(set(rcas_wld.index)):
            for hs in set(export_hs).union(set(import_hs)):
                rca_dist_opp.append([year, bra, hs, \
                                tryto(rcas_dom, hs, bra), tryto(rcas_wld, hs, bra), \
                                tryto(rcd, hs, bra), \
                                tryto(dist_dom, hs, bra), tryto(dist_wld, hs, bra), \
                                tryto(opp_gain_dom, hs, bra), tryto(opp_gain_wld, hs, bra) ])

        # print len(rca_dist_opp), "rows updated"

    # now time to merge!
    # print "merging datasets..."
    ybp_rdo = pd.DataFrame(rca_dist_opp, columns=["year", "bra_id", "hs_id", "rca", "rca_wld", "rcd", "distance", "distance_wld", "opp_gain", "opp_gain_wld"])
    ybp_rdo["year"] = ybp_rdo["year"].astype("int")
    ybp_rdo["month"] = "00"
    ybp_rdo = ybp_rdo.set_index(["year", "month", "bra_id", "hs_id"])

    ymbp = pd.merge(ymbp, ybp_rdo, how="outer", left_index=True, right_index=True)

    return ymbp
def main(input_file, year, output_dir):
    
    output_dir = os.path.abspath(os.path.join(output_dir, str(year)))
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    store = pd.HDFStore(os.path.join(output_dir,'yodp.h5'))
    
    try:
        ypw = store.get('ypw')
    except KeyError:
        '''
        Import file to pandas dataframe
        '''
        comtrade_df = import_file(input_file)
    
        '''
        Add indexes
        '''
        ypw = comtrade_df.groupby(['hs_id', 'wld_id']).sum()
        
        store.put('ypw', ypw)
    
    '''
    Calculate RCA
    '''
    ypw_rca = ypw.reset_index()
    ypw_rca = ypw_rca.pivot(index="wld_id", columns="hs_id", values="val_usd")
    ypw_rca = ps_calcs.rca(ypw_rca)
    
    ypw_rca_binary = ypw_rca.copy()
    ypw_rca_binary[ypw_rca_binary >= 1] = 1
    ypw_rca_binary[ypw_rca_binary < 1] = 0
    
    '''
        DISTANCES
    '''
    ypw_prox = ps_calcs.proximity(ypw_rca_binary)
    ypw_dist = ps_calcs.distance(ypw_rca_binary, ypw_prox).fillna(0)
    
    '''
        COMPLEXITY
    '''
    eci, pci = calc_complexity(ypw)
    
    '''
        OPP GAIN
    '''
    ypw_opp_gain = ps_calcs.opportunity_gain(ypw_rca_binary[pci.index], ypw_prox[pci.index].reindex(pci.index), pci)
    
    '''
        MERGE DATA
    '''
    ypw_opp_gain = pd.DataFrame(ypw_opp_gain.T.stack(), columns=["opp_gain"])
    ypw_opp_gain = ypw_opp_gain.replace(0, np.nan)
    
    ypw_dist = pd.DataFrame(ypw_dist.T.stack(), columns=["distance"])
    ypw_dist = ypw_dist.replace(0, np.nan)
    
    ypw_rca = pd.DataFrame(ypw_rca.T.stack(), columns=["rca"])
    ypw_rca = ypw_rca.replace(0, np.nan)
    
    new_ypw = ypw \
                .merge(ypw_rca, how="outer", left_index=True, right_index=True) \
                .merge(ypw_dist, how="outer", left_index=True, right_index=True) \
                .merge(ypw_opp_gain, how="outer", left_index=True, right_index=True)
    new_ypw = new_ypw.reset_index()
    new_ypw["year"] = year
    cols = new_ypw.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    new_ypw = new_ypw[cols]
    
    
    '''
    Write out to files
    '''
    new_file_path = os.path.abspath(os.path.join(output_dir, "comtrade_ypw.tsv.bz2"))
    new_ypw.to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=False, float_format="%.3f")
    
    new_file_path = os.path.abspath(os.path.join(output_dir, "comtrade_pci.tsv.bz2"))
    pd.DataFrame(pci, columns=["pci"]).to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True, float_format="%.3f")
    
    new_file_path = os.path.abspath(os.path.join(output_dir, "comtrade_eci.tsv.bz2"))
    pd.DataFrame(eci, columns=["eci"]).to_csv(bz2.BZ2File(new_file_path, 'wb'), sep="\t", index=True, float_format="%.3f")
Beispiel #13
0
def main():
    ''' 
        Step 1:
        Import the data file to a pandas DataFrame.
    '''
    try:
        oec_df = pd.read_csv("data/year_origin_hs92_4.tsv", \
                                sep="\t", \
                                converters={"hs92":str})
    except IOError:
        sys.exit("File doesn't exist, use fetch_oec_data.sh to download.")
    ''' 
        Step 2:
        Convert our vertically oriented data CPY (country-product-year) into
        the multidimensional Mcp matrix.
        rows     = countries
        columns  = products
    '''
    # Only use most recent year (could loop through each year too...)
    most_recent_year = sorted(oec_df.year.unique())[-1]
    oec_df = oec_df[oec_df.year == most_recent_year]

    # We only care about the country, product and export_val columns
    # so let's drop all the others
    oec_df = oec_df[["origin", "hs92", "export_val"]]

    # Drop all rows without export value
    oec_df = oec_df[~oec_df.export_val.isnull()]

    # Now we pivot our flat file to be countries X products
    mcp = oec_df.pivot(index="origin", columns="hs92", values="export_val")
    ''' 
        Step 3:
        Now this is the easiest part, we use the ps_calcs library to run the
        RCA calculation on the Mcp matrix.
    '''
    rcas = rca(mcp)

    # Here are some tests...
    # 1. Print the 10 products New Zealand (nzl) has the highest RCA in.
    # 0204 = Sheep and Goat Meat
    print(rcas.ix['nzl'].sort_values(ascending=False).head(10))

    # Here are some tests...
    # 1. Print the 10 countries with the highest RCA in cars (8703).
    # SVK = Slovakia
    print(rcas['8703'].sort_values(ascending=False).head(10))
    ''' 
        Step 4:
        Lastly, we can convert our nominal RCA values into binary 1s and 0s,
        1 and > meaning that countries exports their fair share of the product
        and 0 meaning they don't.
    '''
    rcas[rcas >= 1] = 1
    rcas[rcas < 1] = 0

    proximities = proximity(rcas)
    densities = density(rcas, proximities)

    print("\nThe top 10 HS product codes that Brazil has RCA in:\n")
    print(rcas.loc["bra"].sort_values(ascending=False).head(10))

    print("\n Rcas")
    print(rcas)

    print("\n Proximities")
    print(proximities)

    print("\n Densities")
    print(densities)

    print(
        "\nCalculation run successfully! Read the source code to see what's going on."
    )
Beispiel #14
0
def rdo(ybi, yi, year, depths):

    rca_dist_opp = []
    for geo_level in depths["bra"]:
        print "geo level:", geo_level

        ybi_data = ybi.reset_index()

        bra_criterion = ybi_data["bra_id"].str.len() == geo_level
        cnae_criterion = ybi_data["cnae_id"].str.len() == 6
        ybi_data = ybi_data[bra_criterion & cnae_criterion]

        # ybi_data = ybi_data.reindex(index=ybi_index)
        # ybi_data = ybi_data.drop(["year", "num_emp", "num_est", "wage_avg", "num_emp_est"], axis=1)
        ybi_data = ybi_data[["bra_id", "cnae_id", "wage"]]

        # ybi_data = ybi_data.unstack()
        # levels = ybi_data.columns.levels
        # labels = ybi_data.columns.labels
        # ybi_data.columns = levels[1][labels[1]]
        '''
            RCAS
        '''

        # ybi_data = ybi_data.pivot(index="bra_id", columns="cnae_id", values="wage").fillna(0)
        ybi_data = ybi_data.pivot(index="bra_id",
                                  columns="cnae_id",
                                  values="wage")
        rcas = ps_calcs.rca(ybi_data)

        rcas_binary = rcas.copy()
        rcas_binary[rcas_binary >= 1] = 1
        rcas_binary[rcas_binary < 1] = 0
        '''
            DISTANCES
        '''
        '''calculate proximity for opportunity gain calculation'''
        prox = ps_calcs.proximity(rcas_binary)
        '''calculate distances using proximity'''
        dist = ps_calcs.distance(rcas_binary, prox).fillna(0)
        '''
            OPP GAIN
        '''
        '''calculate product complexity'''
        pci = ps_calcs.complexity(rcas_binary)[1]
        '''calculate opportunity gain'''
        opp_gain = ps_calcs.opportunity_gain(rcas_binary, prox, pci)

        rdo = []
        for bra in rcas.index:
            for cnae in rcas.columns:
                rdo.append([
                    year, bra, cnae, rcas[cnae][bra], dist[cnae][bra],
                    opp_gain[cnae][bra]
                ])

        rca_dist_opp += rdo

    # now time to merge!
    print "merging datasets..."
    ybi_rdo = pd.DataFrame(
        rca_dist_opp,
        columns=["year", "bra_id", "cnae_id", "rca", "distance", "opp_gain"])
    ybi_rdo["year"] = ybi_rdo["year"].astype(int)
    ybi_rdo["rca"][ybi_rdo["rca"] == 0] = np.nan
    ybi_rdo = ybi_rdo.set_index(["year", "bra_id", "cnae_id"])

    # get union of both sets of indexes
    all_ybi_indexes = set(ybi.index).union(set(ybi_rdo.index))

    all_ybi_indexes = pd.MultiIndex.from_tuples(
        all_ybi_indexes, names=["year", "bra_id", "cnae_id"])
    # ybi = ybi.reindex(index=all_ybi_indexes, fill_value=0)
    ybi = ybi.reindex(index=all_ybi_indexes)
    ybi["rca"] = ybi_rdo["rca"]
    ybi["distance"] = ybi_rdo["distance"]
    ybi["opp_gain"] = ybi_rdo["opp_gain"]

    return ybi
def rdo(ymbp, ymp, year, geo_depths):

    export_hs = ymp[["export_val"]].groupby(level=["hs_id"]).sum().dropna()
    export_hs = [hs for hs in export_hs.index if len(hs) == 6]

    import_hs = ymp[["import_val"]].groupby(level=["hs_id"]).sum().dropna()
    import_hs = [hs for hs in import_hs.index if len(hs) == 6]

    rca_dist_opp = []
    for geo_level in geo_depths:
        print "geo_level", geo_level

        """
            RCAS
        """
        rcas_dom = get_domestic_rcas(geo_level, year, ymbp, "export")
        rcas_dom = rcas_dom.reindex(columns=export_hs)

        rcd = get_domestic_rcas(geo_level, year, ymbp, "import")
        rcd = rcd.reindex(columns=import_hs)
        # print rcd.ix["mg"]
        # sys.exit()

        rcas_wld = get_wld_rcas(geo_level, year, ymbp)
        rcas_wld = rcas_wld.reindex(columns=export_hs)
        # print rcas_wld.ix["4"]
        # print rcas_wld['010204']
        # sys.exit()

        rcas_dom_binary = rcas_dom.copy()
        rcas_dom_binary[rcas_dom_binary >= 1] = 1
        rcas_dom_binary[rcas_dom_binary < 1] = 0

        rcas_wld_binary = rcas_wld.copy()
        rcas_wld_binary[rcas_wld_binary >= 1] = 1
        rcas_wld_binary[rcas_wld_binary < 1] = 0

        """
            DISTANCES
        """
        """domestic distances"""
        prox_dom = ps_calcs.proximity(rcas_dom_binary)
        dist_dom = ps_calcs.distance(rcas_dom_binary, prox_dom).fillna(0)

        """world distances"""
        prox_wld = get_wld_proximity(year)
        hs_wld = set(rcas_wld_binary.columns).intersection(set(prox_wld.columns))

        # hs_wld = set(rcas_wld_binary.columns).union(set(prox_wld.columns))
        prox_wld = prox_wld.reindex(columns=hs_wld, index=hs_wld)
        rcas_wld_binary = rcas_wld_binary.reindex(columns=hs_wld)

        dist_wld = ps_calcs.distance(rcas_wld_binary, prox_wld).fillna(0)

        """
            OPP GAIN
        """

        """same PCIs for all since we are using world PCIs"""
        pcis = get_pcis(geo_level, ymp)

        # all_hs_dom = set(pcis.index).union(set(rcas_dom.columns))
        all_hs_dom = set(pcis.index).intersection(set(rcas_dom.columns))
        pcis_dom = pcis.reindex(index=all_hs_dom)
        rcas_dom_binary = rcas_dom_binary.reindex(columns=all_hs_dom)
        prox_dom = prox_dom.reindex(index=all_hs_dom, columns=all_hs_dom)

        # print rcas_dom_binary.shape, prox_dom.shape, pcis.shape

        # all_hs_wld = set(pcis.index).union(set(rcas_wld.columns))
        all_hs_wld = set(pcis.index).intersection(set(rcas_wld.columns))
        pcis_wld = pcis.reindex(index=all_hs_wld)
        rcas_wld_binary = rcas_wld_binary.reindex(columns=all_hs_wld)
        prox_wld = prox_wld.reindex(index=all_hs_wld, columns=all_hs_wld)

        # print rcas_dom_binary.shape, prox_dom.shape, pcis.shape
        opp_gain_wld = ps_calcs.opportunity_gain(rcas_wld_binary, prox_wld, pcis_wld)
        opp_gain_dom = ps_calcs.opportunity_gain(rcas_dom_binary, prox_dom, pcis_wld)

        """
            SET RCAS TO NULL
        """
        rcas_dom = rcas_dom.replace(0, np.nan)
        rcas_wld = rcas_wld.replace(0, np.nan)
        rcd = rcd.replace(0, np.nan)

        def tryto(df, col, ind):
            if col in df.columns:
                if ind in df.index:
                    return df[col][ind]
            return None

        # print opp_gain_wld.ix["al000107"].ix["041601"]
        # print opp_gain_dom.ix["al000107"].ix["041601"]
        # print tryto(opp_gain_dom, "041601", "al000107")
        # print "al000107" in set(rcas_dom.index).union(set(rcas_wld.index))
        # print "041601" in set(export_hs).union(set(import_hs))
        # sys.exit()

        """ Connect to DB """
        db = MySQLdb.connect(
            host=os.environ.get("DATAVIVA_DB_HOST", "localhost"),
            user=os.environ["DATAVIVA_DB_USER"],
            passwd=os.environ["DATAVIVA_DB_PW"],
            db=os.environ["DATAVIVA_DB_NAME"],
        )
        db.autocommit(1)
        cursor = db.cursor()

        for bra in set(rcas_dom.index).union(set(rcas_wld.index)):
            for hs in set(export_hs).union(set(import_hs)):
                cursor.execute(
                    "update secex_ymbp set rca_wld=%s, opp_gain_wld=%s, distance_wld=%s where year=%s and month=0 and bra_id=%s and hs_id=%s;",
                    [tryto(rcas_wld, hs, bra), tryto(opp_gain_wld, hs, bra), tryto(dist_wld, hs, bra), year, bra, hs],
                )
Beispiel #16
0
def main(input_file, year, output_dir):

    output_dir = os.path.abspath(os.path.join(output_dir, str(year)))
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    store = pd.HDFStore(os.path.join(output_dir, 'yodp.h5'))

    try:
        ypw = store.get('ypw')
    except KeyError:
        '''
        Import file to pandas dataframe
        '''
        comtrade_df = import_file(input_file)
        '''
        Add indexes
        '''
        ypw = comtrade_df.groupby(['hs_id', 'wld_id']).sum()

        store.put('ypw', ypw)
    '''
    Calculate RCA
    '''
    ypw_rca = ypw.reset_index()
    ypw_rca = ypw_rca.pivot(index="wld_id", columns="hs_id", values="val_usd")
    ypw_rca = ps_calcs.rca(ypw_rca)

    ypw_rca_binary = ypw_rca.copy()
    ypw_rca_binary[ypw_rca_binary >= 1] = 1
    ypw_rca_binary[ypw_rca_binary < 1] = 0
    '''
        DISTANCES
    '''
    ypw_prox = ps_calcs.proximity(ypw_rca_binary)
    ypw_dist = ps_calcs.distance(ypw_rca_binary, ypw_prox).fillna(0)
    '''
        COMPLEXITY
    '''
    eci, pci = calc_complexity(ypw)
    '''
        OPP GAIN
    '''
    ypw_opp_gain = ps_calcs.opportunity_gain(
        ypw_rca_binary[pci.index], ypw_prox[pci.index].reindex(pci.index), pci)
    '''
        MERGE DATA
    '''
    ypw_opp_gain = pd.DataFrame(ypw_opp_gain.T.stack(), columns=["opp_gain"])
    ypw_opp_gain = ypw_opp_gain.replace(0, np.nan)

    ypw_dist = pd.DataFrame(ypw_dist.T.stack(), columns=["distance"])
    ypw_dist = ypw_dist.replace(0, np.nan)

    ypw_rca = pd.DataFrame(ypw_rca.T.stack(), columns=["rca"])
    ypw_rca = ypw_rca.replace(0, np.nan)

    new_ypw = ypw \
                .merge(ypw_rca, how="outer", left_index=True, right_index=True) \
                .merge(ypw_dist, how="outer", left_index=True, right_index=True) \
                .merge(ypw_opp_gain, how="outer", left_index=True, right_index=True)
    new_ypw = new_ypw.reset_index()
    new_ypw["year"] = year
    cols = new_ypw.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    new_ypw = new_ypw[cols]
    '''
    Write out to files
    '''
    new_file_path = os.path.abspath(
        os.path.join(output_dir, "comtrade_ypw.tsv.bz2"))
    new_ypw.to_csv(bz2.BZ2File(new_file_path, 'wb'),
                   sep="\t",
                   index=False,
                   float_format="%.3f")

    new_file_path = os.path.abspath(
        os.path.join(output_dir, "comtrade_pci.tsv.bz2"))
    pd.DataFrame(pci, columns=["pci"]).to_csv(bz2.BZ2File(new_file_path, 'wb'),
                                              sep="\t",
                                              index=True,
                                              float_format="%.3f")

    new_file_path = os.path.abspath(
        os.path.join(output_dir, "comtrade_eci.tsv.bz2"))
    pd.DataFrame(eci, columns=["eci"]).to_csv(bz2.BZ2File(new_file_path, 'wb'),
                                              sep="\t",
                                              index=True,
                                              float_format="%.3f")
def rdo(ymbp, ymp, year, geo_depths):

    export_hs = ymp[["export_val"]].groupby(level=["hs_id"]).sum().dropna()
    export_hs = [hs for hs in export_hs.index if len(hs) == 6]

    import_hs = ymp[["import_val"]].groupby(level=["hs_id"]).sum().dropna()
    import_hs = [hs for hs in import_hs.index if len(hs) == 6]

    rca_dist_opp = []
    for geo_level in geo_depths:
        print "geo_level", geo_level
        '''
            RCAS
        '''
        rcas_dom = get_domestic_rcas(geo_level, year, ymbp, "export")
        rcas_dom = rcas_dom.reindex(columns=export_hs)

        rcd = get_domestic_rcas(geo_level, year, ymbp, "import")
        rcd = rcd.reindex(columns=import_hs)
        # print rcd.ix["mg"]
        # sys.exit()

        rcas_wld = get_wld_rcas(geo_level, year, ymbp)
        rcas_wld = rcas_wld.reindex(columns=export_hs)
        # print rcas_wld.ix["4"]
        # print rcas_wld['010204']
        # sys.exit()

        rcas_dom_binary = rcas_dom.copy()
        rcas_dom_binary[rcas_dom_binary >= 1] = 1
        rcas_dom_binary[rcas_dom_binary < 1] = 0

        rcas_wld_binary = rcas_wld.copy()
        rcas_wld_binary[rcas_wld_binary >= 1] = 1
        rcas_wld_binary[rcas_wld_binary < 1] = 0
        '''
            DISTANCES
        '''
        '''domestic distances'''
        prox_dom = ps_calcs.proximity(rcas_dom_binary)
        dist_dom = ps_calcs.distance(rcas_dom_binary, prox_dom).fillna(0)
        '''world distances'''
        prox_wld = get_wld_proximity(year)
        hs_wld = set(rcas_wld_binary.columns).intersection(
            set(prox_wld.columns))

        # hs_wld = set(rcas_wld_binary.columns).union(set(prox_wld.columns))
        prox_wld = prox_wld.reindex(columns=hs_wld, index=hs_wld)
        rcas_wld_binary = rcas_wld_binary.reindex(columns=hs_wld)

        dist_wld = ps_calcs.distance(rcas_wld_binary, prox_wld).fillna(0)
        '''
            OPP GAIN
        '''
        '''same PCIs for all since we are using world PCIs'''
        pcis = get_pcis(geo_level, ymp)

        # all_hs_dom = set(pcis.index).union(set(rcas_dom.columns))
        all_hs_dom = set(pcis.index).intersection(set(rcas_dom.columns))
        pcis_dom = pcis.reindex(index=all_hs_dom)
        rcas_dom_binary = rcas_dom_binary.reindex(columns=all_hs_dom)
        prox_dom = prox_dom.reindex(index=all_hs_dom, columns=all_hs_dom)

        # print rcas_dom_binary.shape, prox_dom.shape, pcis.shape

        # all_hs_wld = set(pcis.index).union(set(rcas_wld.columns))
        all_hs_wld = set(pcis.index).intersection(set(rcas_wld.columns))
        pcis_wld = pcis.reindex(index=all_hs_wld)
        rcas_wld_binary = rcas_wld_binary.reindex(columns=all_hs_wld)
        prox_wld = prox_wld.reindex(index=all_hs_wld, columns=all_hs_wld)

        # print rcas_dom_binary.shape, prox_dom.shape, pcis.shape
        opp_gain_wld = ps_calcs.opportunity_gain(rcas_wld_binary, prox_wld,
                                                 pcis_wld)
        opp_gain_dom = ps_calcs.opportunity_gain(rcas_dom_binary, prox_dom,
                                                 pcis_wld)
        '''
            SET RCAS TO NULL
        '''
        rcas_dom = rcas_dom.replace(0, np.nan)
        rcas_wld = rcas_wld.replace(0, np.nan)
        rcd = rcd.replace(0, np.nan)

        def tryto(df, col, ind):
            if col in df.columns:
                if ind in df.index:
                    return df[col][ind]
            return None

        # print opp_gain_wld.ix["al000107"].ix["041601"]
        # print opp_gain_dom.ix["al000107"].ix["041601"]
        # print tryto(opp_gain_dom, "041601", "al000107")
        # print "al000107" in set(rcas_dom.index).union(set(rcas_wld.index))
        # print "041601" in set(export_hs).union(set(import_hs))
        # sys.exit()
        ''' Connect to DB '''
        db = MySQLdb.connect(host=os.environ.get("DATAVIVA_DB_HOST",
                                                 "localhost"),
                             user=os.environ["DATAVIVA_DB_USER"],
                             passwd=os.environ["DATAVIVA_DB_PW"],
                             db=os.environ["DATAVIVA_DB_NAME"])
        db.autocommit(1)
        cursor = db.cursor()

        for bra in set(rcas_dom.index).union(set(rcas_wld.index)):
            for hs in set(export_hs).union(set(import_hs)):
                cursor.execute(
                    "update secex_ymbp set rca_wld=%s, opp_gain_wld=%s, distance_wld=%s where year=%s and month=0 and bra_id=%s and hs_id=%s;",
                    [
                        tryto(rcas_wld, hs, bra),
                        tryto(opp_gain_wld, hs, bra),
                        tryto(dist_wld, hs, bra), year, bra, hs
                    ])