Ejemplo n.º 1
0
def test_json_error():
    indicator = 'NV.IND.MANF.KD.87'
    with pytest.raises(
            ValueError,
            match=
            'The indicator was not found. It may have been deleted or archived.'
    ):
        get_series(indicator, mrv=1)
Ejemplo n.º 2
0
def test_indicator_most_recent_value():
    idx = get_series('SP.POP.TOTL', mrv=1)
    assert len(idx.index) > 200
    assert_numeric_or_string(idx)

    idx_mrv5 = get_series('SP.POP.TOTL', mrv=5)
    assert len(idx_mrv5.index) == 5 * len(idx.index)
    assert_numeric_or_string(idx_mrv5)
Ejemplo n.º 3
0
def test_indicator_values():
    idx = get_series('SP.POP.TOTL', date='2017',
                     simplify_index=True).sort_values(ascending=False)
    assert len(idx.index) > 200
    assert idx.index.values[0] == 'World'
    assert idx.iloc[0] == 7530360149.0

    idx = get_series('SP.POP.TOTL',
                     date='2017',
                     simplify_index=True,
                     id_or_value='id').sort_values(ascending=False)
    assert len(idx.index) > 200
    assert idx.index.values[0] == 'WLD'
    assert idx.iloc[0] == 7530360149.0
Ejemplo n.º 4
0
def test_indicator_use_id():
    idx = get_series('SP.POP.TOTL',
                     mrv=1,
                     id_or_value='id',
                     simplify_index=True)
    assert len(idx.index) > 200
    assert_numeric_or_string(idx)
    assert idx.name == 'SP.POP.TOTL'
    assert idx.index.names == ['Country']
def get_wbd_by_indicator(indicator: str, mvr_value=20):
    new_wbd_data = pandas.DataFrame(
        wb.get_series(indicator,
                      mrv=mvr_value,
                      id_or_value='id',
                      simplify_index=True))
    new_wbd_data = new_wbd_data.groupby(['Country'
                                         ]).aggregate({indicator: 'last'})
    return new_wbd_data
Ejemplo n.º 6
0
def world_bank_data(url, date):
    """
    Takes a URL for input and extracts the indicator string. This is then used to extract data from world bank data
    :param url: URL of the data page
    :return: Dataframe with indicator as the last column
    """
    indicator = url.split('?')[0].split('/')[-1]
    data = wb.get_series(indicator, date=date, mrv=1).to_frame().reset_index()
    series = data['Series'].unique()[0]
    data = data.drop(['Series'], axis=1)
    data = dfops.rename_pd(data, [data.columns[-1]], [series])
    return data
Ejemplo n.º 7
0
def info_countries_df():
    countries = wb.get_countries()

    # Population dataset, by the World Bank (most recent value), indexed with the country code
    population = wb.get_series('SP.POP.TOTL', id_or_value='id', simplify_index=True, mrv=1)
    # PATCH: if last line is not working (sometimes World Bank doesn't work) replace with the line below
    # population = pd.read_csv('countries_population.csv').set_index('id')['population']

    # Aggregate region, country and population
    df = countries[['region', 'latitude', 'longitude','name']].loc[countries.region != 'Aggregates']
    df['population'] = population
    df = df.reset_index().rename(columns={'id':'LOCATION'})
    df['LOCATION']=df['LOCATION'].apply(normalize_str)
    df['POPULATION']=df['population']

    gdf_indexed = info_gdf.GLOBAL_INFO_GDF.set_index('LOCATION')
    df = df.set_index('LOCATION')
    df['LAT'] = gdf_indexed['geometry'].centroid.apply(lambda p : p.coords[0][1])
    df['LONG'] = gdf_indexed['geometry'].centroid.apply(lambda p : p.coords[0][0])
    df = df.reset_index()

    df = df[['LOCATION','POPULATION', 'LAT', 'LONG','name']]

    df['name']=df['name'].apply(normalize_str)
    name_replace = {
        'Brunei Darussalam': 'Brunei',
        'Congo, Dem. Rep.': 'Congo (Kinshasa)',
        'Congo, Rep.': 'Congo (Brazzaville)',
        'Czech Republic': 'Czechia',
        'Egypt, Arab Rep.': 'Egypt',
        'Iran, Islamic Rep.': 'Iran',
        'Korea, Rep.': 'Korea, South',
        'St. Lucia': 'Saint Lucia',
        'Russian Federation': 'Russia',
        'Slovak Republic': 'Slovakia',
        'United States': 'US',
        'St. Vincent and the Grenadines': 'Saint Vincent and the Grenadines',
        'Venezuela, RB': 'Venezuela',
        'Taiwan, China': 'Taiwan*',
        'Lao PDR': 'Laos',
        'Syrian Arab Republic': 'Syria',
        'BAHAMAS, THE': 'Bahamas',
        'ST. KITTS AND NEVIS': 'SAINT KITTS AND NEVIS',
        'KYRGYZ REPUBLIC': 'KYRGYZSTAN',
        'GAMBIA, THE': 'GAMBIA',
        'MYANMAR': 'BURMA',
        'YEMEN, REP.': 'YEMEN',
    }
    name_replace = { normalize_str(k): normalize_str(v) for k,v in name_replace.items() }
    df['name']=df['name'].replace(name_replace)
    return df
Ejemplo n.º 8
0
 def load_wdi(self):
     if not self.wdi_code:
         raise ValueError('{}: no associated WDI variable'.format(
             self.label))
     fname = os.path.join(datasets, 'wdi', self.wdi_code + '.csv')
     try:
         timeseries = pd.read_csv(fname, index_col=('Country',
                                                    'Year'))[self.wdi_code]
     except:
         # NOTE: mrv=1 for most recent value would be equivalent to subsequent treatment
         # ....: except that sometimes it results to NaN (e.g CO2 emissions for PSE, Palestine)
         timeseries = wb.get_series(self.wdi_code,
                                    id_or_value='id',
                                    simplify_index=True)
         timeseries.to_csv(fname)
     return timeseries
Ejemplo n.º 9
0
def get_pop_data():
    countries = wb.get_countries()
    population = wb.get_series('SP.POP.TOTL', mrv=1).reset_index()
    countries = countries[['region', 'name']].rename(columns={
        'name': 'country'
    }).loc[countries.region != 'Aggregates']
    countries = pd.merge(left=countries,
                         right=population,
                         left_on='country',
                         right_on='Country',
                         how='left')
    countries = countries[['Country', 'SP.POP.TOTL']]
    countries.columns = ['Country', 'Population']

    # Match country names with COVID API data
    countries['Country_std'] = countries['Country']
    countries['Country_std'].replace(
        {
            'United States': 'United States of America',
            'Iran, Islamic Rep.': 'Iran, Islamic Republic of',
            'Hong Kong SAR, China': 'Hong Kong, SAR China',
            'Korea, Rep.': 'Korea (South)',
            'Vietnam': 'Viet Nam',
            'Egypt, Arab Rep.': 'Egypt',
            'Yemen, Rep.': 'Yemen',
            'Syrian Arab Republic': 'Syrian Arab Republic (Syria)',
            'Kyrgyz Republic': 'Kyrgyzstan',
            'Venezuela, RB': 'Venezuela (Bolivarian Republic)'
        },
        inplace=True)

    # Get list of countries from COVID API and merge on population
    covid_countries = requests.get("https://api.covid19api.com/countries")
    covid_countries = pd.DataFrame(json.loads(covid_countries.text))

    countries = pd.merge(how='left',
                         left=covid_countries,
                         right=countries,
                         left_on='Country',
                         right_on='Country_std')

    countries = countries[['Country_x', 'Slug',
                           'Population']].rename({'Country_x': 'Country'},
                                                 axis=1)

    return countries
Ejemplo n.º 10
0
def sundial_plot(metric='SP.POP.TOTL', title='World Population', year=2000):
    """Plot the given metric as a sundial plot"""
    countries = wb.get_countries()
    values = wb.get_series(metric,
                           date=year,
                           id_or_value='id',
                           simplify_index=True)

    df = countries[['region', 'name']].rename(columns={
        'name': 'country'
    }).loc[countries.region != 'Aggregates']
    df['values'] = values

    # The sunburst plot requires weights (values), labels, and parent (region, or World)
    # We build the corresponding table here
    columns = ['parents', 'labels', 'values']

    level1 = df.copy()
    level1.columns = columns
    level1['text'] = level1['values'].apply(lambda pop: '{:,.0f}'.format(pop))

    level2 = df.groupby('region')['values'].sum().reset_index()[[
        'region', 'region', 'values'
    ]]
    level2.columns = columns
    level2['parents'] = 'World'
    # move value to text for this level
    level2['text'] = level2['values'].apply(lambda pop: '{:,.0f}'.format(pop))
    level2['values'] = 0

    level3 = pd.DataFrame({
        'parents': [''],
        'labels': ['World'],
        'values': [0.0],
        'text': ['{:,.0f}'.format(values.loc['WLD'])]
    })

    all_levels = pd.concat([level1, level2, level3],
                           axis=0).reset_index(drop=True)

    return go.Figure(data=[go.Sunburst(hoverinfo='text', **all_levels)],
                     layout=go.Layout(title='{} (World Bank, {})'.format(
                         title, year),
                                      width=800,
                                      height=800))
Ejemplo n.º 11
0
def fetch_series(series=default_series,
                 scale=['SI.SPR.PCAP','SI.POV.XPND.MD'], scaleby=360,
                 date="1980:%s" %(datetime.now().year), cachedir="data/cache"):
    """
    fetches a definded indicators and formates them in a wide-dataframe

    @param series: an array of names/string series as defined by the worldbank - defaults to:
      SI.POV.XPND.MD: Median daily per capita income or consumption expenditure (2011 PPP)
      SI.SPR.PCAP: Survey mean consumption or income per capita, total population (2011 PPP $ per day)
      SP.POP.TOTL: Population, total
      AG.SRF.TOTL.K2: Surface area (sq. km)
    @param scale: array of series-names that are scaled 
    @param scaleby: the scalefactor to apply to the series that should be scaled - used to scale daily to year by 360
    @param date: the including timerange - defaults to 1980 upto the current year in the format 'from:to' ex. '1980:2020'
    @param cachedir: since the data is on a remote server and cannot be downloaded as persistent file, 
                     the result of the api-call is stored in this cachedir

    @return: a dataframe with selected series as columns and country, countrycode, year
    """
    odf = DataFrame()
    os.makedirs(cachedir,exist_ok=True)

    for i in series:
        cached_df_filename = cachedir+"/"+i+".p"
        df = None
        try:
            with open(cached_df_filename, 'rb') as fd:
                print("Reading Data from cached file: %s" %(cached_df_filename))
                df = pickle.load(fd)
        except:
            df = DataFrame(wb.get_series(i, date=date,id_or_value='id', simplify_index=True))
            print("Writing cached_df_file: %s" %(cached_df_filename))
            with open(cached_df_filename, 'wb') as fd:
                pickle.dump(df, fd)

        df = df[df[i].notnull()]
        if i in scale:
            df[i] = df[i] * scaleby
        odf = odf.append(df)
    odf = DataFrame(odf.groupby(['Country','Year']).sum())
    odf = odf.reset_index()
    return odf
Ejemplo n.º 12
0
def severityplot(data, fig=None, ax=None, logScale = False, quick=False,
        nameUnfocusedCountries = False, legendOnSide = False):

    df = data.df

    # we will get the population of each country from this data set...
    pop = wb.get_series('SP.POP.TOTL', mrv=1).reset_index()
    #print(pop['Country'].unique())
    # some countries in the pop database have different names
    popnames = {'US':'United States',
               'Korea, South':'Korea, Rep.',
               'Russia':'Russian Federation'}

    # countries we want to show in colour...
    if quick:
        focus = ['Canada','US']
    else:
        focus = ['Canada','US','China','Korea, South','United Kingdom','Poland','Mexico','Italy','Spain','France','Germany','Russia','Japan','Belgium','Norway','Austria','Australia','Sweden','Denmark','Singapore','Malaysia','Switzerland','Finland','Portugal','India']

    # aggregate data...
    pc = ['Country/Region', 'Province/State', 'Date', 'Confirmed', 'ConfirmedIncrease', 'Deaths', 'DeathsIncrease', 'Recovered', 'RecoveredIncrease', 'Active', 'ActiveIncrease']
    d = df[pc]
    d = d.groupby(['Country/Region','Date'],as_index=False).agg(data.aggregation)

    # these are all the countries...
    countries = d['Country/Region'].unique()
    if quick:
        countries = focus

    # create a plot...
    if not fig or not ax:
        fig, ax = plt.subplots(1,1)

    if logScale:
        xlim = [10,20000]
        ylim = [0.1,2000]
        ax.set_xscale('log')
        ax.set_yscale('log')
    else:
        xlim = [0,20000]
        ylim = [0,900]

    ax.set_xlim(xlim)
    ax.set_ylim(ylim)

    # helper, returns (x,y,lastValue,daysToDouble), for country label
    def build_label_data(c):
        end = c.tail(1)
        x = float(end['ConfirmedPer1M'])
        y = float(end['DeathsPer1M'])

        if x>=1:
            p = float(y*100)/x
            lab = "(%u/%u, %.2f%%)" % (y,x,p)
        else:
            lab = "(%u/%u)" % (y,x)

        if x <= xlim[1] and y <= ylim[1]:
            return (x, y, lab)
        
        # make it fit in the plot
        end = c[ (c['ConfirmedPer1M'] < xlim[1]) & (c['DeathsPer1M'] < ylim[1]) ].tail(1)
        ex = float(end['ConfirmedPer1M'])
        ey = float(end['DeathsPer1M'])
        return (ex, ey, lab)

    # start plotting...
    for cn in countries:
        #print("--- %s ---" % cn)
        
        # figure out how many people live in the country
        pn = cn
        if cn in popnames.keys():
            pn = popnames[cn]
        
        num = 0
        try:
            num = int(pop[pop['Country'] == pn]['SP.POP.TOTL'])
        except Exception as err:
            if cn in focus:
                print(cn,err)
            pass
        
        # skip countries with low populations
        if num < 1000000:
            continue
                
        try:
            c = d[d['Country/Region'] == cn].copy()
            c['ConfirmedPer1M'] = c['Confirmed'] * 1000000 / num
            c['DeathsPer1M'] = c['Deaths'] * 1000000 / num
            
            if cn in focus:
                linewidth=1
                textweight='normal'
                if cn in ['Canada','US']:
                    linewidth=2
                    textweight='bold'
                    
                c.plot(kind='line',x='ConfirmedPer1M',y='DeathsPer1M', label=cn, linewidth=linewidth, legend=legendOnSide, ax=ax)
                
                (ex,ey,lab) = build_label_data(c)
                ax.text(ex, ey, cn, va='bottom', fontweight=textweight)
                ax.text(ex, ey, lab, va='top', fontweight=textweight, alpha=0.5)
                
            else:
                c.plot(kind='line',x='ConfirmedPer1M',y='DeathsPer1M', legend=False, color='gray', alpha=0.2, ax=ax)

                if nameUnfocusedCountries:
                    (ex,ey,lab) = build_label_data(c)
                    ax.text(ex, ey, cn, alpha=0.2)
                
        except Exception as err:
            #print(cn,err)
            if cn == "Angola":
                pass
            #raise err
            pass

    dataDesc = 'Deaths per Confirmed cases'

    ax.set_title("%s, per population %s"
            % (dataDesc, "(logarithmic)" if logScale else ""),
            fontsize=20)

    ax.set_xlabel("Confirmed cases per 1M population")
    ax.set_ylabel("Deaths per 1M population")

    ax.set_xlim(xlim)
    ax.set_ylim(ylim)

    caption = "%s @ %s (%s)" % (data.giturl, data.gitdate, data.githash)
    ax.text(0.5, -0.05, caption, size=8, ha="center", transform=ax.transAxes)

    return fig
Ejemplo n.º 13
0
                  )  # extract the id as string to input into series object
renewEnergyId = str(Renewable_energy['id'].iloc[0]
                    )  # extract the id as string to input into series object
foreignInvestId = str(Foreign_Investment['id'].iloc[0]
                      )  # extract the id as string to input into series object

print(f'The following value is inputed to series:',
      CO2Id)  # run this to see input to series
print(f'The following value is inputed to series:',
      popGrowthId)  # run this to see input to series
print(f'The following value is inputed to series:',
      renewEnergyId)  # run this to see input to series
print(f'The following value is inputed to series:',
      foreignInvestId)  # run this to see input to series

series1 = pd.DataFrame(wb.get_series(CO2Id)).reset_index(
)  # get the data for searched indicator by country and year
series2 = pd.DataFrame(wb.get_series(popGrowthId)).reset_index(
)  # get the data for searched indicator by country and year
series3 = pd.DataFrame(wb.get_series(renewEnergyId)).reset_index(
)  # get the data for searched indicator by country and year
series4 = pd.DataFrame(wb.get_series(foreignInvestId)).reset_index(
)  # get the data for searched indicator by country and year
countryList1 = [
    'Denmark', 'Sweden', 'Norway'
]  # define the number of countries and which you'd like to explore
countryList2 = [
    'Denmark', ' Sweden', 'Norway'
]  # define the number of countries and which you'd like to explore
countryList3 = [
    'Denmark', 'Sweden', 'Norway'
Ejemplo n.º 14
0
def test_indicator_date():
    idx = get_series('SP.POP.TOTL', date='2010:2018')
    assert len(idx.index) > 200 * 8
    assert_numeric_or_string(idx)
Ejemplo n.º 15
0
import world_bank_data as wb

# Get estimates for the world population:
wb.get_series('SP.POP.TOTL', date='2017')

# Get timeseries of "Agricultural machinery, tractors" in Albania
wb.get_series('AG.AGR.TRAC.NO', country='ALB')
Ejemplo n.º 16
0
# Load Cell Data
cellsDF = pd.read_csv('./data/year_2018__cell_500k/squares_and_triangles/cells.csv')
num_LL_triangles = cellsDF.groupby(['CountryCode'])['LowerLeft'].agg('sum')
num_UR_triangles = cellsDF.groupby(['CountryCode'])['UpperRight'].agg('sum')
num_squares =      cellsDF.groupby(['CountryCode'])['IncludeInSquares'].agg('sum')
cellQty_sq_tri = pd.DataFrame((num_LL_triangles+num_UR_triangles)/2, columns=['qty'])
cellQty_sq = pd.DataFrame((num_squares)).rename(columns={'IncludeInSquares':'qty'})
cellQty = cellQty_sq_tri.join(cellQty_sq, lsuffix='_sqtri', rsuffix='_sq').reset_index()

# Append Alpha Country Code for population join
def Numeric2Alpha(num):
    return pycountry.countries.get(numeric=str(num).zfill(3)).alpha_3
cellQty['CountryAlpha3'] = cellQty['CountryCode'].apply(Numeric2Alpha)

# join population
wb_pop = pd.DataFrame(wb.get_series('SP.POP.TOTL', date='2018', id_or_value='id', simplify_index=True))
cellQty = cellQty.join(wb_pop, on='CountryAlpha3')

# plot the result
cellQty['hasTri'] = cellQty['SP.POP.TOTL']<3e5
cellQty.loc[cellQty['CountryAlpha3']=='ITA', 'hasTri'] = True

fig, [ax1, ax2] = plt.subplots(nrows=1, ncols=2, figsize=(16,6))

sns.set(style="whitegrid")

sns.lineplot(x = [250e3,2e9], y = [0.5,4000], ax=ax1, color='#333333')
sns.scatterplot(x = "SP.POP.TOTL", y = "qty_sq", hue="hasTri", data=cellQty, ax=ax1, legend=False, palette=["#34495e", "#2ecc71"], linewidth=0, size=1.5)
plt.xscale("log")
plt.yscale("log")
ax1.title.set_text('Squares Only')
Ejemplo n.º 17
0
def test_indicator_monthly():
    idx = get_series('DPANUSSPB',
                     country=['CHN', 'BRA'],
                     date='2012M01:2012M08')
    assert len(idx.index) > 200 * 12
    assert_numeric_or_string(idx)
Ejemplo n.º 18
0
import pandas as pd
import world_bank_data as wb

rs = wb.get_series('SP.POP.TOTL', mrv=1, simplify_index=True)

confirmados = pd.read_csv(
    "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"
)

lista = rs.index.unique()

confirmados.drop(["Province/State", "Lat", "Long"], axis=1, inplace=True)
confirmados = confirmados[confirmados['Country/Region'] == 'Argentina']
confirmados.rename(columns={'Country/Region': 'Pais'}, inplace=True)
confirmados = confirmados.groupby(['Pais']).sum()

lista = confirmados.index

confirmados = confirmados.transpose()

recuperados = pd.read_csv(
    "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv"
)

lista = rs.index.unique()

recuperados.drop(["Province/State", "Lat", "Long"], axis=1, inplace=True)
recuperados = recuperados[recuperados['Country/Region'] == 'Argentina']
recuperados.rename(columns={'Country/Region': 'Pais'}, inplace=True)
recuperados = recuperados.groupby(['Pais']).sum()
Ejemplo n.º 19
0
 def get_series_data(series, date='2019'):
     df = wb.get_series(series, mrv=1, date=date)
     df = df.to_frame()
     df = df.reset_index()
     return df
Ejemplo n.º 20
0
import csv, json
import pandas as pd
import world_bank_data as wb

pd.set_option('display.max_rows', 12)

# Countries and associated regions
countries = wb.get_countries()

# Population dataset, indexed with the country code
population = wb.get_series('SP.POP.TOTL',
                           id_or_value='id',
                           simplify_index=True,
                           mrv=1)

# Aggregate region, country and population
df = countries[['region', 'name']].rename(columns={
    'name': 'country'
}).loc[countries.region != 'Aggregates']
df['population'] = population

regions_list = set(df['region'].to_list())
region_clusters = {region: [] for region in regions_list}

for row in df.itertuples():
    if row.country and row.population > 10:
        region_clusters[row.region].append({
            'name': row.country,
            'value': row.population
        })
Ejemplo n.º 21
0
    def loadData(self):

        print('Loading John-Hopkins covid 19 Data')
        # Loading Covid 19 Data
        public_url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
        corona_data = pd.read_csv(public_url)
        corona_data.drop(['Lat', 'Long', 'Province/State'], axis=1)
        country_data = corona_data.groupby('Country/Region').sum()
        country_data = country_data.drop(['Lat', 'Long'], axis=1)
        country_data = country_data.rename(
            columns={'Country/Region': 'Country'},
            index={'US': 'United States'})

        print('Loading World Bank indicators')
        # Get GDP data
        GDP = pd.DataFrame(wb.get_series('NY.GDP.MKTP.CD', mrv=1))
        GDP = GDP.droplevel(level=[1, 2])  # Droping multi level indexing

        # Get gini Index
        Gini = pd.DataFrame(wb.get_series('SI.POV.GINI', date='2010'))
        Gini = Gini.droplevel(level=[1, 2])  # Droping multi level indexing

        # Get population data
        Pop = pd.DataFrame(wb.get_series('SP.POP.TOTL', mrv=1))
        Pop = Pop.droplevel(level=[1, 2])  # Droping multi level indexing

        # Get Health System Data
        Health = pd.DataFrame(wb.get_series('SH.MED.BEDS.ZS', date='2010'))
        Health = Health.droplevel(level=[1, 2])  # Droping multi level indexing

        # Get Density Data
        Dens = pd.DataFrame(wb.get_series('EN.POP.DNST', mrv=1))
        Dens = Dens.droplevel(level=[1, 2])

        # Get Trade data
        Trade = pd.DataFrame(wb.get_series('NE.TRD.GNFS.ZS', mrv=1))
        Trade = Trade.droplevel(level=[1, 2])

        # Get Child mortality data
        Child = pd.DataFrame(wb.get_series('SP.DYN.IMRT.IN', mrv=1))
        Child = Child.droplevel(level=[1, 2])
        Child = Child

        print('Loading from World Data')
        politics = pd.read_csv('Data/politics.csv')
        politics = politics.set_index('Country Name')
        politics = politics.drop(
            ['Series Name', 'Country Code', 'Series Code'], axis=1)
        politics = politics.rename(
            columns={'2018 [YR2018]': 'Political Stability'})

        GOV = pd.read_csv('Data/governement.csv')
        GOV = GOV.set_index('Country Name')
        GOV = GOV.drop(['Series Name', 'Country Code', 'Series Code'], axis=1)
        GOV = GOV.rename(columns={'2018 [YR2018]': 'GOV'})

        print('Loading the Economist Data')
        # Economist businne unit
        df = pd.read_excel('Data/DemocracyIndex.xlsx')
        year = df['time'] == 2018
        DEM = df[year]
        DEM = DEM.drop(['geo', 'a', 'b', 'c', 'd', 'e', 'time', 'f'], axis=1)
        DEM = DEM.set_index('name')
        DEM = DEM.rename(columns={'name': 'Country'})

        # Continent data
        Cont = pd.read_csv('Data/Countries-Continents.csv')
        Cont = Cont.set_index('Country')
        Cont = Cont.rename(index={'US': 'United States'})

        print(
            'Merging all data and selecting only the countries with all the data available'
        )
        allData = country_data.join([
            GDP, Gini, DEM, Pop, Health, Child, Dens, Trade, Cont, politics,
            GOV
        ])
        allData.rename(columns={
            'NY.GDP.MKTP.CD': 'GDP',
            'SI.POV.GINI': 'Gini',
            'Democracy index (EIU)': 'Dem',
            'SP.POP.TOTL': 'Pop',
            'SH.MED.BEDS.ZS': 'Health',
            'SP.DYN.IMRT.IN': 'Child',
            'EN.POP.DNST': 'Dens',
            'NE.TRD.GNFS.ZS': 'Trade',
            'Political Stability	': 'Political Stability'
        },
                       inplace=True)
        allData = allData.dropna()

        print('Computing distance between countries !')
        geolocator = Nominatim(user_agent="my-application")
        Distance = []
        count = 0
        countries = list(allData.index)
        Wuhan = geolocator.geocode("Wuhan")
        Wuhan = (Wuhan.latitude, Wuhan.longitude)

        for i in countries[0:16]:
            c = geolocator.geocode(i)
            Distance.append(
                distance.distance((c.latitude, c.longitude), Wuhan).km)
            count += 1
        print('25 %')
        for i in countries[16:33]:
            c = geolocator.geocode(i)
            Distance.append(
                distance.distance((c.latitude, c.longitude), Wuhan).km)
            count += 1
        print('50 %')
        for i in countries[33:45]:
            c = geolocator.geocode(i)
            Distance.append(
                distance.distance((c.latitude, c.longitude), Wuhan).km)
            count += 1
        print('75 %')
        for i in countries[45:59]:
            c = geolocator.geocode(i)
            Distance.append(
                distance.distance((c.latitude, c.longitude), Wuhan).km)
            count += 1

        Distances = pd.DataFrame(Distance,
                                 index=list(allData.index),
                                 columns=['Distance'])

        allData = allData.join([Distances])
        print('100 %')

        self.y = allData.drop([
            'GDP', 'Gini', 'Dem', 'Pop', 'Health', 'Child', 'Dens', 'Trade',
            'Continent', 'Political Stability', 'GOV', 'Distance'
        ],
                              axis=1)

        self.X = allData.loc[:, [
            'GDP', 'Gini', 'Dem', 'Pop', 'Health', 'Child', 'Dens', 'Trade',
            'Continent', 'Political Stability', 'GOV', 'Distance'
        ]]
        return self.X, self.y
Ejemplo n.º 22
0
def sinceplot(data, fig=None, ax=None,
        logScale = False, dataColumn='Confirmed',
        startCountingAfter = 1, startCountingAfter1M = True,
        nameUnfocusedCountries = False, legendOnSide = False):

    df = data.df

    if not dataColumn in data.numerical:
        raise Exception("cannot plot %s" % dataColumn)

    # we will get the population of each country from this data set...
    pop = wb.get_series('SP.POP.TOTL', mrv=1).reset_index()
    #print(pop['Country'].unique())
    # some countries in the pop database have different names
    popnames = {'US':'United States',
               'Korea, South':'Korea, Rep.',
               'Russia':'Russian Federation'}

    # countries we want to show in colour...
    focus = ['Canada','US','China','Korea, South','United Kingdom','Poland','Mexico','Italy','Spain','France','Germany','Russia','Japan','Belgium','Norway','Austria','Australia','Sweden','Denmark','Singapore','Malaysia','Switzerland','Finland','Portugal','India']
    #focus = ['Canada','US']

    # aggregate data...
    pc = ['Country/Region', 'Province/State', 'Date', 'Confirmed', 'ConfirmedIncrease', 'Deaths', 'DeathsIncrease', 'Recovered', 'RecoveredIncrease', 'Active', 'ActiveIncrease']
    d = df[pc]
    d = d.groupby(['Country/Region','Date'],as_index=False).agg(data.aggregation)

    # these are all the countries...
    countries = d['Country/Region'].unique()
    #countries = focus

    # create a plot...
    if not fig or not ax:
        fig, ax = plt.subplots(1,1)

    if dataColumn == 'Confirmed':   # confirmed
        xlim = [0,250]
        ylim = [0,20000]
    elif dataColumn == 'ConfirmedIncrease':   # delta in confirmed
        xlim = [0,250]
        ylim = [0,500]
    elif dataColumn == 'Deaths':   # deaths
        xlim = [0,220]
        ylim = [0,900]
    else:                           # delta in deaths
        xlim = [0,220]
        ylim = [0,50]

    if logScale:
        xlim[0] = -1
        ylim[0] = 1
        ylim[1] *= 1.3
        ax.set_yscale('log')
        showDoublingAtY = ylim[1] * (3/4)
        doubleindays=[1,2,3,4,5,6,7,8,10,12,15,20]
    else:
        showDoublingAtY = ylim[1] - 100
        doubleindays=[1,2,3,4,5,6,7,8]

    ax.set_xlim(xlim)
    ax.set_ylim(ylim)

    # helper, returns (x,y,lastValue,daysToDouble), for country label
    def build_label_data(c):
        pre = c.tail(2).head(1)
        px = int(pre['Since'])
        py = float(pre['Per1M'])

        end = c.tail(1)
        x = int(end['Since'])
        y = float(end['Per1M'])

        dx = x-px    # days
        dy = y-py    # increase

        sl = 0
        if dx:
            sl = dy/dx   # increase/days

        if logScale:
            rt = 0
            if sl:
                rt = y / sl  # days to double
            rtl = "dtd"  # describe "rt"

        else:
            rt = sl
            rtl = "Δ"    # describe "rt"

        #print("%-20s - x=%u..%u (%u), y=%.3f..%.3f (%.3f), sl=%.3f, %s=%.3f"
        #        % (cn,px,x,dx,py,y,dy,sl,rtl,rt))

        if x <= xlim[1] and y <= ylim[1]:
            return (x, y, y, rt, rtl)
        
        # make it fit in the plot
        end = c[ (c['Since'] < xlim[1]) & (c['Per1M'] < ylim[1]) ].tail(1)
        ex = int(end['Since'])
        ey = float(end['Per1M'])
        return (ex, ey, y, rt, rtl)

    # start plotting...
    for cn in countries:
        #print("--- %s ---" % cn)
        
        # figure out how many people live in the country
        pn = cn
        if cn in popnames.keys():
            pn = popnames[cn]
        
        num = 0
        try:
            num = int(pop[pop['Country'] == pn]['SP.POP.TOTL'])
        except Exception as err:
            if cn in focus:
                print(cn,err)
            pass
        
        # skip countries with low populations
        if num < 1000000:
            continue
                
        try:
            c = d[d['Country/Region'] == cn].copy()
            c['Per1M'] = c[dataColumn] * 1000000 / num
            
            if startCountingAfter1M:
                idx = c[c['Per1M'].ge(startCountingAfter)].index[0]
            else:
                idx = c[c[dataColumn].ge(startCountingAfter)].index[0]
                
            s = c.loc[idx]['Date']
            c['Since'] = c['Date'] - s
            c['Since'] = c['Since']/np.timedelta64(1,'D')
            c = c[c['Since'] > -10]
            
            if cn in focus:
                linewidth=1
                textweight='normal'
                if cn in ['Canada','US']:
                    linewidth=2
                    textweight='bold'
                    
                c.plot(kind='line',x='Since',y='Per1M', label=cn, linewidth=linewidth, legend=legendOnSide, ax=ax)
                
                (ex,ey,v,rt,rtl) = build_label_data(c)
                ax.text(ex, ey, cn, va='bottom', fontweight=textweight)
                ax.text(ex, ey, "(%u, %s=%.2f)"%(v,rtl,rt), va='top', fontweight=textweight, alpha=0.5)
                
            else:
                c.plot(kind='line',x='Since',y='Per1M', legend=False, color='gray', alpha=0.2, ax=ax)

                if nameUnfocusedCountries:
                    (ex,ey,v,rt,rtl) = build_label_data(c)
                    ax.text(ex, ey, cn, alpha=0.2)
                
        except Exception as err:
            #print(cn,err)
            if cn == "Angola":
                pass
            #raise err
            pass

    if showDoublingAtY:
        def double_daily(base, arr):
            arr = np.asarray(arr)
            result = np.power(base,arr)
            return result
        for doublein in doubleindays:
            base = np.power(2,1/doublein)
            x = np.linspace(0,xlim[1])
            y = double_daily(base,x)
            plt.plot(x,y,color='red',alpha=0.25,linestyle=':')
            y = showDoublingAtY
            x = math.log(y, base)
            s = "%u day%s" % (doublein, "s" if doublein>1 else "")
            if x > xlim[1]:
                x = xlim[1]
                y = np.power(base,x)
            plt.text(x, y, s, color='red', alpha=0.5)
            if doublein == 1:
                plt.text(x, y, "double in ", color='red', alpha=0.5, ha='right')

    if dataColumn == 'Confirmed':
        dataDesc = 'Confirmed cases'
    else:
        dataDesc = dataColumn

    ax.set_title("%s per population, since %u observed %s"
            % (dataDesc, startCountingAfter, "(logarithmic)" if logScale else ""),
            fontsize=20)

    if startCountingAfter1M:
        ax.set_xlabel("Days since %u %s / 1M population" % (startCountingAfter, dataDesc))
    else:
        ax.set_xlabel("Days since %u %s" % (startCountingAfter, dataDesc))

    ax.set_ylabel("%s per 1M population" % dataDesc)

    ax.set_xlim(xlim)
    ax.set_ylim(ylim)

    caption = "%s @ %s (%s)" % (data.giturl, data.gitdate, data.githash)
    ax.text(0.5, -0.05, caption, size=8, ha="center", transform=ax.transAxes)

    return fig
    how='left')

Node_Demand_2015['Share_%_Country_Demand'] = Node_Demand_2015[
    'Node_Demand_2015'] / Node_Demand_2015['Country_Demand_2015']

Node_Demand_2015.iloc[49:50]

# ## Historical relationships

# ### Creates historic relationships based on World Bank Data

# In[11]:

#Extracts historical GDPppp per capita (constant 2017 international $) from the World Bank API
Country_GDPppp_WB = wb.get_series('NY.GDP.PCAP.PP.KD',
                                  date='1980:2014',
                                  id_or_value='id')

Country_GDPppp_WB = Country_GDPppp_WB.reset_index().rename(
    columns={
        'NY.GDP.PCAP.PP.KD': 'WB_GDPppp'
    }).set_index('Country')

#Extracts Electricity consumption per capita in kWh from the World Bank API. Data available for up till 2014
Country_Elec_WB = wb.get_series('EG.USE.ELEC.KH.PC',
                                date='1980:2014',
                                id_or_value='id')

Country_Elec_WB = Country_Elec_WB.reset_index().rename(
    columns={
        'EG.USE.ELEC.KH.PC': 'WB_Elec'
Ejemplo n.º 24
0
        covid1.plot(x='date', y=u, color='orange', ax=ax)
    else:
        covid1.plot(x='date', y=u, color='lightgrey', ax=ax)

ax.get_legend().remove()
ax.set_xlabel('')
plt.show()

#plt.yscale('log')

#%% Area plot

import world_bank_data as wb

wbd = wb.get_series('NY.GDP.MKTP.KD.ZG',
                    date='2000:2019',
                    id_or_value='id',
                    simplify_index=True).reset_index()
jpn_data = wbd[wbd.Country == 'JPN']

jpn_data.plot(x='Year', y='NY.GDP.MKTP.KD.ZG', kind='line')

jpn_data.plot(x='Year',
              y='NY.GDP.MKTP.KD.ZG',
              kind='area',
              stacked=False,
              legend=False)

#%% Streamgraph

covid2 = covid.groupby(id)
covid2['daily'] = covid2.deaths - covid2.deaths.shift(1)
Ejemplo n.º 25
0
        granular_country_data['Country_Region'] == country_region]
    return generalized_country_data, granular_country_data


#only grab the cleaned up aggregated file
country_aggregated_data = daily_file_aggregator(daily_file_data)[0]
granular_data_united_states = daily_file_aggregator(daily_file_data,
                                                    country_region='US')[1]
#note, some data can sometimes be added/reported late (i.e. China on 4/16/2020 reporting > 1200 deaths)

###Getting data from API's (world bank API [world_bank_data]

#grab datasets from the web (population, hospital beds per 1000 people)
#and join them into our current covid-19 dataset as features
#fetch population by country data sets (world bank data)
pop_data = wb.get_series('sp.pop.totl', mrv=1).to_frame().reset_index()
#rename columns so they can be joined/fuzzy matched to COVID data, delete unnecessary columns
pop_data = pop_data.rename(columns={
    'sp.pop.totl': 'Population',
    'Country': 'Country_Region'
}).drop(['Series', 'Year'], axis=1)
#fetch hospital beds per capita data (world bank data), remove countries with nan/missing data
hosp_bed_data = wb.get_series(
    'sh.med.beds.zs').to_frame().reset_index().dropna()
#keep only the most recent year when this metric was captured (per country)
hosp_bed_data = hosp_bed_data.drop_duplicates('Country', keep='last')
#rename columns, drop unnecessary columns
hosp_bed_data = hosp_bed_data.rename(
    columns={
        'sh.med.beds.zs': 'hosp_beds_per_1000_people',
        'Year': 'MostRecentYearHospBedInfoCollected',
Ejemplo n.º 26
0
def test_indicator_simplify_scalar():
    pop = get_series('SP.POP.TOTL', 'CHN', mrv=1, simplify_index=True)
    assert isinstance(pop, numbers.Number)
Ejemplo n.º 27
0
# EN.CLC.GHGR.MT.CE	GHG net emissions/removals by LUCF (Mt of CO2 equivalent)

# SP.POP.TOTL	Population, total

# NY.GDP.MKTP.CD	GDP (current US$)
# NY.GDP.MKTP.KD	GDP (constant 2010 US$)
# NY.GDP.MKTP.PP.CD	GDP, PPP (current international $)
# NY.GDP.MKTP.PP.KD	GDP, PPP (constant 2011 international $)
# NY.GDP.PCAP.CD	GDP per capita (current US$)
# NY.GDP.PCAP.KD	GDP per capita (constant 2010 US$)
# NY.GDP.PCAP.PP.CD	GDP per capita, PPP (current international $)
# NY.GDP.PCAP.PP.KD	GDP per capita, PPP (constant 2011 international $)

wb.get_countries().show()
wb.get_regions().show()
wb.get_series('SP.POP.TOTL', id_or_value='id')
wb.get_series('SP.POP.TOTL').reset_index()

# looks simple - so I need:
#   - GCA country to WB code conversion
#   - then I can just join everything and I should have all available years, so should be able to do ASOF over countries

# country mapping
root = 'D:\\projects\\fakta-o-klimatu\\work\\111-emise-svet-srovnani\\data'
path_gca = root + '\\global-carbon-atlas\\export_20190819_2250.csv'
country_map = pd.read_csv(root + '\\country_mapping.csv')[['wb', 'gca']]

country_map = pd.merge(country_map,
                       wb.get_countries()['name'].rename('wb').reset_index())
country_map.show_csv()
country_map = country_map.rename(columns={'id': 'code'})
Ejemplo n.º 28
0
# -*- coding: utf-8 -*-
"""
This code ia part of a larger analysis of EU Emmissions data. 
The purpose of this program is to pull GDP data from the World Bank, 
clean and subset the data, and export the data as a csv to be uses in 
the main analysis program
@author: Colburn Hassman
"""

# Import required packages
import pandas as pd
import world_bank_data as wb
import matplotlib.pyplot as plt

# Pull the data from the world bank
gdp = wb.get_series('NY.GDP.MKTP.PP.CD', simplify_index = True)

# Create a new GDP dataframe from only the countries in question, 
# which merges along the index of years
GDP = pd.DataFrame({'Germany'     : gdp['Germany'],
                    'France'      : gdp['France'], 
                    'Italy'       : gdp['Italy'],
                    'Spain'       : gdp['Spain'],
                    'Netherlands' : gdp['Netherlands'],
                    'Switzerland' : gdp['Switzerland'],
                    'Poland'      : gdp['Poland'],                    
                    'Sweden'      : gdp['Sweden'],
                    'Belgium'     : gdp['Belgium']})

GDP.index = pd.to_datetime(GDP.index) # Convert the index to DateTime object
GDP = GDP[(GDP.index > "2007") & (GDP.index < '2019')] 
Ejemplo n.º 29
0
def test_update_population():
    x = wb.get_series("SP.POP.TOTL", mrv=1, simplify_index=True)
    with open(sample_dir / "population.csv", "w") as fp:
        fp.write(x.to_csv())
Ejemplo n.º 30
0
def test_non_wdi_indicator():
    idx = get_series('TX.VAL.MRCH.CD.WB', mrv=1)
    assert len(idx.index) > 50
    assert_numeric_or_string(idx)