def getWBData():
    #Manually grabbed all links from the pdf
    Male_HIV = ["http://data.worldbank.org/indicator/SH.HIV.1524.MA.ZS"]
    Female_HIV = ["http://data.worldbank.org/indicator/SH.HIV.1524.FE.ZS"]
    Total_HIV = ["http://data.worldbank.org/indicator/SH.DYN.AIDS.ZS"]
    
    #Used loop to get indicators from each link
    inds_Male = [x.split("/")[4] for x in Male_HIV]
    inds_Female = [x.split("/")[4] for x in Female_HIV]
    inds_Total = [x.split("/")[4] for x in Total_HIV]
    
    #indicators = dict(zip(inds,vals))
    inds_Male_2 = dict(zip(inds_Male,inds_Male))
    inds_Female_2 = dict(zip(inds_Female,inds_Female))
    inds_Total_2 = dict(zip(inds_Total,inds_Total))
    
    #Set the date range
    data_date = (datetime.datetime(1990,1,1),datetime.datetime(2014,1,1))
    
    #Download data for all indicators --- each indicator is a variable/column
    #Reset Index so that Year and Country are columns 
    #Add additional column and rename column names
    #Remove non-countries and troublesome countries from data
    MaleAll = wbdata.get_dataframe(indicators = inds_Male_2, data_date = data_date ).fillna(0).ix[340:,:]
    MaleAll = MaleAll.reset_index()
    MaleAll = MaleAll.drop(MaleAll.index[0:510])
    MaleAll['Gender'] = 'Male'
    MaleAll.columns = ['Country','Year','Prevalence','Gender']
    MaleAll = MaleAll[MaleAll.Country != "Cote d'Ivoire"]
    
    FemaleAll = wbdata.get_dataframe(indicators = inds_Female_2, data_date = data_date ).fillna(0).ix[340:,:]
    FemaleAll = FemaleAll.reset_index()
    FemaleAll = FemaleAll.drop(FemaleAll.index[0:510])
    FemaleAll['Gender'] = 'Female'
    FemaleAll.columns = ['Country','Year','Prevalence','Gender']
    FemaleAll = FemaleAll[FemaleAll.Country != "Cote d'Ivoire"]
    
    
    TotalAll = wbdata.get_dataframe(indicators = inds_Total_2, data_date = data_date ).fillna(0).ix[340:,:]
    TotalAll = TotalAll.reset_index()
    TotalAll = TotalAll.drop(TotalAll.index[0:510])
    TotalAll['Gender'] = 'Total'
    TotalAll.columns = ['Country','Year','Prevalence','Gender']
    TotalAll = TotalAll[TotalAll.Country != "Cote d'Ivoire"]
    
    #Combining all the data together
    Final_Table = MaleAll.append([FemaleAll,TotalAll])
    
    #Reading the LGBT HIV csv file and changing the index to Countries.
    #LGBT_csv = pd.read_csv('LGBT_HIV_FINAL_v1.csv') 
    #LGBTAll = LGBT_csv.set_index('Countries')
    #LGBTAll = LGBTAll.drop(LGBTAll.index[[20]])
    #LGBTAll['Gender'] = 'LGBT' 
    
    return Final_Table
Exemple #2
0
def get_economic_dataframes():
    """
    Returns dataframes for GDP at PPP, unemployment, and total government
    debt, per country
    """
    countries = get_countries_as_iso_codes()
    ppps = wbdata.get_dataframe(
        {"NY.GDP.PCAP.PP.KD": "gdpppp"}, country=countries,
        data_date=DATA_DATE)
    unemployement = wbdata.get_dataframe(
        {"SL.UEM.TOTL.ZS": "percent"}, country=countries, data_date=DATA_DATE2)
    debt = wbdata.get_dataframe(
        {"GC.DOD.TOTL.GD.ZS": "debt"}, country=countries, data_date=DATA_DATE2)
    return ppps, unemployement, debt
def plot2():
    p = figure(title='Data from worldbank.org (SH.H2O.SAFE.ZS)',
               x_axis_label='date',
               x_axis_type='datetime')
    countries = ['AF', 'TZ', 'AO', 'MG', 'MZ', 'CG']
    indicators = {'SH.H2O.SAFE.ZS': 'Improved water source'}
    df1 = wbdata.get_dataframe(indicators,
                               country=countries,
                               convert_date=False)
    indicators2 = {'SP.POP.TOTL': 'Total Population'}
    df2 = wbdata.get_dataframe(indicators2,
                               country=countries,
                               convert_date=False)
    dfu1 = df1.unstack(level=0)
    dfu1 = dfu1['1990':]
    dfu2 = df2.unstack(level=0)
    dfu2 = dfu2['1990':]
    dfu = pd.DataFrame(dfu1.values * dfu2.values,
                       columns=dfu1.columns,
                       index=dfu1.index)
    range(dfu.shape[1])
    dfu.columns = range(dfu.shape[1])
    dfu['Date'] = dfu.index
    xyvalues = pd.DataFrame(
        dict(
            Afghanistan=dfu[0],
            Tanzania=dfu[1],
            Angola=dfu[2],
            Madagascar=dfu[3],
            Mozambique=dfu[4],
            Congo=dfu[5],
            # New_Guinea=dfu[6],
            # Saudi_Arabia=dfu[7],
            # Chad=dfu[8],
            # Mongolia=dfu[9],
            Date=dfu['Date']))

    output_file("stocks_timeseries.html")

    p = TimeSeries(
        xyvalues,
        x='Date',
        legend=True,
        title="",
        ylabel='Population with no access to improved source of water')

    script, div = components(p)
    return render_template('plot2.html', script=script, div=div)
def load_world_bank(year=2019):
    """Load data from World Bank"""
    indicators = {
        'SP.POP.TOTL': 'Population',
        'NY.GDP.PCAP.PP.CD': 'GDP',  # GDP per capita
        'NY.GNP.PCAP.PP.KD': 'GNI'
    }  # GNI per capita

    df = wbdata.get_dataframe(indicators)
    df = df.reset_index()

    # Rename Slovak Republic with Slovakia
    df['country'] = df['country'].replace({'Slovak Republic': 'Slovakia'})

    df = df.rename(columns={
        'Population': 'population',
        'GDP': 'gdp',
        'GNI': 'gni'
    })

    # Convert date to integer
    df['date'] = df['date'].astype(int)

    # Set MultiIndex
    df = df.set_index(['country', 'date'])

    df = df.reset_index()
    df = df[df['date'] == year] \
        .drop(columns='date') \
        .set_index('country')

    return df
Exemple #5
0
def get_data():
	indicators = {'SI.POV.GINI':'Gini Index', 
	              'NY.GDP.PCAP.PP.KD':'GDP per capita (constant 2010 US$)',
	              'SP.POP.TOTL':'Population'}
	
	data = wbdata.get_dataframe(indicators=indicators)
	
	data = data.reset_index()
	
	df_region = pd.DataFrame()
	df_region["Country"]=[row['name'] for row in wbdata.get_country("")]
	df_region["Region"]=[row['region']['value'] for row in wbdata.get_country("")]
	df_region = df_region.set_index("Country")
	
	df = pd.DataFrame()
	for country in data["country"].unique():
	    if data[data["country"]==country]['Gini Index'].notna().sum() != 0 and data[data["country"]==country]['GDP per capita (constant 2010 US$)'].notna().sum() != 0:
	        df_auxiliar = data[data["country"]==country].fillna(method="bfill").dropna()
	        df_auxiliar["Region"]=[df_region.loc[country].values[0] for i in range(len(df_auxiliar))]
	        df_auxiliar=df_auxiliar.sort_values(by="date")
	        df = pd.concat([df, df_auxiliar], ignore_index=True)
	df["date"] = df["date"].astype('int64')
	df = pd.concat([df[df["country"]=="Austria"],df[df["country"]!="Austria"]], ignore_index=True)
	df = pd.concat([df[df["country"]=="Algeria"],df[df["country"]!="Algeria"]], ignore_index=True)
	df = pd.concat([df[df["country"]=="Botswana"],df[df["country"]!="Botswana"]], ignore_index=True)
	df = pd.concat([df[df["country"]=="Australia"],df[df["country"]!="Australia"]], ignore_index=True)
	df = pd.concat([df[df["country"]=="India"],df[df["country"]!="India"]], ignore_index=True)
	df = pd.concat([df[df["country"]=="United States"],df[df["country"]!="United States"]], ignore_index=True)
	df = pd.concat([df[df["country"]=="Chile"],df[df["country"]!="Chile"]], ignore_index=True)
	return df
    def df_request(self):
        '''Request dataframe from World Bank, based on indicators,
                    country codes and date period'''

        self.df = wbdata.get_dataframe(self.indicators,
                                       country=self.codes,
                                       data_date=self.date)
def download_once(indicators, path):
    if os.path.isfile(path):
        return pd.read_hdf(path, 'indicators')

    data = wb.get_dataframe(indicators, convert_date=True).sort_index()
    data.to_hdf(path, 'indicators')

    return data
Exemple #8
0
def return_data(number, country=["USA"]):
    try:
        cat = category(number)
        df = wbdata.get_dataframe(cat, country=country, convert_date=False)
        df = df.unstack(level=0)
        return df
    except IndexError:
        print("no data on this!")
Exemple #9
0
def get_WB_data(indicators={}, countries=[]):
    # access data
    df = wbdata.get_dataframe(indicators,
                              country=countries,
                              convert_date=False)
    # reset index for navigation
    df = df.reset_index()
    return df
Exemple #10
0
 def test_monthly_freq(self):
     got = wbd.get_dataframe(
         {"DPANUSSPB": "var"},
         country="bra",
         data_date=dt.datetime(2012, 1, 1),
         freq="M",
     )["var"]["2012M01"]
     assert got == 1.78886363636
Exemple #11
0
 def test_quarterly_freq(self):
     got = wbd.get_dataframe(
         {"DP.DOD.DECD.CR.BC.CD": "var"},
         country="chl",
         data_date=dt.datetime(2013, 1, 1),
         freq="Q",
     )["var"]["2013Q1"]
     assert got == 31049138725.7794
Exemple #12
0
def GetDataWB(indicators, year1=2000, year2=2016):
    """
    This function first retrieves World Bank data from the latest year, and then
    fills any missing data in the dataframe with data from previous years (in the specified range).
    
    ------
    Inputs
    ------
    indicators:     The indicator dataframe that was constructed with the 
                    function GetIndicatorsWB()
    year1:          The lower bound for time-period (default=2000)
    year2:          The upper bound for the time-period (default=2016)
    
    -------
    Outputs
    -------
    dataframe:      The resulting dataframe
    """

    data_date = (datetime.datetime(year2, 1,
                                   1), datetime.datetime(year2, 1, 1))

    df_filled = wbdata.get_dataframe(indicators, data_date=data_date)

    for column in df_filled:
        column_source = column + ' source'
        df_filled[column_source] = None
        df_filled[column_source][
            df_filled[column].notnull()] = 'WB data ' + str(year2)

    year2range = year2 - 1

    for year in range(year2range, year1, -1):
        data_date = (datetime.datetime(year, 1,
                                       1), datetime.datetime(year, 1, 1))
        df_year = wbdata.get_dataframe(indicators, data_date=data_date)

        for column in df_year:
            column_source = column + ' source'
            df_year[column_source] = None
            df_year[column_source][
                df_year[column].notnull()] = 'WB data ' + str(year)
        df_filled = df_filled.combine_first(df_year)

    return df_filled
Exemple #13
0
 def getRawData(self, sSaveFile):
     aCountries = [ oCountry["id"] for oCountry in wbdata.get_country(incomelevel = "LMY", display = False) ]
     mIndicators = {
         "NY.GDP.PCAP.PP.CD": "GDP per capita (current US$)",
         "SH.DYN.MORT":       "Mortality rate, under-5 (per 1,000 live births)",
         "SG.GEN.PARL.ZS":    "Proportion of seats held by women in national parliaments (%)"
     }
     oData = wbdata.get_dataframe(mIndicators, country = aCountries, convert_date = True)
     oData.to_csv(sSaveFile)
    def retrieve_data(self):
        self.df = wbdata.get_dataframe(self.indicators, 
                                country = self.country_converter(self.countries), 
                                data_date=self.date,
                                convert_date= True)

        self.df = self.df.reset_index().dropna(thresh = 0.9*len(self.df), axis = 1)

        return self.df
Exemple #15
0
def get_country_indicator(country, indicator, start, end):
    data_dates = (dt.datetime(start, 1, 1), dt.datetime(end, 1, 1))
    data = wb.get_dataframe({indicator: 'indicator'},
                            country=country,
                            data_date=data_dates,
                            convert_date=False,
                            keep_levels=True)
    data = data.reset_index()
    return data['indicator']
Exemple #16
0
def load_word_bank_dataset():
    """
    This function loads the World Bank Data and return it as NxD numpy arrays
    """
    fert_dataset_path = './demo/WorldBankData/fertility_rate.csv'
    life_exp_dataset_path = './demo/WorldBankData/life_expectancy.csv'
    years_str_list = [str(year) for year in range(1960, 2017)]
    if os.path.exists(fert_dataset_path) & os.path.exists(
            life_exp_dataset_path):
        # If files exists, load from files
        # Load and drop rows with missing values
        fert_rate = pd.read_csv(fert_dataset_path).dropna()
        life_exp = pd.read_csv(life_exp_dataset_path).dropna()
        country_field_name = 'Country Code'
    else:
        # If files don't exist, download data with wbdata instead
        # Get life expectancy and fertility rate data
        life_exp = wbdata.get_dataframe(indicators={
            "SP.DYN.LE00.IN": 'value'
        }).unstack(level=0).transpose().reset_index()
        fert_rate = wbdata.get_dataframe(indicators={
            "SP.DYN.TFRT.IN": 'value'
        }).unstack(level=0).transpose().reset_index()

        # Keep only country name and years columns, filter row with N/A's
        life_exp = life_exp[['country'] + years_str_list].dropna()
        fert_rate = fert_rate[['country'] + years_str_list].dropna()
        country_field_name = 'country'

    # Keep only countries which appear on both dataframes
    valid_countries = list(
        set(life_exp[country_field_name]) & set(fert_rate[country_field_name]))
    life_exp = life_exp[life_exp[country_field_name].isin(valid_countries)]
    fert_rate = fert_rate[fert_rate[country_field_name].isin(valid_countries)]

    # Convert to numpy
    life_exp = life_exp[years_str_list].to_numpy()
    fert_rate = fert_rate[years_str_list].to_numpy()

    # Apply CCA
    cca_transformer = CCA(n_components=2)
    life_exp_cca, fert_rate_cca = cca_transformer.fit_transform(
        fert_rate, life_exp)
    return life_exp_cca, fert_rate_cca
def collect():
    # generate a dict from the indicators file
    takwimu_indicators = pd.read_csv('key/takwimu_indicators.csv',
                                     index_col=0,
                                     squeeze=True).to_dict()
    # Gather indicator data on the selected countries
    data = wbdata.get_dataframe(takwimu_indicators,
                                country=country_code,
                                convert_date=False)
    return data.to_csv('data/takwimu_worldbank_data.csv')
Exemple #18
0
    async def gni_percap(self, ctx, country: str, year: int):
        await ctx.defer(hidden=True)
        arg = country
        arg2 = str(year)
        try:
            country1 = coco.convert(names=arg, to="iso2")
            country2 = []
            country2.append(country1)

            # set up the indicator I want (just build up the dict if you want more than one)
            indicators = {"NY.GNP.PCAP.CD": "GNI per Capita"}

            # grab indicators above for countires above and load into data frame
            df = wbdata.get_dataframe(
                indicators, country=country2, convert_date=False
            ).to_dict()["GNI per Capita"][arg2]

            if str(df) == "nan":
                embed = discord.Embed(
                    title="Sorry",
                    description="**We couldn't find data for that year**",
                    color=0xFF5733,
                )

                embed.set_thumbnail(url=url)
                await ctx.send(embed=embed)

            else:

                embed = discord.Embed(
                    title="GNI per capita of {}".format(arg),
                    description=f"The gni per capita of {arg} in {arg2} was/is $`{str(df)}`",
                    color=0xFF5733,
                )

                result3 = coco.convert(names=arg, to="ISO2")

                embed.set_thumbnail(
                    url=f"https://flagcdn.com/w80/{result3.lower()}.jpg"
                )

                embed.set_footer(text="Information requested by: {}".format(ctx.author))

                await ctx.send(embed=embed)

        except:
            embed = discord.Embed(
                title="Sorry",
                description="** We could not find data for that year**",
                color=0xFF5733,
            )

            embed.set_thumbnail(url=url)

            await ctx.send(embed=embed)
Exemple #19
0
def load_population_wb(fileName='population_world_countries.csv'):
    import os
    if fileName and os.path.isfile(fileName):
        wbdf = pd.read_csv(fileName)
    else:
        import wbdata
        import datetime

        wbdf = wbdata.get_dataframe({'SP.POP.TOTL': 'Population'},
                                    country='all',
                                    convert_date=True)
        wbdf = wbdf.reset_index()
        wbdf = wbdf.dropna()
        wbdf = wbdf.groupby(by=['country']).first()
        wbdf = wbdf.reset_index()
        wbdf = wbdf.rename(columns={'country': 'Country'})
        wbdf = wbdf.drop(columns=['date'])
        #fix names to match the WHO datasource
        correctCountryNamesDict = {}
        oldNames = [
            "Brunei Darussalam", "Congo, Dem. Rep.", "Congo, Rep.",
            "Czech Republic", "Egypt, Arab Rep.", "Iran, Islamic Rep.",
            "Korea, Rep.", "St. Lucia", "West Bank and Gaza",
            "Russian Federation", "Slovak Republic", "United States",
            "St. Vincent and the Grenadines", "Venezuela, RB"
        ]
        newNames = [
            "Brunei", "Congo (Kinshasa)", "Congo (Brazzaville)", "Czechia",
            "Egypt", "Iran", "Korea, South", "Saint Lucia",
            "occupied Palestinian territory", "Russia", "Slovakia", "US",
            "Saint Vincent and the Grenadines", "Venezuela"
        ]
        for old, new in zip(oldNames, newNames):
            correctCountryNamesDict[old] = new
        wbdf = wbdf.replace({"Country": correctCountryNamesDict})
        # Data from wikipedia
        noDataCountries = pd.DataFrame({
            'Country': [
                "Cruise Ship", "Guadeloupe", "Guernsey", "Holy See", "Jersey",
                "Martinique", "Reunion", "Taiwan*"
            ],
            'Population':
            [3700, 395700, 63026, 800, 106800, 376480, 859959, 23780452]
        })

        wbdf = wbdf.append(noDataCountries).sort_values(
            by=['Country']).reset_index(drop=True)

        standardNamesDict = getStandardNamesDict()
        wbdf = wbdf.replace({"Country": standardNamesDict})

        if fileName:
            wbdf.to_csv(fileName, index=False)

    return wbdf
Exemple #20
0
def load_from_wbdata(countries, indicators, year_from, year_to):
    """Create data frame for given list of countries, indicators and dates using World Bank API
    :param countries: list of codes
    :param indicators: dict {ind_code : ind_name}
    :param year_from: starting year
    :param year_to: ending year
    :returns df_data: multi index data frame
    """
    data_date = (datetime.datetime(year_from, 1, 1), datetime.datetime(year_to, 1, 1))
    df_data = wbdata.get_dataframe(indicators, country=countries, data_date=data_date, convert_date=False)
    return df_data
    def _retrieve_from_server(self, country):
        """
        Retrieve the dataset of the country from the server.

        Args:
            country (str): country name

        Returns:
            pandas.DataFrame: retrieved data
                Index
                    reset index
                Columns
                    - Country (object): country name
                    - Year (int): year
                    - Sex (object): Female/Male
                    - Age (object): age
                    - Population (object): population value
        """
        if self.verbose:
            print(
                f"Retrieving population pyramid dataset ({country}) from https://data.worldbank.org/"
            )
        # Retrieve from World Bank Open Data
        iso3_code = coco.convert(country, to="ISO3", not_found=None)
        try:
            df = wbdata.get_dataframe(self.INDICATOR_DICT,
                                      country=iso3_code,
                                      convert_date=True)
        except RuntimeError:
            raise SubsetNotFoundError(country=country) from None
        # Preprocessing (-> Country, Population, Min, Max, Sex, Year)
        df = df.stack().reset_index()
        df.insert(0, self.COUNTRY, country)
        df.columns = [self.COUNTRY, "Date", "Attribute", self.N]
        df2 = df["Attribute"].str.split("-", expand=True)
        df2.columns = ["Min", "Max", self.SEX]
        df = pd.concat([df.drop("Attribute", axis=1), df2], axis=1)
        df["Max"] = df["Max"].replace("UP", self.ELDEST)
        for col in [self.N, "Min", "Max"]:
            df[col] = pd.to_numeric(df[col], downcast="integer")
        df[self.SEX].replace({"FE": "Female", "MA": "Male"}, inplace=True)
        df[self.YEAR] = df["Date"].dt.year
        df = df.drop("Date", axis=1)
        # Preprocessing (-> Country, Year, Sex, Age, Population)
        df[self.AGE] = df[["Min",
                           "Max"]].apply(lambda x: range(x[0], x[1] + 1),
                                         axis=1)
        df[self.N] = df[["Min", "Max", self.N]].apply(lambda x: x[2] /
                                                      (x[1] - x[0] + 1),
                                                      axis=1)
        df = df.explode(self.AGE).reset_index(drop=True)
        df[self.N] = df[self.N].astype(np.int64)
        return df.loc[:, self.PYRAMID_COLS]
Exemple #22
0
def load_wb_data():
    indicator = 'SH.DTH.IMRT'
    start_date = 2010
    end_date = 2015
    data_dates = (dt.datetime(start_date, 1, 1), dt.datetime(end_date, 1, 1))
    data = wb.get_dataframe({indicator: 'values'},
                            country=('PAK', 'IND'),
                            data_date=data_dates,
                            convert_date=False,
                            keep_levels=True)
    data = data.reset_index()
    return data
def overall_trend(ax):
    """Plot the overall trend of safely managed sanitation facilities"""
    df = wbdata.get_dataframe(
        indicators_sanitation, country=["WLD"],
        convert_date=False).dropna()  # 'WLD' is the code of world"
    df_clean = clean_data(df)
    # A matplotlib plot with legend, labels and a title
    xlabels = df_clean.index
    ax.plot(xlabels, df_clean)
    ax.legend(["World"], loc='best')
    ax.set_title("Safely Managed Sanitation Services (% population)")
    ax.set_ylabel('% population')
def get_indicator(x, y):
    indicators = {x: y}
    df = wbdata.get_dataframe(indicators, country="all", convert_date=True)
    df = df.reset_index(drop=False)
    df.rename(index=str,
              columns={
                  "country": "Country",
                  "date": "Year"
              },
              inplace=True)
    df['Year'] = df['Year'].apply(lambda x: int(x.year))
    df = df.pivot(index='Country', columns='Year', values=y)
    df = df.loc[:, '1990':'2016'].dropna(axis='rows')
    return df
def data_incomelevel(inc):  # Parameter inc is the incomelevel
    """Given an income level, return a plotable dataframe"""
    df = wbdata.get_dataframe(indicators,
                              country=countries_incomelevel(inc),
                              convert_date=False)
    df_clean = clean_data(
        df
    )  # clean data, drop all the rows with missing value (missing population value or sanitation value), sort index
    df_mul = multiply_two_columns(
        df_clean, "sanitation", "population",
        "san_pop")  # san_pop = sanitation * population, a new column
    df_sum = sum_at_index(
        df_mul, "date"
    )  # group the dataset on "date", and sum population value and san_pop value
    df_div = divide_two_columns(df_sum, "san_pop", "population",
                                "trend")  # trend = san_pop / population
    df_final = df_div["trend"]  # only keep "trend" column, for plot
    return df_final
Exemple #26
0
def get_pop(df):
    '''Takes a dataframe where the columns are country names and the rows are dates. 
    Returns population from the WDI.

    Parameters
    ----------

    df: a Pandas dataframe'''
    countries = df.columns
    date_tuple = df.index.min(), df.index.max()

    ISO_dict = pd.read_csv(data + '/ISO_codes.csv',
                           index_col=[0]).ISO.to_dict()

    ISOs = [ISO_dict[country] for country in countries]
    indicators = {'SP.POP.TOTL': 'population'}
    result = wbdata.get_dataframe(indicators, country=ISOs)
    result = result['population'].unstack().T
    return (result)
Exemple #27
0
def wb_country_data(indicator, start=2015, end=2015):
    """
    grab gender parity index data from world bank api
    :param indicator:
    :param start: start year
    :param end: end year
    :return: a dataframe
    """
    data_dates = (datetime.datetime(start, 1, 1), datetime.datetime(end, 1, 1))
    # call the api
    data = wbdata.get_dataframe({indicator: 'indicator'},
                                data_date=data_dates,
                                convert_date=True,
                                keep_levels=False)
    df_wb = data[['indicator']]
    df_wb['CountryName'] = df_wb.index
    df_wb = df_wb.reset_index(drop=True)
    df_wb = df_wb.dropna()
    df_wb.to_csv('../data/gender_coef.csv')
    return df_wb
Exemple #28
0
def download_world_bank():
    """
    Download data from the World Bank
    """

    path = './data/world_bank'

    delete_directory(path=path)

    indicators = [{
        'NY.GDP.PCAP.PP.CD':
        f'GDP per capita, PPP (current international $)'
    }, {
        'SP.POP.TOTL': f'Population, total'
    }, {
        'SP.URB.TOTL.IN.ZS':
        f'Urban population (% of total population)'
    }, {
        'EN.POP.SLUM.UR.ZS':
        f'Urban population (% of total population)'
    }, {
        'SP.RUR.TOTL.ZS': f'Urban population (% of total population)'
    }, {
        'SP.DYN.LE00.IN': f'Life expectancy at birth, total (years)'
    }, {
        'SH.XPD.CHEX.GD.ZS': f'Current health expenditure (% of GDP)'
    }]

    for indicator in indicators:

        file_name = list(indicator.keys())[0]
        full_path = f'{path}/{file_name}.csv'

        print(f'Downloading {indicator}.')

        try:
            df = wbdata.get_dataframe(indicator)
            df.to_csv(full_path)
            sleep(2)
        except Exception:
            print(f'Download failed for {indicator}')
Exemple #29
0
def WB_country_data(indicator, start=2015, end=2015):
    """
    A function for for getting gender indicator data for all countries

    :param country_code: a string of three letters indicatoring country name
    :param indicator: the indicator of database, a string
    :param start: start date of the year
    :param end: end year
    :return: a dataframe of this indicator
    """
    import datetime
    import wbdata
    data_dates = (datetime.datetime(start, 1, 1), datetime.datetime(end, 1, 1))
    # call the api
    data = wbdata.get_dataframe({indicator: 'indicator'},
                                data_date=data_dates,
                                convert_date=True,
                                keep_levels=False)

    data = data.reset_index()
    # data = data.dropna() #if I want I can drop the na's
    return data[['indicator']]
Exemple #30
0
def get_dataframe_spec(request):
    (
        country,
        data_date,
        source,
        convert_date,
        column_name,
        keep_levels,
    ) = request.param
    return GetDataFrameSpec(
        result=wbd.get_dataframe(
            {
                "NY.GDP.MKTP.CD": column_name,
                "NY.GDP.MKTP.PP.CD": "ppp"
            },
            country=country,
            data_date=data_date,
            source=source,
            convert_date=convert_date,
            keep_levels=keep_levels,
        ),
        country=country,
        data_date=data_date,
        source=source,
        convert_date=convert_date,
        column_names=[column_name, "ppp"],
        keep_levels=keep_levels,
        expected_country="Eritrea",
        expected_date=dt.datetime(2010, 1, 1) if convert_date else "2010",
        expected_column=column_name,
        expected_value={
            "2": 2117039512.19512,
            "11": 2117008130.0813
        }[source or "2"],
        country_in_index=(country == "all" or not isinstance(country, str)
                          or keep_levels),
        date_in_index=(not isinstance(data_date, dt.datetime) or keep_levels),
    )
Exemple #31
0
def retrieve_data_from_api(indicator):
    """
    Calls wbdata API client to retrieve WDI data and returns data as Pandas dataframe.
 
    >>> VALID_INDICATOR = 'SH.STA.BASS.ZS'
    >>> INVALID_INDICATOR = 'foo'

    >>> type(retrieve_data_from_api(VALID_INDICATOR))
    <class 'pandas.core.frame.DataFrame'>

    >>> retrieve_data_from_api(INVALID_INDICATOR)
    This indicator could not be retrieved.
    """
    min_date = datetime.datetime(1960, 1, 1)
    max_date = datetime.datetime(2020, 1, 1)
    data_date = (min_date, max_date)

    try:
        return wbdata.get_dataframe(
            {indicator: "value"}, data_date=data_date
        ).reset_index()
    except:
        print("This indicator could not be retrieved.")
def load_avg(start_yr, end_yr):
    years = (datetime.datetime(start_yr,1,1), datetime.datetime(end_yr,12,30))
    df = wbdata.get_dataframe(indicators, data_date=years)
    wb_df = df.unstack(level = 0)
    wb_mean = wb_df.mean()
    SecM = wb_mean['SecondaryMale']
    SecF = wb_mean['SecondaryFemale']
    Sec = wb_mean['Secondary']
    Prim = wb_mean['Primary']
    PrimM = wb_mean['PrimaryMale']
    PrimF = wb_mean['PrimaryFemale']
    Poverty = wb_mean['Poverty']
    Gini = wb_mean['GINI']
    mean_df = pd.DataFrame(Poverty, columns=['Poverty'])
    mean_df['Sec_M'] = SecM
    mean_df['Sec_F'] = SecF
    mean_df['Sec'] = Sec
    mean_df['Prim_M'] = PrimM
    mean_df['Prim_F'] = PrimF
    mean_df['Prim'] = Prim
    mean_df['Gini'] = Gini
    mean_df = mean_df.dropna(how='all')
    return mean_df    
Exemple #33
0
for i in range(1000, len(indicators)):
    indicatorID.append(indicators[i]['id'])

# Make indDict for fetching data
indDict = dict()
for indStr in indicatorID:
    indDict[indStr] = indStr.replace('.','_')

# fetch data
data_date = (datetime.datetime(2005, 1, 1), datetime.datetime(2016, 1, 1))
for countryStr in countryID:
    for key, value in indDict.items():
        d = dict()
        d[key] = value
        try:
            df = wbdata.get_dataframe(d, country=countryStr, data_date=data_date, convert_date=True)
            df = df.dropna()
        #except (TypeError, ValueError):
        except:
            #print('failed and continue')
            continue
        
        try:
            dfname="df."+str(value)
            countryStr_tmp = "'"+countryStr+"'";
            valueStr_tmp = "'"+str(value).replace('_', '.')+"'";
            code = '''for i in range(len('''+dfname+''')): y=int(str(df.index[i])[:4]); v='''+dfname+'''[i]; con.execute("INSERT INTO hua (tag, year, country, category, value, property) VALUES ('''+valueStr_tmp+''', '%d', '''+countryStr_tmp+''', 'CATEGORY', '%f', 'float');" % (y, v)); print('''+valueStr_tmp+''', y, '''+countryStr_tmp+''')'''
                        
            #code = "for i in range(len("+dfname+")): y=str(df.index[i])[:4]; v="+dfname+"[i]; con.execute('INSERT INTO hua (tag, year, country, category, value, property) VALUES ('TAGGY', '%d', '%s', '%s', '%f', 'float');' % (y,"+countryStr+", "+str(value)+", v))"
            exec(code)
            #for i in range(len(df.IC_BUS_EASE_XQ)):
Exemple #34
0
import wbdata
import matplotlib.pyplot as plt
 
#set up the countries I want
countries = ["CN", "IN", "US"]
 
#set up the indicator I want (just build up the dict if you want more than one)
indicators = {'NY.GNP.PCAP.CD': 'GNI per Capita'}
 
#grab indicators above for countires above and load into data frame
df = wbdata.get_dataframe(indicators, country=countries, convert_date=False)
 
#df is "pivoted", pandas' unstack fucntion helps reshape it into something plottable
dfu = df.unstack(level=0)
 
# a simple matplotlib plot with legend, labels and a title
dfu.plot()
plt.legend(loc='best')
plt.title("GNI Per Capita ($USD, Atlas Method)")
plt.xlabel('Date'); plt.ylabel('GNI Per Capita ($USD, Atlas Method')
plt.show()
# get maternal deaths in 1990                               
indicators = {                                                            
        'SP.DYN.LE00.MA.IN' : 'LifeExp_Male',
        'SH.HIV.1524.FE.ZS': 'Percent_HIV-AIDS_Female',     
        'SP.DYN.CDRT.IN': 'Deaths_per_100k_Population',     
        'SP.DYN.IMRT.IN': "Infant_mortality_rate",          
        'SP.DYN.TFRT.IN': "Total_fertility_rate",           
        'SP.POP.65UP.TO.ZS': "Percent_of_pop_over_65", 
}                                                           
                                                            
                                                            
start = datetime.datetime(2014, 1, 1)                       
stop = datetime.datetime(2014, 12, 31)                      
                                                            
                                                            
df1 = wbdata.get_dataframe(indicators = indicators,          
        data_date = (start, stop))                          
                                                            
                                                            
#df1.head()




# In[2]:



maternal_deaths = {'SH.STA.MMRT': "Maternal_Deaths_1990"}    
start = datetime.datetime(1990, 1, 1)                        
stop = datetime.datetime(1990, 12, 31)                       
Exemple #36
0
import datetime
import wbdata
import numpy as np
import pandas as pd

# Createa a pandas.dataframe from wbdata, which is restricted by the given
# date and indicators
indicators = {
              "EN.ATM.CO2E.PC": "co2",
              "GC.DOD.TOTL.GD.ZS": "debt",
              "SE.ENR.TERT.FM.ZS": "gender edu",
              "SI.DST.10TH.10": "topincome"
              }   
data_date = (datetime.datetime(2011,1,1), datetime.datetime(2011,1,1))  # Only year 2011
df = wbdata.get_dataframe(indicators, 
                          country="all", 
                          convert_date=True, 
                          data_date=data_date)
df = df.fillna(df.mean())   # replace missing values with mean
print ("All data:")
dfgdp = wbdata.get_dataframe({"NY.GDP.PCAP.PP.KD": "gdppc"}, 
                             country="all", 
                             convert_date=True, 
                             data_date=data_date)
gdp_numeric = df.values
gdp_numeric = gdp_numeric.tolist()
gdp_numeric = [i[0] for i in gdp_numeric]
#print (gdp_numeric)  
# The quartile values below are found by finding quartile info from df.describe()
q1 = 5000
q2 = 15000
q3 = 20000
Exemple #37
0
'''
gdp per capita conversion to low, med,  hi
'''

import wbdata
import pandas as pd
import datetime
import numpy as np
#countries = [i['id'] for i in wbdata.get_country(incomelevel="all", display=False)]
#countries = [i['id'] for i in wbdata.get_country(country_id=None, display=False)]
countries = "all"
indicators = {"NY.GDP.PCAP.PP.KD": "gdppc"}
#indicators = {"NY.GDP.PCAP.PP.KD": "gdppc"}
data_date = (datetime.datetime(2011,1,1), datetime.datetime(2011,1,1))
df = wbdata.get_dataframe(indicators, country=countries, convert_date=True, data_date=data_date)
df = df.fillna(df.mean())   # replace missing values with mean
print ("All data:")
gdp_numeric = df.values
gdp_numeric = gdp_numeric.tolist()
gdp_numeric = [i[0] for i in gdp_numeric]
print (gdp_numeric)  
print ("All GDP numeric values:")
print (df.describe()) 
q1 = 5000
q2 = 15000
q3 = 20000
q4 = 150000
#a = [i for i in range(0,10)]
a = gdp_numeric

gdp_classes = np.array(gdp_numeric)
Exemple #38
0
 def testCountries(self):
     countries = ("USA", "GBR")
     wbdata.get_dataframe(self.indicators, country=countries)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import wbdata

##### Extract data from World Bank API #####

# Want to grab measure of inflation (for comparison purposes)
indicators = {"FP.CPI.TOTL.ZG": "value"}

# Low income countries
LIC_countries = [country['id'] for country in wbdata.get_country(incomelevel="LIC", display=False)]
LIC_df = wbdata.get_dataframe(indicators, country=LIC_countries, convert_date=False)

# Lower Middle income countries
LMC_countries = [country['id'] for country in wbdata.get_country(incomelevel="LMC", display=False)]
LMC_df = wbdata.get_dataframe(indicators, country=LMC_countries, convert_date=False)

# Upper Middle income countries
UMC_countries = [country['id'] for country in wbdata.get_country(incomelevel="UMC", display=False)]
UMC_df = wbdata.get_dataframe(indicators, country=UMC_countries, convert_date=False)

# High income countries
HIC_countries = [country['id'] for country in wbdata.get_country(incomelevel="HIC", display=False)]
HIC_df = wbdata.get_dataframe(indicators, country=HIC_countries, convert_date=False)

##### plot FP.CPI.TOTL.ZG ####
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111)
"""
Messing around with Oliver Sherouse's wbdata, which accesses all of the 
World Bank's data API's. This follows the documentation, link below.  

Not sure this is ready for primetime, but it could be me... 

References 
* http://datacatalog.worldbank.org/
* http://blogs.worldbank.org/opendata/accessing-world-bank-data-apis-python-r-ruby-stata
* https://github.com/OliverSherouse/wbdata/blob/master/docs/source/index.rst 

Prepared for the NYU Course "Global Economy" 
* https://sites.google.com/site/nyusternglobal/home 
* https://github.com/DaveBackus/Global_Economy 

Written by Dave Backus @ NYU, September 2014  
Created with Python 3.4 
"""
import wbdata 

wbdata.get_source()
wbdata.get_indicator(source=15)
d   = wbdata.get_data('IC.BUS.EASE.XQ', country='USA')

indicators = {'IC.BUS.EASE.XQ': 'Ease', 'IRSPREAD': 'Spread'}
df1 = wbdata.get_dataframe(indicators, data_date=(2012, 2013), country='ARG')
#df2 = wbdata.get_dataframe('IRSPREAD', country='all', convert_date=True)

#%%
Exemple #41
0
def regress(debug = False):
    if request.method == 'POST':
        if debug:
            return jsonify(debug_response_main)
        try:
            data = json.loads(request.data)
            from_year, to_year, options = int(data.pop('from')), int(data.pop('to')), data.pop('options')
            highest = max(data.keys())
            indicators = {data[x]['ind']:data[x]['ind'] for x in data if 'ind' in data[x]}

            # pulls the data, removes rows with any NA (making R's life better)
            df = wbdata.get_dataframe(indicators=indicators,
                                      convert_date=True,
                                      data_date=(
                                          datetime(from_year, 1, 1),
                                          datetime(to_year, 1, 1)
                                      )).dropna()

            if not len(df):
                return jsonify({"desc": "Not enough data!",
                                "summary": "Not enough data!",
                                'effects': {'info': "Not enough data!"}, 'error': 0})

            lm_vectors, mapData = [], {}
            for num in data:
                ind_name = data[num]['ind']
                vector = FloatVector(df[ind_name])
                if num != highest:
                    robjects.globalenv[str('v' + num)] = vector
                else:
                    robjects.globalenv[str('res')] = vector
                    desc = json.loads(get_ind_preview(2010, ind_name).data)
                    desc['highest'] = ind_name
                # Store it in the server side session.
                keys = map(functions.make_key, df[ind_name].keys())
                session[ind_name] = [{keys[i]: df[ind_name][i] for i in range(len(keys))}]
                mapData[ind_name] = {}
                for key in df[ind_name].keys():
                    if functions.get_country_code(key[0]):
                        mapData[ind_name].setdefault(key[1].year, {})[functions.get_country_code(key[0])] = df[ind_name][key]

            effects = {'count': str(len(df)) + ' rows of data were used for the analysis.'}
            lmr = stats.lm("res ~ {}".format(' + '.join(['v' + str(i) for i in range(1, len(data))])))
            lmr = str(base.summary(lmr))
            lmr = lmr[lmr.find('Residuals:'):]

            lda = {}
            if options['lda']:
                try:
                    robjects.r('qres1 <- quantile(res)')
                    robjects.r('qres <- cut(res, qres1, labels=c(1,2,3,4), include.lowest=TRUE)')
                    importr('MASS')
                    robjects.r("mylda <- lda(qres ~ {})".format(' + '.join(['v' + str(i) for i in range(1, len(data))])))
                    lda_pie = list(robjects.r("mylda$svd^2/sum(mylda$svd^2) * 100"))
                    lda_means = list(robjects.r("mylda$means"))
                    robjects.r("lda_preds <- predict(mylda, as.table(cbind({})))".format(','.join(['v' + str(i) for i in range(1, len(data))])))
                    lda_class_success = robjects.r('mean(as.numeric(lda_preds$class) == qres)')
                    lda = {'lda_pie': lda_pie, 'lda_means': lda_means, 'lda_class_success': float(lda_class_success[0]) * 100}
                    effects['lda_success'] = "LDA classification on " + data[highest]['ind'] + " had an accuracy of  " + str(lda['lda_class_success']) + " %."
                except Exception as lda_e:
                    lda = {'error': lda_e.message}

            vals = lmr[lmr.lower().find('(intercept)'):lmr.lower().find('---')].split('\n')

            for i in range(1, len(data)):
                row, name = vals[i].split(), data[str(i)]['ind']     #is the corresponding row of this datum
                if len(row) and row[-1] in ['*', '**', '***', '.']:                          #it is significant
                    effects[name] = "{0} {1}significantly affects {2} in a {3} direction.".format(
                        name,
                        {'.':'', '*': 'quite ', '**': 'very ', '***': 'very very '}[row[-1]],
                        data[highest]['ind'],
                        {1: 'positive', 0: 'negative'}[rpy2functions.sign(row[1])]
                    )
                    effects[name] += ' A single unit increase in {0}, {1} {2} by {3} units on average.'.format(
                        name,
                        {1:'increases', 0:'decreases'}[rpy2functions.sign(row[1])],
                        data[highest]['ind'],
                        rpy2functions.unsign(row[1])
                    )
                else:
                    effects[name] = name + " was not found to be a significant factor!"
                response = {"desc": str(df.describe()), "summary": lmr, 'effects': effects, 'error': 0,
                            'mapData': mapData, 'lda': lda, 'desc2': desc}
            return jsonify(response)
        except Exception as e:
            return jsonify({'error': 1,
                            'err_msg': 'There was an error. Trace attached:',
                            'trace': '\n'.join(e.args) + e.message})
Exemple #42
0
 def testDate(self):
     data_date = datetime.datetime(2008, 1, 1)
     wbdata.get_dataframe(self.indicators, data_date=data_date)
Exemple #43
0
 def testDateRange(self):
     data_date = (datetime.datetime(2008, 1, 1),
                  datetime.datetime(2010, 1, 1))
     wbdata.get_dataframe(self.indicators, data_date=data_date)
Exemple #44
0
 def testConvertDate(self):
     wbdata.get_dataframe(self.indicators, convert_date=True)
Exemple #45
0
# __author__ = 'david'

import wbdata
import pandas
import matplotlib.pyplot as plt

#set up the countries I want
countries = ["CL","UY","HU"]

#set up the indicator I want (just build up the dict if you want more than one)
indicators = {'SP.DYN.LE00.IN':'Life expectancy at birth, total (years)'}

#grab indicators above for countires above and load into data frame
df = wbdata.get_dataframe(indicators, convert_date=False)
#wbdata.get_dataframe
#df is "pivoted", pandas' unstack fucntion helps reshape it into something plottable
dfu = df.unstack(level=0)

# a simple matplotlib plot with legend, labels and a title
dfu.plot();
plt.legend(loc='best');
plt.title("GNI Per Capita ($USD, Atlas Method)");
plt.xlabel('Date'); plt.ylabel('GNI Per Capita ($USD, Atlas Method');

print(dfu)
Exemple #46
0
            for a in indicators:

                # set a pretty name for the SQL DB
                forname = a['id'].lower()
                forname = ''.join(e for e in forname if e.isalnum())
                b = {a['id']: forname}

                # update the list of all indicators
                assert isinstance(forname, unicode)
                if forname in all_ind:
                    logging.info("%i: Skipped duplicate [%s] %s" % (s_yr, forname, a['name']))
                    continue
                all_ind.append(forname)

                # save API data to a dataframe
                df_temp = wbdata.get_dataframe(b, data_date=data_date)
                if df_temp is None:
                    logging.warn("%i: No API response [%s] %s" % (s_yr, forname, a['name']))
                    continue

                # don't bother with params below completion threshold
                notnull = df_temp.count(0) / df_cref.shape[0]
                if notnull[0] < min_compl:
                    logging.warn("%i: Too little data [%s] %s" % (s_yr, forname, a['name']))
                    continue
                print "%i: Fetched [%s] %s" % (s_yr, forname, a['name'])

                # join dataframe to empty DF or add to merged DF
                if i == 0:
                    df_cmerged = df_cref.join(df_temp)
                    df_amerged = df_aref.join(df_temp)
#this code allows us to pull statistics about education and GNI for India from the World Bank API, for a comparison against the candidate information

import wbdata
import pandas as pd
import datetime

#the country we want to pull data for
countries = ["IN"]
 
#the indicators that we are interested in collecting data on
indicators = {'NY.GNP.PCAP.CD':'GNI per Capita',
              'MYS.PROP.15UP.NED.MF' : 'Pop % - No Education (Age: 15+)',
              'MYS.PROP.15UP.PRI.MF' : 'Pop % - Primary (Age: 15+)',
              'MYS.PROP.15UP.SEC.MF' : 'Pop % - Secondary (Age: 15+)',
              'MYS.PROP.15UP.TER.MF' : 'Pop % - Tertiary (Age: 15+)'}
 
#start and end date for data request
years = (datetime.datetime(2010, 1, 1), datetime.datetime(2010, 12, 12))  #chose 2010, because it had data available for the indicators, and was in the middle of the election data we had available
 
#grab indicators for selected country and timeframe and load into data frame
df = wbdata.get_dataframe(indicators, country=countries, data_date=years)

#df is "pivoted", pandas' unstack function reshapes it into something plottable
wb_df = df.unstack(level=0)

#save data into a CSV for access in R
wb_df.to_csv("G:\ProgrammingForAnalytics\Assignments\GroupProject\WB_data.csv")