def test_wdi_download_error_handling(self): cntry_codes = ['USA', 'XX'] inds = 'NY.GDP.PCAP.CD' with tm.assertRaisesRegexp(ValueError, "Invalid Country Code\\(s\\): XX"): result = download(country=cntry_codes, indicator=inds, start=2003, end=2004, errors='raise') if PANDAS_0160: # assert_produces_warning doesn't exists in prior versions with self.assert_produces_warning(): result = download(country=cntry_codes, indicator=inds, start=2003, end=2004, errors='warn') self.assertTrue(isinstance(result, pd.DataFrame)) self.assertEqual(len(result), 2) cntry_codes = ['USA'] inds = ['NY.GDP.PCAP.CD', 'BAD_INDICATOR'] with tm.assertRaisesRegexp(ValueError, "The provided parameter value is not valid\\. Indicator: BAD_INDICATOR"): result = download(country=cntry_codes, indicator=inds, start=2003, end=2004, errors='raise') if PANDAS_0160: with self.assert_produces_warning(): result = download(country=cntry_codes, indicator=inds, start=2003, end=2004, errors='warn') self.assertTrue(isinstance(result, pd.DataFrame)) self.assertEqual(len(result), 2)
def test_wdi_download(self): # Test a bad indicator with double (US), triple (USA), # standard (CA, MX), non standard (KSV), # duplicated (US, US, USA), and unknown (BLA) country codes # ...but NOT a crash inducing country code (World bank strips pandas # users of the luxury of laziness, because they create their # own exceptions, and don't clean up legacy country codes. # ...but NOT a retired indicator (User should want it to error.) cntry_codes = ['CA', 'MX', 'USA', 'US', 'US', 'KSV', 'BLA'] inds = ['NY.GDP.PCAP.CD','BAD.INDICATOR'] expected = {'NY.GDP.PCAP.CD': {('Canada', '2004'): 31829.522562759001, ('Canada', '2003'): 28026.006013044702, ('Kosovo', '2004'): 2135.3328465238301, ('Kosovo', '2003'): 1969.56271307405, ('Mexico', '2004'): 7042.0247834044303, ('Mexico', '2003'): 6601.0420648056606, ('United States', '2004'): 41928.886136479705, ('United States', '2003'): 39682.472247320402}} expected = pd.DataFrame(expected) # Round, to ignore revisions to data. expected = np.round(expected,decimals=-3) if PANDAS_0170: expected = expected.sort_index() else: expected = expected.sort() result = download(country=cntry_codes, indicator=inds, start=2003, end=2004, errors='ignore') if PANDAS_0170: result = result.sort_index() else: result = result.sort() # Round, to ignore revisions to data. result = np.round(result, decimals=-3) if PANDAS_0140: expected.index.names=['country', 'year'] else: # prior versions doesn't allow to set multiple names to MultiIndex # Thus overwrite it with the result expected.index = result.index tm.assert_frame_equal(result, expected) # pass start and end as string result = download(country=cntry_codes, indicator=inds, start='2003', end='2004', errors='ignore') if PANDAS_0170: result = result.sort_index() else: result = result.sort() # Round, to ignore revisions to data. result = np.round(result, decimals=-3) tm.assert_frame_equal(result, expected)
def test_wdi_download(self): # Test a bad indicator with double (US), triple (USA), # standard (CA, MX), non standard (KSV), # duplicated (US, US, USA), and unknown (BLA) country codes # ...but NOT a crash inducing country code (World bank strips pandas # users of the luxury of laziness, because they create their # own exceptions, and don't clean up legacy country codes. # ...but NOT a retired indicator (User should want it to error.) cntry_codes = ['CA', 'MX', 'USA', 'US', 'US', 'KSV', 'BLA'] inds = ['NY.GDP.PCAP.CD','BAD.INDICATOR'] expected = {'NY.GDP.PCAP.CD': {('Canada', '2003'): 28026.006013044702, ('Mexico', '2003'): 6601.0420648056606, ('Canada', '2004'): 31829.522562759001, ('Kosovo', '2003'): 1969.56271307405, ('Mexico', '2004'): 7042.0247834044303, ('United States', '2004'): 41928.886136479705, ('United States', '2003'): 39682.472247320402, ('Kosovo', '2004'): 2135.3328465238301}} expected = pandas.DataFrame(expected) # Round, to ignore revisions to data. expected = pandas.np.round(expected,decimals=-3) expected.sort(inplace=True) result = download(country=cntry_codes, indicator=inds, start=2003, end=2004, errors='ignore') result.sort(inplace=True) # Round, to ignore revisions to data. result = pandas.np.round(result,decimals=-3) expected.index = result.index assert_frame_equal(result, pandas.DataFrame(expected))
def get_wb_df(wb_name,colname): """gets a dataframe from wb data with all years and all countries, and a lot of nans""" #return all values wb_raw =(wb.download(indicator=wb_name,start=start_year,end=today_year,country="all")) #sensible name for the column # wb_raw.rename(columns={wb_raw.columns[0]: colname},inplace=True) return wb_raw.rename(columns={wb_raw.columns[0]: colname})
def test_wdi_download_w_retired_indicator(self): cntry_codes = ['CA', 'MX', 'US'] # Despite showing up in the search feature, and being listed online, # the api calls to GDPPCKD don't work in their own query builder, nor # pandas module. GDPPCKD used to be a common symbol. # This test is written to ensure that error messages to pandas users # continue to make sense, rather than a user getting some missing # key error, cause their JSON message format changed. If # World bank ever finishes the deprecation of this symbol, # this nose test should still pass. inds = ['GDPPCKD'] try: result = download(country=cntry_codes, indicator=inds, start=2003, end=2004, errors='ignore') # If for some reason result actually ever has data, it's cause WB # fixed the issue with this ticker. Find another bad one. except ValueError as e: raise nose.SkipTest("No indicators returned data: {0}".format(e)) # if it ever gets here, it means WB unretired the indicator. # even if they dropped it completely, it would still get caught above # or the WB API changed somehow in a really unexpected way. if len(result) > 0: raise nose.SkipTest("Invalid results")
def datasets(dataset): if dataset == ('WB'): df = wb.download(indicator='NY.GDP.PCAP.KD', country = ['US', 'CA', 'MX'], start = 2000, end = 2015) return df elif dataset == ('Iris'): df = pd.read_csv('https://archive.ics.uci.edu/ml/' 'machine-learning-databases/iris/iris.data', header = None) return df
def test_wdi_download_monthly(self): expected = {'COPPER': {('World', '2012M01'): 8040.47, ('World', '2011M12'): 7565.48, ('World', '2011M11'): 7581.02, ('World', '2011M10'): 7394.19, ('World', '2011M09'): 8300.14, ('World', '2011M08'): 9000.76, ('World', '2011M07'): 9650.46, ('World', '2011M06'): 9066.85, ('World', '2011M05'): 8959.90, ('World', '2011M04'): 9492.79, ('World', '2011M03'): 9503.36, ('World', '2011M02'): 9867.60, ('World', '2011M01'): 9555.70}} expected = pd.DataFrame(expected) # Round, to ignore revisions to data. expected = np.round(expected, decimals=-3) if PANDAS_0170: expected = expected.sort_index() else: expected = expected.sort() cntry_codes = 'ALL' inds = 'COPPER' result = download(country=cntry_codes, indicator=inds, start=2011, end=2012, freq='M',errors='ignore') if PANDAS_0170: result = result.sort_index() else: result = result.sort() result = np.round(result, decimals=-3) if PANDAS_0140: expected.index.names = ['country', 'year'] else: # prior versions doesn't allow to set multiple names to MultiIndex # Thus overwrite it with the result expected.index = result.index tm.assert_frame_equal(result, expected) result = WorldBankReader(inds, countries=cntry_codes, start=2011, end=2012, freq='M', errors='ignore').read() if PANDAS_0170: result = result.sort_index() else: result = result.sort() result = np.round(result, decimals=-3) tm.assert_frame_equal(result, expected)
def test_wdi_download_w_crash_inducing_countrycode(self): cntry_codes = ['CA', 'MX', 'US', 'XXX'] inds = ['NY.GDP.PCAP.CD'] try: result = download(country=cntry_codes, indicator=inds, start=2003, end=2004, errors='ignore') except ValueError as e: raise nose.SkipTest("No indicators returned data: {0}".format(e)) # if it ever gets here, it means the country code XXX got used by WB # or the WB API changed somehow in a really unexpected way. if len(result) > 0: raise nose.SkipTest("Invalid results")
def api_wb(params): from data_params import DATABASE country_codes = mongo_to_dataframe('utilities', 'country_code') df = wb.download( indicator=params['indicator'], country=params['country'], start=params['start'], end=params['end'] ) df = df.reset_index() df.rename(columns=params['col_rename'], inplace=True) df = pd.merge(df,country_codes, left_on='country',right_on='country_name') df = df[['GDP_cst_dollars','ISO3','population','year']] logger.info('inserting df with shape: ' + str(df.shape)) dataframe_to_mongo(df, DATABASE,params['collection_name'], erase=True) logger.info('insertion sucessful in db' + DATABASE + ' of collection: ' + params['collection_name'])
def test_wdi_download_str(self): expected = {'NY.GDP.PCAP.CD': {('Japan', '2004'): 36441.50449394, ('Japan', '2003'): 33690.93772972, ('Japan', '2002'): 31235.58818439, ('Japan', '2001'): 32716.41867489, ('Japan', '2000'): 37299.64412913}} expected = pd.DataFrame(expected) # Round, to ignore revisions to data. expected = np.round(expected, decimals=-3) if PANDAS_0170: expected = expected.sort_index() else: expected = expected.sort() cntry_codes = 'JP' inds = 'NY.GDP.PCAP.CD' result = download(country=cntry_codes, indicator=inds, start=2000, end=2004, errors='ignore') if PANDAS_0170: result = result.sort_index() else: result = result.sort() result = np.round(result, decimals=-3) if PANDAS_0140: expected.index.names = ['country', 'year'] else: # prior versions doesn't allow to set multiple names to MultiIndex # Thus overwrite it with the result expected.index = result.index tm.assert_frame_equal(result, expected) result = WorldBankReader(inds, countries=cntry_codes, start=2000, end=2004, errors='ignore').read() if PANDAS_0170: result = result.sort_index() else: result = result.sort() result = np.round(result, decimals=-3) tm.assert_frame_equal(result, expected)
def test_wdi_download_quarterly(self): expected = {'DT.DOD.PUBS.CD.US': {('Albania', '2012Q1'): 3240539817.18, ('Albania', '2011Q4'): 3213979715.15, ('Albania', '2011Q3'): 3187681048.95, ('Albania', '2011Q2'): 3248041513.86, ('Albania', '2011Q1'): 3137210567.92}} expected = pd.DataFrame(expected) # Round, to ignore revisions to data. expected = np.round(expected, decimals=-3) if PANDAS_0170: expected = expected.sort_index() else: expected = expected.sort() cntry_codes = 'ALB' inds = 'DT.DOD.PUBS.CD.US' result = download(country=cntry_codes, indicator=inds, start=2011, end=2012, freq='Q', errors='ignore') if PANDAS_0170: result = result.sort_index() else: result = result.sort() result = np.round(result, decimals=-3) if PANDAS_0140: expected.index.names = ['country', 'year'] else: # prior versions doesn't allow to set multiple names to MultiIndex # Thus overwrite it with the result expected.index = result.index tm.assert_frame_equal(result, expected) result = WorldBankReader(inds, countries=cntry_codes, start=2011, end=2012, freq='Q', errors='ignore').read() if PANDAS_0170: result = result.sort_index() else: result = result.sort() result = np.round(result, decimals=-1) tm.assert_frame_equal(result, expected)
def __init__(self, indicator): """Initiates with data from 2017-today""" self.indicator = indicator # Build cache if it does not already exist engine = create_engine('sqlite:///cache.db') conn = engine.connect() if engine.dialect.has_table(engine, self.indicator): self.data = pandas.read_sql(self.indicator, conn, index_col=['country', 'year']) else: self.data = wb.download(indicator=self.indicator, country='all', start=1960, end=2030).dropna() self.data.to_sql(self.indicator, conn, if_exists='replace') conn.close() self.start_date = self.data.unstack().columns[0][1] self.end_date = self.data.unstack().columns[-1][1]
def test_wdi_download_quarterly(self): code = "DT.DOD.PUBS.CD.US" expected = { code: { ("Albania", "2012Q1"): 3240539817.18, ("Albania", "2011Q4"): 3213979715.15, ("Albania", "2011Q3"): 3187681048.95, ("Albania", "2011Q2"): 3248041513.86, ("Albania", "2011Q1"): 3137210567.92, } } expected = pd.DataFrame(expected) # Round, to ignore revisions to data. expected = np.round(expected, decimals=-3) expected = expected.sort_index() cntry_codes = "ALB" inds = "DT.DOD.PUBS.CD.US" result = download( country=cntry_codes, indicator=inds, start=2011, end=2012, freq="Q", errors="ignore", ) result = result.sort_index() result = np.round(result, decimals=-3) expected.index.names = ["country", "year"] tm.assert_frame_equal(result, expected) result = WorldBankReader(inds, countries=cntry_codes, start=2011, end=2012, freq="Q", errors="ignore").read() result = result.sort_index() result = np.round(result, decimals=-1) tm.assert_frame_equal(result, expected)
def test_wdi_download(self): # Test a bad indicator with double (US), triple (USA), # standard (CA, MX), non standard (KSV), # duplicated (US, US, USA), and unknown (BLA) country codes # ...but NOT a crash inducing country code (World bank strips pandas # users of the luxury of laziness, because they create their # own exceptions, and don't clean up legacy country codes. # ...but NOT a retired indicator (User should want it to error.) cntry_codes = ['CA', 'MX', 'USA', 'US', 'US', 'KSV', 'BLA'] inds = ['NY.GDP.PCAP.CD', 'BAD.INDICATOR'] expected = { 'NY.GDP.PCAP.CD': { ('Canada', '2003'): 28026.006013044702, ('Mexico', '2003'): 6601.0420648056606, ('Canada', '2004'): 31829.522562759001, ('Kosovo', '2003'): 1969.56271307405, ('Mexico', '2004'): 7042.0247834044303, ('United States', '2004'): 41928.886136479705, ('United States', '2003'): 39682.472247320402, ('Kosovo', '2004'): 2135.3328465238301 } } expected = pandas.DataFrame(expected) # Round, to ignore revisions to data. expected = pandas.np.round(expected, decimals=-3) expected.sort(inplace=True) result = download(country=cntry_codes, indicator=inds, start=2003, end=2004, errors='ignore') result.sort(inplace=True) # Round, to ignore revisions to data. result = pandas.np.round(result, decimals=-3) expected.index = result.index assert_frame_equal(result, pandas.DataFrame(expected))
def featureappnd(ind,nm,ft): clmns.append(nm) tmpdt=wb.download(country=u"all", indicator=ind,start=strtyear, end=now.year) tmpdt.columns=[nm] tmpdt['Country']=[i[0] for i in tmpdt.index] tmpdt['Year']=[i[1] for i in tmpdt.index] tmpdt['Year']=tmpdt['Year'].apply(int) tmpdt =tmpdt.groupby("Country").transform(lambda x: x.iloc[::-1]) tmpdt['Country']=[i[0] for i in tmpdt.index] tmpdt=tmpdt[["Country","Year",nm]] tmpdt=tmpdt[658:] tmpdt[nm]=tmpdt[nm].replace(0, np.nan) if ft=="reg": tmpdt[nm] = tmpdt.groupby("country")[nm].transform(lambda x: x.fillna(lnreg(x,tmpdt['Year']-strtyear))) elif ft=="mean": tmpdt[nm] = tmpdt.groupby("country")[nm].transform(lambda x: x.fillna(x.mean())) elif ft=="sdp": tmpdt[nm] = tmpdt.groupby("country")[nm].transform(lambda x: x.fillna(x.mean()+(tmpdt["Year"]-2010)*0.5*np.std(x))) elif ft=="sdn": tmpdt[nm] = tmpdt.groupby("country")[nm].transform(lambda x: x.fillna(x.mean()-(tmpdt["Year"]-2010)*0.5*np.std(x))) global maindata maindata=pd.merge(maindata,tmpdt)
def test_wdi_download_quarterly(self): code = 'DT.DOD.PUBS.CD.US' expected = { code: { ('Albania', '2012Q1'): 3240539817.18, ('Albania', '2011Q4'): 3213979715.15, ('Albania', '2011Q3'): 3187681048.95, ('Albania', '2011Q2'): 3248041513.86, ('Albania', '2011Q1'): 3137210567.92 } } expected = pd.DataFrame(expected) # Round, to ignore revisions to data. expected = np.round(expected, decimals=-3) expected = expected.sort_index() cntry_codes = 'ALB' inds = 'DT.DOD.PUBS.CD.US' result = download(country=cntry_codes, indicator=inds, start=2011, end=2012, freq='Q', errors='ignore') result = result.sort_index() result = np.round(result, decimals=-3) expected.index.names = ['country', 'year'] tm.assert_frame_equal(result, expected) result = WorldBankReader(inds, countries=cntry_codes, start=2011, end=2012, freq='Q', errors='ignore').read() result = result.sort_index() result = np.round(result, decimals=-1) tm.assert_frame_equal(result, expected)
def test_wdi_download_str(self): # These are the expected results, rounded (robust against # data revisions in the future). expected = { "NY.GDP.PCAP.CD": { ("Japan", "2004"): 38000.0, ("Japan", "2003"): 35000.0, ("Japan", "2002"): 32000.0, ("Japan", "2001"): 34000.0, ("Japan", "2000"): 39000.0, } } expected = pd.DataFrame(expected) expected = expected.sort_index() cntry_codes = "JP" inds = "NY.GDP.PCAP.CD" result = download(country=cntry_codes, indicator=inds, start=2000, end=2004, errors="ignore") result = result.sort_index() result = np.round(result, decimals=-3) expected.index.names = ["country", "year"] tm.assert_frame_equal(result, expected) result = WorldBankReader(inds, countries=cntry_codes, start=2000, end=2004, errors="ignore").read() result = result.sort_index() result = np.round(result, decimals=-3) tm.assert_frame_equal(result, expected)
def to_flourish(indicator, start_yr, end_yr, country='all', save_csv=True): """ Downloads data from the World Bank and converts it to the format for making bar chart races in Flourish. Parameters: * indicator: the world bank code available on the World Bank Page. * country: a string if single or a list if multiple of the ISO3 codes of the locations. * start_yr: the first year of data that you want to get. * end_yr: the final year of data that you want to collect. * save_csv: saves the file as a csv in your working directory. """ df = wb.download(indicator=indicator, country=country, start=start_yr, end=end_yr) df = df.reset_index() df = pd.pivot_table(df, values=indicator, columns='year', index='country').reset_index() print("Processed the Indicator Data") country_info = wb.get_countries() country_info = country_info[country_info.region != 'Aggregates'] df_merged = pd.merge(country_info[['iso2c', 'name', 'region']], df, left_on='name', right_on='country') df_merged.insert( 3, 'Image URL', df_merged['iso2c'].apply( lambda i: f"https://www.countryflags.io/{i}/flat/64.png")) df_merged = df_merged.drop(columns=['iso2c', 'country']) if save_csv: df_merged.to_csv( f"flourish_data/flourish_{indicator}_{datetime.now().strftime('%d-%m-%Y %H-%M')}.csv", index=False) return df_merged
def test_wdi_download_str(self): # These are the expected results, rounded (robust against # data revisions in the future). expected = { 'NY.GDP.PCAP.CD': { ('Japan', '2004'): 38000.0, ('Japan', '2003'): 35000.0, ('Japan', '2002'): 32000.0, ('Japan', '2001'): 34000.0, ('Japan', '2000'): 39000.0 } } expected = pd.DataFrame(expected) expected = expected.sort_index() cntry_codes = 'JP' inds = 'NY.GDP.PCAP.CD' result = download(country=cntry_codes, indicator=inds, start=2000, end=2004, errors='ignore') result = result.sort_index() result = np.round(result, decimals=-3) expected.index.names = ['country', 'year'] tm.assert_frame_equal(result, expected) result = WorldBankReader(inds, countries=cntry_codes, start=2000, end=2004, errors='ignore').read() result = result.sort_index() result = np.round(result, decimals=-3) tm.assert_frame_equal(result, expected)
def test_wdi_download_error_handling(self): cntry_codes = ["USA", "XX"] inds = "NY.GDP.PCAP.CD" msg = "Invalid Country Code\\(s\\): XX" with pytest.raises(ValueError, match=msg): download( country=cntry_codes, indicator=inds, start=2003, end=2004, errors="raise", ) with pytest.warns(Warning): result = download(country=cntry_codes, indicator=inds, start=2003, end=2004, errors="warn") assert isinstance(result, pd.DataFrame) assert len(result), 2 cntry_codes = ["USA"] inds = ["NY.GDP.PCAP.CD", "BAD_INDICATOR"] msg = "The provided parameter value is not valid\\. " "Indicator: BAD_INDICATOR" with pytest.raises(ValueError, match=msg): download( country=cntry_codes, indicator=inds, start=2003, end=2004, errors="raise", ) with pytest.warns(Warning): result = download(country=cntry_codes, indicator=inds, start=2003, end=2004, errors="warn") assert isinstance(result, pd.DataFrame) assert len(result) == 2
def chart1(indicator1, countryCode, startY, endY): import matplotlib from io import BytesIO import base64 import pandas as pd pd.core.common.is_list_like = pd.api.types.is_list_like from pandas_datareader import wb matplotlib.use("agg") import matplotlib.pyplot as plt # mathces = wb.search('gdp.*capita.*const') dat = wb.download(indicator=indicator1, country=countryCode, start=startY, end=endY) data = dat.unstack() print(data) data.plot(kind='bar') sio = BytesIO() plt.savefig(sio, format='png') data = base64.encodebytes(sio.getvalue()).decode() html = ''' <html> <body> <img src="data:image/png;base64,{}" /> </body> <html> ''' picture = "data:image/png;base64," + data #print(picture) plt.close() #return html.format(data) return render_template("home.html", picture=picture)
def get_wb_data(indicator, start_yr, end_yr, country='all', save_csv=False): """ Downloads and formats World Bank Data with year as index. Parameters: * indicator: the world bank code available on the World Bank Page. * start_yr: the first year of data that you want to get. * end_yr: the final year of data that you want to collect. * country: a string if single or a list if multiple of the ISO3 codes of the locations. * save_csv: saves the file as a csv in your working directory. """ global indicator_name indicator_name = indicator temp_df = wb.download(indicator=indicator, country=country, start=start_yr, end=end_yr) temp_df = temp_df.dropna() temp_df.index.names = ['Region', 'Year'] temp_df = temp_df.reset_index(level=0).sort_values(by='Region') if save_csv: temp_df.to_csv( f"{indicator}_{datetime.now().strftime('%d-%m-%Y %H-%M')}.csv", index=False) return temp_df
def test_wdi_download_error_handling(self): cntry_codes = ['USA', 'XX'] inds = 'NY.GDP.PCAP.CD' msg = "Invalid Country Code\\(s\\): XX" with assert_raises_regex(ValueError, msg): download(country=cntry_codes, indicator=inds, start=2003, end=2004, errors='raise') with tm.assert_produces_warning(): result = download(country=cntry_codes, indicator=inds, start=2003, end=2004, errors='warn') assert isinstance(result, pd.DataFrame) assert len(result), 2 cntry_codes = ['USA'] inds = ['NY.GDP.PCAP.CD', 'BAD_INDICATOR'] msg = ("The provided parameter value is not valid\\. " "Indicator: BAD_INDICATOR") with assert_raises_regex(ValueError, msg): download(country=cntry_codes, indicator=inds, start=2003, end=2004, errors='raise') with tm.assert_produces_warning(): result = download(country=cntry_codes, indicator=inds, start=2003, end=2004, errors='warn') assert isinstance(result, pd.DataFrame) assert len(result) == 2
import matplotlib.pyplot as plt import numpy as np from pandas_datareader import wb path = "https://github.com/omercadopopular/cgoes/blob/master/tutorial/python/statatopython/PPI_DB_082316.dta?raw=true" cpisauce = "https://github.com/omercadopopular/cgoes/blob/master/tutorial/python/statatopython/CPIAUCSL.xls?raw=true" gdpsauce = "https://github.com/omercadopopular/cgoes/blob/master/tutorial/python/statatopython/gdp.xlsx?raw=true" ##################################### # 1. Retrieve Databases ############# ##################################### ## 1.1 Import GDP data from the World Bank wbdata = (wb.download(indicator='NY.GDP.MKTP.CD', country='all', start=1994, end=2015) .dropna() .rename(columns={'NY.GDP.MKTP.CD': 'gdp'}) ) ## 1.2 Read file from STATA dta ppidf = pd.read_stata(path) ## 1.3 Import CPI data from excel file ## Note you have to skip 9 rows cpi = pd.read_excel(cpisauce, skiprows=9, header=1) ##################################### # 2. Adjust Databases ############### #####################################
import pandas as pd, datetime from pandas_datareader import data, wb dat = wb.download(indicator=['SL.UEM.TOTL.ZS','NY.GDP.DEFL.KD.ZG','NE.RSB.GNFS.ZS'], country=['USA', 'TUR','GBR'], start=1970, end=2016) dat.to_csv('data.csv') # start=datetime.datetime(1970, 1, 1) # end=datetime.datetime(2016, 1, 1) # df = data.DataReader(['BPBLTT01TRA188S','LRUN64TTTRQ156S','FPCPITOTLZGTUR'], "fred", start, end) # df.columns = ['tbal','unemploy','inf'] # df.to_csv('tr.csv') # start=datetime.datetime(1970, 1, 1) # end=datetime.datetime(2016, 1, 1) # df = data.DataReader(['BPBLTT01GBQ188S','LMUNRRTTGBM156S','FPCPITOTLZGGBR'], "fred", start, end) # df.columns = ['tbal','unemploy','inf'] # df.to_csv('uk.csv')
def download_data(year): ind = ['SH.STA.ACSN', 'SE.PRM.CMPT.FE.ZS'] dat = wb.download(indicator=ind, country='all', start=year, end=year).dropna() dat.columns = ['sanitation', 'completion'] return dat
from pandas_datareader import data, wb #import wbdata import pandas import matplotlib.pyplot as plt # #set up the countries I want # countries = ["CL","UY","HU"] # # #set up the indicator I want (just build up the dict if you want more than one) # indicators = {'SP.DYN.LE00.IN':'Life expectancy at birth, total (years)'} # # #grab indicators above for countires above and load into data frame # df = wbdata.get_dataframe(indicators, convert_date=False) # #wbdata.get_dataframe # #df is "pivoted", pandas' unstack fucntion helps reshape it into something plottable # dfu = df.unstack(level=0) # # # a simple matplotlib plot with legend, labels and a title # dfu.plot(); # plt.legend(loc='best'); # plt.title("GNI Per Capita ($USD, Atlas Method)"); # plt.xlabel('Date'); plt.ylabel('GNI Per Capita ($USD, Atlas Method'); ind = ['SP.DYN.LE00.IN'] #countries = ['iso2c'] dat = wb.download(indicator=ind, country='all', start=2013, end=2013).dropna() dat.columns = ['cellphone'] print(dat)
life = pd.read_csv('subsaharan_africa.csv', index_col=3, na_values=None) jk = pd.read_csv('subsaharan_africa.csv') # <codecell> life.columns # Get the external dataset from worldbank # We have selected indicator, "SP.POP.TOTL" df = wb.download( # Specify indicator to retrieve indicator='SP.POP.TOTL', country=['all'], # Start Year start='2008', # End Year end=2016 ) # <codecell> #create an array with all the years all_year = [str(x) for x in range(1960, 2000)] #Check what's inside all_year print all_year.count # <codecell> #drop all columns with no values #life = life.dropna(axis=0)
def p148(steps): countries = [ 'BR', 'CA', 'CN', 'FR', 'DE', 'IN', 'IL', 'JP', 'SA', 'GB', 'US', ] dat = wb.download(indicator='NY.GDP.PCAP.KD', country=countries, start=1970, end=2016) df = dat.unstack().T df.index = df.index.droplevel(0).astype(int) class Net(torch.nn.Module): def __init__(self, input_size, hidden_size): super(Net, self).__init__() self.rnn = torch.nn.LSTM(input_size, hidden_size) self.fc = torch.nn.Linear(hidden_size, 1) def forward(self, x): x = x[:, :, None] x, _ = self.rnn(x) x = self.fc(x) x = x[:, :, 0] return x net = Net(input_size=1, hidden_size=5) # 数据归一化 df_scaled = df / df.loc[2000] # 确定训练集和测试集 years = df.index train_seq_len = sum((years >= 1971) & (years <= 2000)) test_seq_len = sum(years > 2000) print('训练集长度 = {}, 测试集长度 = {}'.format(train_seq_len, test_seq_len)) # 确定训练使用的特征和标签 inputs = torch.tensor(df_scaled.iloc[:-1].values, dtype=torch.float32) labels = torch.tensor(df_scaled.iloc[1:].values, dtype=torch.float32) # 训练网络 criterion = torch.nn.MSELoss() optimizer = torch.optim.Adam(net.parameters()) for step in range(steps): if step: optimizer.zero_grad() train_loss.backward() optimizer.step() preds = net(inputs) train_preds = preds[:train_seq_len] train_labels = labels[:train_seq_len] train_loss = criterion(train_preds, train_labels) test_preds = preds[train_seq_len:] test_labels = labels[train_seq_len:] test_loss = criterion(test_preds, test_labels) if step % 500 == 0: print('第{}次迭代: loss (训练集) = {}, loss (测试集) = {}'.format( step, train_loss, test_loss)) preds = net(inputs) df_pred_scaled = pd.DataFrame(preds.detach().numpy(), index=years[1:], columns=df.columns) df_pred = df_pred_scaled * df.loc[2000] print(df_pred.loc[2001:])
from pandas_datareader import wb import matplotlib.pyplot as plt mathces = wb.search('gni.*capita.*const') #grab indicator,country, period I want and load into data frame df = wb.download(indicator='NY.GNP.PCAP.CD', country=['CL', 'UY', 'HU'], start=1990, end=2010) #df is "pivoted", pandas' unstack fucntion helps reshape it into something plottable dfu = df.unstack(level=0) # a simple matplotlib plot with legend, labels and a title dfu.plot(); plt.legend(loc='best'); plt.title("GNI Per Capita ($USD, Atlas Method)"); plt.xlabel('Date'); plt.ylabel('GNI Per Capita ($USD, Atlas Method'); plt.show();
#Import packages: from pandas_datareader import wb import pandas as pd import numpy as np import matplotlib.pyplot as plt from matplotlib_venn import venn2 #Import GDP data from World bank: gdps_wb = wb.download(indicator='NY.GDP.PCAP.KD', country=['US'], start=1990, end=2018) gdps_wb = gdps_wb.rename(columns={'NY.GDP.PCAP.KD': 'gdp'}) gdps_wb = gdps_wb.reset_index() gdps_wb.year = gdps_wb.year.astype(int) gdps_wb.head(10) gdpgrowth_wb = wb.download(indicator='NY.GDP.MKTP.KD.ZG', country=['US'], start=1990, end=2018) gdpgrowth_wb = gdpgrowth_wb.rename(columns={'NY.GDP.MKTP.KD.ZG': 'gdp_growth'}) gdpgrowth_wb = gdpgrowth_wb.reset_index() gdpgrowth_wb.year = gdpgrowth_wb.year.astype(int) gdpgrowth_wb.head(10) #Import unemployment data from excel file: unempl = pd.read_excel('Data.xlsx') print(unempl) #Change type:
"St. Vincent and the Grenadines":"Saint Vincent and the Grenadines", "Congo, Rep.":"Republic of the Congo", "Bahamas, The":"The Bahamas", "Gambia, The":"The Gambia" } for t in trans : s["Country/Region"] = s["Country/Region"].replace(t, trans[t]) return(s) if __name__ == "__main__": dsets = datasets.load() covid = datasets.combine(dsets) if (os.path.isfile(WDI_FILE)) : warnings.warn("Reading cached WDI data from disk, delete file to download updated") wdi = pd.read_pickle(WDI_FILE) else : wdi = covid.drop(columns=["Date","Province/State","Lat","Long", datasets.CONFIRMED,"deaths","recoveries"]).drop_duplicates() for id in INDICES_USED: s = wb.download(indicator=id, country="all", start=2005, end=2019).reset_index() # use most recent non missing value s = s.dropna().groupby("country").last() s = s.drop(columns="year").reset_index() # match country names to covid data s = s.rename(columns={"country":"Country/Region"}) s = fixcountrynames(s) wdi = pd.merge(wdi, s, how='left', on='Country/Region', validate="one_to_one") wdi.to_pickle(WDI_FILE)
import pandas as pd from pandas_datareader import wb import torch import torch.nn import torch.optim countries = ['BR', 'CA', 'CN', 'FR', 'DE', 'IN', 'IL', 'JP', 'SA', 'GB', 'US',] dat = wb.download(indicator='NY.GDP.PCAP.KD', country=countries, start=1970, end=2016) df = dat.unstack().T df.index = df.index.droplevel(0).astype(int) print(df) class Net(torch.nn.Module): def __init__(self, input_size, hidden_size): super(Net, self).__init__() self.rnn = torch.nn.LSTM(input_size, hidden_size) self.fc = torch.nn.Linear(hidden_size, 1) def forward(self, x): x = x[:, :, None] x, _ = self.rnn(x) x = self.fc(x) x = x[:, :, 0] return x net = Net(input_size=1, hidden_size=5) print(net)
####DATA PROJECT#### ###Data Cleaning and Structuring### ## Set up import pandas as pd import numpy as np import pandas_datareader import datetime from pandas_datareader import wb ##Downloand Data from the Wold Data Bank countries = ["CN","JP","BR","US","DK","ES","TM","IN","NG"] indicators = {"NY.GDP.PCAP.KD":"GDP per capita", "NY.GDP.MKTP.CD":"GDP(current US $)", "SP.POP.TOTL":"Population total", "SP.URB.TOTL.IN.ZS":"Urban Population in %", "SP.DYN.TFRT.IN":"Fertility Rate", "SE.ADT.LITR.ZS": "Literacy rate, adult total in %" } data_wb = wb.download(indicator= indicators, country= countries, start=1990, end=2017) data_wb = data_wb.rename(columns = {"NY.GDP.PCAP.KD":"gdp_pC","NY.GDP.MKTP.CD":"gdp", "SP.POP.TOTL":"pop", "SP.URB.TOTL.IN.ZS":"urban_pop%", "SP.DYN.TFRT.IN":"frt", "SE.ADT.LITR.ZS":"litr"}) data_wb = data_wb.reset_index() data_wb.head(-5) writer = pd.ExcelWriter('pandas_simple.xlsx', engine='xlsxwriter') data_wb.to_excel(r"./data_wb1.xlsx") ##Overview of the data data_wb.dtypes pd.options.display.float_format = '{:,}'.format round(data_wb.head(),2) data_wb['gdp_in_bil'] = data_wb['gdp']/1000000000
# In[23]: df1.Price.mean() # In[27]: get_ipython().system("pip3 install --upgrade pandas_datareader") from pandas_datareader import data, wb df_wb = wb.download( # Specify indicator to retrieve indicator="SP.POP.TOTL", country=["all"], # Start Year start="2008", # End Year end=2016, ) # In[28]: df_wb.shape # In[29]: df_wb.head()
def get_wb(wb_name): """return unstacked dataframe (countries, year) with WB data""" return wb.download(indicator=wb_name,start=start_year,end=today_year,country="all").unstack("year")[wb_name].dropna(how="all").dropna(how="all",axis=1)
raw_unstacked_data = raw_data.unstack(level=0) # printing our data object # print(raw_data) # print(raw_unstacked_data) # ============================================================================= # # API method 2: using from pandas.datareader import wb, convert the data object to a DataFrame # ============================================================================= # view all data pd.set_option('display.max_columns', 15) pd.set_option('display.max_rows', 15) df1 = wb.download(indicator = indicators, country = countries, start = 2008, end = 2018) date_period = [i for i in range(2008, 2019)] print(df1) # create a new DataFrame df2 for later use, not change origin values from df1 if we do some calculations for our dataframe df2 # rename the columns name df2 = df1.rename(columns = {'SI.DST.05TH.20':'Income share held by highest 20%', 'SI.DST.FRST.20': 'Income share held by lowest 20%', \ 'SL.EMP.TOTL.SP.FE.NE.ZS': 'Employment to population ratio, 15+, female (%) (national estimate)',\ 'SL.EMP.TOTL.SP.MA.NE.ZS': 'Employment to population ratio, 15+, male (%) (national estimate)'}, inplace = False) # overview our data object DataFrame # Data manipulation: dealing with the missing value, replace them as mean(), which has less impact on our data sets df2.mean() df2.fillna(df2.mean(), inplace = True) print(df2)
# -*- coding: utf-8 -*- """ Created on Wed Feb 24 14:07:46 2016 @author: anh """ import pandas as pd from pandas_datareader import wb from ggplot import * dat = wb.download( indicator=['BX.KLT.DINV.CD.WD', 'BX.KLT.DINV.WD.GD.ZS'], country='CN', start=2005, end=2011) dat.reset_index(inplace=True) dat['year'] = pd.to_datetime(dat['year']) # key print ggplot(aes(x='year', y='BX.KLT.DINV.CD.WD'), data=dat) + \ geom_line() + theme_bw() + \ scale_x_date(labels = date_format("%m - %Y")) cdat ggplot(dat.loc["China"], aes(x='year', y='BX.KLT.DINV.CD.WD')) + \ geom_point()
#pip install linearmodels #Run in terminal from linearmodels import PanelOLS import numpy as np import matplotlib.pyplot as plt import pandas_datareader from pandas_datareader import wb import seaborn as sns ###### a. Downloading inflation and unemployment data from World Bank ###### cntr_eu = ['DK', 'SE', 'FR', 'NL', 'DE', 'GB', 'BE', 'LU', 'AT', 'FI'] # Subset of countries affected by ECB's QE cntr_other = ['CA', 'CH', 'AU', 'NZ', 'SG', 'NO', 'US', 'JP', 'KR'] # Subset of countries not affected by ECB's QE infl_eu = wb.download(indicator='FP.CPI.TOTL.ZG', country=cntr_eu, start=1991, end=2017) infl_other = wb.download(indicator='FP.CPI.TOTL.ZG', country=cntr_other, start=1991, end=2017) unem_eu = wb.download(indicator='SL.UEM.TOTL.ZS', country=cntr_eu, start=1991, end=2017) unem_other = wb.download(indicator='SL.UEM.TOTL.ZS', country=cntr_other, start=1991, end=2017)
df1.Price # In[23]: df1.Price.mean() # In[27]: get_ipython().system('pip3 install --upgrade pandas_datareader') from pandas_datareader import data, wb df_wb = wb.download( # Specify indicator to retrieve indicator='SP.POP.TOTL', country=['all'], # Start Year start='2008', # End Year end=2016) # In[28]: df_wb.shape # In[29]: df_wb.head() # In[ ]: # In[36]:
# -*- coding: utf-8 -*- """ Created on Fri Oct 23 12:13:00 2020 @author: howard 美国、中国、日本近二十年人均GDP对比图 """ from pandas_datareader import wb import matplotlib.pyplot as plt dat = wb.download(indicator='NY.GDP.PCAP.KD', country=['US', 'CN', 'JP'], start=2001, end=2021) dat2draw = dat.unstack(level=0) plt.figure(figsize=(10, 4)) plt.plot(dat2draw.iloc[:, 0], 'r-', label="China") plt.plot(dat2draw.iloc[:, 1], 'b-*', label="Japan") plt.plot(dat2draw.iloc[:, 2], 'g--', label="USA") plt.title("PER CAPITA GDP ($)", fontsize=20) plt.legend() plt.pause(0)
import matplotlib.pyplot as plt import numpy as np from pandas_datareader import wb path = "https://github.com/omercadopopular/cgoes/blob/master/tutorial/python/statatopython/PPI_DB_082316.dta?raw=true" cpisauce = "https://github.com/omercadopopular/cgoes/blob/master/tutorial/python/statatopython/CPIAUCSL.xls?raw=true" gdpsauce = "https://github.com/omercadopopular/cgoes/blob/master/tutorial/python/statatopython/gdp.xlsx?raw=true" ##################################### # 1. Retrieve Databases ############# ##################################### ## 1.1 Import GDP data from the World Bank wbdata = (wb.download( indicator='NY.GDP.MKTP.CD', country='all', start=1994, end=2015).dropna().rename(columns={'NY.GDP.MKTP.CD': 'gdp'})) ## 1.2 Read file from STATA dta ppidf = pd.read_stata(path) ## 1.3 Import CPI data from excel file ## Note you have to skip 9 rows cpi = pd.read_excel(cpisauce, skiprows=9, header=1) ##################################### # 2. Adjust Databases ############### #####################################
import warnings warnings.simplefilter('ignore', FutureWarning) from pandas_datareader import wb import matplotlib.pyplot as plt df = wb.download(indicator='SP.POP.TOTL', country=['JP', 'US'], start=1960, end=2014) print(df) # SP.POP.TOTL # country year # Japan 2014 127276000 # 2013 127445000 # 2012 127629000 # 2011 127833000 # 2010 128070000 # ... ... # United States 1964 191889000 # 1963 189242000 # 1962 186538000 # 1961 183691000 # 1960 180671000 # # [110 rows x 1 columns] df2 = df.unstack(level=0) print(df2.head()) # SP.POP.TOTL # country Japan United States # year
#---- ch05/pandas-fred-ma price['ma3'] = price.inflation.rolling(3, center=True).mean() price['ma9'] = price.inflation.rolling(9, center=True).mean() price #---- ch05/pandas-fred-ma-plot/plot ax = price.inflation.plot(alpha=0.2) price[['ma3', 'ma9']].plot(ax = ax) plt.show() #---- ch05/wb-gdp from pandas_datareader import wb gdp = wb.download(indicator='NY.GDP.PCAP.CD', country='all', start=1960, end=2010) gdp #---- ch05/wb-gdp-pivot gdp_pivot = gdp.reset_index() gdp_pivot['NY.GDP.PCAP.CD'] = np.log(gdp_pivot['NY.GDP.PCAP.CD']) gdp_pivot = gdp_pivot.pivot(index='year', columns='country', values='NY.GDP.PCAP.CD') gdp_pivot.index = gdp_pivot.index.astype('uint64') #---- ch05/wb-gdp-pivot/dnr gdp_pivot
start = dt.datetime(2010, 1, 1) # start date codes = ['GDPC1', 'PCECC96'] # real GDP, real consumption fred = data.DataReader(codes, 'fred', start) fred = fred/1000 # convert billions to trillions fred.plot() #%% # World Bank from pandas_datareader import wb # World Bank api var = ['NY.GDP.PCAP.PP.KD'] # GDP per capita iso = ['USA', 'FRA', 'JPN', 'CHN', 'IND', 'BRA', 'MEX'] # country codes year = 2013 wbdf = wb.download(indicator=var, country=iso, start=year, end=year) #%% wbdf = wbdf.reset_index(level='year', drop=True) wbdf.plot(kind='barh') #%% # Fama-French equity returns from pandas_datareader import data # Package to access FF ff = data.DataReader('F-F_Research_Data_factors', 'famafrench')[0] ff.columns = ['xsm', 'smb', 'hml', 'rf'] # rename variables #%% """ Review
import pandas as pd from pandas_datareader import wb import matplotlib.pyplot as plt mathces = wb.search('gdp.*capita.*const') dat = wb.download(indicator='NY.GDP.PCAP.KD', country='CN', start=2010, end=2017) data = dat.stack() print(data) data.plot(kind='line') plt.show()
import pandas as pd import numpy as np import matplotlib.pyplot as plt import ipywidgets as widgets #continents = ['DK','ZA','US','GB','CN','IN','BR','CA','RU','TR','KR','VN','SE','DE','AL','FR','BG','IT','PK','ID','MX','PL'] continents = ['WLD', 'TSA', 'TMN', 'ECS', 'SSF', 'NAC', 'LCN'] from pandas_datareader import wb pop = wb.download(indicator='SP.POP.TOTL', country=continents, start=1970, end=2015) pop.head(3) gdp = wb.download(indicator='NY.GDP.MKTP.KD', country=continents, start=1970, end=2015) gdp.head(3) # Merging data: merged = pd.merge(gdp,pop, how='inner', on=['country','year']) merged = merged.reset_index() merged = merged.rename(columns = {'country' : 'continent', 'NY.GDP.MKTP.KD' : 'gdp', 'SP.POP.TOTL' : 'pop'}) merged['gdp_cap'] = merged['gdp'] / merged['pop'] # Sorting data: merged.sort_values(by=['continent','year'], inplace=True) merged = merged.reset_index(drop = True) merged.head() # Indexing:
x11.append(j) x1=np.array(x11) y1=np.array(y11) if len(y1)>4: m=((x1.mean()*y1.mean())-(x1*y1).mean())/((x1.mean()*x1.mean())-(x1*x1).mean()) b=y1.mean()-(m*x1.mean()) return (m*p)+b else: return y1.mean() clmns=[] #Stores names of all the columns used '''Initiating Dataset with ease of doing business''' indic="Ease of Doing Business" clmns.append(indic) maindata=wb.download(country=u"all", indicator="IC.BUS.EASE.XQ",start=strtyear, end=now.year) maindata.columns=[indic] maindata['Country']=[i[0] for i in maindata.index] maindata['Year']=[i[1] for i in maindata.index] maindata['Year']=maindata['Year'].apply(int) maindata = maindata.groupby("Country").transform(lambda x: x.iloc[::-1]) maindata['Country']=[i[0] for i in maindata.index] maindata=maindata[["Country","Year",indic]] maindata=maindata[658:] maindata[indic]=maindata[indic].replace(0, np.nan) maindata[indic] = maindata.groupby("country")[indic].transform(lambda x: x.fillna(x.mean())) '''Function to append Features''' def featureappnd(ind,nm,ft): clmns.append(nm) tmpdt=wb.download(country=u"all", indicator=ind,start=strtyear, end=now.year)
def country_DataFrame_to_list(country, target_data): df = wb.download(indicator = target_data, country = country, start = 2008, end = 2018) df.fillna(df.mean(), inplace = True) df_list =df[df.columns[0]].tolist() round_list = [round(i, 2) for i in df_list ] return round_list[::-1]
#import matplotlib.pyplot as plt from pandas_datareader import wb import quandl ##GETTING DATA FROM WEB##### tickers = "BRJ8" startDate = '2018-1-1' endDate = dt.datetime.today() mgWeb = web.DataReader(tickers, 'moex', startDate, endDate) mgWeb.to_excel('recData/BRENT_FUTURE_from_web.xlsx') ####GETTING DATA FROM MICROSOFT#### tickers = ['^SPX'] dataSource = 'stooq' startDate = '2018-1-1' endDate = dt.datetime.today() gWeb = web.DataReader(tickers, dataSource, startDate, endDate) gWeb.to_excel('recData/SP500_from_web.xlsx') worldBank = wb.download(indicator='NY.GDP.MKTP.CD', country=[ 'RU'], start=2005, end=2008) worldBank.to_excel('recData/WB_from_web.xlsx') keyIndicies = quandl.get('BANKRUSSIA/KEYECIND') keyIndicies.to_excel('recData/CB_from_web.xlsx') mgWeb = mgWeb[['VALUE', 'OPEN', 'LOW']] mgWeb = mgWeb.assign(LowInUSD=mgWeb['LOW']/57) mgWeb = mgWeb.assign(Numbers=mgWeb['VALUE']/mgWeb['LOW']) print(mgWeb)