def fetch(self, series_meta): """" Initial stab at querying. Will refactor code into a subclass... Can only support single series queries... :param series_meta: econ_platform_core.SeriesMetaData :return: list """ query_ticker = str(series_meta.ticker_query) df = dbnomics.fetch_series(query_ticker) tickers = set(df.series_code) if len(tickers) > 1: raise NotImplementedError( 'Multiple series queries not yet supported') ser = pandas.Series(df.value) ser.index = df.period ser.name = '{0}@{1}/{2}/{3}'.format(self.ProviderCode, df['provider_code'][0], df['dataset_code'][0], df['series_code'][0]) # Convert 'NA' to NaN ser.replace('NA', numpy.nan) # Always return a list of series. Only the user interface will convert list to a single pandas.Series return [ ser, ]
def OECD_data(code, indicators, dset, periods): global codes print('Fetching data ' + str(indicators) + '...') for i in indicators: codes[code][i] = fetch_series('OECD' + '/' + dset + '/' + code + '.' + i + '.' + periods) print("Done") return
def imf_WEOdata(code, indicators): global codes print('Fetching data ' + str(indicators) + '...') for i in indicators: codes[code][i] = fetch_series('IMF' + '/' + 'WEO' + '/' + code + "." + i) print("Done") return
def imf_CPIdata(code, indicators, periods): global codes print('Fetching data ' + str(indicators) + '...') for i in indicators: codes[code][i] = fetch_series('IMF' + '/' + 'CPI' + '/' + periods + '.' + code + '.' + i) print("Done") return
def getCPI(): ukCPI = fetch_series('IMF/CPI/M.GB.PCPIHA_PC_CP_A_PT') euCPI = fetch_series('IMF/CPI/M.U2.PCPIHA_PC_CP_A_PT') dbnomicsQuery = "period >= '" + START_DATE + "'" ukCPI = ukCPI.query(dbnomicsQuery) euCPI = euCPI.query(dbnomicsQuery) ukCPIDict = { ukCPI.period.iloc[i]: ukCPI.value.iloc[i] for i in range(len(ukCPI)) } euCPIDict = { euCPI.period.iloc[i]: euCPI.value.iloc[i] for i in range(len(euCPI)) } dates = [] ukCPIarr = [] euCPIarr = [] # Match the dates for k, v in ukCPIDict.items(): match = euCPIDict.get(k, 0) ukCPIarr.append(v) euCPIarr.append(match) dates.append(k) ukCPIarr = np.array(ukCPIarr, dtype=np.float) euCPIarr = np.array(euCPIarr, dtype=np.float) ukEuCPIRatio = ukCPIarr / euCPIarr # The mean and std of the inflation data that was used in training # This keeps the normalisation consistent cpi_mean = 0.3957793542714463 cpi_std = 0.9813408912397396 ukEuCPIRatio = (ukEuCPIRatio - cpi_mean) / cpi_std # Return a dictionary of all inflation data cpiDict = {dates[i]: ukEuCPIRatio[i] for i in range(len(dates))} return cpiDict
def get_dbnomics_data(self, country: str, indicator_list: list, start_date=None, end_date=None, freq='A', counterparty=None, flow=None) -> pd.DataFrame: """ get api data from dbnomics website :param country: str :param indicator_list: str :param freq: 'Q', 'A', or 'M' :param start_date: 'yyyy-mm-dd' :param end_date: 'yyyy-mm-dd' :param counterparty: str :param flow: 'M' for import, 'X' for export :return: Data Frame """ result = [] for indicator in indicator_list: dbnomics_code = self.get_dbnomics_code(country, indicator, freq, counterparty, flow) # fetch data from dbnomics, update fetched data try: df = dbnomics.fetch_series(dbnomics_code) df = self.__update_fetched_data(df, indicator, start_date, end_date, freq=freq) except: if dbnomics_code in [ 'IMF/BOP/A.CA.BSM_BP6_USD', 'IMF/BOP/A.ID.BSTV_BP6_USD', 'IMF/BOP/A.SA.BSR_BP6_USD', 'IMF/BOP/A.SA.BSM_BP6_USD', 'IMF/BOP/A.SA.BSOPCR_BP6_USD', 'IMF/BOP/Q.CA.BSM_BP6_USD', 'IMF/BOP/Q.ID.BSTV_BP6_USD', 'IMF/BOP/Q.SA.BSR_BP6_USD', 'IMF/BOP/Q.SA.BSM_BP6_USD', 'IMF/BOP/Q.SA.BSOPCR_BP6_USD' ]: pass else: # TODO: use pass when dbnomics connection is stable! return self.get_dbnomics_data(country, indicator, start_date, end_date, freq, counterparty, flow) df = self.__format_dataframe(df, country, indicator, dbnomics_code, freq, counterparty) result.append(df) return pd.concat(result, axis=0)
def test_fetch_series_of_dataset(): df = fetch_series('AMECO', 'ZUTN') provider_codes = df["provider_code"].unique() assert len(provider_codes) == 1, df assert provider_codes[0] == "AMECO", df dataset_codes = df["dataset_code"].unique() assert len(dataset_codes) == 1, df assert dataset_codes[0] == "ZUTN", df series_codes = df["series_code"].unique() assert len(series_codes) > 1, df
def test_fetch_series_by_code_mask_with_plus_in_dimension_code(): df = fetch_series('SCB', 'AKIAM', '"J+K"+"G+H".AM0301C1') provider_codes = df["provider_code"].unique() assert len(provider_codes) == 1, df assert provider_codes[0] == "SCB", df dataset_codes = df["dataset_code"].unique() assert len(dataset_codes) == 1, df assert dataset_codes[0] == "AKIAM", df series_codes = df["series_code"].unique() assert set(series_codes) == {'J+K.AM0301C1', 'G+H.AM0301C1'}, series_codes
def test_fetch_series_by_code_mask(): df = fetch_series("IMF", "CPI", "M.FR+DE.PCPIEC_IX+PCPIA_IX") provider_codes = df["provider_code"].unique() assert len(provider_codes) == 1, df assert provider_codes[0] == "IMF", df dataset_codes = df["dataset_code"].unique() assert len(dataset_codes) == 1, df assert dataset_codes[0] == "CPI", df series_codes = df["series_code"].unique() assert len(series_codes) == 4, df
def test_fetch_series_by_id(): df = fetch_series('AMECO/ZUTN/EA19.1.0.0.0.ZUTN') provider_codes = df["provider_code"].unique() assert len(provider_codes) == 1, df assert provider_codes[0] == "AMECO", df dataset_codes = df["dataset_code"].unique() assert len(dataset_codes) == 1, df assert dataset_codes[0] == "ZUTN", df series_codes = df["series_code"].unique() assert len(series_codes) == 1, df assert series_codes[0] == "EA19.1.0.0.0.ZUTN", df
def test_fetch_series_by_dimension(): df = fetch_series("WB", "DB", dimensions={ "country": ["FR", "IT", "ES"], "indicator": ["IC.REG.COST.PC.FE.ZS.DRFN"], }) provider_codes = df["provider_code"].unique() assert len(provider_codes) == 1, df assert provider_codes[0] == "WB", df dataset_codes = df["dataset_code"].unique() assert len(dataset_codes) == 1, df assert dataset_codes[0] == "DB", df series_codes = df["series_code"].unique() assert len(series_codes), df
def test_fetch_series_by_ids_in_same_dataset(): df = fetch_series([ 'AMECO/ZUTN/EA19.1.0.0.0.ZUTN', 'AMECO/ZUTN/DNK.1.0.0.0.ZUTN', ]) provider_codes = df["provider_code"].unique() assert len(provider_codes) == 1, df assert provider_codes[0] == "AMECO", df dataset_codes = df["dataset_code"].unique() assert len(dataset_codes) == 1, df assert dataset_codes[0] == "ZUTN", df series_codes = df["series_code"].unique() assert len(series_codes) == 2, df assert series_codes[0] == "EA19.1.0.0.0.ZUTN", df assert series_codes[1] == "DNK.1.0.0.0.ZUTN", df
def test_fetch_series_by_ids_in_different_datasets(): df = fetch_series([ 'AMECO/ZUTN/EA19.1.0.0.0.ZUTN', 'BIS/PP-SS/Q.AU.N.628', ]) provider_codes = df["provider_code"].unique() assert len(provider_codes) == 2, df assert provider_codes[0] == "AMECO", df assert provider_codes[1] == "BIS", df dataset_codes = df["dataset_code"].unique() assert len(dataset_codes) == 2, df assert dataset_codes[0] == "ZUTN", df assert dataset_codes[1] == "PP-SS", df series_codes = df["series_code"].unique() assert len(series_codes) == 2, df assert series_codes[0] == "EA19.1.0.0.0.ZUTN", df assert series_codes[1] == "Q.AU.N.628", df
def fetch(self, series_meta): """" Initial stab at querying. Will refactor code into a subclass... Can only support single series queries... :param series_meta: econ_platform_core.SeriesMetadata :return: list """ query_ticker = str(series_meta.ticker_query) df = dbnomics.fetch_series(query_ticker) tickers = set(df.series_code) if len(tickers) > 1: raise NotImplementedError( 'Multiple series queries not yet supported') ser = pandas.Series(df.value) ser.index = df.period ser.name = '{0}@{1}/{2}/{3}'.format(self.ProviderCode, df['provider_code'][0], df['dataset_code'][0], df['series_code'][0]) # Convert 'NA' to NaN ser = ser.replace('NA', numpy.nan) # Always return a list of series. Only the user interface will convert list to a single pandas.Series # Get metadata colz = df.columns if 'FREQUENCY' in colz: series_meta.frequency = df['FREQUENCY'][df.index[0]] if 'series_name' in colz: series_meta.series_name = df['series_name'][df.index[0]] series_meta.series_description = '{0} : DB.nomics series {1}'.format( series_meta.series_name, query_ticker) excluded = ('value', 'period', 'original_period', 'indexed_at') for c in colz: if c in excluded: continue series_meta.ProviderMetadata[c] = df[c][df.index[0]] return ser
def getEuroFundamentals(): print() ukGDP = fetch_series('Eurostat/namq_10_gdp/Q.CLV05_MEUR.NSA.B1GQ.UK') ukExports = fetch_series('Eurostat/namq_10_exi/Q.CLV05_MEUR.NSA.P6.UK') franceGDP = fetch_series('Eurostat/namq_10_gdp/Q.CLV05_MEUR.NSA.B1GQ.FR') franceExports = fetch_series('Eurostat/namq_10_exi/Q.CLV05_MEUR.NSA.P6.FR') swedenGDP = fetch_series('Eurostat/namq_10_gdp/Q.CLV05_MEUR.NSA.B1GQ.SE') swedenExports = fetch_series('Eurostat/namq_10_exi/Q.CLV05_MEUR.NSA.P6.SE') greeceGDP = fetch_series('Eurostat/namq_10_gdp/Q.CLV05_MEUR.NSA.B1GQ.EL') greeceExports = fetch_series('Eurostat/namq_10_exi/Q.CLV05_MEUR.NSA.P6.EL') luxGDP = fetch_series('Eurostat/namq_10_gdp/Q.CLV05_MEUR.NSA.B1GQ.LU') luxExports = fetch_series('Eurostat/namq_10_exi/Q.CLV05_MEUR.NSA.P6.LU') spainGDP = fetch_series('Eurostat/namq_10_gdp/Q.CLV05_MEUR.NSA.B1GQ.ES') spainExports = fetch_series('Eurostat/namq_10_exi/Q.CLV05_MEUR.NSA.P6.ES') netherlandsGDP = fetch_series( 'Eurostat/namq_10_gdp/Q.CLV05_MEUR.NSA.B1GQ.NL') netherlandsExports = fetch_series( 'Eurostat/namq_10_exi/Q.CLV05_MEUR.NSA.P6.NL') toSave = (ukGDP, ukExports, franceGDP, franceExports, swedenGDP, swedenExports, greeceGDP, greeceExports, luxGDP, luxExports, spainGDP, spainExports, netherlandsGDP, netherlandsExports) for ukGDP in toSave: #yes it's bad form to name my variables this way, but I first #had to test to see if this would work for one dataFrame before making #a for loop and I didn't wanna go and change every single name ukGDP.query("period >= '2010'", inplace=True) print("running") ukGDP.drop('National accounts indicator (ESA 2010)', axis=1, inplace=True) ukGDP.drop('Geopolitical entity (reporting)', axis=1, inplace=True) ukGDP.drop('Seasonal adjustment', axis=1, inplace=True) ukGDP.drop('Unit of measure', axis=1, inplace=True) ukGDP.drop('@frequency', axis=1, inplace=True) ukGDP.drop('provider_code', axis=1, inplace=True) ukGDP.drop('dataset_code', axis=1, inplace=True) ukGDP.drop('dataset_name', axis=1, inplace=True) ukGDP.drop('series_code', axis=1, inplace=True) ukGDP.drop('series_name', axis=1, inplace=True) ukGDP.drop('original_period', axis=1, inplace=True) ukGDP.drop('Frequency', axis=1, inplace=True) ukGDP.drop('FREQ', axis=1, inplace=True) ukGDP.drop('unit', axis=1, inplace=True) ukGDP.drop('s_adj', axis=1, inplace=True) ukGDP.drop('na_item', axis=1, inplace=True) ukGDP.drop('geo', axis=1, inplace=True) ukGDP.drop('original_value', axis=1, inplace=True) pandas.set_option('display.max_rows', None) pandas.set_option('display.max_columns', None) print(toSave) pickle.dump(toSave, open('euroFundamentalData.pkl', 'wb'))
try: import pandas import dbnomics except ImportError: print('Need to install pandas and dbnomics - pip install dbnomics') raise print("""Fetch a single series: Greek nominal GDP. Got the code from the series webpage. Code: 'Eurostat/namq_10_gdp/Q.CP_MEUR.SCA.B1GQ.EL' URL: https://db.nomics.world/Eurostat/namq_10_gdp/Q.CP_MEUR.SCA.B1GQ.EL """) df1 = dbnomics.fetch_series('Eurostat/namq_10_gdp/Q.CP_MEUR.SCA.B1GQ.EL') print('Object returned by fetch_series:') print('Type:', type(df1)) print('Some of the data') print(df1.head()) input('Hit Return to continue') print('Columns of the dataframe:') print(df1.columns) print('This is a lot of columns. Useful, but often extraneous for analysis.') print('The values of the time series:') print(df1['value']) print('This is indexed by the dataframe row number. We also need the time axis [period]') print(df1['period']) input('Hit any key; attempting to plot')
def getSeries(provider, dataset, df_variable, start_date, end_date, flat_list): ''' Fetch the serie from the website and compute correlation between the series and the variable of interest :param provider: Provider of the dataset, eg: AMECO, WTO, NBB, ... :param dataset: Name of the dataset :param df_variable: Variable of interest in the format date - value :param start_date: start date of the variable of interest :param end_date: end date of the variable of interest :param flat_list: country list :return: returns a dataset containing the correlation between the variable of interest and all external variables in the dataset ''' try: df_series = fetch_series(provider_code=provider, dataset_code=dataset, max_nb_series=10000) df_series.columns = map(str.lower, df_series.columns) if set(df_series['@frequency']) == {'annual'}: return pd.DataFrame() else: pass if 'country' in df_series.columns: df_series['country'] = df_series['country'].str.lower() df_series = df_series[df_series['country'].isin(flat_list)] df_mapping = df_series[[ 'provider_code', 'series_code', 'dataset_name', 'series_name' ]].drop_duplicates() df_series = df_series[[ 'provider_code', 'dataset_code', 'series_code', 'period', 'value', '@frequency' ]] df_series = df_series[(df_series['period'] >= start_date) & (df_series['period'] <= end_date)] if not df_series.empty: df_variable = df_variable.reset_index(drop=True) df_monthly = make_monthly(df_series, start_date, end_date) df_all = pd.merge(df_variable, df_monthly, how='outer', on='date') df_all = df_all.sort_values('date').reset_index(drop=True) df_all = df_all[df_all.count(axis=1) > 1] cols = df_all.columns.difference( ['date', 'provider_code', 'dataset_code']) cols2 = list(set(df_series['series_code'])) df_all[cols] = df_all[cols].pct_change().fillna(0).replace( [np.inf, -np.inf], 0) corr = [ crosscorr(df_all[var_name], df_all[var], i) for i in range(0, 13) for var in cols2 ] df_corr = pd.DataFrame( data={ 'series_code': list(np.repeat(cols2, 13)), 'Time': list(range(0, 13)) * len(cols2), 'Correlation': corr }) df_corr = df_corr[df_corr['Correlation'] > corr_threshold] df_all = pd.merge(df_corr, df_mapping, how='left', on='series_code') df_indicators_corr = df_all[[ 'provider_code', 'series_code', 'dataset_name', 'series_name', 'Time', 'Correlation' ]] df_vals = df_monthly.T df_vals.columns = df_vals.iloc[0] df_return = pd.merge(df_indicators_corr, df_vals, how='left', left_on='series_code', right_index=True) return df_return except: logging.info("Dataset {} not parsed in getSeries.".format(dataset))
tmp_list = [] codes = [] for sublist in temp_list: for item in sublist: tmp_list.append(item) for i in tmp_list: codes.append("IMF/DOT/A.AM.TMG_CIF_USD." + str(i)) dt = pd.DataFrame() data = pd.DataFrame() if (os.path.isfile('Homework 4 & 5/Import_Data.csv') == False): for i in codes: dt = fetch_series(str(i)) data = data.append(dt) dt.iloc[0:0] data.to_csv("Homework 4 & 5\Import_Data.csv") if (os.path.isfile('Homework 4 & 5/Import_Data.csv') == True): dtt = pd.read_csv("Homework 4 & 5/Import_Data.csv") import_data = dtt[[ "original_value", "original_period", "Counterpart Reference Area" ]] grouped_data = import_data.groupby(['original_period']).sum() sorted_data = grouped_data.sort_values(by="original_value", ascending=False) grouped_country = import_data.groupby(['Counterpart Reference Area']).sum() sorted_country = grouped_country.sort_values(by='original_value',
def test_fetch_series_with_na_values(): df = fetch_series('AMECO', 'ZUTN', 'DEU.1.0.0.0.ZUTN') assert "NA" in list(df.original_value) assert not any(df.original_value.isna()) assert "NA" not in list(df.value) assert any(df.value.isna())