def makeJsonPieAgroecosystems(year): df = eurostat.get_data_df('aei_ps_inp', flags=False) df = df.query('unit == "PC_AREA"') df.rename(columns={r'geo\time': 'geo'}, inplace=True) data = [] for i in df.index: if df['geo'][i] == 'EU27_2020': data.append([df['geo'][i], year, df[year][i], df['indic_ag'][i]]) df = pd.DataFrame(data, columns = ['geo_code', 'time', 'value', 'type']) df.to_csv('../output_data/headline_indicators/agroecosystems/agroecosystems.csv', index = False) data = {} data['data'] = [] for i in df.index: data_part = {} data_part['name'] = getType(df['type'][i]) data_part['y'] = df['value'][i] data['data'].append(data_part) with open('../output_data/headline_indicators/agroecosystems/agroecosystems.json', 'w', encoding = 'utf-8') as f: json.dump(data, f, ensure_ascii = False, indent = 4)
def fetch_health_exp(indicators): #Working rewrite of health care stats for code in indicators.keys(): full_df = eurostat.get_data_df(code, flags=False) geo_entities = full_df['geo\\time'].unique()[:-7] units = indicators[code]['units'] providers = indicators[code]['providers'] indicator = indicators[code]['indicator'] indicator_df = full_df.loc[(full_df['unit'].isin(units)) & (full_df[indicator].isin(providers)) & (full_df['geo\\time'].isin(geo_entities))].copy() year = indicators[code]['year'] final_df = fetch_eu_states()['ISO3166_a2'] for unit in units: indicator_name = f"EUS_{indicators[code]['name']}_{str.lower(unit)}_{year}" subset_df = indicator_df.loc[indicator_df['unit']==unit, ['geo\\time', year]] subset_df.columns = ['ISO3166_a2', indicator_name] # Fix Greece and the UK subset_df.loc[subset_df['ISO3166_a2'] == 'EL', 'ISO3166_a2'] = 'GR' subset_df.loc[subset_df['ISO3166_a2'] == 'UK', 'ISO3166_a2'] = 'GB' final_df = pd.merge(final_df, subset_df, how='left', on='ISO3166_a2') #final_df.drop('ISO3166_a2', axis=1, inplace=True) return final_df
def fetch_tax(): code = "sdg_17_50" #Share of environmental taxes in total tax revenues df = eurostat.get_data_df(code, flags=False) df.rename(columns={ df.columns[0]: "country" }, inplace = True) df = df[df['country'].isin(list(countries))] df=df.melt(id_vars=["country"], var_name="year", value_name="env_tax") df.sort_values(by=['country','year'],inplace = True) return df.query('year > 1999').query('year<2018')
def load_percent_ict_of_gdp(db_connection: sqlite3.Connection) -> None: """ Request the json-stat data from eurostat for the percentage of GDP that is created by the ICT sector. Load dataframe into a db table. """ df = eurostat.get_data_df("tin00074") df = df[df["nace_r2"].isin(["ICT"])] # Ignore unnecessary data df.drop(columns="nace_r2", inplace=True) df.rename(columns={"time\geo": "country_label"}, inplace=True) df.to_sql("percent_ict_of_gdp", db_connection, if_exists="replace", index=False)
def make_eurostat_table(code, toc_df, path=target_dir): ''' This function extracts and saves a eurostat table with a readable name together with a yaml with its data dictionary Args: code (str) is the code for the table toc_df (df) is a dataframe with a table of contents (we use it to create the schema) path (str) is the destination for storing data and schema ''' try: table = eurostat.get_data_df(code) if 'time\geo' in table.columns: struct_name = 'time\geo' melt_var_name = 'geo' meta_cols = [ x for x in table.columns if ((len(x) > 2) & (not any(name in x for name in ['EU', 'EA']))) ] else: struct_name = 'geo\\time' melt_var_name = 'time' meta_cols = [x for x in table.columns if type(x) != int] table_long = table.melt(id_vars=meta_cols, var_name=melt_var_name, value_name=code) #Create schema sch = make_schema(table_long, code, toc_df, struct_name, melt_var_name) #Save table and schema table_long.to_csv(f'{path}/{code}.csv', index=False) with open(f'{path}/{code}.yaml', 'w') as outfile: yaml.dump(sch, outfile) except: #A small number of eurostat tables don't work with this package. print(' API failure')
def fetch_economic_indicator(code, name, year, unit=None, unit_name='unit'): full_df = eurostat.get_data_df(code, flags=False) geo_entities = full_df['geo\\time'].unique()[:-7] if unit is not None: full_df = full_df.loc[(full_df[unit_name] == unit) & (full_df['geo\\time'].isin(geo_entities))] else: full_df = full_df.loc[(full_df['geo\\time'].isin(geo_entities))] indicator_name = f"EUS_{name}_{year}" full_df = full_df[['geo\\time', year]] full_df.columns = ['ISO3166_a2', indicator_name] full_df.loc[full_df['ISO3166_a2'] == 'EL', 'ISO3166_a2'] = 'GR' full_df.loc[full_df['ISO3166_a2'] == 'UK', 'ISO3166_a2'] = 'GB' return full_df
def fetch_ghg_pc(): # Eurostat Code # Codes can be found at Eurostat data browser # https://ec.europa.eu/eurostat/data/database code = "t2020_rd300" #SGreenhouse gas emissions per capita [T2020_RD300] df = eurostat.get_data_df(code, flags=False) df.rename(columns={ df.columns[0]: "country" }, inplace = True) #Only individual countries df2 = df[df['country'].isin(list(countries))] # Reshape to put Years from column headers (1st row) to a column df2=df2.melt(id_vars=["country"], var_name="year", value_name="ghg_e_pc") # Sort by geo and year df2.sort_values(by=['country','year'],inplace = True) return df2
def load_percent_ent_using_cloud_computing( db_connection: sqlite3.Connection) -> None: """ Request the json-stat data from eurostat for the percentage of cloud computing services used by enterprise companies. Load dataframe into a db table. """ df = eurostat.get_data_df("isoc_cicce_use") # Filter the data to match the custom query df = df[df["unit"].isin(["PC_ENT"]) & df["indic_is"].isin(["E_CC"]) & df["sizen_r2"].isin(["10_C10_S951_XK"])] # Ignore unnecessary data df.drop(columns=["unit", "sizen_r2", "indic_is"], inplace=True) df.rename(columns={"time\geo": "country_label"}, inplace=True) df.to_sql("percent_ent_using_cc", db_connection, if_exists="replace", index=False)
def eurostat_requests(dict_codes, filepath): ''' Function to request data from Eurostat using variable codes from a dictionary. Then writes the data to csv. ''' write_log('--- FILE CHECKS') existing_files = list_directory( filepath) # Check existing files in the output folder for value in dict_codes: # For each variable in the dictionary (key) get the associated Eurostat code for the request df = eurostat.get_data_df( dict_codes[value] ) # Assign the dataframe returned from the request to the variable (key) of the dictionary # slice large datasets if value == 'Gross_value_added_and_income_by_industry': df = df.drop(df[(df["s_adj"] != "NSA") | (df["na_item"] != "B1G") | (df["unit"] != "CLV_I05") | (df["nace_r2"] != "TOTAL")].index) elif value == 'GDP_and_main_components': df = df.drop(df[(df["na_item"] != "B1GQ") | (df["unit"] != "CLV_I05")].index) elif value == 'Harmonized_index_of_consumer_prices_monthly_data': df = df.drop(df[(df["s_adj"] != "NSA") | (df["indic"] != "CP-HI00") | (df["unit"] != "HICP2015")].index) elif value == 'Employmentby_industry': df = df.drop( df[(df["s_adj"] != "SCA") | (df["na_item"] != "EMP_DC") | (df["unit"] != "PCH_PRE_PER") | (df["nace_r2"] != "TOTAL")].index) elif value == 'Unemployment_by_sex_and_age_monthly_data': df = df.drop( df[(df["s_adj"] != "SA") | (df["sex"] != "T") | (df["unit"] != "PC_ACT") | (df["age"] != "TOTAL")].index) df.reset_index(drop=True, inplace=True) if value in existing_files: old_df = pd.read_csv(filepath / f"{value}.csv") if df.equals(old_df): write_log('same ' + value) else: write_log('diff ' + value) path_file = filepath / f"{value}.csv" # Define the path to th file df.to_csv(path_file, index=False) # Write the dataframe to csv
def fetch_gdp(): # Eurostat Code - always lower case # Codes can be found at Eurostat data browser # https://ec.europa.eu/eurostat/data/database code = "sdg_08_10" # df = eurostat.get_data_df(code, flags=False) df.rename(columns={ df.columns[2]: "country" }, inplace = True) #Only individual countries df2 = df[df['country'].isin(list(countries))] # Drop 2019 columm before melt del df2[2019] # Also with melting : # since this dataset contains two variables # I'll split it into two data sets df3=df2.melt(id_vars=["country", "unit"], var_name="year", value_name="temp_value") df3.sort_values(by=['country','year'],inplace = True) return (df3[df3['unit']=='CLV10_EUR_HAB'], df3[df3['unit']=='CLV_PCH_PRE_HAB'])
def createFilesFromEurostat(eurostat_id, indicator_id, type_column_id, years, calculate_share): df = eurostat.get_data_df(eurostat_id, flags=False) df.rename(columns={r'geo\time': 'geo'}, inplace=True) df = df.loc[df['geo'].isin(geo_category_codes_list)] df = filterDataFramesEurostat(eurostat_id, indicator_id, years, df) df = reorganizeDataFrames(indicator_id, 'geo', None, None, type_column_id, years, df) if calculate_share == True: df['value_share_calculated'] = '' for i in df.index: geo_category_code = df['geo_code'][i] time = df['time'][i] denominator = df.query('geo_code == @geo_category_code & time == @time & type == "TOTAL"')['value'].values[0] df['value_share_calculated'][i] = df['value'][i]/denominator df = createCsvFiles(eurostat_id, indicator_id, df) if indicator_id == '2.1.d.1' and eurostat_id == 'sdg_15_60': return createVisualizationFiles(indicator_id, years, df)
return pd.Series({"Latitude": location.lat, "Longitude": location.lon}) except: return pd.Series({"Latitude": np.nan, "Longitude": np.nan}) def get_NutsCode(row: pd.Series) -> pd.Series: try: result = nf.find(lat=row["Latitude"], lon=row["Longitude"]) levels = [r['LEVL_CODE'] for r in result] result = result[levels.index(2)] return pd.Series({'NUTS_ID': result['NUTS_ID'], 'NUTS_NAME': result['NUTS_NAME']}) except: return pd.Series({'NUTS_ID': np.nan, 'NUTS_NAME': np.nan}) locodes = pd.read_csv("D:\Programming\Python\SustainabilityReport\data\general\locode-list.csv") ports = eurostat.get_data_df('mar_go_aa') ports = pd.DataFrame(ports.loc[ports[r"rep_mar\time"].str.contains(r'([a-zA-Z]{5})$'), r"rep_mar\time"]) ports = ports.reset_index(drop=True) ports["Country"] = ports[r"rep_mar\time"].map(lambda x: x[-5:-3]) ports["Location"] = ports[r"rep_mar\time"].map(lambda x: x[-3:]) ports = ports.merge(locodes, on=["Country", "Location"], how="left") ports = ports.loc[:, [r"rep_mar\time", "Country", "Location", "Name", "Coordinates"]] ports = ports.merge(ports.apply(getLatAndLon, axis=1), left_index=True, right_index=True) ports = ports.merge(ports.apply(get_NutsCode, axis=1), left_index=True, right_index=True) ports = ports.dropna() ports.loc[:, [r"rep_mar\time", "NUTS_ID", "NUTS_NAME"]].to_excel("D:\Programming\Python\SustainabilityReport\data\general\PortsDict_NUTS2_2016.xlsx", index= False)
def data_Eurostat(url, container): ind = 0 for i in url: if ind <= len(url): container.append([url[ind][0], eurostat.get_data_df(url[ind][1])]) ind += 1
df = df.pivot(index=[r'geo\time', 'year'], columns=keep_column)['value'].reset_index() if option_replace_nan_with_zero == 1: df = df.fillna(0) elif option_replace_nan_with_zero == 2: df = df.apply(replace_nan_with_zero, axis=1, args=(1 / 2, )) elif option_replace_nan_with_zero == 3: df = df.apply(replace_nan_with_zero, axis=1, args=(1 / 100, )) if prefix: df = df.add_prefix(prefix) return df ## size filter_landuse = ["TOTAL"] df_size = eurostat.get_data_df('tgs00002') columns_avail = list(set(time_frame).intersection(set(df_size.columns))) columns_needed = list(set(time_frame).difference(set(df_size.columns))) df_size = df_size.loc[df_size["landuse"].isin(filter_landuse) & df_size[r"geo\time"].isin(nuts2_regions), [r"geo\time"] + columns_avail] df_size = df_size.merge(df_size.apply(create_missing_years_mean, axis=1, args=(columns_needed, )), left_index=True, right_index=True) df_size = df_size.melt(id_vars=[r"geo\time"], var_name="year", value_name="size") df_size["size"] = df_size["size"] / 1000
Created on Fri Dec 4 12:01:54 2020 @author: joana """ #!pip install eurostat import eurostat import pandas as pd toc = eurostat.get_toc() df = eurostat.get_toc_df() crime = eurostat.subset_toc_df(df, "crime") #code: [ilc_mddw03] crime = eurostat.get_data_df('ilc_mddw03', flags=False) #'PC': 'Percentage' #'hhtyp'(TYPE OF HOUSEHOLD): TOTAL #'incgrp'(INCOME SITUATION) : TOTAL pd.set_option("display.max_rows", 2193) #Filtrar por TOTAL em TYPE OF HOUSEHOLD e INCOME SITUATION crimedf = crime[(crime['hhtyp'] == 'TOTAL') & (crime['incgrp'] == 'TOTAL')] crimedf = crimedf.drop(['hhtyp', 'incgrp', 'unit'], axis=1) #crimedf = crimedf.reset_index(drop=True) #Criar coluna country com os dados da coluna geo/time crimedf['Country'] = crimedf.iloc[:, 0]
data_dates=(datetime.datetime(1960, 1, 1), datetime.datetime(2017, 1, 1)) #Create dataframe df = wbdata.get_dataframe(indicators, #country=country_tuple, data_date=data_dates, convert_date=False, keep_levels=True) df.Population = df.Population.astype(int) df.index.names = ['country','year'] #conda install -c konstantinstadler country_converter #df.to_csv('data/wb.csv',convert_dates='ty') code = "sdg_17_50" #Share of environmental taxes in total tax revenues tdf = eurostat.get_data_df(code, flags=False) tdf = tdf.rename(columns={ tdf.columns[0]: "country_code" }) tdf= tdf[tdf['country_code'].isin(list(euro_country_tuple))] del tdf[2018] iso2_codes =tdf['country_code'].tolist() # Change back to standard - scary!! iso2_codes[15]='GB' del tdf['country_code'] tdf['country']=coco.convert(names=iso2_codes, to='name_short', not_found=None) tax=tdf.melt(id_vars=["country"], var_name="year", value_name="env_tax") tax['year'] = pd.to_numeric(tax['year'], errors='coerce') tax.sort_values(by=['country','year'],inplace = True)
df_JH_confirmed.head() # Data from UN world population url_WPP_tot_pop = 'https://population.un.org/wpp/Download/Files/1_Indicators%20(Standard)/CSV_FILES/WPP2019_TotalPopulationBySex.csv' url_WPP_pop_age_sex = 'https://population.un.org/wpp/Download/Files/1_Indicators%20(Standard)/CSV_FILES/WPP2019_PopulationByAgeSex_Medium.csv' url_WPP_feritlity = 'https://population.un.org/wpp/Download/Files/1_Indicators%20(Standard)/CSV_FILES/WPP2019_Fertility_by_Age.csv' df_WPP_tot_pop = pd.read_csv(url_WPP_tot_pop,index_col=0,parse_dates=[0]) df_WPP_pop_age_sex = pd.read_csv(url_WPP_pop_age_sex,index_col=0,parse_dates=[0]) df_WPP_feritlity = pd.read_csv(url_WPP_feritlity,index_col=0,parse_dates=[0]) df_WPP_tot_pop.head() # Data from Eurostat data_EU_pop = eurostat.get_data_df('demo_r_gind3') # Population data_EU_gdp = eurostat.get_data_df('nama_10_gdp') # GDP data_EU_cons = eurostat.get_data_df('nama_10_fcs') # Consumption data_EU_trade = eurostat.get_data_df('nama_10_exi') # Import export data_EU_short_rate = eurostat.get_data_df('irt_st_a') # Short term int rates on govt bonds data_EU_long_rate = eurostat.get_data_df('irt_lt_gby10_a') # 10 y int rates on govt bonds data_EU_unemp = eurostat.get_data_df('une_rt_a') # Unemployment by sex and age data_EU_inv = eurostat.get_data_df('nama_10_an6') # Gross capital formation (gross investment) data_EU_gdp.head() # Save all data print(os.getcwd()) # Current directory print(os.chdir(data_dir)) # New directory df_JH_confirmed = df_JH_confirmed.to_csv("df_JH_confirmed.csv", index=False)
# #EUROSTAT import # # https://ec.europa.eu/eurostat/cache/metadata/en/demomwk_esms.htm # https://ec.europa.eu/eurostat/web/products-datasets/-/demo_r_mweek3 import eurostat #Import the data from Eurostat eurost_df = eurostat.get_data_df('demo_r_mweek3', flags=False) #Rename column 'geo\time' to 'geo' eurost_df.rename(columns={r'geo\time': 'geo'}, inplace=True) #Select for totals sex and age groups trimmed_df = eurost_df[(eurost_df['sex'] == "T") & (eurost_df['age'] == "TOTAL")] #Trim to the country totals data ccodes = [ 'AT', 'BE', 'BG', 'CH', 'CZ', 'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'HU', 'IS',