def main(): url = 'https://data-bs.ch/lufthygiene/nmbs_pm25/airmet_bs_museum_pm25_aktuell.csv' print(f'Downloading data from {url}...') urllib3.disable_warnings() df = common.pandas_read_csv(url, sep=';', encoding='cp1252', skiprows=range(1, 2)) print(f'Calculating ISO8601 time string...') df['timestamp'] = pd.to_datetime( df.Anfangszeit, format='%d.%m.%Y %H:%M:%S').dt.tz_localize('Europe/Zurich', ambiguous='infer', nonexistent='shift_forward') # We simplify the code and re-push all current data all the time instead of checking for the latest timestamp in ODS. # print(f'Reading latest timestamp from ODS dataset...') # urllib3.disable_warnings() # r = common.requests_get('https://data.bs.ch/api/records/1.0/search/?dataset=100100&q=&rows=1&sort=anfangszeit', verify=False) # r.raise_for_status() # latest_ods_timestamp = r.json()['records'][0]['fields']['anfangszeit'] # print(f'Latest timestamp is {latest_ods_timestamp}.') # print(f'Filtering data after {latest_ods_timestamp} for submission to ODS via realtime API...') # realtime_df = df[df['timestamp'] > latest_ods_timestamp] # print(f'Pushing {realtime_df.timestamp.count()} rows to ODS realtime API...') realtime_df = df if len(realtime_df) == 0: print(f'No rows to push to ODS... ') else: print(f'Dropping empty values...') realtime_df.PM25_Sensirion = realtime_df.PM25_Sensirion.replace( ' ', numpy.nan) realtime_df = realtime_df.dropna(subset=['PM25_Sensirion']) # Realtime API bootstrap data: # { # "anfangszeit": "23.02.2021 10:30:00", # "pm25": 13.3 # } payload = (realtime_df.rename(columns={ 'Anfangszeit': 'anfangszeit', 'PM25_Sensirion': 'pm25' })[['anfangszeit', 'pm25']].to_json(orient="records")) print( f'Pushing {realtime_df.Anfangszeit.count()} rows to ODS realtime API...' ) # print(f'Pushing the following data to ODS: {json.dumps(json.loads(payload), indent=4)}') # use data=payload here because payload is a string. If it was an object, we'd have to use json=payload. r = common.requests_post(url=credentials.ods_live_push_api_url, data=payload, verify=False) r.raise_for_status() print('Job successful!')
def get_bag_data(dataset_name, url, suffix): print(f'Reading current csv from {url} into data frame...') df = common.pandas_read_csv(url) print(f'Checking which column contains the date...') date_column = 'datum' if 'datum' in df.columns else 'date' print(f'Dropping lines with empty value in date column "{date_column}"...') print(f'{df[date_column].isna()}') df = df.dropna(subset=[date_column]) print(f'Calculating columns...') if 'weekly' not in suffix: print(f'Date column is regarded as being a calendar day, calculating dayofweek, wochentag, week...') df['dayofweek'] = pd.to_datetime(df[date_column]).dt.dayofweek df['wochentag'] = df['dayofweek'].apply(lambda x: common.weekdays_german[x]) df['week'] = pd.to_datetime(df[date_column]).dt.week else: print(f'Date column is regarded as being a week number. Calculating year, week...') df['year'] = df[date_column].astype(str).str.slice(stop=4) df['week'] = df[date_column].astype(str).str.slice(start=-2) suffix_string = f'_{suffix}' if suffix != '' else '' export_file_name = os.path.join(credentials.path, f'covid19_{dataset_name}{suffix_string}.csv') print(f'Exporting to file {export_file_name}...') df.to_csv(export_file_name, index=False) common.upload_ftp(export_file_name, credentials.ftp_server, credentials.ftp_user, credentials.ftp_pass, 'bag')
import pandas as pd from datetime import datetime import common import glob import os from lufthygiene_pm25 import credentials print(f'Reading data from multiple csv into single dataframe...') df = pd.concat([ common.pandas_read_csv(f, skiprows=range(1, 6), sep=';', encoding='cp1252') for f in glob.glob( 'c:/dev/workspace/data-processing/lufthygiene_pm25/archive/*.csv') ], ignore_index=True) print(f'Sorting...') print(f'Calculating columns...') df['timestamp'] = pd.to_datetime(df.Zeit, format='%d.%m.%Y %H:%M:%S').dt.tz_localize( 'Europe/Zurich', ambiguous=True, nonexistent='shift_forward') df = df.sort_values(by=['timestamp'], ascending=False, ignore_index=True) print(f'Dropping duplicate rows...') df = df.drop_duplicates() print(f'Melting dataframe...') ldf = df.melt(id_vars=['Zeit', 'timestamp'], var_name='station', value_name='pm_2_5') print(f'Dropping rows with empty pm25 value...') ldf = ldf.dropna(subset=['pm_2_5'])
def get_reporting_df(df_bs_long_all): print(f'Calculating age group "Gesamtbevölkerung"...') df_all_ages = df_bs_long_all.copy(deep=True) df_all_ages = df_all_ages.groupby(['vacc_day', 'vacc_count']).sum().reset_index() df_all_ages['age_group'] = 'Gesamtbevölkerung' df_bs_long_all = df_bs_long_all.append(df_all_ages) print('calculating age grouop "Impfberechtigte Bevölkerung"...') df_vacc_allowed = df_all_ages.copy() df_vacc_allowed['age_group'] = 'Impfberechtigte Bevölkerung' df_bs_long_all = df_bs_long_all.append(df_vacc_allowed) print(f'Calculating cumulative sums for long df...') # See https://stackoverflow.com/a/32847843 df_bs_cum = df_bs_long_all.copy(deep=True) df_bs_cum['count_cum'] = df_bs_cum.groupby(['age_group', 'vacc_count' ])['count'].cumsum() print(f'Calculating cumulative sums of _only_ first vaccinations...') df_only_first = df_bs_cum.copy(deep=True) # Negate cumulative numbers of 2nd vacc, then sum cum numbers of vacc 1 and 2 to get the cum number of _only_ 1st vacc df_only_first.count_cum = numpy.where(df_only_first.vacc_count == 2, df_only_first.count_cum * -1, df_only_first.count_cum) df_only_first = df_only_first.groupby(['vacc_day', 'age_group' ])['count_cum'].sum().reset_index() df_only_first['vacc_count'] = -1 df_bs_cum = df_bs_cum.append(df_only_first) print(f'Adding explanatory text for vacc_count...') vacc_count_desc = pd.DataFrame.from_dict({ 'vacc_count': [-1, 1, 2], 'vacc_count_description': [ 'Ausschliesslich erste Impfdosis', 'Erste Impfdosis', 'Zweite Impfdosis' ] }) df_bs_cum = df_bs_cum.merge(vacc_count_desc, on=['vacc_count'], how='left') # Retrieve data from https://data.bs.ch/explore/dataset/100128 print(f'Retrieving population data from {credentials.pop_data_file_path}') df_pop = common.pandas_read_csv(credentials.pop_data_file_path, sep=';') print(f'Filter 2020-12-31 data, create age groups, and sum') df_pop_2020 = df_pop.loc[df_pop['datum'] == '2020-12-31'][[ 'person_alter', 'anzahl' ]] df_pop_2020['age_group'] = pd.cut(df_pop_2020.person_alter, bins=get_age_groups()['bins'], labels=get_age_groups()['labels'], include_lowest=True) df_pop_age_group = df_pop_2020.groupby([ 'age_group' ])['anzahl'].sum().reset_index().rename(columns={'anzahl': 'total_pop'}) print(f'Removing "Unbekannt" age group from population dataset...') df_pop_age_group = df_pop_age_group.query('age_group != "Unbekannt"') print(f'Calculating count of age group "Impfberechtige Bevölkerung"...') df_pop_vacc_allowed = pd.DataFrame( { 'age_group': 'Impfberechtigte Bevölkerung', 'total_pop': df_pop_age_group.total_pop.sum() }, index=[0]) print(f'Calculating count of age group "Gesamtbevölkerung"...') df_pop_total = pd.DataFrame( { 'age_group': 'Gesamtbevölkerung', 'total_pop': df_pop_2020.anzahl.sum() }, index=[0]) print( f'Appending totals for "Impfberechtigte Bevölkerung" and "Gesamtbevölkerung" ' ) df_pop_age_group = df_pop_age_group.append(df_pop_vacc_allowed).append( df_pop_total) print(f'Joining pop data and calculating percentages...') df_bs_perc = df_bs_cum.merge(df_pop_age_group, on=['age_group'], how='left') df_bs_perc[ 'count_cum_percentage_of_total_pop'] = df_bs_perc.count_cum / df_bs_perc.total_pop * 100 return df_bs_perc
import pandas as pd import json import common import urllib3 from lufthygiene_pm25 import credentials url = 'https://data-bs.ch/lufthygiene/regionales-mikroklima/airmet_bs_sensirion_pm25_aktuell.csv' print(f'Downloading data from {url}...') urllib3.disable_warnings() df = common.pandas_read_csv(url, sep=';', encoding='cp1252', skiprows=range(1, 6)) print(f'Calculating ISO8601 time string...') df['timestamp'] = pd.to_datetime(df.Zeit, format='%d.%m.%Y %H:%M:%S').dt.tz_localize('Europe/Zurich', ambiguous=True, nonexistent='shift_forward') # we simplify the code and re-push all current data all the time instead of checking for the latest timestamp in ODS. # Otherwise we'd need to check for the latest timestamp of each single sensor, instead of the latest overall. # print(f'Reading latest timestamp from ODS dataset...') # urllib3.disable_warnings() # r = common.requests_get('https://data.bs.ch/api/records/1.0/search/?dataset=100081&q=&rows=1&sort=zeitstempel', verify=False) # r.raise_for_status() # latest_ods_timestamp = r.json()['records'][0]['fields']['zeitstempel'] # print(f'Latest timestamp retrieved from ODS is {latest_ods_timestamp}.') # print(f'Latest timestamp retrieved new data is {df.timestamp.max().strftime("%Y.%m.%dT%H:%M:%S%z")}') # print(f'Filtering data after {latest_ods_timestamp} for submission to ODS via realtime API...') # realtime_df = df[df['timestamp'] > latest_ods_timestamp] realtime_df = df if len(realtime_df) == 0: print(f'No rows to push to ODS... ') else: print(f'Melting dataframe...')