Python pandas_read_csv 예제들, common.pandas_read_csv Python 예제들

예제 #1

0

파일 보기

def main():
    url = 'https://data-bs.ch/lufthygiene/nmbs_pm25/airmet_bs_museum_pm25_aktuell.csv'
    print(f'Downloading data from {url}...')
    urllib3.disable_warnings()
    df = common.pandas_read_csv(url,
                                sep=';',
                                encoding='cp1252',
                                skiprows=range(1, 2))
    print(f'Calculating ISO8601 time string...')
    df['timestamp'] = pd.to_datetime(
        df.Anfangszeit,
        format='%d.%m.%Y %H:%M:%S').dt.tz_localize('Europe/Zurich',
                                                   ambiguous='infer',
                                                   nonexistent='shift_forward')

    # We simplify the code and re-push all current data all the time instead of checking for the latest timestamp in ODS.
    # print(f'Reading latest timestamp from ODS dataset...')
    # urllib3.disable_warnings()
    # r = common.requests_get('https://data.bs.ch/api/records/1.0/search/?dataset=100100&q=&rows=1&sort=anfangszeit', verify=False)
    # r.raise_for_status()
    # latest_ods_timestamp = r.json()['records'][0]['fields']['anfangszeit']
    # print(f'Latest timestamp is {latest_ods_timestamp}.')
    # print(f'Filtering data after {latest_ods_timestamp} for submission to ODS via realtime API...')
    # realtime_df = df[df['timestamp'] > latest_ods_timestamp]
    # print(f'Pushing {realtime_df.timestamp.count()} rows to ODS realtime API...')

    realtime_df = df

    if len(realtime_df) == 0:
        print(f'No rows to push to ODS... ')
    else:
        print(f'Dropping empty values...')
        realtime_df.PM25_Sensirion = realtime_df.PM25_Sensirion.replace(
            ' ', numpy.nan)
        realtime_df = realtime_df.dropna(subset=['PM25_Sensirion'])
        # Realtime API bootstrap data:
        # {
        #     "anfangszeit": "23.02.2021 10:30:00",
        #     "pm25": 13.3
        # }
        payload = (realtime_df.rename(columns={
            'Anfangszeit': 'anfangszeit',
            'PM25_Sensirion': 'pm25'
        })[['anfangszeit', 'pm25']].to_json(orient="records"))
        print(
            f'Pushing {realtime_df.Anfangszeit.count()} rows to ODS realtime API...'
        )
        # print(f'Pushing the following data to ODS: {json.dumps(json.loads(payload), indent=4)}')
        # use data=payload here because payload is a string. If it was an object, we'd have to use json=payload.
        r = common.requests_post(url=credentials.ods_live_push_api_url,
                                 data=payload,
                                 verify=False)
        r.raise_for_status()

    print('Job successful!')

예제 #2

0

파일 보기

파일: copy_bag_datasets.py 프로젝트: opendatabs/data-processing

def get_bag_data(dataset_name, url, suffix):
    print(f'Reading current csv from {url} into data frame...')
    df = common.pandas_read_csv(url)
    print(f'Checking which column contains the date...')
    date_column = 'datum' if 'datum' in df.columns else 'date'
    print(f'Dropping lines with empty value in date column "{date_column}"...')
    print(f'{df[date_column].isna()}')
    df = df.dropna(subset=[date_column])
    print(f'Calculating columns...')
    if 'weekly' not in suffix:
        print(f'Date column is regarded as being a calendar day, calculating dayofweek, wochentag, week...')
        df['dayofweek'] = pd.to_datetime(df[date_column]).dt.dayofweek
        df['wochentag'] = df['dayofweek'].apply(lambda x: common.weekdays_german[x])
        df['week'] = pd.to_datetime(df[date_column]).dt.week
    else:
        print(f'Date column is regarded as being a week number. Calculating year, week...')
        df['year'] = df[date_column].astype(str).str.slice(stop=4)
        df['week'] = df[date_column].astype(str).str.slice(start=-2)
    suffix_string = f'_{suffix}' if suffix != '' else ''
    export_file_name = os.path.join(credentials.path, f'covid19_{dataset_name}{suffix_string}.csv')
    print(f'Exporting to file {export_file_name}...')
    df.to_csv(export_file_name, index=False)
    common.upload_ftp(export_file_name, credentials.ftp_server, credentials.ftp_user, credentials.ftp_pass, 'bag')

예제 #3

0

파일 보기

import pandas as pd
from datetime import datetime
import common
import glob
import os
from lufthygiene_pm25 import credentials

print(f'Reading data from multiple csv into single dataframe...')
df = pd.concat([
    common.pandas_read_csv(f, skiprows=range(1, 6), sep=';', encoding='cp1252')
    for f in glob.glob(
        'c:/dev/workspace/data-processing/lufthygiene_pm25/archive/*.csv')
],
               ignore_index=True)
print(f'Sorting...')
print(f'Calculating columns...')
df['timestamp'] = pd.to_datetime(df.Zeit,
                                 format='%d.%m.%Y %H:%M:%S').dt.tz_localize(
                                     'Europe/Zurich',
                                     ambiguous=True,
                                     nonexistent='shift_forward')
df = df.sort_values(by=['timestamp'], ascending=False, ignore_index=True)
print(f'Dropping duplicate rows...')
df = df.drop_duplicates()
print(f'Melting dataframe...')
ldf = df.melt(id_vars=['Zeit', 'timestamp'],
              var_name='station',
              value_name='pm_2_5')
print(f'Dropping rows with empty pm25 value...')
ldf = ldf.dropna(subset=['pm_2_5'])

예제 #4

0

파일 보기

파일: etl_vmdl_altersgruppen.py 프로젝트: opendatabs/data-processing

def get_reporting_df(df_bs_long_all):
    print(f'Calculating age group "Gesamtbevölkerung"...')
    df_all_ages = df_bs_long_all.copy(deep=True)
    df_all_ages = df_all_ages.groupby(['vacc_day',
                                       'vacc_count']).sum().reset_index()
    df_all_ages['age_group'] = 'Gesamtbevölkerung'
    df_bs_long_all = df_bs_long_all.append(df_all_ages)
    print('calculating age grouop "Impfberechtigte Bevölkerung"...')
    df_vacc_allowed = df_all_ages.copy()
    df_vacc_allowed['age_group'] = 'Impfberechtigte Bevölkerung'
    df_bs_long_all = df_bs_long_all.append(df_vacc_allowed)

    print(f'Calculating cumulative sums for long df...')
    # See https://stackoverflow.com/a/32847843
    df_bs_cum = df_bs_long_all.copy(deep=True)
    df_bs_cum['count_cum'] = df_bs_cum.groupby(['age_group', 'vacc_count'
                                                ])['count'].cumsum()

    print(f'Calculating cumulative sums of _only_ first vaccinations...')
    df_only_first = df_bs_cum.copy(deep=True)
    # Negate cumulative numbers of 2nd vacc, then sum cum numbers of vacc 1 and 2 to get the cum number of _only_ 1st vacc
    df_only_first.count_cum = numpy.where(df_only_first.vacc_count == 2,
                                          df_only_first.count_cum * -1,
                                          df_only_first.count_cum)
    df_only_first = df_only_first.groupby(['vacc_day', 'age_group'
                                           ])['count_cum'].sum().reset_index()
    df_only_first['vacc_count'] = -1
    df_bs_cum = df_bs_cum.append(df_only_first)

    print(f'Adding explanatory text for vacc_count...')
    vacc_count_desc = pd.DataFrame.from_dict({
        'vacc_count': [-1, 1, 2],
        'vacc_count_description': [
            'Ausschliesslich erste Impfdosis', 'Erste Impfdosis',
            'Zweite Impfdosis'
        ]
    })
    df_bs_cum = df_bs_cum.merge(vacc_count_desc, on=['vacc_count'], how='left')

    # Retrieve data from https://data.bs.ch/explore/dataset/100128
    print(f'Retrieving population data from {credentials.pop_data_file_path}')
    df_pop = common.pandas_read_csv(credentials.pop_data_file_path, sep=';')
    print(f'Filter 2020-12-31 data, create age groups, and sum')
    df_pop_2020 = df_pop.loc[df_pop['datum'] == '2020-12-31'][[
        'person_alter', 'anzahl'
    ]]
    df_pop_2020['age_group'] = pd.cut(df_pop_2020.person_alter,
                                      bins=get_age_groups()['bins'],
                                      labels=get_age_groups()['labels'],
                                      include_lowest=True)
    df_pop_age_group = df_pop_2020.groupby([
        'age_group'
    ])['anzahl'].sum().reset_index().rename(columns={'anzahl': 'total_pop'})
    print(f'Removing "Unbekannt" age group from population dataset...')
    df_pop_age_group = df_pop_age_group.query('age_group != "Unbekannt"')
    print(f'Calculating count of age group "Impfberechtige Bevölkerung"...')
    df_pop_vacc_allowed = pd.DataFrame(
        {
            'age_group': 'Impfberechtigte Bevölkerung',
            'total_pop': df_pop_age_group.total_pop.sum()
        },
        index=[0])
    print(f'Calculating count of age group "Gesamtbevölkerung"...')
    df_pop_total = pd.DataFrame(
        {
            'age_group': 'Gesamtbevölkerung',
            'total_pop': df_pop_2020.anzahl.sum()
        },
        index=[0])
    print(
        f'Appending totals for "Impfberechtigte Bevölkerung" and "Gesamtbevölkerung" '
    )
    df_pop_age_group = df_pop_age_group.append(df_pop_vacc_allowed).append(
        df_pop_total)

    print(f'Joining pop data and calculating percentages...')
    df_bs_perc = df_bs_cum.merge(df_pop_age_group,
                                 on=['age_group'],
                                 how='left')
    df_bs_perc[
        'count_cum_percentage_of_total_pop'] = df_bs_perc.count_cum / df_bs_perc.total_pop * 100

    return df_bs_perc

예제 #5

0

파일 보기

파일: etl.py 프로젝트: opendatabs/data-processing

import pandas as pd
import json
import common
import urllib3
from lufthygiene_pm25 import credentials

url = 'https://data-bs.ch/lufthygiene/regionales-mikroklima/airmet_bs_sensirion_pm25_aktuell.csv'
print(f'Downloading data from {url}...')
urllib3.disable_warnings()
df = common.pandas_read_csv(url, sep=';', encoding='cp1252', skiprows=range(1, 6))
print(f'Calculating ISO8601 time string...')
df['timestamp'] = pd.to_datetime(df.Zeit, format='%d.%m.%Y %H:%M:%S').dt.tz_localize('Europe/Zurich', ambiguous=True, nonexistent='shift_forward')

# we simplify the code and re-push all current data all the time instead of checking for the latest timestamp in ODS.
# Otherwise we'd need to check for the latest timestamp of each single sensor, instead of the latest overall.

# print(f'Reading latest timestamp from ODS dataset...')
# urllib3.disable_warnings()
# r = common.requests_get('https://data.bs.ch/api/records/1.0/search/?dataset=100081&q=&rows=1&sort=zeitstempel', verify=False)
# r.raise_for_status()
# latest_ods_timestamp = r.json()['records'][0]['fields']['zeitstempel']
# print(f'Latest timestamp retrieved from ODS is {latest_ods_timestamp}.')
# print(f'Latest timestamp retrieved new data is {df.timestamp.max().strftime("%Y.%m.%dT%H:%M:%S%z")}')
# print(f'Filtering data after {latest_ods_timestamp} for submission to ODS via realtime API...')
# realtime_df = df[df['timestamp'] > latest_ods_timestamp]

realtime_df = df
if len(realtime_df) == 0:
    print(f'No rows to push to ODS... ')
else:
    print(f'Melting dataframe...')