# Now we can simply join them into the same table df = confirmed.merge(deaths) # Parse date into a datetime object df['Date'] = df['Date'].apply(lambda date: datetime.fromisoformat(date).date()) # Offset date by 1 day to match ECDC report if not is_region: df['Date'] = df['Date'].apply(lambda date: date + timedelta(days=1)) # Convert dates to ISO format df['Date'] = df['Date'].apply(lambda date: date.isoformat()) # Add the country code to all records df['CountryCode'] = 'ES' # Country-level data is embedded as "Total" in the CSV files if is_region: df = df[df['_RegionLabel'] != 'Total'] else: df = df[df['_RegionLabel'] == 'Total'] df = df.drop(columns=['_RegionLabel']) # Merge the new data with the existing data (prefer new data if duplicates) if not is_region: filter_function = lambda row: row['CountryCode'] == 'ES' and pandas.isna(row['RegionCode']) df = merge_previous(df, ['Date', 'CountryCode'], filter_function) # Output the results dataframe_output(df, ROOT, 'ES' if is_region else None)
previous = read_file(sys.argv[4]) # Confirmed cases are split into age groups, add up all groups keys = ['RegionCode', 'Date'] confirmed = confirmed.set_index(keys) confirmed = confirmed.groupby(['Date', 'RegionCode']).sum() confirmed = confirmed.reset_index() # Join the confirmed and deaths tables data = confirmed.merge(deaths, how='outer') # Map the department to the region data['RegionCode'] = data['RegionCode'].apply(lambda dep: dep_map.get(dep)) # Data is new cases, perform the cumsum to get total data = cumsum_table(data.dropna(subset=keys).set_index(keys)).reset_index() # Merge with the prior data previous = previous[previous['CountryCode'] == 'FR'] previous = previous[~previous['RegionCode'].isna()] data = merge_previous(data, previous, ['Date', 'RegionCode']) # New data is incomplete for Confirmed, so use the prior data when available data = data.set_index(keys) previous = previous.set_index(keys).dropna() data.loc[previous.index] = previous # print(data.tail(50)) # Output the results dataframe_output(data.reset_index(), 'FR')
region = region[region.index <= forecast_date] # Early exit: If there are less than DATAPOINT_COUNT_MIN datapoints if len(region) < DATAPOINT_COUNT_MIN: continue # Define the subfolder that will hold the output assets forecast_chart = ROOT / 'output' / 'charts' / ('%s_US_%s.svg' % (forecast_date, key)) # Perform forecast forecast_data = forecast(region['Confirmed'], predict_window) # Output charts as SVG files plot_forecast(forecast_chart, region['Confirmed'], forecast_data) # Aggregate forecast data for idx in forecast_data.index: forecast_df.loc[(key, idx), 'CountryCode'] = country_code forecast_df.loc[(key, idx), 'CountryName'] = country_name forecast_df.loc[(key, idx), 'ForecastDate'] = forecast_date forecast_df.loc[(key, idx), 'Estimated'] = '%.03f' % forecast_data[idx] forecast_df.loc[(key, idx), 'ForecastChart'] = forecast_chart.relative_to(ROOT / 'output') for idx in region['Confirmed'].index: forecast_df.loc[(key, idx), 'Confirmed'] = int(region.loc[idx, 'Confirmed']) # Save output to CSV and JSON dataframe_output(forecast_df, ROOT, 'usa')
from utils import github_raw_dataframe, dataframe_output, timezone_adjust # Root path of the project ROOT = Path(os.path.dirname(__file__)) / '..' # Read DXY CSV file from website df = github_raw_dataframe('BlankerL/DXY-COVID-19-Data', 'csv/DXYArea.csv') # Adjust 7 hour difference between China's GMT+8 and GMT+1 df['Date'] = df['updateTime'].apply(lambda date: timezone_adjust(date, 7)) # Rename the appropriate columns df = df.rename( columns={ 'countryEnglishName': 'CountryName', 'provinceEnglishName': 'RegionName', 'province_confirmedCount': 'Confirmed', 'province_deadCount': 'Deaths', 'province_curedCount': 'Recovered' }) # Filter China data only df = df[df['CountryName'] == 'China'] # This is time series data, get only the last snapshot of each day df = df.sort_values('updateTime').groupby( ['Date', 'CountryName', 'RegionName']).last().reset_index() # Output the results dataframe_output(df, ROOT, 'CN')
#!/usr/bin/env python from pandas import DataFrame from utils import github_raw_dataframe, dataframe_output # Read data from GitHub repo df = github_raw_dataframe('tomwhite/covid-19-uk-data', 'data/covid-19-indicators-uk.csv') # Aggregate time series data into relational format records = [] for idx, rows in df.groupby(['Date', 'Country']): records.append({ 'Date': idx[0], 'Country': idx[1], **{ record.loc['Indicator']: record.loc['Value'] for _, record in rows.iterrows() } }) df = DataFrame.from_records(records).rename(columns={ 'Country': '_RegionLabel', 'ConfirmedCases': 'Confirmed' }) # Output the results dataframe_output(df, 'GB')
from covid_io import read_argv from utils import dataframe_output, merge_previous # Confirmed and deaths come from different CSV files, parse them separately first confirmed, deaths, prev_data = read_argv() confirmed = confirmed.rename(columns={ 'fecha': 'Date', 'CCAA': '_RegionLabel', 'total': 'Confirmed' }) deaths = deaths.rename(columns={ 'fecha': 'Date', 'CCAA': '_RegionLabel', 'total': 'Deaths' }) # Now we can simply join them into the same table df = confirmed.merge(deaths) # Parse date into a datetime object df['Date'] = df['Date'].apply(lambda date: datetime.fromisoformat(date).date()) # Add the country code to all records df['CountryCode'] = 'ES' # Convert dates to ISO format df['Date'] = df['Date'].apply(lambda date: date.isoformat()) # Output the results dataframe_output(df, 'ES')
#!/usr/bin/env python from datetime import datetime from datetime import datetime from covid_io import read_argv from utils import datetime_isoformat, pivot_table, dataframe_output def parse_date(date): return datetime_isoformat('%s-%d' % (date, datetime.now().year), '%d-%b-%Y') # Read data from Google Sheets df = read_argv() df.columns = df.iloc[0] df = df.rename(columns={'Provinsi': 'Date'}) df = df.iloc[1:].set_index('Date') df = df[df.columns.dropna()] df = pivot_table(df.transpose(), pivot_name='RegionName') df['Date'] = df['Date'].apply(parse_date) df = df.dropna(subset=['Date']) df = df.rename(columns={'Value': 'Confirmed'}) df['Deaths'] = None df = df.dropna(how='all', subset=['Confirmed', 'Deaths']) # Output the results dataframe_output(df, 'ID')
if tokens[0] == 'Total' or tokens[0] == 'ESPAÑA' and table_marker: break # Only process tokens from known region if tokens[0] in region_list: records += parse_record(tokens) # Exit if we have covered all regions if len(records) == len(region_list): break # Early exit: no records in the report (2020-03-16 onwards) if not records: print('No records from region found in report') sys.exit(1) # Put resulting records into a dataframe df = pd.DataFrame.from_records(records).merge(regions, on='_RegionLabel') df['Date'] = date # Merge the new data with the existing data (prefer new data if duplicates) filter_function = lambda row: row['CountryCode'] == 'ES' and not pd.isna(row[ 'RegionCode']) df = merge_previous(df, ['Date', 'RegionCode'], filter_function) # Only keep the necessary columns prior to merging with metadata df = df[['Date', 'RegionCode', 'Confirmed', 'Deaths']] # Output the results dataframe_output(df, ROOT, 'es')
# We must use the requests package directly because covidtracking returns 403 otherwise df = pd.read_json( requests.get('https://covidtracking.com/api/states/daily', headers={ 'User-agent': 'Mozilla/5.0' }).text) # Rename the appropriate columns df = df.rename( columns={ 'date': 'Date', 'state': 'Region', 'positive': 'Confirmed', 'death': 'Deaths', 'total': 'Tested' }) # Null values are not the same as zero, make sure all numbers are string objects for col in ('Confirmed', 'Deaths', 'Tested'): df[col] = df[col].dropna().astype(int).astype(str) # Convert date to ISO format df['Date'] = df['Date'].apply(lambda date: datetime.datetime.strptime( str(date), '%Y%m%d').strftime('%Y-%m-%d')) # Inclide the country name in the data df['CountryName'] = 'United States of America' # Output the results dataframe_output(df, ROOT, 'usa')
non_null = [value for value in group if not (isna(value) or isnull(value))] return None if not non_null else sum(non_null) # Add up all the rows with same Date and RegionName data = data.sort_values(['Date', 'RegionName']) data = data.drop(columns=['Value']).groupby(['RegionName', 'Date' ]).agg(aggregate_region_values) data = data.reset_index().sort_values(['Date', 'RegionName']) # Compute cumsum of the values region by region value_columns = ['Confirmed', 'Deaths'] if not args.cumsum: for region in data['RegionName'].unique(): mask = data['RegionName'] == region data.loc[mask, value_columns] = data.loc[mask, value_columns].cumsum() # Get rid of rows which have all null values data = data.dropna(how='all', subset=value_columns) # If we don't have deaths data, then make them null rather than zero if args.null_deaths: data['Deaths'] = None if args.debug: print('\nOutput:') print(data.head(50)) # Output the results dataframe_output(data, args.country_code)
# Retrieve the CSV files from https://covid19.isciii.es df = read_argv(encoding='ISO-8859-1').rename(columns={ 'FECHA': 'Date', 'CCAA': 'RegionCode', 'Fallecidos': 'Deaths' }).dropna(subset=['Date']) # Add the country code to all records df['CountryCode'] = 'ES' # Confirmed cases are split across 3 columns confirmed_columns = ['CASOS', 'PCR+', 'TestAc+'] for col in confirmed_columns: df[col] = df[col].fillna(0) df['Confirmed'] = df.apply(lambda x: sum([x[col] for col in confirmed_columns]), axis=1) # Convert dates to ISO format df['Date'] = df['Date'].apply( lambda date: datetime_isoformat(date, '%d/%m/%Y')) # Country-wide is the sum of all regions region_level = df country_level = df.groupby(['Date', 'CountryCode']).sum().reset_index() # Output the results dataframe_output(country_level) dataframe_output(region_level, 'ES')
import os from pathlib import Path from pandas import DataFrame from utils import github_raw_dataframe, dataframe_output # Root path of the project ROOT = Path(os.path.dirname(__file__)) / '..' # Read data from GitHub repo df = github_raw_dataframe('tomwhite/covid-19-uk-data', 'data/covid-19-indicators-uk.csv') # Aggregate time series data into relational format records = [] for idx, rows in df.groupby(['Date', 'Country']): records.append({ 'Date': idx[0], 'Country': idx[1], **{record.loc['Indicator']: record.loc['Value'] for _, record in rows.iterrows()} }) df = DataFrame.from_records(records).rename( columns={'Country': '_RegionLabel', 'ConfirmedCases': 'Confirmed'}) # Output the results dataframe_output(df, ROOT, 'GB')
df = read_argv() # Rename the appropriate columns df = df.rename(columns={'time_iso8601': 'Date'}) # Convert dates to ISO format df['Date'] = df['Date'].apply( lambda date: datetime.fromisoformat(date).date().isoformat()) # Get a list of all regions regions = unique([col[3:5] for col in df.columns if col.startswith('DE-')]) # Transform the data from non-tabulated format to our record format records = [] for idx, row in df.iterrows(): record = {'Date': row['Date']} for region_code in regions: records.append({ 'RegionCode': region_code, 'Confirmed': row['DE-%s_cases' % region_code], 'Deaths': row['DE-%s_deaths' % region_code], **record }) df = DataFrame.from_records(records) # Ensure we only take one record from the table df = df.groupby(['Date', 'RegionCode']).last().reset_index() # Output the results dataframe_output(df, 'DE')
''' from covid_io import read_argv from utils import dataframe_output, timezone_adjust # Read DXY CSV file from website df = read_argv() # Adjust 7 hour difference between China's GMT+8 and GMT+1 df['Date'] = df['updateTime'].apply(lambda date: timezone_adjust(date, 7)) # Rename the appropriate columns df = df.rename( columns={ 'countryEnglishName': 'CountryName', 'provinceEnglishName': 'RegionName', 'province_confirmedCount': 'Confirmed', 'province_deadCount': 'Deaths', 'province_curedCount': 'Recovered' }) # Filter China data only df = df[df['CountryName'] == 'China'] # This is time series data, get only the last snapshot of each day df = df.sort_values('updateTime').groupby( ['Date', 'CountryName', 'RegionName']).last().reset_index() # Output the results dataframe_output(df, 'CN')
df = df.reset_index() # Create a dummy record to be inserted where there is missing data sample_record = df.iloc[0].copy() sample_record['Confirmed'] = None sample_record['Deaths'] = None # Loop through all the dates, which must be unique in the dataset index and fill data date_range = pd.date_range(FIRST_DATE, df['Date'].max()) date_range = [date.date().isoformat() for date in date_range] # Backfill the first date with a zero if FIRST_DATE not in df['Date'].values: df = df.set_index('Date') df.loc[FIRST_DATE, 'Confirmed'] = 0 df.loc[FIRST_DATE, 'Deaths'] = 0 df = df.reset_index() # Fill all of country's missing data where numbers did not change for date in [date for date in date_range if date not in df['Date'].values]: inserted_record = sample_record.copy() inserted_record['Date'] = date df = df.append(inserted_record, ignore_index=True) df = df.reset_index().sort_values('Date') for column in ('Confirmed', 'Deaths'): df[column] = df[column].ffill() # Output the results dataframe_output(df, ROOT, 'world')
from datetime import datetime import pandas from utils import github_raw_dataframe, dataframe_output # Root path of the project ROOT = Path(os.path.dirname(__file__)) / '..' # Read data from GitHub repo df = github_raw_dataframe('dssg-pt/covid19pt-data', 'data.csv') df['Date'] = df['data'].apply(lambda date: datetime.strptime(date, '%d-%m-%Y').date().isoformat()) # Extract regions from the data regions = [col.split('_')[-1] for col in df.columns if col.startswith('confirmados_')] regions = [region for region in regions if len(region) > 2 and region not in ('novos', 'estrangeiro')] # Aggregate regions into a single data frame subsets = [] for region in regions: subset = df[['Date', 'confirmados_%s' % region, 'obitos_%s' % region]] subset = subset.copy() subset['_RegionLabel'] = region.replace('ars', '') subset = subset.rename( columns={'confirmados_%s' % region: 'Confirmed', 'obitos_%s' % region: 'Deaths'}) subsets.append(subset) df = pandas.concat(subsets) # Output the results dataframe_output(df, ROOT, 'PT')
region = region[region.index <= forecast_date] # Early exit: If there are less than DATAPOINT_COUNT_MIN datapoints # TODO: Draw simple chart with data for visualization without forecast if len(region) < DATAPOINT_COUNT_MIN: continue # Define the subfolder that will hold the output assets forecast_chart = ROOT / 'output' / 'charts' / ('%s_%s.svg' % (forecast_date, key)) # Perform forecast forecast_data = forecast(region['Confirmed'], predict_window) # Output charts as SVG files plot_forecast(forecast_chart, region['Confirmed'], forecast_data) # Output text data to CSV file for idx in forecast_data.index: forecast_df.loc[(key, idx), 'CountryName'] = key_map[key] forecast_df.loc[(key, idx), 'ForecastDate'] = forecast_date forecast_df.loc[(key, idx), 'Estimated'] = '%.03f' % forecast_data[idx] forecast_df.loc[(key, idx), 'ForecastChart'] = forecast_chart.relative_to(ROOT / 'output') for idx in region['Confirmed'].index: forecast_df.loc[(key, idx), 'Confirmed'] = int(region.loc[idx, 'Confirmed']) # Save output to CSV and JSON dataframe_output(forecast_df, ROOT, 'world')
from datetime import datetime from pandas import DataFrame from utils import github_raw_dataframe, dataframe_output, merge_previous # Root path of the project ROOT = Path(os.path.dirname(__file__)) / '..' df = github_raw_dataframe('covid-19-au/covid-19-au.github.io', 'src/data/state.json', branch='prod').transpose() # Transform the data from non-tabulated format to record format records = [] for idx, row in df.iterrows(): for code in df.columns: data = row[code] record = { 'Date': idx.date().isoformat(), 'RegionCode': code, 'Confirmed': data[0] } if len(data) > 1: record['Deaths'] = data[1] if len(data) > 2: record['Recovered'] = data[2] if len(data) > 3: record['Tested'] = data[3] records.append(record) df = DataFrame.from_records(records) # Output the results dataframe_output(df, ROOT, 'AU')
ROOT = Path(os.path.dirname(__file__)) / '..' # Read CSV file from GitHub project df = github_raw_dataframe('jgehrcke/covid-19-germany-gae', 'data.csv') # Rename the appropriate columns df = df.rename(columns={'time_iso8601': 'Date'}) # Convert dates to ISO format df['Date'] = df['Date'].apply( lambda date: datetime.fromisoformat(date).date().isoformat()) # Get a list of all region codes regions = unique([col[3:5] for col in df.columns if col.startswith('DE-')]) # Transform the data from non-tabulated format to our record format records = [] for idx, row in df.iterrows(): record = {'Date': row['Date'], 'CountryCode': 'DE'} for region in regions: records.append({ 'RegionCode': region, 'Confirmed': row['DE-%s_cases' % region], 'Deaths': row['DE-%s_deaths' % region], **record }) df = DataFrame.from_records(records) # Output the results dataframe_output(df, ROOT, 'de')
import os import sys import datetime from pathlib import Path import pandas as pd from utils import github_raw_dataframe, dataframe_output, pivot_table, ROOT df = github_raw_dataframe('carranco-sga/Mexico-COVID-19', 'Mexico_COVID19.csv') df = df.rename(columns={'Fecha': 'Date'}).set_index('Date') deaths_columns = [col for col in df.columns if col.endswith('_D')] confirmed_columns = [col[:-2] for col in deaths_columns] deaths = df[deaths_columns] confirmed = df[confirmed_columns] deaths.columns = confirmed.columns deaths = pivot_table( deaths, pivot_name='RegionCode').rename(columns={'Value': 'Deaths'}) confirmed = pivot_table( confirmed, pivot_name='RegionCode').rename(columns={'Value': 'Confirmed'}) df = confirmed.merge(deaths).sort_values(['Date', 'RegionCode']) # Output the results dataframe_output(df, 'MX')
df = confirmed.merge(deaths) # Parse date into a datetime object df['Date'] = df['Date'].apply(lambda date: datetime.fromisoformat(date).date()) # Offset date by 1 day to match ECDC report if not is_region: df['Date'] = df['Date'].apply(lambda date: date + timedelta(days=1)) # Convert dates to ISO format df['Date'] = df['Date'].apply(lambda date: date.isoformat()) # Add the country code to all records df['CountryCode'] = 'ES' # Country-level data is embedded as "Total" in the CSV files if is_region: df = df[df['_RegionLabel'] != 'Total'] else: df['RegionCode'] = None df = df[df['_RegionLabel'] == 'Total'] df = df.drop(columns=['_RegionLabel']) # Merge the new data with the existing data (prefer new data if duplicates) if not is_region: filter_function = lambda row: row['CountryCode'] == 'ES' and pandas.isna(row['RegionCode']) df = merge_previous(df, ['Date', 'CountryCode'], filter_function) # Output the results dataframe_output(df, ROOT, 'es' if is_region else 'world')
import os import sys import datetime from pathlib import Path from utils import github_raw_dataframe, dataframe_output # Root path of the project ROOT = Path(os.path.dirname(__file__)) / '..' # Read CSV file from covidtracking's GitHub project df = github_raw_dataframe('COVID19Tracking/covid-tracking-data', 'data/states_daily_4pm_et.csv') # Rename the appropriate columns df = df.rename( columns={ 'date': 'Date', 'state': 'RegionCode', 'positive': 'Confirmed', 'death': 'Deaths', 'total': 'Tested' }) # Convert date to ISO format df['Date'] = df['Date'].apply(lambda date: datetime.datetime.strptime( str(date), '%Y%m%d').date().isoformat()) # Output the results dataframe_output(df, ROOT, 'US')
df['GeoId'] = df['GeoId'].apply(lambda code: 'GR' if code == 'EL' else code) # Workaround for https://github.com/open-covid-19/data/issues/13 # ECDC mistakenly labels Greece country code as UK instead of GB df['GeoId'] = df['GeoId'].apply(lambda code: 'GB' if code == 'UK' else code) # Workaround for https://github.com/open-covid-19/data/issues/12 # ECDC data for Italy is simply wrong, so Italy's data will be parsed from a different source # ECDC data for Spain is two days delayed because original reporting time mismatch, parse separately df = df[(df['GeoId'] != 'IT') & (df['GeoId'] != 'ES')] # Compute the cumsum of values columns = ['DateRep', 'GeoId', 'Confirmed', 'Deaths'] df_ = pd.DataFrame(columns=columns) for country in df['GeoId'].unique(): subset = df[df['GeoId'] == country].copy() subset['Confirmed'] = subset['Cases'].cumsum() subset['Deaths'] = subset['Deaths'].cumsum() df_ = pd.concat([df_, subset[columns]]) df_ = df_[columns] df_.columns = ['Date', 'CountryCode', 'Confirmed', 'Deaths'] df = df_ # Make sure all data types are appropriately casted df['Confirmed'] = df['Confirmed'].fillna(0).astype(int) df['Deaths'] = df['Deaths'].fillna(0).astype(int) # Output the results dataframe_output(df, ROOT, 'world', metadata_merge='left')
from datetime import datetime import pandas from covid_io import read_argv from utils import dataframe_output # Read data from GitHub repo # https://raw.github.com/dssg-pt/covid19pt-data/master/data.csv df = read_argv() df['Date'] = df['data'].apply(lambda date: datetime.strptime(date, '%d-%m-%Y').date().isoformat()) # Extract regions from the data regions = [col.split('_')[-1] for col in df.columns if col.startswith('confirmados_')] regions = [region for region in regions if len(region) > 2 and region not in ('novos', 'estrangeiro')] # Aggregate regions into a single data frame subsets = [] for region in regions: subset = df[['Date', 'confirmados_%s' % region, 'obitos_%s' % region]] subset = subset.copy() subset['_RegionLabel'] = region.replace('ars', '') subset = subset.rename( columns={'confirmados_%s' % region: 'Confirmed', 'obitos_%s' % region: 'Deaths'}) subsets.append(subset) df = pandas.concat(subsets) # Output the results dataframe_output(df, 'PT')
This script loads the latest JSON from covidtracking.com website and extracts the confirmed cases, deaths and total tests for each state. Credit to the covidtracking.com team for scraping the data from each state. ''' import sys import datetime from utils import github_raw_dataframe, dataframe_output # Read CSV file from covidtracking's GitHub project df = github_raw_dataframe('COVID19Tracking/covid-tracking-data', 'data/states_daily_4pm_et.csv') # Rename the appropriate columns df = df.rename( columns={ 'date': 'Date', 'state': 'RegionCode', 'positive': 'Confirmed', 'death': 'Deaths', 'total': 'Tested' }) # Convert date to ISO format df['Date'] = df['Date'].apply(lambda date: datetime.datetime.strptime( str(date), '%Y%m%d').date().isoformat()) # Output the results dataframe_output(df, 'US')
# Workaround for https://github.com/open-covid-19/data/issues/13 # ECDC mistakenly labels Greece country code as UK instead of GB df['geoId'] = df['geoId'].apply(lambda code: 'GB' if code == 'UK' else code) # Workaround for https://github.com/open-covid-19/data/issues/12 # ECDC data for Italy is simply wrong, so Italy's data will be parsed from a different source # ECDC data for Spain is two days delayed because original reporting time mismatch, parse separately df = df[df['geoId'] != 'ES'] df = df[df['geoId'] != 'IT'] # Compute the cumsum of values columns = ['Date', 'CountryCode', 'Confirmed', 'Deaths'] df_ = pd.DataFrame(columns=columns) for country in df['geoId'].unique(): subset = df[df['geoId'] == country].copy() subset['CountryCode'] = subset['geoId'] subset['Date'] = subset['dateRep'].apply( lambda date: datetime.strptime(date, '%d/%m/%Y').date().isoformat()) subset = subset.sort_values('Date') subset['Confirmed'] = subset['cases'].cumsum() subset['Deaths'] = subset['deaths'].cumsum() df_ = pd.concat([df_, subset[columns]]) df = df_ # Make sure all data types are appropriately casted df['Confirmed'] = df['Confirmed'].fillna(0).astype(int) df['Deaths'] = df['Deaths'].fillna(0).astype(int) # Output the results dataframe_output(df)
#!/usr/bin/env python import pandas import datetime from covid_io import read_argv from utils import dataframe_output # Read CSV file from covidtracking's GitHub project data = read_argv() # Rename the appropriate columns data = data.rename( columns={ 'date': 'Date', 'prname': '_RegionLabel', 'numconf': 'Confirmed', 'numdeaths': 'Deaths', 'numtested': 'Tested' }) # Convert date to datetime object data['Date'] = data['Date'].apply(lambda date: datetime.datetime.strptime( date, '%d-%m-%Y').date().isoformat()) # Output the results dataframe_output(data, 'CA')
#!/usr/bin/env python from datetime import datetime from pandas import DataFrame from covid_io import read_argv from utils import dataframe_output # Read data from GitHub repo confirmed, deaths = read_argv() for df in (confirmed, deaths): df.rename(columns={'Unnamed: 1': 'RegionCode'}, inplace=True) df.set_index('RegionCode', inplace=True) # Transform the data from non-tabulated format to record format records = [] for region_code in confirmed.index.unique(): for col in confirmed.columns[1:]: date = col + '/' + str(datetime.now().year) date = datetime.strptime(date, '%d/%m/%Y').date().isoformat() records.append({ 'Date': date, 'RegionCode': region_code, 'Confirmed': confirmed.loc[region_code, col], 'Deaths': deaths.loc[region_code, col]}) df = DataFrame.from_records(records) # Output the results dataframe_output(df, 'BR')
# ECDC mistakenly labels Greece country code as UK instead of GB df['geoId'] = df['geoId'].apply(lambda code: 'GB' if code == 'UK' else code) # Workaround for https://github.com/open-covid-19/data/issues/12 # ECDC data for Italy is simply wrong, so Italy's data will be parsed from a different source # ECDC data for Spain is two days delayed because original reporting time mismatch, parse separately df = df[df['geoId'] != 'ES'] df = df[df['geoId'] != 'IT'] # Compute the cumsum of values columns = ['Date', 'CountryCode', 'Confirmed', 'Deaths'] df_ = pd.DataFrame(columns=columns) for country in df['geoId'].unique(): subset = df[df['geoId'] == country].copy() subset['CountryCode'] = subset['geoId'] subset['Date'] = subset['dateRep'].apply( lambda date: datetime.strptime(date, '%d/%m/%Y').date().isoformat()) subset = subset.sort_values('Date') subset['Confirmed'] = subset['cases'].cumsum() subset['Deaths'] = subset['deaths'].cumsum() df_ = pd.concat([df_, subset[columns]]) df = df_ # Make sure all data types are appropriately casted df['Confirmed'] = df['Confirmed'].fillna(0).astype(int) df['Deaths'] = df['Deaths'].fillna(0).astype(int) # Output the results df['RegionCode'] = None dataframe_output(df, ROOT)
from utils import github_raw_dataframe, dataframe_output # Root path of the project ROOT = Path(os.path.dirname(__file__)) / '..' # Read data from GitHub repo confirmed = github_raw_dataframe('elhenrico/covid19-Brazil-timeseries', 'confirmed-cases.csv') deaths = github_raw_dataframe('elhenrico/covid19-Brazil-timeseries', 'deaths.csv') for df in (confirmed, deaths): df.rename(columns={'Unnamed: 1': 'RegionCode'}, inplace=True) df.set_index('RegionCode', inplace=True) # Transform the data from non-tabulated format to record format records = [] for region_code in confirmed.index.unique(): for col in confirmed.columns[1:]: date = col + '/' + str(datetime.now().year) date = datetime.strptime(date, '%d/%m/%Y').date().isoformat() records.append({ 'Date': date, 'RegionCode': region_code, 'Confirmed': confirmed.loc[region_code, col], 'Deaths': confirmed.loc[region_code, col] }) df = DataFrame.from_records(records) # Output the results dataframe_output(df, ROOT, 'BR')