Ejemplo n.º 1
0
# Now we can simply join them into the same table
df = confirmed.merge(deaths)

# Parse date into a datetime object
df['Date'] = df['Date'].apply(lambda date: datetime.fromisoformat(date).date())

# Offset date by 1 day to match ECDC report
if not is_region:
    df['Date'] = df['Date'].apply(lambda date: date + timedelta(days=1))

# Convert dates to ISO format
df['Date'] = df['Date'].apply(lambda date: date.isoformat())

# Add the country code to all records
df['CountryCode'] = 'ES'

# Country-level data is embedded as "Total" in the CSV files
if is_region:
    df = df[df['_RegionLabel'] != 'Total']
else:
    df = df[df['_RegionLabel'] == 'Total']
    df = df.drop(columns=['_RegionLabel'])

# Merge the new data with the existing data (prefer new data if duplicates)
if not is_region:
    filter_function = lambda row: row['CountryCode'] == 'ES' and pandas.isna(row['RegionCode'])
    df = merge_previous(df, ['Date', 'CountryCode'], filter_function)

# Output the results
dataframe_output(df, ROOT, 'ES' if is_region else None)
Ejemplo n.º 2
0
previous = read_file(sys.argv[4])

# Confirmed cases are split into age groups, add up all groups
keys = ['RegionCode', 'Date']
confirmed = confirmed.set_index(keys)
confirmed = confirmed.groupby(['Date', 'RegionCode']).sum()
confirmed = confirmed.reset_index()

# Join the confirmed and deaths tables
data = confirmed.merge(deaths, how='outer')

# Map the department to the region
data['RegionCode'] = data['RegionCode'].apply(lambda dep: dep_map.get(dep))

# Data is new cases, perform the cumsum to get total
data = cumsum_table(data.dropna(subset=keys).set_index(keys)).reset_index()

# Merge with the prior data
previous = previous[previous['CountryCode'] == 'FR']
previous = previous[~previous['RegionCode'].isna()]
data = merge_previous(data, previous, ['Date', 'RegionCode'])

# New data is incomplete for Confirmed, so use the prior data when available
data = data.set_index(keys)
previous = previous.set_index(keys).dropna()
data.loc[previous.index] = previous
# print(data.tail(50))

# Output the results
dataframe_output(data.reset_index(), 'FR')
Ejemplo n.º 3
0
    region = region[region.index <= forecast_date]

    # Early exit: If there are less than DATAPOINT_COUNT_MIN datapoints
    if len(region) < DATAPOINT_COUNT_MIN: continue

    # Define the subfolder that will hold the output assets
    forecast_chart = ROOT / 'output' / 'charts' / ('%s_US_%s.svg' %
                                                   (forecast_date, key))

    # Perform forecast
    forecast_data = forecast(region['Confirmed'], predict_window)

    # Output charts as SVG files
    plot_forecast(forecast_chart, region['Confirmed'], forecast_data)

    # Aggregate forecast data
    for idx in forecast_data.index:
        forecast_df.loc[(key, idx), 'CountryCode'] = country_code
        forecast_df.loc[(key, idx), 'CountryName'] = country_name
        forecast_df.loc[(key, idx), 'ForecastDate'] = forecast_date
        forecast_df.loc[(key, idx), 'Estimated'] = '%.03f' % forecast_data[idx]
        forecast_df.loc[(key, idx),
                        'ForecastChart'] = forecast_chart.relative_to(ROOT /
                                                                      'output')
    for idx in region['Confirmed'].index:
        forecast_df.loc[(key, idx), 'Confirmed'] = int(region.loc[idx,
                                                                  'Confirmed'])

# Save output to CSV and JSON
dataframe_output(forecast_df, ROOT, 'usa')
Ejemplo n.º 4
0
from utils import github_raw_dataframe, dataframe_output, timezone_adjust

# Root path of the project
ROOT = Path(os.path.dirname(__file__)) / '..'

# Read DXY CSV file from  website
df = github_raw_dataframe('BlankerL/DXY-COVID-19-Data', 'csv/DXYArea.csv')

# Adjust 7 hour difference between China's GMT+8 and GMT+1
df['Date'] = df['updateTime'].apply(lambda date: timezone_adjust(date, 7))

# Rename the appropriate columns
df = df.rename(
    columns={
        'countryEnglishName': 'CountryName',
        'provinceEnglishName': 'RegionName',
        'province_confirmedCount': 'Confirmed',
        'province_deadCount': 'Deaths',
        'province_curedCount': 'Recovered'
    })

# Filter China data only
df = df[df['CountryName'] == 'China']

# This is time series data, get only the last snapshot of each day
df = df.sort_values('updateTime').groupby(
    ['Date', 'CountryName', 'RegionName']).last().reset_index()

# Output the results
dataframe_output(df, ROOT, 'CN')
Ejemplo n.º 5
0
#!/usr/bin/env python

from pandas import DataFrame
from utils import github_raw_dataframe, dataframe_output

# Read data from GitHub repo
df = github_raw_dataframe('tomwhite/covid-19-uk-data',
                          'data/covid-19-indicators-uk.csv')

# Aggregate time series data into relational format
records = []
for idx, rows in df.groupby(['Date', 'Country']):
    records.append({
        'Date': idx[0],
        'Country': idx[1],
        **{
            record.loc['Indicator']: record.loc['Value']
            for _, record in rows.iterrows()
        }
    })
df = DataFrame.from_records(records).rename(columns={
    'Country': '_RegionLabel',
    'ConfirmedCases': 'Confirmed'
})

# Output the results
dataframe_output(df, 'GB')
Ejemplo n.º 6
0
from covid_io import read_argv
from utils import dataframe_output, merge_previous

# Confirmed and deaths come from different CSV files, parse them separately first
confirmed, deaths, prev_data = read_argv()
confirmed = confirmed.rename(columns={
    'fecha': 'Date',
    'CCAA': '_RegionLabel',
    'total': 'Confirmed'
})
deaths = deaths.rename(columns={
    'fecha': 'Date',
    'CCAA': '_RegionLabel',
    'total': 'Deaths'
})

# Now we can simply join them into the same table
df = confirmed.merge(deaths)

# Parse date into a datetime object
df['Date'] = df['Date'].apply(lambda date: datetime.fromisoformat(date).date())

# Add the country code to all records
df['CountryCode'] = 'ES'

# Convert dates to ISO format
df['Date'] = df['Date'].apply(lambda date: date.isoformat())

# Output the results
dataframe_output(df, 'ES')
Ejemplo n.º 7
0
#!/usr/bin/env python

from datetime import datetime
from datetime import datetime
from covid_io import read_argv
from utils import datetime_isoformat, pivot_table, dataframe_output


def parse_date(date):
    return datetime_isoformat('%s-%d' % (date, datetime.now().year), '%d-%b-%Y')


# Read data from Google Sheets
df = read_argv()

df.columns = df.iloc[0]
df = df.rename(columns={'Provinsi': 'Date'})
df = df.iloc[1:].set_index('Date')

df = df[df.columns.dropna()]
df = pivot_table(df.transpose(), pivot_name='RegionName')
df['Date'] = df['Date'].apply(parse_date)
df = df.dropna(subset=['Date'])
df = df.rename(columns={'Value': 'Confirmed'})
df['Deaths'] = None
df = df.dropna(how='all', subset=['Confirmed', 'Deaths'])

# Output the results
dataframe_output(df, 'ID')
Ejemplo n.º 8
0
    if tokens[0] == 'Total' or tokens[0] == 'ESPAÑA' and table_marker:
        break

    # Only process tokens from known region
    if tokens[0] in region_list:
        records += parse_record(tokens)

    # Exit if we have covered all regions
    if len(records) == len(region_list):
        break

# Early exit: no records in the report (2020-03-16 onwards)
if not records:
    print('No records from region found in report')
    sys.exit(1)

# Put resulting records into a dataframe
df = pd.DataFrame.from_records(records).merge(regions, on='_RegionLabel')
df['Date'] = date

# Merge the new data with the existing data (prefer new data if duplicates)
filter_function = lambda row: row['CountryCode'] == 'ES' and not pd.isna(row[
    'RegionCode'])
df = merge_previous(df, ['Date', 'RegionCode'], filter_function)

# Only keep the necessary columns prior to merging with metadata
df = df[['Date', 'RegionCode', 'Confirmed', 'Deaths']]

# Output the results
dataframe_output(df, ROOT, 'es')
# We must use the requests package directly because covidtracking returns 403 otherwise
df = pd.read_json(
    requests.get('https://covidtracking.com/api/states/daily',
                 headers={
                     'User-agent': 'Mozilla/5.0'
                 }).text)

# Rename the appropriate columns
df = df.rename(
    columns={
        'date': 'Date',
        'state': 'Region',
        'positive': 'Confirmed',
        'death': 'Deaths',
        'total': 'Tested'
    })

# Null values are not the same as zero, make sure all numbers are string objects
for col in ('Confirmed', 'Deaths', 'Tested'):
    df[col] = df[col].dropna().astype(int).astype(str)

# Convert date to ISO format
df['Date'] = df['Date'].apply(lambda date: datetime.datetime.strptime(
    str(date), '%Y%m%d').strftime('%Y-%m-%d'))

# Inclide the country name in the data
df['CountryName'] = 'United States of America'

# Output the results
dataframe_output(df, ROOT, 'usa')
Ejemplo n.º 10
0
    non_null = [value for value in group if not (isna(value) or isnull(value))]
    return None if not non_null else sum(non_null)


# Add up all the rows with same Date and RegionName
data = data.sort_values(['Date', 'RegionName'])
data = data.drop(columns=['Value']).groupby(['RegionName', 'Date'
                                             ]).agg(aggregate_region_values)
data = data.reset_index().sort_values(['Date', 'RegionName'])

# Compute cumsum of the values region by region
value_columns = ['Confirmed', 'Deaths']
if not args.cumsum:
    for region in data['RegionName'].unique():
        mask = data['RegionName'] == region
        data.loc[mask, value_columns] = data.loc[mask, value_columns].cumsum()

# Get rid of rows which have all null values
data = data.dropna(how='all', subset=value_columns)

# If we don't have deaths data, then make them null rather than zero
if args.null_deaths:
    data['Deaths'] = None

if args.debug:
    print('\nOutput:')
    print(data.head(50))

# Output the results
dataframe_output(data, args.country_code)
Ejemplo n.º 11
0
# Retrieve the CSV files from https://covid19.isciii.es
df = read_argv(encoding='ISO-8859-1').rename(columns={
    'FECHA': 'Date',
    'CCAA': 'RegionCode',
    'Fallecidos': 'Deaths'
}).dropna(subset=['Date'])

# Add the country code to all records
df['CountryCode'] = 'ES'

# Confirmed cases are split across 3 columns
confirmed_columns = ['CASOS', 'PCR+', 'TestAc+']
for col in confirmed_columns:
    df[col] = df[col].fillna(0)
df['Confirmed'] = df.apply(lambda x: sum([x[col]
                                          for col in confirmed_columns]),
                           axis=1)

# Convert dates to ISO format
df['Date'] = df['Date'].apply(
    lambda date: datetime_isoformat(date, '%d/%m/%Y'))

# Country-wide is the sum of all regions
region_level = df
country_level = df.groupby(['Date', 'CountryCode']).sum().reset_index()

# Output the results
dataframe_output(country_level)
dataframe_output(region_level, 'ES')
import os
from pathlib import Path

from pandas import DataFrame

from utils import github_raw_dataframe, dataframe_output

# Root path of the project
ROOT = Path(os.path.dirname(__file__)) / '..'

# Read data from GitHub repo
df = github_raw_dataframe('tomwhite/covid-19-uk-data', 'data/covid-19-indicators-uk.csv')

# Aggregate time series data into relational format
records = []
for idx, rows in df.groupby(['Date', 'Country']):
    records.append({
        'Date': idx[0],
        'Country': idx[1],
        **{record.loc['Indicator']: record.loc['Value'] for _, record in rows.iterrows()}
    })
df = DataFrame.from_records(records).rename(
    columns={'Country': '_RegionLabel', 'ConfirmedCases': 'Confirmed'})

# Output the results
dataframe_output(df, ROOT, 'GB')
Ejemplo n.º 13
0
df = read_argv()

# Rename the appropriate columns
df = df.rename(columns={'time_iso8601': 'Date'})

# Convert dates to ISO format
df['Date'] = df['Date'].apply(
    lambda date: datetime.fromisoformat(date).date().isoformat())

# Get a list of all regions
regions = unique([col[3:5] for col in df.columns if col.startswith('DE-')])

# Transform the data from non-tabulated format to our record format
records = []
for idx, row in df.iterrows():
    record = {'Date': row['Date']}
    for region_code in regions:
        records.append({
            'RegionCode': region_code,
            'Confirmed': row['DE-%s_cases' % region_code],
            'Deaths': row['DE-%s_deaths' % region_code],
            **record
        })
df = DataFrame.from_records(records)

# Ensure we only take one record from the table
df = df.groupby(['Date', 'RegionCode']).last().reset_index()

# Output the results
dataframe_output(df, 'DE')
Ejemplo n.º 14
0
'''

from covid_io import read_argv
from utils import dataframe_output, timezone_adjust

# Read DXY CSV file from  website
df = read_argv()

# Adjust 7 hour difference between China's GMT+8 and GMT+1
df['Date'] = df['updateTime'].apply(lambda date: timezone_adjust(date, 7))

# Rename the appropriate columns
df = df.rename(
    columns={
        'countryEnglishName': 'CountryName',
        'provinceEnglishName': 'RegionName',
        'province_confirmedCount': 'Confirmed',
        'province_deadCount': 'Deaths',
        'province_curedCount': 'Recovered'
    })

# Filter China data only
df = df[df['CountryName'] == 'China']

# This is time series data, get only the last snapshot of each day
df = df.sort_values('updateTime').groupby(
    ['Date', 'CountryName', 'RegionName']).last().reset_index()

# Output the results
dataframe_output(df, 'CN')
Ejemplo n.º 15
0
    df = df.reset_index()

# Create a dummy record to be inserted where there is missing data
sample_record = df.iloc[0].copy()
sample_record['Confirmed'] = None
sample_record['Deaths'] = None

# Loop through all the dates, which must be unique in the dataset index and fill data
date_range = pd.date_range(FIRST_DATE, df['Date'].max())
date_range = [date.date().isoformat() for date in date_range]

# Backfill the first date with a zero
if FIRST_DATE not in df['Date'].values:
    df = df.set_index('Date')
    df.loc[FIRST_DATE, 'Confirmed'] = 0
    df.loc[FIRST_DATE, 'Deaths'] = 0
    df = df.reset_index()

# Fill all of country's missing data where numbers did not change
for date in [date for date in date_range if date not in df['Date'].values]:
    inserted_record = sample_record.copy()
    inserted_record['Date'] = date
    df = df.append(inserted_record, ignore_index=True)

df = df.reset_index().sort_values('Date')
for column in ('Confirmed', 'Deaths'):
    df[column] = df[column].ffill()

# Output the results
dataframe_output(df, ROOT, 'world')
Ejemplo n.º 16
0
from datetime import datetime

import pandas

from utils import github_raw_dataframe, dataframe_output

# Root path of the project
ROOT = Path(os.path.dirname(__file__)) / '..'

# Read data from GitHub repo
df = github_raw_dataframe('dssg-pt/covid19pt-data', 'data.csv')
df['Date'] = df['data'].apply(lambda date: datetime.strptime(date, '%d-%m-%Y').date().isoformat())

# Extract regions from the data
regions = [col.split('_')[-1] for col in df.columns if col.startswith('confirmados_')]
regions = [region for region in regions if len(region) > 2 and region not in ('novos', 'estrangeiro')]

# Aggregate regions into a single data frame
subsets = []
for region in regions:
    subset = df[['Date', 'confirmados_%s' % region, 'obitos_%s' % region]]
    subset = subset.copy()
    subset['_RegionLabel'] = region.replace('ars', '')
    subset = subset.rename(
        columns={'confirmados_%s' % region: 'Confirmed', 'obitos_%s' % region: 'Deaths'})
    subsets.append(subset)
df = pandas.concat(subsets)

# Output the results
dataframe_output(df, ROOT, 'PT')
Ejemplo n.º 17
0
    region = region[region.index <= forecast_date]

    # Early exit: If there are less than DATAPOINT_COUNT_MIN datapoints
    # TODO: Draw simple chart with data for visualization without forecast
    if len(region) < DATAPOINT_COUNT_MIN: continue

    # Define the subfolder that will hold the output assets
    forecast_chart = ROOT / 'output' / 'charts' / ('%s_%s.svg' %
                                                   (forecast_date, key))

    # Perform forecast
    forecast_data = forecast(region['Confirmed'], predict_window)

    # Output charts as SVG files
    plot_forecast(forecast_chart, region['Confirmed'], forecast_data)

    # Output text data to CSV file
    for idx in forecast_data.index:
        forecast_df.loc[(key, idx), 'CountryName'] = key_map[key]
        forecast_df.loc[(key, idx), 'ForecastDate'] = forecast_date
        forecast_df.loc[(key, idx), 'Estimated'] = '%.03f' % forecast_data[idx]
        forecast_df.loc[(key, idx),
                        'ForecastChart'] = forecast_chart.relative_to(ROOT /
                                                                      'output')
    for idx in region['Confirmed'].index:
        forecast_df.loc[(key, idx), 'Confirmed'] = int(region.loc[idx,
                                                                  'Confirmed'])

# Save output to CSV and JSON
dataframe_output(forecast_df, ROOT, 'world')
Ejemplo n.º 18
0
from datetime import datetime

from pandas import DataFrame

from utils import github_raw_dataframe, dataframe_output, merge_previous

# Root path of the project
ROOT = Path(os.path.dirname(__file__)) / '..'
df = github_raw_dataframe('covid-19-au/covid-19-au.github.io',
                          'src/data/state.json',
                          branch='prod').transpose()

# Transform the data from non-tabulated format to record format
records = []
for idx, row in df.iterrows():
    for code in df.columns:
        data = row[code]
        record = {
            'Date': idx.date().isoformat(),
            'RegionCode': code,
            'Confirmed': data[0]
        }
        if len(data) > 1: record['Deaths'] = data[1]
        if len(data) > 2: record['Recovered'] = data[2]
        if len(data) > 3: record['Tested'] = data[3]
        records.append(record)
df = DataFrame.from_records(records)

# Output the results
dataframe_output(df, ROOT, 'AU')
Ejemplo n.º 19
0
ROOT = Path(os.path.dirname(__file__)) / '..'

# Read CSV file from GitHub project
df = github_raw_dataframe('jgehrcke/covid-19-germany-gae', 'data.csv')

# Rename the appropriate columns
df = df.rename(columns={'time_iso8601': 'Date'})

# Convert dates to ISO format
df['Date'] = df['Date'].apply(
    lambda date: datetime.fromisoformat(date).date().isoformat())

# Get a list of all region codes
regions = unique([col[3:5] for col in df.columns if col.startswith('DE-')])

# Transform the data from non-tabulated format to our record format
records = []
for idx, row in df.iterrows():
    record = {'Date': row['Date'], 'CountryCode': 'DE'}
    for region in regions:
        records.append({
            'RegionCode': region,
            'Confirmed': row['DE-%s_cases' % region],
            'Deaths': row['DE-%s_deaths' % region],
            **record
        })
df = DataFrame.from_records(records)

# Output the results
dataframe_output(df, ROOT, 'de')
Ejemplo n.º 20
0
import os
import sys
import datetime
from pathlib import Path
import pandas as pd
from utils import github_raw_dataframe, dataframe_output, pivot_table, ROOT

df = github_raw_dataframe('carranco-sga/Mexico-COVID-19', 'Mexico_COVID19.csv')
df = df.rename(columns={'Fecha': 'Date'}).set_index('Date')

deaths_columns = [col for col in df.columns if col.endswith('_D')]
confirmed_columns = [col[:-2] for col in deaths_columns]

deaths = df[deaths_columns]
confirmed = df[confirmed_columns]
deaths.columns = confirmed.columns

deaths = pivot_table(
    deaths, pivot_name='RegionCode').rename(columns={'Value': 'Deaths'})
confirmed = pivot_table(
    confirmed, pivot_name='RegionCode').rename(columns={'Value': 'Confirmed'})

df = confirmed.merge(deaths).sort_values(['Date', 'RegionCode'])

# Output the results
dataframe_output(df, 'MX')
Ejemplo n.º 21
0
df = confirmed.merge(deaths)

# Parse date into a datetime object
df['Date'] = df['Date'].apply(lambda date: datetime.fromisoformat(date).date())

# Offset date by 1 day to match ECDC report
if not is_region:
    df['Date'] = df['Date'].apply(lambda date: date + timedelta(days=1))

# Convert dates to ISO format
df['Date'] = df['Date'].apply(lambda date: date.isoformat())

# Add the country code to all records
df['CountryCode'] = 'ES'

# Country-level data is embedded as "Total" in the CSV files
if is_region:
    df = df[df['_RegionLabel'] != 'Total']
else:
    df['RegionCode'] = None
    df = df[df['_RegionLabel'] == 'Total']
    df = df.drop(columns=['_RegionLabel'])

# Merge the new data with the existing data (prefer new data if duplicates)
if not is_region:
    filter_function = lambda row: row['CountryCode'] == 'ES' and pandas.isna(row['RegionCode'])
    df = merge_previous(df, ['Date', 'CountryCode'], filter_function)

# Output the results
dataframe_output(df, ROOT, 'es' if is_region else 'world')
Ejemplo n.º 22
0
import os
import sys
import datetime
from pathlib import Path

from utils import github_raw_dataframe, dataframe_output

# Root path of the project
ROOT = Path(os.path.dirname(__file__)) / '..'

# Read CSV file from covidtracking's GitHub project
df = github_raw_dataframe('COVID19Tracking/covid-tracking-data',
                          'data/states_daily_4pm_et.csv')

# Rename the appropriate columns
df = df.rename(
    columns={
        'date': 'Date',
        'state': 'RegionCode',
        'positive': 'Confirmed',
        'death': 'Deaths',
        'total': 'Tested'
    })

# Convert date to ISO format
df['Date'] = df['Date'].apply(lambda date: datetime.datetime.strptime(
    str(date), '%Y%m%d').date().isoformat())

# Output the results
dataframe_output(df, ROOT, 'US')
Ejemplo n.º 23
0
df['GeoId'] = df['GeoId'].apply(lambda code: 'GR' if code == 'EL' else code)

# Workaround for https://github.com/open-covid-19/data/issues/13
# ECDC mistakenly labels Greece country code as UK instead of GB
df['GeoId'] = df['GeoId'].apply(lambda code: 'GB' if code == 'UK' else code)

# Workaround for https://github.com/open-covid-19/data/issues/12
# ECDC data for Italy is simply wrong, so Italy's data will be parsed from a different source
# ECDC data for Spain is two days delayed because original reporting time mismatch, parse separately
df = df[(df['GeoId'] != 'IT') & (df['GeoId'] != 'ES')]

# Compute the cumsum of values
columns = ['DateRep', 'GeoId', 'Confirmed', 'Deaths']
df_ = pd.DataFrame(columns=columns)
for country in df['GeoId'].unique():
    subset = df[df['GeoId'] == country].copy()
    subset['Confirmed'] = subset['Cases'].cumsum()
    subset['Deaths'] = subset['Deaths'].cumsum()
    df_ = pd.concat([df_, subset[columns]])

df_ = df_[columns]
df_.columns = ['Date', 'CountryCode', 'Confirmed', 'Deaths']
df = df_

# Make sure all data types are appropriately casted
df['Confirmed'] = df['Confirmed'].fillna(0).astype(int)
df['Deaths'] = df['Deaths'].fillna(0).astype(int)

# Output the results
dataframe_output(df, ROOT, 'world', metadata_merge='left')
Ejemplo n.º 24
0
from datetime import datetime
import pandas
from covid_io import read_argv
from utils import dataframe_output


# Read data from GitHub repo
# https://raw.github.com/dssg-pt/covid19pt-data/master/data.csv
df = read_argv()
df['Date'] = df['data'].apply(lambda date: datetime.strptime(date, '%d-%m-%Y').date().isoformat())

# Extract regions from the data
regions = [col.split('_')[-1] for col in df.columns if col.startswith('confirmados_')]
regions = [region for region in regions
           if len(region) > 2 and region not in ('novos', 'estrangeiro')]

# Aggregate regions into a single data frame
subsets = []
for region in regions:
    subset = df[['Date', 'confirmados_%s' % region, 'obitos_%s' % region]]
    subset = subset.copy()
    subset['_RegionLabel'] = region.replace('ars', '')
    subset = subset.rename(
        columns={'confirmados_%s' % region: 'Confirmed', 'obitos_%s' % region: 'Deaths'})
    subsets.append(subset)
df = pandas.concat(subsets)

# Output the results
dataframe_output(df, 'PT')
Ejemplo n.º 25
0
This script loads the latest JSON from covidtracking.com website and extracts
the confirmed cases, deaths and total tests for each state.

Credit to the covidtracking.com team for scraping the data from each state.
'''

import sys
import datetime
from utils import github_raw_dataframe, dataframe_output

# Read CSV file from covidtracking's GitHub project
df = github_raw_dataframe('COVID19Tracking/covid-tracking-data',
                          'data/states_daily_4pm_et.csv')

# Rename the appropriate columns
df = df.rename(
    columns={
        'date': 'Date',
        'state': 'RegionCode',
        'positive': 'Confirmed',
        'death': 'Deaths',
        'total': 'Tested'
    })

# Convert date to ISO format
df['Date'] = df['Date'].apply(lambda date: datetime.datetime.strptime(
    str(date), '%Y%m%d').date().isoformat())

# Output the results
dataframe_output(df, 'US')
Ejemplo n.º 26
0
# Workaround for https://github.com/open-covid-19/data/issues/13
# ECDC mistakenly labels Greece country code as UK instead of GB
df['geoId'] = df['geoId'].apply(lambda code: 'GB' if code == 'UK' else code)

# Workaround for https://github.com/open-covid-19/data/issues/12
# ECDC data for Italy is simply wrong, so Italy's data will be parsed from a different source
# ECDC data for Spain is two days delayed because original reporting time mismatch, parse separately
df = df[df['geoId'] != 'ES']
df = df[df['geoId'] != 'IT']

# Compute the cumsum of values
columns = ['Date', 'CountryCode', 'Confirmed', 'Deaths']
df_ = pd.DataFrame(columns=columns)
for country in df['geoId'].unique():
    subset = df[df['geoId'] == country].copy()
    subset['CountryCode'] = subset['geoId']
    subset['Date'] = subset['dateRep'].apply(
        lambda date: datetime.strptime(date, '%d/%m/%Y').date().isoformat())
    subset = subset.sort_values('Date')
    subset['Confirmed'] = subset['cases'].cumsum()
    subset['Deaths'] = subset['deaths'].cumsum()
    df_ = pd.concat([df_, subset[columns]])
df = df_

# Make sure all data types are appropriately casted
df['Confirmed'] = df['Confirmed'].fillna(0).astype(int)
df['Deaths'] = df['Deaths'].fillna(0).astype(int)

# Output the results
dataframe_output(df)
Ejemplo n.º 27
0
#!/usr/bin/env python

import pandas
import datetime
from covid_io import read_argv
from utils import dataframe_output

# Read CSV file from covidtracking's GitHub project
data = read_argv()

# Rename the appropriate columns
data = data.rename(
    columns={
        'date': 'Date',
        'prname': '_RegionLabel',
        'numconf': 'Confirmed',
        'numdeaths': 'Deaths',
        'numtested': 'Tested'
    })

# Convert date to datetime object
data['Date'] = data['Date'].apply(lambda date: datetime.datetime.strptime(
    date, '%d-%m-%Y').date().isoformat())

# Output the results
dataframe_output(data, 'CA')
Ejemplo n.º 28
0
#!/usr/bin/env python

from datetime import datetime
from pandas import DataFrame
from covid_io import read_argv
from utils import dataframe_output


# Read data from GitHub repo
confirmed, deaths = read_argv()
for df in (confirmed, deaths):
    df.rename(columns={'Unnamed: 1': 'RegionCode'}, inplace=True)
    df.set_index('RegionCode', inplace=True)

# Transform the data from non-tabulated format to record format
records = []
for region_code in confirmed.index.unique():
    for col in confirmed.columns[1:]:
        date = col + '/' + str(datetime.now().year)
        date = datetime.strptime(date, '%d/%m/%Y').date().isoformat()
        records.append({
            'Date': date,
            'RegionCode': region_code,
            'Confirmed': confirmed.loc[region_code, col],
            'Deaths': deaths.loc[region_code, col]})
df = DataFrame.from_records(records)

# Output the results
dataframe_output(df, 'BR')
Ejemplo n.º 29
0
# ECDC mistakenly labels Greece country code as UK instead of GB
df['geoId'] = df['geoId'].apply(lambda code: 'GB' if code == 'UK' else code)

# Workaround for https://github.com/open-covid-19/data/issues/12
# ECDC data for Italy is simply wrong, so Italy's data will be parsed from a different source
# ECDC data for Spain is two days delayed because original reporting time mismatch, parse separately
df = df[df['geoId'] != 'ES']
df = df[df['geoId'] != 'IT']

# Compute the cumsum of values
columns = ['Date', 'CountryCode', 'Confirmed', 'Deaths']
df_ = pd.DataFrame(columns=columns)
for country in df['geoId'].unique():
    subset = df[df['geoId'] == country].copy()
    subset['CountryCode'] = subset['geoId']
    subset['Date'] = subset['dateRep'].apply(
        lambda date: datetime.strptime(date, '%d/%m/%Y').date().isoformat())
    subset = subset.sort_values('Date')
    subset['Confirmed'] = subset['cases'].cumsum()
    subset['Deaths'] = subset['deaths'].cumsum()
    df_ = pd.concat([df_, subset[columns]])
df = df_

# Make sure all data types are appropriately casted
df['Confirmed'] = df['Confirmed'].fillna(0).astype(int)
df['Deaths'] = df['Deaths'].fillna(0).astype(int)

# Output the results
df['RegionCode'] = None
dataframe_output(df, ROOT)
from utils import github_raw_dataframe, dataframe_output

# Root path of the project
ROOT = Path(os.path.dirname(__file__)) / '..'

# Read data from GitHub repo
confirmed = github_raw_dataframe('elhenrico/covid19-Brazil-timeseries',
                                 'confirmed-cases.csv')
deaths = github_raw_dataframe('elhenrico/covid19-Brazil-timeseries',
                              'deaths.csv')
for df in (confirmed, deaths):
    df.rename(columns={'Unnamed: 1': 'RegionCode'}, inplace=True)
    df.set_index('RegionCode', inplace=True)

# Transform the data from non-tabulated format to record format
records = []
for region_code in confirmed.index.unique():
    for col in confirmed.columns[1:]:
        date = col + '/' + str(datetime.now().year)
        date = datetime.strptime(date, '%d/%m/%Y').date().isoformat()
        records.append({
            'Date': date,
            'RegionCode': region_code,
            'Confirmed': confirmed.loc[region_code, col],
            'Deaths': confirmed.loc[region_code, col]
        })
df = DataFrame.from_records(records)

# Output the results
dataframe_output(df, ROOT, 'BR')