orient='records') def dataframe_add_key(data: pandas.DataFrame): ''' Adds the Key column to all rows of the dataframe ''' data['Key'] = data.apply(compute_record_key, axis=1) return data.set_index('Key') # Root path of the project ROOT = Path(os.path.dirname(__file__)) / '..' # Read the full data file data = read_csv(ROOT / 'output' / 'data.csv') for col in data.columns: data[col] = series_converter(data[col]) # Backwards compatibility: China dataset and Region -> RegionName china = data[data['CountryCode'] == 'CN'] china = china[~china['RegionCode'].isna()] dataframe_split(china, ('RegionCode', ), ROOT, 'cn') china['Region'] = china['RegionName'] dataframe_split(china, ('RegionCode', ), ROOT, 'china') # Backwards compatibility: Usa dataset and RegionName -> RegionCode usa = data[data['CountryCode'] == 'US'] usa = usa[~usa['RegionCode'].isna()] dataframe_split(usa, ('RegionCode', ), ROOT, 'us') usa['Region'] = usa['RegionCode'] dataframe_split(usa, ('RegionCode', ), ROOT, 'usa')
# Estimate daily counts per category assuming ratio is constant df['NewCases'] = df['Confirmed'].diff().astype('Int64') df['NewDeaths'] = df['Deaths'].diff().astype('Int64') df['NewMild'] = df['NewCases'] * mild_ratio df['NewSevere'] = df['NewCases'] * severe_ratio df['NewCritical'] = df['NewCases'] * critical_ratio df = df[['NewCases', 'NewDeaths', 'NewMild', 'NewSevere', 'NewCritical']] # Compute the rolling windows for count of active (current) categories df['CurrentlyMild'] = df['NewMild'].rolling( round(mild_recovery_days)).sum() df['CurrentlySevere'] = df['NewSevere'].rolling( round(severe_recovery_days)).sum() df['CurrentlyCritical'] = df['NewCritical'].rolling( round(critical_recovery_days)).sum() # Get rid of the first columns which are useless because of the windowing function df = df.iloc[ max(mild_recovery_days, severe_recovery_days, critical_recovery_days):] # Make sure all columns have the appropriate type for col in df.columns: df[col] = series_converter(df[col]) # Output resulting dataframe if print_header_flag: df.to_csv(sys.stdout) print_header_flag = False else: df.to_csv(sys.stdout, header=None)
# Early exit: If there are less than DATAPOINT_COUNT output datapoints if len(subset) < DATAPOINT_COUNT - PREDICT_WINDOW: continue # Perform forecast forecast_data = compute_forecast(subset['Confirmed'], PREDICT_WINDOW) # Capture only the last DATAPOINT_COUNT days forecast_data = forecast_data.sort_index().iloc[-DATAPOINT_COUNT:] # Fill out the corresponding index in the output forecast for idx in forecast_data.index: df_forecast.loc[(idx, key), 'ForecastDate'] = forecast_date df_forecast.loc[(idx, key), 'Estimated'] = '%.03f' % forecast_data.loc[idx] if idx in subset.index: df_forecast.loc[(idx, key), 'Confirmed'] = int(subset.loc[idx, 'Confirmed']) # Do data cleanup here data = df_forecast.reset_index() forecast_columns = ['ForecastDate', 'Date', 'Key', 'Estimated', 'Confirmed'] data = data.sort_values(['Key', 'Date'])[forecast_columns] # Make sure the core columns have the right data type for col in data.columns: data[col] = series_converter(data[col]) # Output resulting dataframe data.to_csv(sys.stdout, index=False)