def main(database=MASTER_DATABASE, siteData=SITE_DATA, bundleOutputFileName=DEFAULT_OUTPUT_JSON_FILE_NAME): bundle = { 'confirmed': {}, 'deaths': {}, } casesType = ( 'confirmed', 'deaths', ) requiredAttributes = ('info', ) + casesType cryostation = Cryostation(database) for element in cryostation.items(): country = element[1] if not all((r in country.keys() for r in requiredAttributes)): continue if not country['info'].get('region', None): continue for caseType in casesType: _applyCountFor(bundle, country, caseType) cryostation.close() bundleFileName = os.path.join(siteData, bundleOutputFileName) with open(bundleFileName, 'w') as outputStream: json.dump(bundle, outputStream) return bundle, bundleFileName
def _updateUSData(): # 'Cases' -- TSV ref # 'confirmed' -- VirusTrack DB key print(' updating US...') updateUSCases = _fetchCurrentUpdatesUS(columnRef='Cases') updateUSCases = _homologizeUpdateData(updateUSCases, US_STATE_NAMES) updateUSDeaths = _fetchCurrentUpdatesUS(columnRef='Deaths') updateUSDeaths = _homologizeUpdateData(updateUSDeaths, US_STATE_NAMES) cryostation = Cryostation(MASTER_DATABASE) country = cryostation['US'] for location in tqdm.tqdm(sorted(updateUSCases.keys())): try: if location in NIXED_ROWS_INDEX: # TODO: Eugene - what do we do about these uncharted locations? # retardedKeys.append(location) continue country['provinces'][location]['confirmed'][ SCRAPED_TODAY] = updateUSCases[location][SCRAPED_TODAY] country['provinces'][location]['deaths'][ SCRAPED_TODAY] = updateUSDeaths[location][SCRAPED_TODAY] except: print(' || Invalid location: %s' % location) continue cryostation['US'] = country cryostation.close()
def makeTestDatabase(): with Cryostation(TEMP_DATABASE_NAME) as cryostationTest: with Cryostation(MASTER_DATABASE) as cryostation: unitedStates = cryostation['US'] california = {'confirmed': unitedStates['provinces']['California']['confirmed']} newYork = {'confirmed': unitedStates['provinces']['New York']['confirmed']} newJersey = {'confirmed': unitedStates['provinces']['New Jersey']['confirmed']} item = {'confirmed': unitedStates['confirmed'], 'provinces': {'California': california, 'New York': newYork, 'New Jersey': newJersey, }, 'key': 'US'} cryostationTest['US'] = item with Cryostation(TEMP_DATABASE_NAME) as cryostationTest: with Cryostation(MASTER_DATABASE) as cryostation: italy = {'confirmed': cryostation['Italy']['confirmed'], 'key': 'Italy'} uk = {'confirmed': cryostation['United Kingdom']['confirmed'], 'key': 'United Kingdom'} cryostationTest['Italy'] = italy cryostationTest['United Kingdom'] = uk
def _updateWorldData(): # 'Cases' -- TSV ref # 'confirmed' -- VirusTrack DB key print(' updating world...') updateWorldCases = _fetchCurrentUpdates('Cases') updateWorldCases = _homologizeUpdateData(updateWorldCases, COUNTRY_NAMES) updateWorldDeaths = _fetchCurrentUpdates('Deaths') updateWorldDeaths = _homologizeUpdateData(updateWorldDeaths, COUNTRY_NAMES) cryostation = Cryostation(MASTER_DATABASE) for countryName in tqdm.tqdm(sorted(updateWorldCases.keys())): if countryName in cryostation: country = cryostation[countryName] try: country['confirmed'][SCRAPED_TODAY] = updateWorldCases[ countryName][SCRAPED_TODAY] country['deaths'][SCRAPED_TODAY] = updateWorldDeaths[ countryName][SCRAPED_TODAY] except KeyError: # TODO: Eugene - Define a mechanism to add new countries reporting to the database pass cryostation[countryName] = country else: print('## country %s not found in database' % countryName) cryostation.close()
def packGlobal(siteData = SITE_DATA): bundle = { 'confirmed': dict(), 'deaths': dict(), } cryostation = Cryostation(MASTER_DATABASE) for key in tqdm.tqdm(cryostation.keys()): if 'confirmed' in cryostation[key]: bundle['confirmed'][key] = cryostation[key]['confirmed'] bundle['deaths'][key] = cryostation[key]['deaths'] cryostation.close() fileName = os.path.join(siteData, BUNDLE_GLOBAL_JSON) with open(fileName, 'w') as outputStream: json.dump(bundle, outputStream)
def computeGrowth(regionType = 'country', countryName = None, casesType = 'confirmed', windowSize = WINDOW_SIZE, disableProgressBar = True, todayDate = TODAY_DATE, databasePath = MASTER_DATABASE, ): with Cryostation(databasePath) as cryostation: print('Loading time series for countries...') regions = cryostation.timeSeriesFor(regionType = regionType, countryName = countryName, casesType = casesType, disableProgressBar = disableProgressBar, ) growth = _computeGrowthFor(regions, windowSize = windowSize) growthGaugeData = _getGrowthGaugeData(growth, todayDate) print('Computing/writing growth factors...') if regionType == 'country': _appendGrowthToCountries(growthGaugeData, databasePath, disableProgressBar = disableProgressBar) elif regionType == 'province': _appendGrowthToProvinces(growthGaugeData, databasePath, countryName=countryName, disableProgressBar=True) else: raise ValueError(f'regionType = {regionType} not understood') print('Done.')
def _appendGrowthToCountries(growthGaugeDataCountries, databasePath, disableProgressBar=True): countryNameAll = list(growthGaugeDataCountries['yesterday'].index) for countryName in tqdm(countryNameAll, disable=disableProgressBar): with Cryostation(databasePath) as cryostation: country = cryostation[countryName] country['growth'] = {d: growthGaugeDataCountries[d][countryName] for d in growthGaugeDataCountries.keys()} cryostation[countryName] = country
def packRegions(countryName = 'US', siteData = SITE_DATA): bundle = { 'confirmed': dict(), 'deaths': dict(), } cryostation = Cryostation(MASTER_DATABASE) country = cryostation[countryName] for region in country['regions']: if 'confirmed' in country['regions'][region]: bundle['confirmed'][region] = country['regions'][region]['confirmed'] bundle['deaths'][region] = country['regions'][region]['deaths'] cryostation.close() fileName = os.path.join(siteData, BUNDLE_US_REGIONS_JSON) with open(fileName, 'w') as outputStream: json.dump(bundle, outputStream)
def _main( siteDataDirectory=SITE_RESOURCES, database=MASTER_DATABASE, nStateLimit=1000, # unreachable "infinite" limit ): print('vuhospitals - getting the total hospital beds count per state') with Cryostation(database) as cryostation: country = cryostation['US'] postCodes = country['provinceCodes'] count = 0 for state in tqdm(postCodes.keys()): if state in country['provinces']: country['provinces'][state][ 'hospitalBedsCount'] = _getTotalBedsForPostalCode( postCodes[state]['postalCode']) # Artificial break for unit tests count += 1 if count == nStateLimit: break cryostation['US'] = country return country
def test__appendGrowthToCountries(): growthGaugeData = test__getGrowthGaugeData() _appendGrowthToCountries(growthGaugeData, TEMP_DATABASE_NAME) with Cryostation(TEMP_DATABASE_NAME) as cryostationTest: assert isinstance(cryostationTest['US']['growth'], dict) assert isinstance(cryostationTest['Italy']['growth'], dict) assert isinstance(cryostationTest['United Kingdom']['growth'], dict) os.remove(TEMP_DATABASE_NAME)
def _appendGrowthToProvinces(growthGaugeDataProvinces, databasePath, countryName = 'US', disableProgressBar=True): provinceNameAll = list(growthGaugeDataProvinces['yesterday'].index) for stateName in tqdm(provinceNameAll.columns, disable=disableProgressBar): with Cryostation(databasePath) as cryostation: country = cryostation[countryName] province = country['provinces'][stateName] province['growth'] = {d: growthGaugeDataProvinces[d][stateName] for d in growthGaugeDataProvinces.keys()} country['provinces'][stateName] = province cryostation[countryName] = country
def _bundleHospitalBeds(countryName = 'US'): bundle = dict() with Cryostation(MASTER_DATABASE) as cryostation: for state in tqdm.tqdm(cryostation[countryName]['provinces'].keys()): if 'hospitalBedsCount' in cryostation[countryName]['provinces'][state]: bundle[state] = cryostation[countryName]['provinces'][state]['hospitalBedsCount'] return bundle
def updateDatabaseWith(dataset): with Cryostation(MASTER_DATABASE) as cryostation: country = cryostation['US'] for state in country['provinces'].keys(): if 'counties' not in country['provinces'][state]: continue for county in country['provinces'][state]['counties'].keys(): try: country['provinces'][state]['counties'][county]['confirmed'][ SCRAPED_TODAY] = float(dataset[state][county]['confirmed']) country['provinces'][state]['counties'][county]['deaths'][ SCRAPED_TODAY] = float(dataset[state][county]['deaths']) except: continue with Cryostation(MASTER_DATABASE) as cryostation: cryostation['US'] = country
def test_computeGrowth(): makeTestDatabase() computeGrowth(regionType='country', casesType='confirmed', todayDate=TEST_TODAY_DATE, databasePath=TEMP_DATABASE_NAME) with Cryostation(TEMP_DATABASE_NAME) as cryostationTest: assert isinstance(cryostationTest['US']['growth'], dict) assert isinstance(cryostationTest['Italy']['growth'], dict) assert isinstance(cryostationTest['United Kingdom']['growth'], dict) os.remove(TEMP_DATABASE_NAME)
def packCountry(countryName = 'US', siteData = SITE_DATA): print(' processing states') bundle = { 'confirmed': dict(), 'deaths': dict(), 'allCounties': dict(), } cryostation = Cryostation(MASTER_DATABASE) country = cryostation[countryName] for state in country['provinces']: if 'confirmed' in country['provinces'][state]: bundle['confirmed'][state] = country['provinces'][state]['confirmed'] bundle['deaths'][state] = country['provinces'][state]['deaths'] if 'counties' in country['provinces'][state]: bundle['allCounties'][state] = country['provinces'][state]['counties'] cryostation.close() print(' processing hospital beds counts by state') bundle['hospitalBeds'] = _bundleHospitalBeds(countryName) fileName = os.path.join(siteData, BUNDLE_US_JSON) with open(fileName, 'w') as outputStream: json.dump(bundle, outputStream)
def test_plotDataAndPredictionsWithCI(): meanPredictionTSAll, percentilesTSAll, = loadAll( siteData=join(TEST_SITE_DATA, 'test-predictions')) with Cryostation(REAL_DATABASE_FILE_NAME) as cs: confirmedCasesAll = cs.timeSeriesFor() # take defaults _ = plotDataAndPredictionsWithCI( meanPredictionTSAll, confirmedCasesAll, percentilesTSAll, ['Albania', 'Algeria'], )
def _updateUSRegionsData(target): # 'confirmed' -- VirusTrack DB key print(' updating US regions...') updateUSRegions = dict() cryostation = Cryostation(MASTER_DATABASE) country = cryostation['US'] allTime = list(country['provinces'][TOTAL_US_NAME][target].keys()) for location in tqdm.tqdm(country['provinces']): if location in NIXED_ROWS_INDEX: continue try: region = US_REGIONS[location] if region not in updateUSRegions: updateUSRegions[region] = { SCRAPED_TODAY: 0.0, } try: updateUSRegions[region][SCRAPED_TODAY] += float( country['provinces'][location][target][SCRAPED_TODAY]) except: yesterday = country['provinces'][location][allTime[len(allTime) - 2]] updateUSRegions[region][SCRAPED_TODAY] = yesterday except KeyError: print(' >> Invalid location: %s' % location) continue for region in sorted(updateUSRegions.keys()): country['regions'][region][target][SCRAPED_TODAY] = updateUSRegions[ region][SCRAPED_TODAY] cryostation['US'] = country cryostation.close()
def test_predictCountries(): try: predictRegions('US', nDaysPredict=10, siteData=TEST_SITE_DATA, logGrowthModel=logGrowthModel, nSamples=TEST_N_SAMPLES, nChains=TEST_N_CHAINS, databasePath=TEST_DATABASE_PATH, ) _assertValidJSON(join(TEST_SITE_DATA,'prediction-world-mean-US.json')) _assertValidJSON(join(TEST_SITE_DATA, 'prediction-world-conf-int-US.json')) predictRegions('Alabama', regionType='stateUS', nDaysPredict=10, siteData=TEST_SITE_DATA, logGrowthModel=logGrowthModel, nSamples=TEST_N_SAMPLES, nChains=TEST_N_CHAINS, databasePath=TEST_DATABASE_PATH, ) _assertValidJSON(join(TEST_SITE_DATA, 'prediction-world-mean-US.json')) _assertValidJSON(join(TEST_SITE_DATA, 'prediction-world-conf-int-US.json')) nLimitRegions=2 with Cryostation(TEST_DATABASE_PATH) as cs: countries = cs.allCountryNames() predictRegions('all', regionType='country', nDaysPredict=10, siteData=TEST_SITE_DATA, logGrowthModel=logGrowthModel, nSamples=TEST_N_SAMPLES, nChains=TEST_N_CHAINS, nLimitRegions=nLimitRegions, databasePath=TEST_DATABASE_PATH, ) for i in range(nLimitRegions): _assertValidJSON(join(TEST_SITE_DATA, f'prediction-world-mean-{countries[i]}.json')) except Exception as e: raise e finally: _purge(TEST_SITE_DATA, '.json')
def test__computeGrowthFor(): makeTestDatabase() with Cryostation(TEMP_DATABASE_NAME) as cryostation: print('Loading time series for countries...') regions = cryostation.timeSeriesFor(regionType = 'country', casesType = 'confirmed', ) growth = _computeGrowthFor(regions, WINDOW_SIZE) assert isinstance(growth, DataFrame) smoothCases = (regions.iloc[-WINDOW_SIZE:, 0].mean(), regions.iloc[-WINDOW_SIZE-1:-1, 0].mean(), regions.iloc[-WINDOW_SIZE-2:-2, 0].mean(), ) growthFactorExpectedFinal = (smoothCases[0] - smoothCases[1])/(smoothCases[1] - smoothCases[2]) assert abs(growth.iloc[-1,0] - growthFactorExpectedFinal) < 1e-4 return growth
def predictLogisticGrowth(logGrowthModel: StanModel, regionName, target = 'confirmed', regionType = 'country', nSamples = N_SAMPLES, nChains = N_CHAINS, nDaysPredict = N_DAYS_PREDICT, minCasesFilter = MIN_CASES_FILTER, minNumberDaysWithCases = MIN_NUMBER_DAYS_WITH_CASES, predictionsPercentiles = PREDICTIONS_PERCENTILES, randomSeed = 2020, databasePath = DATABASE_PATH, maxTreeDepth = MAX_TREEDEPTH, ): """Predict the region with the nth highest number of cases Parameters ---------- logGrowthModel: A compiled pystan model regionName: Name of the region to train, which must be a country or US state in Cryostation target: 'confirmed' or 'deaths' regionType: 'country' or 'stateUS nSamples: Number of samples per chain of MCMC nChains: Number of independent chains MCMC nDaysPredict: Number of days ahead to predict minCasesFilter: Minimum number of cases for prediction minNumberDaysWithCases: Minimum number of days with at least minCasesFilter predictionsPercentiles: Bayesian confidence intervals to evaluate randomSeed: Seed for stan sampler databasePath: Path to virustrack.db maxTreeDepth: max_treedepth for pystan Returns ------- regionTS: All data for the queried region predictionsMeanTS: Posterior mean prediction predictionsPercentilesTS: Posterior percentiles trace: pymc3 trace object regionTSClean: Data used for training """ with Cryostation(databasePath) as storage: try: if regionType == 'country': if target in storage[regionName].keys(): regionTS = pd.Series(storage[regionName][target]) else: return None elif regionType == 'stateUS': if target in storage['US']['provinces'][regionName].keys(): regionTS = pd.Series(storage['US']['provinces'][regionName][target]) else: return None else: raise NotImplementedError except Exception as e: raise e regionTS.index = pd.to_datetime(regionTS.index) regionTS.sort_index(inplace=True) minIndex = (regionTS > minCasesFilter).argmax() regionTSClean = regionTS.iloc[minIndex:] if regionTSClean.shape[0] < minNumberDaysWithCases: return None regionTSClean.index = pd.to_datetime(regionTSClean.index) t = regionTSClean.index.to_series().diff().map(lambda d: d.days).fillna(0).cumsum().values regionTSCleanLog = np.log(regionTSClean.values + 1) logisticGrowthData = {'nDays': regionTSClean.shape[0], 't': list(t), 'casesLog': list(regionTSCleanLog) } fit = logGrowthModel.sampling(data=logisticGrowthData, iter=nSamples, chains=nChains, seed=randomSeed, control={'max_treedepth':maxTreeDepth} ) trace = fit.to_dataframe() predictionsMean, predictionsPercentilesTS = _getPredictionsFromPosteriorSamples(t, trace, nDaysPredict, predictionsPercentiles, ) predictionsMeanTS, predictionsPercentilesTS = _castPredictionsAsTS(regionTSClean, nDaysPredict, predictionsMean, predictionsPercentilesTS, ) regionTS.index = pd.to_datetime(regionTS.index) prediction = { 'regionTS': regionTS, 'predictionsMeanTS': predictionsMeanTS, 'predictionsPercentilesTS': predictionsPercentilesTS, 'trace': trace, 'regionTSClean': regionTSClean, 'regionName': regionName, 't': t, } return prediction
def predictRegions(regionName, regionType='country', target='confirmed', predictionsPercentiles=PREDICTIONS_PERCENTILES, siteData=SITE_DATA, priorLogCarryingCapacity=PRIOR_LOG_CARRYING_CAPACITY, priorMidPoint=PRIOR_MID_POINT, priorGrowthRate=PRIOR_GROWTH_RATE, priorSigma=PRIOR_SIGMA, logGrowthModel=None, databasePath=DATABASE_PATH, nLimitRegions=None, **kwargs ): """Generate forecasts for regions Parameters ---------- regionName: A country key of Cryostation, or 'all' target: 'confirmed' or 'deaths' predictionsPercentiles: The posterior percentiles to compute siteData: The directory for output data regionType: 'country' or 'stateUS' priorLogCarryingCapacity priorMidPoint priorGrowthRate priorSigma logGrowthModel: A compiled pystan model databasePath: Path to virustrack.db nLimitRegions: Maximum number of regions to train in alphabetical order kwargs: Optional named arguments for covidvu.predictLogisticGrowth Returns ------- JSON dump of mean prediction and confidence intervals """ if logGrowthModel is None: print('Building model. This may take a few moments...') logGrowthModel = buildLogisticModel(priorLogCarryingCapacity= priorLogCarryingCapacity, priorMidPoint=priorMidPoint, priorGrowthRate=priorGrowthRate, priorSigma=priorSigma, ) print('Done.') else: assert isinstance(logGrowthModel, StanModel) if regionName == 'all': if regionType == 'country': with Cryostation(databasePath) as cs: countries = cs.allCountryNames() for i, country in enumerate(countries): print(f'Training {country}') if nLimitRegions: if i > nLimitRegions-1: break prediction = predictLogisticGrowth(logGrowthModel, country, regionType=regionType, predictionsPercentiles=predictionsPercentiles, target=target, **kwargs ) _dumpRegionPrediction(prediction, siteData, predictionsPercentiles, meanFilename=PREDICTION_MEAN_JSON_FILENAME_WORLD, confIntFilename=PREDICTION_CI_JSON_FILENAME_WORLD, ) print('Done.') elif regionType == 'stateUS': with Cryostation(databasePath) as cs: statesUS = cs.allProvincesOf('US') for i, state in enumerate(statesUS): if nLimitRegions: if i > nLimitRegions: break print(f'Training {state}') prediction = predictLogisticGrowth(logGrowthModel, state, regionType=regionType, predictionsPercentiles=predictionsPercentiles, target=target, **kwargs ) _dumpRegionPrediction(prediction, siteData, predictionsPercentiles, meanFilename=PREDICTION_MEAN_JSON_FILENAME_US, confIntFilename=PREDICTION_CI_JSON_FILENAME_US, ) print('Done.') else: raise ValueError(f'regionType = {regionType} not understood') else: print(f'Training {regionName}') prediction = predictLogisticGrowth(logGrowthModel, regionName, regionType=regionType, predictionsPercentiles=predictionsPercentiles, target=target, **kwargs, ) if regionType == 'country': _dumpRegionPrediction(prediction, siteData, predictionsPercentiles, meanFilename=PREDICTION_MEAN_JSON_FILENAME_WORLD, confIntFilename=PREDICTION_CI_JSON_FILENAME_WORLD, ) elif regionType == 'stateUS': _dumpRegionPrediction(prediction, siteData, predictionsPercentiles, meanFilename=PREDICTION_MEAN_JSON_FILENAME_US, confIntFilename=PREDICTION_CI_JSON_FILENAME_US, ) print('Done.')