def _get_partial_trajectory(self, epiweek, valid=True): y, w = EW.split_epiweek(epiweek) if w < 30: y -= 1 ew1 = EW.join_epiweek(y, 30) ew2 = epiweek limit = EW.add_epiweeks(ew2, -5) weeks = Epidata.range(ew1, ew2) stable = Epidata.check(Epidata.fluview(self.region, weeks)) try: unstable = Epidata.check(Epidata.fluview(self.region, weeks, issues=ew2)) except: unstable = [] wili = {} for row in stable: ew, value = row['epiweek'], row['wili'] if not valid or ew < limit: wili[ew] = value for row in unstable: ew, value = row['epiweek'], row['wili'] wili[ew] = value curve = [] for ew in EW.range_epiweeks(ew1, ew2, inclusive=True): if ew not in wili: if valid: t = 'unstable' else: t = 'any' raise Exception('wILI (%s) not available for week %d' % (t, ew)) curve.append(wili[ew]) n1 = EW.delta_epiweeks(ew1, ew2) + 1 n2 = len(curve) if n1 != n2: raise Exception('missing data (expected %d, found %d)' % (n1, n2)) return curve
def __init__(self, region): self.region = region weeks = Epidata.range(200330, 202330) rows = Epidata.check(Epidata.fluview(self.region, weeks)) self.seasons = {} for row in rows: ew, wili = row['epiweek'], row['wili'] y, w = EW.split_epiweek(ew) if w < 30: y -= 1 i = EW.delta_epiweeks(EW.join_epiweek(y, 30), ew) if y not in self.seasons: self.seasons[y] = {} if 0 <= i < 52: self.seasons[y][i] = wili years = sorted(list(self.seasons.keys())) for year in years: if len(self.seasons[year]) != 52: del self.seasons[year] if 2008 in self.seasons and 2009 in self.seasons: for i in range(40, 52): self.seasons[2008][i] = self.seasons[2009][i] del self.seasons[2009] curve = lambda y: [self.seasons[y][i] for i in range(52)] self.years = sorted(list(self.seasons.keys())) self.curves = dict([(y, curve(y)) for y in self.years])
def _forecast(self, ageGroup, epiweek): # season setup and sanity check ew1 = flu.join_epiweek(self.test_season, 40) ew2 = flu.join_epiweek(self.test_season + 1, 17) print("test season:", self.test_season, "ew1:", ew1, "epiweek:", epiweek) if not ew1 <= epiweek <= ew2: raise Exception('`epiweek` outside of `test_season`') # get past values (left half) from the Epidata API response = Epidata.flusurv('network_all', Epidata.range(ew1, epiweek), issues=epiweek) epidata = Forecaster.Utils.decode(response) pinned = [row[ageGroup] for row in epidata] if len(pinned) != flu.delta_epiweeks(ew1, epiweek) + 1: raise Exception('missing ILINet data') # get the user submissions (right half) from the database print("ageGroup", ageGroup, "epiweek", epiweek) submissions = self.fetch_submissions(ageGroup, epiweek) self._num_users = len(submissions) if self.verbose: print(' [EC] %d users found for %s on %d' % (len(submissions), ageGroup, epiweek)) # concatenate observed data and user submissions return [pinned + sub for sub in submissions]
def get_training_set(location, epiweek, signal, valid): ew1, ew2, ew3, weeks0, weeks1 = get_weeks(epiweek) auth = secrets.api.fluview try: result = Epidata.fluview(location, weeks0, issues=ew2, auth=auth) rows = Epidata.check(result) unstable = extract(rows, ['wili']) except: unstable = {} rows = Epidata.check(Epidata.fluview(location, weeks0, auth=auth)) stable = extract(rows, ['wili']) data = {} num_dropped = 0 for ew in signal.keys(): if ew == ew3: continue sig = signal[ew] if ew not in unstable: if valid and flu.delta_epiweeks(ew, ew3) <= 5: raise Exception('unstable wILI is not available on %d' % ew) if ew not in stable: num_dropped += 1 continue wili = stable[ew] else: wili = unstable[ew] data[ew] = {'x': sig, 'y': wili} if num_dropped: msg = 'warning: dropped %d/%d signal weeks because (w)ILI was unavailable' print(msg % (num_dropped, len(signal))) return get_training_set_data(data)
def get_lag_and_ili(issue, epiweek, num_ili, num_patients): """ Compute and return reporting lag and percent ILI from imputed ILINet data. """ lag = delta_epiweeks(epiweek, issue) ili = 100.0 * (0 if num_patients == 0 else num_ili / num_patients) return lag, ili
def test_find_csv_files(self): """Recursively explore and find CSV files.""" path_prefix = 'prefix/to/the/data/' glob_paths = [ # valid weekly path_prefix + 'fb_survey/weekly_202015_county_cli.csv', # valid daily path_prefix + 'ght/20200408_state_rawsearch.csv', # valid national path_prefix + 'valid/20200408_nation_sig.csv', # valid hhs path_prefix + 'valid/20200408_hhs_sig.csv', # invalid path_prefix + 'invalid/hello_world.csv', # invalid day path_prefix + 'invalid/22222222_b_c.csv', # invalid week path_prefix + 'invalid/weekly_222222_b_c.csv', # invalid geography path_prefix + 'invalid/20200418_province_c.csv', # ignored path_prefix + 'ignored/README.md', ] mock_glob = MagicMock() mock_glob.glob.return_value = glob_paths found = set(CsvImporter.find_csv_files(path_prefix, glob=mock_glob)) expected_issue_day = int(date.today().strftime("%Y%m%d")) expected_issue_week = int(str(epi.Week.fromdate(date.today()))) time_value_day = 20200408 expected = set([ (glob_paths[0], ('fb_survey', 'cli', 'week', 'county', 202015, expected_issue_week, delta_epiweeks(202015, expected_issue_week))), (glob_paths[1], ('ght', 'rawsearch', 'day', 'state', time_value_day, expected_issue_day, (date.today() - date(year=time_value_day // 10000, month=(time_value_day // 100) % 100, day=time_value_day % 100)).days)), (glob_paths[2], ('valid', 'sig', 'day', 'nation', time_value_day, expected_issue_day, (date.today() - date(year=time_value_day // 10000, month=(time_value_day // 100) % 100, day=time_value_day % 100)).days)), (glob_paths[3], ('valid', 'sig', 'day', 'hhs', time_value_day, expected_issue_day, (date.today() - date(year=time_value_day // 10000, month=(time_value_day // 100) % 100, day=time_value_day % 100)).days)), (glob_paths[4], None), (glob_paths[5], None), (glob_paths[6], None), (glob_paths[7], None), ]) self.assertEqual(found, expected)
def update_from_file_clinical(issue, date, filename, test_mode=False): """ Read WHO/NREVSS data from a zipped CSV and insert into (or update) the database. """ # database connection u, p = secrets.db.epi cnx = mysql.connector.connect(user=u, password=p, database='epidata') rows1 = get_rows(cnx, CL_TABLE) print('rows before: %d' % (rows1)) insert = cnx.cursor() # load the data, ignoring empty rows print('loading data from %s as issued on %d' % (filename, issue)) rows = load_zipped_csv(filename, CL_SHEET) print(' loaded %d rows' % len(rows)) data = [get_clinical_data(row) for row in rows] entries = [obj for obj in data if obj] print(' found %d entries' % len(entries)) sql = ''' INSERT INTO `fluview_clinical` (`release_date`, `issue`, `epiweek`, `region`, `lag`, `total_specimens`, `total_a`, `total_b`, `percent_positive`, `percent_a`, `percent_b`) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE `release_date` = least(`release_date`, %s), `total_specimens` = %s, `total_a` = %s, `total_b` = %s, `percent_positive` = %s, `percent_a` = %s, `percent_b` = %s ''' # insert each row insert = cnx.cursor() for row in entries: lag = delta_epiweeks(row['epiweek'], issue) args = [ row['total_specimens'], row['total_a'], row['total_b'], row['percent_positive'], row['percent_a'], row['percent_b'] ] ins_args = [date, issue, row['epiweek'], row['location'], lag] + args upd_args = [date] + args insert.execute(sql, ins_args + upd_args) # cleanup insert.close() if test_mode: print('test mode, not committing') rows2 = rows1 else: cnx.commit() rows2 = get_rows(cnx) print('rows after: %d (added %d)' % (rows2, rows2 - rows1)) cnx.close()
def fetch_submissions(self, region, epiweek_now): final_week = flu.join_epiweek(self.test_season + 1, 20) self.cur = self.cnx.cursor() self.cur.execute( """ SELECT u.`id` `user_id`, f.`epiweek`, f.`wili` FROM ( SELECT u.* FROM `ec_fluv_users_mturk_2019` u JOIN `ec_fluv_defaults` d ON TRUE LEFT JOIN `ec_fluv_user_preferences_mturk` p ON p.`user_id` = u.`id` AND p.`name` = d.`name` WHERE d.`name` = '_debug' AND coalesce(p.`value`, d.`value`) = '0' ) u JOIN `ec_fluv_submissions_mturk` s ON s.`user_id` = u.`id` JOIN `ec_fluv_forecast_mturk` f ON f.`user_id` = u.`id` AND f.`region_id` = s.`region_id` AND f.`epiweek_now` = s.`epiweek_now` JOIN `ec_fluv_regions` r ON r.`id` = s.`region_id` WHERE r.`fluview_name` = %s AND s.`epiweek_now` = %s AND f.`epiweek` <= %s AND f.`wili` > 0 ORDER BY u.`id` ASC, f.`epiweek` ASC """, (region, epiweek_now, final_week)) submissions = {} for (user, epiweek, wili) in self.cur: if self.users is not None and user not in self.users: continue if user not in submissions: submissions[user] = [] submissions[user].append(wili) self.cur.close() curves = [] expected_weeks = flu.delta_epiweeks(epiweek_now, final_week) for user in submissions: if len(submissions[user]) != expected_weeks: print( ' [EC] warning: missing data in user sumission [%d|%s|%d]' % (user, region, epiweek_now)) else: curves.append(submissions[user]) return curves
def update_from_file(issue, date, dir, test_mode=False): # Read ECDC data from CSVs and insert into (or update) the database. # database connection u, p = secrets.db.epi cnx = mysql.connector.connect(user=u, password=p, database='epidata') rows1 = get_rows(cnx, 'ecdc_ili') print('rows before: %d' % (rows1)) insert = cnx.cursor() # load the data, ignoring empty rows files = glob.glob(os.path.join(dir,"*.csv")) rows = [] for filename in files: with open(filename,'r') as f: header = map(lambda s: s.strip(),f.readline().split(',')) for l in f: data = list(map(lambda s: s.strip().replace('"',''),l.split(','))) row = {} row['epiweek'] = int(data[1][:4] + data[1][5:]) row['region'] = data[4] row['incidence_rate'] = data[3] rows.append(row) print(' loaded %d rows' % len(rows)) entries = [obj for obj in rows if obj] print(' found %d entries' % len(entries)) sql = ''' INSERT INTO `ecdc_ili` (`release_date`, `issue`, `epiweek`, `region`, `lag`, `incidence_rate`) VALUES ('%s', %s, %s, '%s', %s, %s) ON DUPLICATE KEY UPDATE `release_date` = least(`release_date`, '%s'), `incidence_rate` = %s ''' for row in entries: lag = delta_epiweeks(row['epiweek'], issue) data_args = [row['incidence_rate']] insert_args = [date,issue,row['epiweek'],row['region'],lag] + data_args update_args = [date] + data_args try: insert.execute(sql % tuple(insert_args + update_args)) except Exception: pass # cleanup insert.close() if test_mode: print('test mode, not committing') rows2 = rows1 else: cnx.commit() rows2 = get_rows(cnx) print('rows after: %d (added %d)' % (rows2,rows2-rows1)) cnx.close()
def get_data(self, start_week, end_week, location, term, resolution='week', country='US'): start_date = GHT._ew2date(start_week) end_date = GHT._ew2date(end_week) num_weeks = flu.delta_epiweeks(start_week, end_week) + 1 # getTimelinesForHealth parameters params = { 'terms': term, 'time_startDate': start_date, 'time_endDate': end_date, 'timelineResolution': resolution, } # We have a special check for the US for backwards compatibility. # i.e. if the country is 'US' AND the location is 'US', just put the geo-restriction for country. # In contrast, another country might have a sub-region with initials 'US' and we want the region restriction instead. if country == 'US': if location == 'US' or location == NO_LOCATION_STR: params['geoRestriction_country'] = 'US' else: params['geoRestriction_region'] = 'US-' + location else: if location == NO_LOCATION_STR: params['geoRestriction_country'] = country else: params['geoRestriction_region'] = country + '-' + location # make the API call data = self.service.getTimelinesForHealth(**params).execute() # extract the values try: values = [p['value'] for p in data['lines'][0]['points']] except: values = None # throttle request rate time.sleep(self.delay) # return the results return { 'start_week': start_week, 'end_week': end_week, 'num_weeks': num_weeks, 'location': location, 'country': country, 'term': term, 'resolution': resolution, 'data': data, 'values': values, }
def drop_invalid_predictions(epiweek, user_predictions): # sanity check user inputs (copy key sets since we modify the dict in-place) expected_length = epiweek_lib.delta_epiweeks(epiweek, Constants.MAX_EPIWEEK) num_dropped = 0 for location in list(user_predictions.keys()): for user in list(user_predictions[location].keys()): if len(user_predictions[location][user]) != expected_length: num_dropped += 1 del user_predictions[location][user] if not user_predictions[location]: del user_predictions[location] if num_dropped: print('NOTE: dropped %d time-series with invalid length' % num_dropped)
def _forecast(self, region, epiweek): print('inside hybrid._forecast, region, epiweek:', region, epiweek) P = self.past._forecast(region, epiweek) F = self.future._forecast(region, epiweek) print('inside hybrid._forecast, len P, len F', len(P), len(F)) i = flu.delta_epiweeks(flu.join_epiweek(self.test_season, 40), epiweek) curves = [] for j in range(max(len(P), len(F))): p, f = P[j % len(P)], F[j % len(F)] curves.append(list(p[:i]) + list(f[i:])) if self._callback is not None: self._callback() return curves
def _forecast(self, region, epiweek): ew1 = flu.join_epiweek(self.test_season + 0, 40) ew2 = flu.join_epiweek(self.test_season + 1, 24) num_weeks = flu.delta_epiweeks(ew1, ew2) print('fetching past data until week %d' % (epiweek)) observed = self._get_current(region, epiweek, self.forecast_type) mean, var = self.emp_mean[region].copy(), self.emp_var[region].copy() for ew in flu.range_epiweeks(ew1, flu.add_epiweeks(epiweek, 1)): i = flu.delta_epiweeks(ew1, ew) lag = flu.delta_epiweeks(ew1, epiweek) - i lag = min(lag, len(self.bf_var[region]) - 1) mean[i] = observed[i] var[i] = self.bf_var[region][lag] curves = Forecaster.Utils.sample_normal_var(mean, var, self.num_samples) if not self.do_sampling: offset = flu.delta_epiweeks(ew1, epiweek) + 1 for (i, curve) in enumerate(curves): index = i % len(self.emp_curves[region]) curve[offset:] = self.emp_curves[region][index][offset:] return curves
def get_weight(ew1, ew2): # I want something that: # - drops sharply over the most recent ~3 weeks # - falls off exponentially with time # - puts extra emphasis on the past weeks at the same time of year # - gives no week a weight of zero dw = flu.delta_epiweeks(ew1, ew2) yr = 52.2 hl1, hl2, bw = yr, 1, 4 a = 0.05 #b = (np.cos(2 * np.pi * (dw / yr)) + 1) / 2 b = np.exp(-((min(dw % yr, yr - dw % yr) / bw) ** 2)) c = 2 ** -(dw / hl1) d = 1 - 2 ** -(dw / hl2) return (a + (1 - a) * b) * c * d
def _forecast(self, region, epiweek): # season setup and sanity check ew1 = flu.join_epiweek(self.test_season, 40) ew2 = flu.join_epiweek(self.test_season + 1, 20) if not ew1 <= epiweek <= ew2: raise Exception('`epiweek` outside of `test_season`') # get past values (left half) from the Epidata API epidata = Forecaster.Utils.decode(Epidata.fluview(region, Epidata.range(ew1, epiweek), issues=epiweek)) pinned = [row['wili'] for row in epidata] if len(pinned) != flu.delta_epiweeks(ew1, epiweek) + 1: raise Exception('missing ILINet data') # get the user submissions (right half) from the database submissions = self.fetch_submissions(region, epiweek) self._num_users = len(submissions) print(' [EC] %d users found for %s on %d' % (len(submissions), region, epiweek)) # concatenate observed data and user submissions return [pinned + sub for sub in submissions]
def get_weight(ew1, ew2): """ This function gives the weight between two given epiweeks based on a function that: - drops sharply over the most recent ~3 weeks - falls off exponentially with time - puts extra emphasis on the past weeks at the same time of year (seasonality) - gives no week a weight of zero """ dw = flu.delta_epiweeks(ew1, ew2) yr = 52.2 hl1, hl2, bw = yr, 1, 4 a = 0.05 # b = (np.cos(2 * np.pi * (dw / yr)) + 1) / 2 b = np.exp(-((min(dw % yr, yr - dw % yr) / bw) ** 2)) c = 2 ** -(dw / hl1) d = 1 - 2 ** -(dw / hl2) return (a + (1 - a) * b) * c * d
def get_model(ew2, epiweeks, X, Y): ne, nx1, nx2, ny = len(epiweeks), len(X), len(X[0]), len(Y) if ne != nx1 or nx1 != ny: raise Exception('length mismatch e=%d X=%d Y=%d' % (ne, nx1, ny)) weights = np.diag([get_weight(ew1, ew2) for ew1 in epiweeks]) X = np.array(X).reshape((nx1, nx2)) Y = np.array(Y).reshape((ny, 1)) bias0 = np.ones(Y.shape) if ne >= 26 and flu.delta_epiweeks(epiweeks[0], epiweeks[-1]) >= 52: # constant and periodic bias bias1 = np.array([get_periodic_bias(ew) for ew in epiweeks]) X = np.hstack((X, bias0, bias1)) else: # constant bias only X = np.hstack((X, bias0)) XtXi = np.linalg.inv(dot(X.T, weights, X)) XtY = dot(X.T, weights, Y) return np.dot(XtXi, XtY)
def _get_current(self, region, epiweek, forecast_type): ew1 = flu.join_epiweek(self.test_season + 0, 40) ew2 = flu.join_epiweek(self.test_season + 1, 20) weeks = Epidata.range(ew1, ew2) if self.forecast_type == ForecastType.WILI: print('fetching history data for:') print(region, epiweek, weeks) epidata = Forecaster.Utils.decode( Epidata.fluview(region, weeks, issues=epiweek)) data = [row['wili'] for row in epidata] # print (data) else: epidata = Forecaster.Utils.decode( Epidata.flusurv('network_all', weeks, issues=epiweek)) data = [row[region] for row in epidata] if len(data) != flu.delta_epiweeks(ew1, epiweek) + 1: raise Exception('missing data') return data
def update_from_data(ews, ilis, date, issue, test_mode=False): u, p = secrets.db.epi cnx = mysql.connector.connect(user=u, password=p, database='epidata') rows1 = get_rows(cnx) print('rows before: %d' % (rows1)) insert = cnx.cursor() sql = ''' INSERT INTO `kcdc_ili` (`release_date`, `issue`, `epiweek`, `region`, `lag`, `ili`) VALUES ('%s', %s, %s, '%s', %s, %s) ON DUPLICATE KEY UPDATE `release_date` = least(`release_date`, '%s'), `ili` = %s ''' for i in range(len(ews)): ew = ews[i] ili = ilis[i] lag = delta_epiweeks(ews[i], issue) insert_args = [date, issue, ew, 'ROK', lag, ili] update_args = [date, ili] try: insert.execute(sql % tuple(insert_args + update_args)) except Exception: pass # cleanup insert.close() if test_mode: print('test mode, not committing') rows2 = rows1 else: cnx.commit() rows2 = get_rows(cnx) print('rows after: %d (added %d)' % (rows2, rows2 - rows1)) cnx.close()
def get_data(self, start_week, end_week, location, term, resolution='week'): start_date = GHT._ew2date(start_week) end_date = GHT._ew2date(end_week) num_weeks = flu.delta_epiweeks(start_week, end_week) + 1 # getTimelinesForHealth parameters params = { 'terms': term, 'time_startDate': start_date, 'time_endDate': end_date, 'timelineResolution': resolution, } if location == 'US': params['geoRestriction_country'] = location else: params['geoRestriction_region'] = 'US-' + location # make the API call data = self.service.getTimelinesForHealth(**params).execute() # extract the values try: values = [p['value'] for p in data['lines'][0]['points']] except: values = None # throttle request rate time.sleep(self.delay) # return the results return { 'start_week': start_week, 'end_week': end_week, 'num_weeks': num_weeks, 'location': location, 'term': term, 'resolution': resolution, 'data': data, 'values': values, }
def train(self, epiweek): if epiweek not in self.ew2i: raise Exception('not predicting during this period') most_recent_issue = self.dds.get_most_recent_issue(self.region) i2 = min(self.ew2i[epiweek] - 5, self.ew2i[most_recent_issue] - 1) signal_to_truth_shift = max( 0, EW.delta_epiweeks(most_recent_issue, epiweek)) self.stts = signal_to_truth_shift i1 = self.weeks[2 + signal_to_truth_shift] ew1, ew2 = self.i2ew[i1], self.i2ew[i2] num_weeks = i2 - i1 if num_weeks <= 0: raise Exception('not predicting during this period') feature_indices = self.feature_indices( epiweek, signal_to_truth_shift=signal_to_truth_shift, valid=False) X, Y = np.zeros((num_weeks, np.sum(feature_indices))), np.zeros( (num_weeks, 1)) r = 0 for i in range(i1, i2): try: newx = self._get_features( self.i2ew[i], signal_to_truth_shift=signal_to_truth_shift, valid=False, mask=feature_indices) newy = self.data[i + 1]['stable'] if np.all(np.isfinite(newx)): X[r, :] = newx Y[r, 0] = newy r += 1 except Exception: pass X = X[:r, :] Y = Y[:r, :] Y = np.log(np.maximum(Y, 0.01)) self.model = ISCH.dot(np.linalg.inv(ISCH.dot(X.T, X)), X.T, Y) self.training_week = epiweek return (X, Y, self.model)
def update_from_file(issue, date, filename, test_mode=False): """ Read ILINet data from a zipped CSV and insert into (or update) the database. """ # database connection u, p = secrets.db.epi cnx = mysql.connector.connect(user=u, password=p, database='epidata') rows1 = get_rows(cnx) print('rows before: %d' % (rows1)) insert = cnx.cursor() # load the data, ignoring empty rows print('loading data from %s as issued on %d' % (filename, issue)) rows = load_zipped_csv(filename) print(' loaded %d rows' % len(rows)) data = [get_ilinet_data(row) for row in rows] entries = [obj for obj in data if obj] print(' found %d entries' % len(entries)) sql = ''' INSERT INTO `fluview` (`release_date`, `issue`, `epiweek`, `region`, `lag`, `num_ili`, `num_patients`, `num_providers`, `wili`, `ili`, `num_age_0`, `num_age_1`, `num_age_2`, `num_age_3`, `num_age_4`, `num_age_5`) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE `release_date` = least(`release_date`, %s), `num_ili` = %s, `num_patients` = %s, `num_providers` = %s, `wili` = %s, `ili` = %s, `num_age_0` = coalesce(%s, `num_age_0`), `num_age_1` = coalesce(%s, `num_age_1`), `num_age_2` = coalesce(%s, `num_age_2`), `num_age_3` = coalesce(%s, `num_age_3`), `num_age_4` = coalesce(%s, `num_age_4`), `num_age_5` = coalesce(%s, `num_age_5`) ''' # insert each row insert = cnx.cursor() for row in entries: lag = delta_epiweeks(row['epiweek'], issue) args = [ row['n_ili'], row['n_patients'], row['n_providers'], row['wili'], row['ili'], row['age0'], row['age1'], row['age2'], row['age3'], row['age4'], row['age5'] ] ins_args = [date, issue, row['epiweek'], row['location'], lag] + args upd_args = [date] + args insert.execute(sql, ins_args + upd_args) # cleanup insert.close() if test_mode: print('test mode, not committing') rows2 = rows1 else: cnx.commit() rows2 = get_rows(cnx) print('rows after: %d (added %d)' % (rows2, rows2 - rows1)) cnx.close()
def get_xy(data): weeks = sorted(data) y = [data[w] for w in weeks] x = [Epiweek.delta_epiweeks(201030, w) for w in weeks] return list(map(np.array, [x, y]))
def update(issue, location_name, test_mode=False): """Fetch and store the currently avialble weekly FluSurv dataset.""" # fetch data location_code = flusurv.location_codes[location_name] print('fetching data for', location_name, location_code) data = flusurv.get_data(location_code) # metadata epiweeks = sorted(data.keys()) location = location_name release_date = str(EpiDate.today()) # connect to the database u, p = secrets.db.epi cnx = mysql.connector.connect(user=u, password=p, database='epidata') cur = cnx.cursor() rows1 = get_rows(cur) print('rows before: %d' % rows1) # SQL for insert/update sql = ''' INSERT INTO `flusurv` ( `release_date`, `issue`, `epiweek`, `location`, `lag`, `rate_age_0`, `rate_age_1`, `rate_age_2`, `rate_age_3`, `rate_age_4`, `rate_overall`, `rate_age_5`, `rate_age_6`, `rate_age_7` ) VALUES ( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s ) ON DUPLICATE KEY UPDATE `release_date` = least(`release_date`, %s), `rate_age_0` = coalesce(%s, `rate_age_0`), `rate_age_1` = coalesce(%s, `rate_age_1`), `rate_age_2` = coalesce(%s, `rate_age_2`), `rate_age_3` = coalesce(%s, `rate_age_3`), `rate_age_4` = coalesce(%s, `rate_age_4`), `rate_overall` = coalesce(%s, `rate_overall`), `rate_age_5` = coalesce(%s, `rate_age_5`), `rate_age_6` = coalesce(%s, `rate_age_6`), `rate_age_7` = coalesce(%s, `rate_age_7`) ''' # insert/update each row of data (one per epiweek) for epiweek in epiweeks: lag = delta_epiweeks(epiweek, issue) if lag > 52: # Ignore values older than one year, as (1) they are assumed not to # change, and (2) it would adversely affect database performance if all # values (including duplicates) were stored on each run. continue args_meta = [release_date, issue, epiweek, location, lag] args_insert = data[epiweek] args_update = [release_date] + data[epiweek] cur.execute(sql, tuple(args_meta + args_insert + args_update)) # commit and disconnect rows2 = get_rows(cur) print('rows after: %d (+%d)' % (rows2, rows2 - rows1)) cur.close() if test_mode: print('test mode: not committing database changes') else: cnx.commit() cnx.close()
def update_from_file(issue, date, filename, test_mode=False): # Read PAHO data from CSV and insert into (or update) the database. # Behavior with issue: # PAHO has drop down menu for week, and selecting a given week # from that menu gives the data for that issue, not that EW # Unsure what revisions, if any, that data goes through # Current code ignores PAHO-given issue, is based on argument 'issue' # database connection u, p = secrets.db.epi cnx = mysql.connector.connect(user=u, password=p, database='epidata') rows1 = get_rows(cnx, 'paho_dengue') print('rows before: %d' % (rows1)) insert = cnx.cursor() # load the data, ignoring empty rows print('loading data from %s as issued on %d' % (filename, issue)) with open(filename, 'r', encoding='utf-8') as f: c = f.read() rows = [] for l in csv.reader(StringIO(c), delimiter=','): rows.append(get_paho_row(l)) print(' loaded %d rows' % len(rows)) entries = [obj for obj in rows if obj] print(' found %d entries' % len(entries)) sql = ''' INSERT INTO `paho_dengue` (`release_date`, `issue`, `epiweek`, `region`, `lag`, `total_pop`, `serotype`, `num_dengue`, `incidence_rate`, `num_severe`, `num_deaths`) VALUES ('%s', %s, %s, '%s', %s, %s, '%s', %s, %s, %s, %s) ON DUPLICATE KEY UPDATE `release_date` = least(`release_date`, '%s'), `total_pop` = %s, `serotype` = '%s', `num_dengue` = %s, `incidence_rate` = %s, `num_severe` = %s, `num_deaths` = %s ''' for row in entries: if row['issue'] > issue: # Issued in a week that hasn't happened yet continue lag = delta_epiweeks(row['epiweek'], issue) data_args = [ row['total_pop'], row['serotype'], row['num_dengue'], row['incidence_rate'], row['num_severe'], row['num_deaths'] ] insert_args = [date, issue, row['epiweek'], row['region'], lag ] + data_args update_args = [date] + data_args insert.execute(sql % tuple(insert_args + update_args)) # cleanup insert.close() if test_mode: print('test mode, not committing') rows2 = rows1 else: cnx.commit() rows2 = get_rows(cnx) print('rows after: %d (added %d)' % (rows2, rows2 - rows1)) cnx.close()
def forecast(self, epiweek): """ `epiweek`: the most recent epiweek for which ILINet data is available """ # sanity checks flu.check_epiweek(epiweek) season = flu.split_epiweek(flu.get_season(epiweek)[0])[0] week = flu.split_epiweek(epiweek)[1] first_epiweek = flu.join_epiweek(season, 40) offset = flu.delta_epiweeks(first_epiweek, epiweek) if season != self.test_season: raise Exception('unable to forecast season %d' % season) if 20 < week < 40: raise Exception('unable to forecast week %02d' % week) # initialize forecast forecast = Forecast(self.test_season, datetime.now(), self.name, epiweek, self.forecast_type) # aliases for readability num_week_bins = forecast.season_length num_wili_bins = forecast.num_ili_bins wili_bin_size = forecast.ili_bin_size # if (forecast_type == ForecastType.HOSP): # num_wili_bins = 601 # uniform blending weights week_weight = self.min_week_prob * (num_week_bins + 1 ) # include `none` "bin" wili_weight = self.min_wili_prob * num_wili_bins if week_weight > 1: raise Exception('`min_week_prob` is impossibly high') if wili_weight > 1: raise Exception('`min_wili_prob` is impossibly high') # forecast each region for region in self.locations: # draw sample curves curves = self._forecast(region, epiweek) # regional info if Locations.is_region(region): baseline = Targets.baselines[self.test_season][region] else: baseline = None # get all targets targets = [ Targets.get_all_targets(c, baseline, offset, rule_season=self.test_season) for c in curves ] onsets = [t['onset'] for t in targets] peakweeks = [t['peakweek'] for t in targets] peaks = [t['peak'] for t in targets] x1s = [t['x1'] for t in targets] x2s = [t['x2'] for t in targets] x3s = [t['x3'] for t in targets] x4s = [t['x4'] for t in targets] # forecast each target allow_no_pw = self.test_season < 2016 if Locations.is_region(region): # skip onset for states and hospitalization, and do it only for regions onset = self.forecast_weeks(first_epiweek, num_week_bins, onsets, week_weight, self.smooth_weeks_bw, True) peakweek = self.forecast_weeks(first_epiweek, num_week_bins, peakweeks, week_weight, self.smooth_weeks_bw, allow_no_pw) peak = self.forecast_wili(wili_bin_size, num_wili_bins, peaks, wili_weight, self.smooth_wili_bw) x1 = self.forecast_wili(wili_bin_size, num_wili_bins, x1s, wili_weight, self.smooth_wili_bw) x2 = self.forecast_wili(wili_bin_size, num_wili_bins, x2s, wili_weight, self.smooth_wili_bw) x3 = self.forecast_wili(wili_bin_size, num_wili_bins, x3s, wili_weight, self.smooth_wili_bw) x4 = self.forecast_wili(wili_bin_size, num_wili_bins, x4s, wili_weight, self.smooth_wili_bw) # fill in the forecast data fc = forecast.get_or_create_forecast(region) if Locations.is_region(region): fc.set_onset(*onset) fc.set_peakweek(*peakweek) fc.set_peak(*peak) fc.set_lookahead(1, *x1) fc.set_lookahead(2, *x2) fc.set_lookahead(3, *x3) fc.set_lookahead(4, *x4) # sanity check completed forecast forecast.sanity_check() return forecast
def plot(forecasts, prefix, fig_label=''): # timing epiweek = forecasts[0][0].epiweek ew0, ew1 = flu.get_season(epiweek) num_weeks = flu.delta_epiweeks(ew0, ew1) + 1 year = flu.split_epiweek(ew0)[0] # plot settings x_ticks = [i for i in range(0, num_weeks, 3)] x_tick_labels = [ '%02d' % ForecastIO.get_index_week(i) for i in x_ticks ] y_ticks = [i for i in range(0, 14, 2)] regions = ['nat'] + ['hhs%s' % i for i in range(1, 11)] # TODO: avoid hardcoding these values everywhere baseline_values_2019 = [ 2.4, 1.9, 3.2, 1.9, 2.4, 1.9, 3.8, 1.7, 2.7, 2.4, 1.5 ] baselines = dict( (r, v) for (r, v) in zip(regions, baseline_values_2019)) bin_size = forecasts[0][0].ili_bin_size # get the somewhat sorted list of all unique locations locations = [] for info in forecasts: fc = info[0] for loc in fc.get_locations(): if loc not in locations: locations.append(loc) # plot each region for region in locations: # only consider forecasts that include this location region_forecasts = [] for info in forecasts: if info[0].has_forecast(region): region_forecasts.append(info) # center subplot plt.figure(figsize=(12, 12)) ax2 = plt.subplot(3, 2, 3) if region in baselines: plt.axhline(baselines[region], color='#888888') weeks = [i for i in range(flu.delta_epiweeks(ew0, epiweek) + 1)] values = Plotter.get_unstable_wILI(region, ew0, epiweek) plt.plot(weeks, values, color='#000000', linewidth=2) weeks = [flu.delta_epiweeks(ew0, epiweek) + i for i in range(1, 5)] for (forecast, label, color) in region_forecasts: fc = forecast.get_forecast(region) values = [fc.get_lookahead(i)['point'] for i in range(1, 5)] plt.plot(weeks, values, color=color, linewidth=2) ax2.set_xbound(0, 33) ax2.set_ybound(0, 12) ax2.set_xticks(x_ticks) ax2.set_yticks(y_ticks) ax2.set_xticklabels(x_tick_labels) ax2.get_xaxis().set_tick_params(labelbottom='on', labeltop='on') ax2.get_yaxis().set_tick_params(labelleft='on', labelright='on') # top subplot: peakweek top = Plotter.weekly_subplot(region_forecasts, region, plt.subplot(3, 2, 1), ax2, False) # bottom subplot: onset bottom = Plotter.weekly_subplot(region_forecasts, region, plt.subplot(3, 2, 5), ax2, True) # right subplot: peakheight right = Plotter.wili_subplot(region_forecasts, region, plt.subplot(3, 2, 4), ax2, bin_size) # top-right subplot: legend leg = plt.subplot(3, 2, 2) for (forecast, label, color) in forecasts: plt.plot([0], [0], color=color, label=label) plt.legend(loc='lower left') # other stuff top.set_ylabel('Pr(Peak Week)') top.get_yaxis().set_label_position('right') bottom.set_ylabel('Pr(Onset Week)') bottom.get_yaxis().set_label_position('right') right.set_xlabel('Pr(Peak Height)') right.get_xaxis().set_label_position('top') ax2.set_ylabel('%s %s' % (fig_label, region.upper())) ax2.get_yaxis().set_label_position('left') # show the finished figure if prefix is None: plt.show() break else: filename = '%s_%s.png' % (prefix, region) plt.savefig(filename, bbox_inches='tight') print('saved %s' % filename)
def find_csv_files(scan_dir, issue=(date.today(), epi.Week.fromdate(date.today())), glob=glob): """Recursively search for and yield covidcast-format CSV files. scan_dir: the directory to scan (recursively) The return value is a tuple of (path, details), where, if the path was valid, details is a tuple of (source, signal, time_type, geo_type, time_value, issue, lag) (otherwise None). """ logger = get_structured_logger('find_csv_files') issue_day, issue_epiweek = issue issue_day_value = int(issue_day.strftime("%Y%m%d")) issue_epiweek_value = int(str(issue_epiweek)) issue_value = -1 lag_value = -1 for path in sorted(glob.glob(os.path.join(scan_dir, '*', '*'))): if not path.lower().endswith('.csv'): # safe to ignore this file continue # match a daily or weekly naming pattern daily_match = CsvImporter.PATTERN_DAILY.match(path.lower()) weekly_match = CsvImporter.PATTERN_WEEKLY.match(path.lower()) if not daily_match and not weekly_match: logger.warning(event='invalid csv path/filename', detail=path, file=path) yield (path, None) continue # extract and validate time resolution if daily_match: time_type = 'day' time_value = int(daily_match.group(2)) match = daily_match time_value_day = CsvImporter.is_sane_day(time_value) if not time_value_day: logger.warning(event='invalid filename day', detail=time_value, file=path) yield (path, None) continue issue_value = issue_day_value lag_value = (issue_day - time_value_day).days else: time_type = 'week' time_value = int(weekly_match.group(2)) match = weekly_match time_value_week = CsvImporter.is_sane_week(time_value) if not time_value_week: logger.warning(event='invalid filename week', detail=time_value, file=path) yield (path, None) continue issue_value = issue_epiweek_value lag_value = delta_epiweeks(time_value_week, issue_epiweek_value) # # extract and validate geographic resolution geo_type = match.group(3).lower() if geo_type not in CsvImporter.GEOGRAPHIC_RESOLUTIONS: logger.warning(event='invalid geo_type', detail=geo_type, file=path) yield (path, None) continue # extract additional values, lowercased for consistency source = match.group(1).lower() signal = match.group(4).lower() if len(signal) > 64: logger.warning(event='invalid signal name (64 char limit)', detail=signal, file=path) yield (path, None) continue yield (path, (source, signal, time_type, geo_type, time_value, issue_value, lag_value))
def get_periodic_bias(epiweek): weeks_per_year = 52.2 offset = flu.delta_epiweeks(200001, epiweek) % weeks_per_year angle = np.pi * 2 * offset / weeks_per_year return [np.sin(angle), np.cos(angle)]