def update(sensors, first_week=None, last_week=None, valid=False, test_mode=False): # most recent issue last_issue = get_most_recent_issue() # location information loc_info = StateInfo() # connect u, p = secrets.db.epi cnx = mysql.connector.connect(user=u, password=p, database='epidata') cur = cnx.cursor() # update each sensor for (name, loc) in sensors: if loc == 'hhs': locations = loc_info.hhs elif loc == 'cen': locations = loc_info.cen elif loc == 'state' or loc == 'sta': locations = loc_info.sta else: locations = [loc] # update each location print(locations) for location in locations: # timing ew1, ew2 = first_week, last_week if ew1 is None: ew1 = get_last_update(cur, name, location) if ew2 is None: ew2 = flu.add_epiweeks(last_issue, +1) print('Updating %s-%s from %d to %d.' % (name, location, ew1, ew2)) for test_week in flu.range_epiweeks(ew1, ew2, inclusive=True): train_week = flu.add_epiweeks(test_week, -1) try: value = { 'gft': get_gft, 'ght': get_ght, 'ghtj': get_ghtj, 'twtr': get_twtr, 'wiki': get_wiki, 'cdc': get_cdc, 'epic': get_epic, 'sar3': get_sar3, 'arch': get_arch, 'quid': get_quid, }[name](location, train_week, valid) print(' %4s %5s %d -> %.3f' % (name, location, test_week, value)) # upload store_value(cur, name, location, test_week, value) except Exception as ex: print(' failed: %4s %5s %d' % (name, location, test_week), ex) #raise ex sys.stdout.flush() # disconnect cur.close() if not test_mode: cnx.commit() cnx.close()
def _train(self, region): if region in self.bf_var: # already trained return if len(region) == 2: # TODO: this is a hack for state ILI # assume backfill of region 4 print('FIXME: setting backfill for %s as hhs4' % region) self.bf_var[region] = self.bf_var['hhs4'] self.emp_mean[region] = self.emp_mean['hhs4'] self.emp_var[region] = self.emp_var['hhs4'] self.emp_curves[region] = self.emp_curves['hhs4'] return stable = self._get_stable(region) start_weeks = [flu.get_season(ew)[0] for ew in stable.keys()] curves = [] seasons = set( [flu.split_epiweek(ew)[0] for ew in start_weeks if ew is not None]) for s in seasons: ew1 = flu.join_epiweek(s + 0, 40) if self.forecast_type == ForecastType.WILI: ew2 = flu.add_epiweeks(ew1, 37) else: ew2 = flu.add_epiweeks(ew1, 29) # print("stable: ", stable) # print("range_epiweeks: ", [i for i in flu.range_epiweeks(ew1, ew2)]) curve = [stable[ew] for ew in flu.range_epiweeks(ew1, ew2)] curves.append(curve) self.emp_mean[region] = np.mean(curves, axis=0) self.emp_var[region] = np.var(curves, axis=0, ddof=1) self.emp_curves[region] = curves if self.backfill_weeks is None: self.bf_var[region] = [0] else: self.bf_var[region] = [] for lag in range(self.backfill_weeks): unstable = self._get_unstable(region, lag) changes = [ stable[ew] - unstable[ew] for ew in stable.keys() & unstable.keys() ] if len(changes) < 2: raise Exception('not enough data') self.bf_var[region].append(np.var(changes, ddof=1)) print( ' %5s: %s' % (region, ' '.join(['%.3f' % (b**0.5) for b in self.bf_var[region]])))
def _get_features(self, ew, signal_to_truth_shift=0, valid=False, mask=np.ones((10), dtype=bool)): X = np.zeros((1, 10)) i = self.ew2i[ew] X[0, 0] = 1 for lag in range(3): if valid and not self.valid[i - lag][lag]: w = self.i2ew[i - lag] raise Exception('missing unstable wILI (ew=%d|lag=%d)' % (w, lag)) try: X[0, 1 + lag] = np.log( np.maximum( 0.01, self.data[i - lag - signal_to_truth_shift]['stable'])) except Exception: X[0, 1 + lag] = np.nan for holiday in range(4): if EW.split_epiweek(EW.add_epiweeks(ew, holiday))[1] == 1: X[0, 4 + holiday] = 1 y, w = EW.split_epiweek(ew) N = EW.get_num_weeks(y) offset = np.pi * 2 * w / N X[0, 8] = np.sin(offset) X[0, 9] = np.cos(offset) # todo linear time trend covariate? return X[:, mask]
def get_kcdc_data(): issue = EpiDate.today().get_ew() last_season = issue // 100 + (1 if issue % 100 > 35 else 0) url = 'http://www.cdc.go.kr/npt/biz/npp/iss/influenzaListAjax.do' params = { 'icdNm': 'influenza', 'startYear': '2004', # Started in 2004 'endYear': str(last_season) } response = requests.post(url, params) datas = response.json() data = datas['data'] ews = [] ilis = [] ew1 = 200436 for year in range(2004, last_season): year_data = data[year - 2004] if year > 2004: ew1 = ews[-1] + 1 ili_yr = year_data["VALUE"].split('`') ili_yr = [float(f) for f in ili_yr if f != ''] ew2 = add_epiweeks(ew1, len(ili_yr)) new_ews = list(range_epiweeks(ew1, ew2)) for i in range(len(new_ews)): j = float(ili_yr[i]) ilis.append(j) ews.append(new_ews[i]) return ews, ilis
def get_training_set_datasetname(location, epiweek, signal, target, signal_to_truth_ew_shift): ew1, ew2, ew3, weeks0, weeks1 = get_weeks(epiweek) groundTruth = dict() auth = secrets.api.datasetname_targets datasetnameData = Epidata.check( Epidata.datasetname_targets(auth, target, location, weeks0)) for row in datasetnameData: groundTruth[row['epiweek']] = row['value'] data = {} dropped_weeks = 0 for signal_week in signal.keys(): ground_truth_week = flu.add_epiweeks(signal_week, signal_to_truth_ew_shift) # skip the week we're trying to predict if ground_truth_week == ew3: continue sig = signal[signal_week] if ground_truth_week in groundTruth: label = groundTruth[ground_truth_week] else: dropped_weeks += 1 continue data[ground_truth_week] = {'x': sig, 'y': label} if dropped_weeks: msg = 'warning: dropped %d/%d signal weeks because ground truth / target was unavailable' print(msg % (dropped_weeks, len(signal))) epiweeks = sorted(list(data.keys())) X = [data[week]['x'] for week in epiweeks] Y = [data[week]['y'] for week in epiweeks] return (epiweeks, X, Y)
def _forecast(first_epiweek, num_bins, indices, uniform_weight, smooth_bw, allow_none): if smooth_bw > 0: print( ' [EC] warning: epicast doesnt smooth week bins, but smooth_bw = %.3f' % smooth_bw) num_none = indices.count(None) if num_none > 0 and not allow_none: raise Exception('target does not allow None, but None given') dist = Epicast.fit_distribution(indices, num_bins, 1, -0.5, False, num_users) dist *= len(indices) - num_none extra = [num_none] if allow_none else [] dist = Forecaster.Utils.normalize(list(dist) + extra) dist = Forecaster.Utils.blend(dist, uniform_weight) if allow_none: dist, none = dist[:-1], dist[-1] else: none = None possibilities = [i for i in indices if i is not None] if len(possibilities) == 0: possibilities = [0] point = flu.add_epiweeks(first_epiweek, int(np.median(possibilities))) return (dist, none, point)
def _get_partial_trajectory(self, epiweek, valid=True): y, w = EW.split_epiweek(epiweek) if w < 30: y -= 1 ew1 = EW.join_epiweek(y, 30) ew2 = epiweek limit = EW.add_epiweeks(ew2, -5) weeks = Epidata.range(ew1, ew2) stable = Epidata.check(Epidata.fluview(self.region, weeks)) try: unstable = Epidata.check(Epidata.fluview(self.region, weeks, issues=ew2)) except: unstable = [] wili = {} for row in stable: ew, value = row['epiweek'], row['wili'] if not valid or ew < limit: wili[ew] = value for row in unstable: ew, value = row['epiweek'], row['wili'] wili[ew] = value curve = [] for ew in EW.range_epiweeks(ew1, ew2, inclusive=True): if ew not in wili: if valid: t = 'unstable' else: t = 'any' raise Exception('wILI (%s) not available for week %d' % (t, ew)) curve.append(wili[ew]) n1 = EW.delta_epiweeks(ew1, ew2) + 1 n2 = len(curve) if n1 != n2: raise Exception('missing data (expected %d, found %d)' % (n1, n2)) return curve
def get_weeks(epiweek): ew1 = 200330 ew2 = epiweek ew3 = flu.add_epiweeks(epiweek, 1) weeks0 = Epidata.range(ew1, ew2) weeks1 = Epidata.range(ew1, ew3) return (ew1, ew2, ew3, weeks0, weeks1)
def get_most_recent_issue(self): """Return the most recent epiweek for which FluView data is available.""" ew2 = EpiDate.today().get_ew() ew1 = add_epiweeks(ew2, -9) response = self.epidata.fluview('nat', self.epidata.range(ew1, ew2)) issues = [row['issue'] for row in self.epidata.check(response)] return max(issues)
def update(self, sensors, first_week, last_week): """ Compute sensor readings and store them in the database. """ # most recent issue if last_week is None: last_issue = get_most_recent_issue(self.epidata) last_week = flu.add_epiweeks(last_issue, +1) # connect with self.database as database: # update each sensor for (name, loc) in sensors: # update each location for location in get_location_list(loc): # timing ew1 = first_week if ew1 is None: ew1 = database.get_most_recent_epiweek(name, location) if ew1 is None: # If an existing sensor reading wasn't found in the database and # no start week was given, just assume that readings should start # at 2014w01. ew1 = 201401 print('%s-%s not found, starting at %d' % (name, location, ew1)) args = (name, location, ew1, last_week) print('Updating %s-%s from %d to %d.' % args) for test_week in flu.range_epiweeks(ew1, last_week, inclusive=True): self.update_single(database, test_week, name, location)
def get_naive_nowcast(self, loc): # Because final wILI is not known for multiple months, it's not possible to # implement a *real-time* random walk naive nowcaster. There are (at least) # two ways to define a substitute naive nowcaster: # - Naive Oracle: assume final wILI is known at runtime (it's not) # and define the nowcast as final wILI on the previous week. # - Seasonal Naive: define the nowcast as final wILI on the same week # one year in the past. # Naive Oracle has the disadvantage that it's not realistic (because of # backfill), and therefore it is unfairly advantaged. Seasonal Naive has # the disadvantage that wILI 52 weeks ago is only very loosely correlated # with wILI at runtime, and therefore it is unfairly disadvantaged. # (Ideally we would define the naive nowcast as preliminary wILI on the # previous week, but that data isn't generally available, except for # certain locations and seasons.) # It's not immediately clear which definition of "naive" is better in this # situation. The variable below controls which definition is used # throughout this analysis; 1 corresponds to Naive Oracle, and 52 # corresponds to Seasonal Naive. delta = 1 nowcast = {} truth = self.get_truth(loc) for ew1 in truth: ew0 = Epiweek.add_epiweeks(ew1, -delta) if ew0 in truth: nowcast[ew1] = truth[ew0] return nowcast
def get_most_recent_issue(self, location): """Return the most recent epiweek for which paho_dengue data is available in given location.""" ew2 = EpiDate.today().get_ew() ew1 = add_epiweeks(ew2, -52) response = self.epidata.paho_dengue(location, self.epidata.range(ew1, ew2)) ews = [row['epiweek'] for row in self.epidata.check(response)] return max(ews)
def get_weeks(self): """Return a list of weeks on which truth and sensors are both available.""" latest_week = EpiDate.today().get_ew() latest_week = add_epiweeks(latest_week, -1) week_range = range_epiweeks(self.FIRST_DATA_EPIWEEK, latest_week, inclusive=True) return list(week_range)
def extract(rows, fields, signal_to_truth_ew_shift): data = {} for row in rows: data[flu.add_epiweeks(row['epiweek'], signal_to_truth_ew_shift)] = [ float(row[f]) for f in fields ] return data
def get_update_range(self, first_week, last_week): """Return the range of epiweeks to update.""" # default to most recent issue if a week range isn't given if not last_week: # repeat previous nowcast in case new data is available first_week = self.data_source.get_most_recent_issue() # nowcast the first week without ilinet data last_week = add_epiweeks(first_week, 1) return first_week, last_week
def update_single(self, database, test_week, name, location): train_week = flu.add_epiweeks(test_week, -1) impl = self.implementations[name] try: value = impl(location, train_week, self.valid, self.target) print(' %4s %5s %d -> %.3f' % (name, location, test_week, value)) except Exception as ex: value = None print(' failed: %4s %5s %d' % (name, location, test_week), ex) if value is not None: database.insert(self.target, name, location, test_week, value) sys.stdout.flush()
def get_dengue_data(first_week, last_week): # Check week order if first_week > last_week: first_week, last_week = last_week, first_week # Bounds check if first_week < 200301 or last_week < 200301: raise Exception('week out of range') # Initialize data by week and location (zeroes are not reported) data = {} for week in range_epiweeks(first_week, add_epiweeks(last_week, 1)): data[week] = {} for location in NIDSS.LOCATION_TO_REGION.keys(): data[week][location] = 0 # Download CSV response = requests.get(NIDSS.DENGUE_URL) if response.status_code != 200: raise Exception('export Dengue failed [%d]' % response.status_code) csv = response.content.decode('big5-tw') # Parse the data lines = [l.strip() for l in csv.split('\n')[1:] if l.strip() != ''] for line in lines: fields = line.split(',') location_b64 = base64.b64encode(fields[3].encode('utf-8')) location = NIDSS._TRANSLATED[location_b64] region = NIDSS.LOCATION_TO_REGION[location] imported_b64 = base64.b64encode(fields[6].encode('utf-8')) imported = imported_b64 == b'5piv' sex = fields[5] age = fields[7] count = int(fields[8]) year = int(fields[1]) week = int(fields[2]) # Week 53 was reported each year in 2003-2007 if year < 2008 and year != 2003 and week > 52: week = 52 # Epiweek system change in 2009 # See also: http://research.undefinedx.com/forum/index.php?topic=300.0 if year == 2009: week -= 1 if week == 0: year, week = 2008, 53 epiweek = year * 100 + week if epiweek < first_week or epiweek > last_week: # Outside of the requested range continue if epiweek not in data or location not in data[epiweek]: # Not a vaild U.S. epiweek raise Exception('data missing %d-%s' % (epiweek, location)) # Add the counts to the location on this epiweek data[epiweek][location] += count # Return results indexed by week and location return data
def _get_features(self, ew, valid=True): X = np.zeros((1, 7)) i = self.ew2i[ew] X[0, 0] = 1 for holiday in range(4): if EW.split_epiweek(EW.add_epiweeks(ew, holiday))[1] == 1: X[0, 1 + holiday] = 1 y, w = EW.split_epiweek(ew) N = EW.get_num_weeks(y) offset = np.pi * 2 * w / N X[0, 5] = np.sin(offset) X[0, 6] = np.cos(offset) # todo linear time trend covariate? return X
def _get_stable(self, region): ranges = [] for s in range(2003, self.test_season): if s == 2009: continue ew1 = flu.join_epiweek(s, 40) ew2 = flu.add_epiweeks(ew1, 37) ranges.append(Epidata.range(ew1, ew2)) if self.forecast_type == ForecastType.WILI: epidata = Forecaster.Utils.decode(Epidata.fluview(region, ranges)) return dict([(row['epiweek'], row['wili']) for row in epidata]) else: epidata = Forecaster.Utils.decode( Epidata.flusurv('network_all', ranges)) return dict([(row['epiweek'], row[region]) for row in epidata])
def _get_features(self, ew, valid=True): X = np.zeros((1, 8)) i = self.ew2i[ew] X[0, 0] = 1 for lag in range(3): if valid and not self.valid[i - lag][lag]: w = self.i2ew[i - lag] raise Exception('missing unstable wILI (ew=%d|lag=%d)' % (w, lag)) X[0, 1 + lag] = self.data[i - lag][lag] for holiday in range(4): if EW.split_epiweek(EW.add_epiweeks(ew, holiday))[1] == 1: X[0, 4 + holiday] = 1 # y, w = EW.split_epiweek(ew) # N = EW.get_num_weeks(y) # offset = np.pi * 2 * w / N # X[0, 8] = np.sin(offset) # X[0, 9] = np.cos(offset) return X
def _forecast(self, region, epiweek): ew1 = flu.join_epiweek(self.test_season + 0, 40) ew2 = flu.join_epiweek(self.test_season + 1, 24) num_weeks = flu.delta_epiweeks(ew1, ew2) print('fetching past data until week %d' % (epiweek)) observed = self._get_current(region, epiweek, self.forecast_type) mean, var = self.emp_mean[region].copy(), self.emp_var[region].copy() for ew in flu.range_epiweeks(ew1, flu.add_epiweeks(epiweek, 1)): i = flu.delta_epiweeks(ew1, ew) lag = flu.delta_epiweeks(ew1, epiweek) - i lag = min(lag, len(self.bf_var[region]) - 1) mean[i] = observed[i] var[i] = self.bf_var[region][lag] curves = Forecaster.Utils.sample_normal_var(mean, var, self.num_samples) if not self.do_sampling: offset = flu.delta_epiweeks(ew1, epiweek) + 1 for (i, curve) in enumerate(curves): index = i % len(self.emp_curves[region]) curve[offset:] = self.emp_curves[region][index][offset:] return curves
def extract_epiweek_and_team(filename): """ Extract the submission epiweek (epiweek of most recently published report) and the team name from the file name of a flu contest submission. The return value is a tuple of: 1. the submission epiweek (e.g. 201751) 2. the team name (e.g. "delphi-epicast") """ # this is the naming convention for 2017 flu contest submissions pattern = re.compile('^EW(\\d{2})-(.*)-(\\d{4})-(\\d{2})-(\\d{2}).csv$') match = pattern.match(os.path.basename(filename)) if match is None: # only able to parse this specific naming convention raise Exception() week = int(match.group(1)) team = match.group(2) year = int(match.group(3)) month = int(match.group(4)) day = int(match.group(5)) epiweek = EpiDate(year, month, day).get_ew() # We know the week number, but the year has to be inferred from the # submission date. Since the week of submission is never less than the week # of the most recent report, we can step backwards from the week of # submission until we find the expected week number. Ordinarily, this will # take exactly two steps. For example, data collected on 2017w51 is # reported on 2017w52, and our forecast is submitted on 2018w01; so we # start with 2018w01 and step backwards until find the first week 51, which # is 2017w51. if not 1 <= week <= 53: # prevent an infinite loop raise Exception('invalid week number: %d' % week) while Epiweek.split_epiweek(epiweek)[1] != week: epiweek = Epiweek.add_epiweeks(epiweek, -1) return epiweek, team
def get_week_forecast(first_epiweek, num_bins, indices, uniform_weight, smooth_bw, allow_none): dist = [indices.count(i) for i in range(num_bins)] none = indices.count(None) if none > 0 and not allow_none: raise Exception( 'target does not allow None, but None was provided') extra = [none] if allow_none else [] temp = Forecaster.Utils.normalize(np.array(dist + extra)) if smooth_bw > 0: # TODO: don't smooth across dist and norm temp = Forecaster.Utils.smooth(temp, smooth_bw) temp = Forecaster.Utils.blend(temp, uniform_weight) if allow_none: dist, none = temp[:-1], temp[-1] else: dist, none = temp, None possibilities = [i for i in indices if i is not None] if len(possibilities) == 0: possibilities = [0] point = flu.add_epiweeks(first_epiweek, int(median_low(possibilities))) return (dist, none, point)
def update(locations, first=None, last=None, force_update=False, load_email=True): # download and prepare data first qd = quidel.QuidelData(DATAPATH, load_email) if not qd.need_update and not force_update: print('Data not updated, nothing needs change.') return qd_data = qd.load_csv() qd_measurements = qd.prepare_measurements(qd_data, start_weekday=4) qd_ts = quidel.measurement_to_ts(qd_measurements, 7, startweek=first, endweek=last) # connect to the database u, p = secrets.db.epi cnx = mysql.connector.connect(user=u, password=p, database='epidata') cur = cnx.cursor() def get_num_rows(): cur.execute('SELECT count(1) `num` FROM `quidel`') for (num, ) in cur: pass return num # check from 4 weeks preceeding the last week with data through this week cur.execute( 'SELECT max(`epiweek`) `ew0`, yearweek(now(), 6) `ew1` FROM `quidel`') for (ew0, ew1) in cur: ew0 = 200401 if ew0 is None else flu.add_epiweeks(ew0, -4) ew0 = ew0 if first is None else first ew1 = ew1 if last is None else last print('Checking epiweeks between %d and %d...' % (ew0, ew1)) # keep track of how many rows were added rows_before = get_num_rows() # check Quidel for new and/or revised data sql = ''' INSERT INTO `quidel` (`location`, `epiweek`, `value`) VALUES (%s, %s, %s) ON DUPLICATE KEY UPDATE `value` = %s ''' total_rows = 0 for location in locations: if location not in qd_ts: continue ews = sorted(qd_ts[location].keys()) num_missing = 0 for ew in ews: v = qd_ts[location][ew] sql_data = (location, ew, v, v) cur.execute(sql, sql_data) total_rows += 1 if v == 0: num_missing += 1 if num_missing > 0: print(' [%s] missing %d/%d value(s)' % (location, num_missing, len(ews))) # keep track of how many rows were added rows_after = get_num_rows() print('Inserted %d/%d row(s)' % (rows_after - rows_before, total_rows)) # cleanup cur.close() cnx.commit() cnx.close()
X = self._get_features(epiweek, valid=valid) return float(SAR3.dot(X, self.model)[0, 0]) if __name__ == '__main__': # args and usage parser = argparse.ArgumentParser() parser.add_argument('epiweek', type=int, help='most recently published epiweek (best 201030+)') parser.add_argument('region', type=str, help='region (nat, hhs, cen)') args = parser.parse_args() # options ew1, reg = args.epiweek, args.region ew2 = EW.add_epiweeks(ew1, 1) # train and predict print('Most recent issue: %d' % ew1) prediction = SAR3(reg).predict(ew1, True) print('Predicted wILI in %s on %d: %.3f' % (reg, ew2, prediction)) res = Epidata.fluview(reg, ew2, auth=secrets.api.fluview) if res['result'] == 1: row = res['epidata'][0] issue = row['issue'] wili = row['wili'] err = prediction - wili print('Actual wILI as of %d: %.3f (err=%+.3f)' % (issue, wili, err)) else: print('Actual wILI: unknown')
help='first epiweek override') parser.add_argument('--last', '-l', default=None, type=int, help='last epiweek override') parser.add_argument('--test', '-t', default=False, action='store_true', help='dry run only') args = parser.parse_args() # epiweeks and timing first, last = None, None if args.first is not None: first = args.first if args.last is not None: last = args.last if last is None: last = get_most_recent_issue() if first is None: first = flu.add_epiweeks(last, -52) if last < first: raise Exception('epiweeks in the wrong order') flu.check_epiweek(first, last) print('Updating epiweeks from %d to %d.' % (first, last)) # make it happen update(first, last, args.test)
def update_quid_db(qd_ts, update_field='value'): # check from 4 weeks preceeding the last week with data through this week cur.execute( 'SELECT max(`epiweek`) `ew0`, yearweek(now(), 6) `ew1` FROM `quidel`' ) for (ew0, ew1) in cur: ew0 = 200401 if ew0 is None else flu.add_epiweeks(ew0, -4) ew0 = ew0 if first is None else first ew1 = ew1 if last is None else last print('Checking epiweeks between %d and %d...' % (ew0, ew1)) # keep track of how many rows were added rows_before = get_num_rows() # check Quidel for new and/or revised data # Default update field is 'value'. sql = ''' INSERT INTO `quidel` (`location`, `epiweek`, `value`) VALUES (%s, %s, %s) ON DUPLICATE KEY UPDATE `value` = %s ''' if update_field == 'num_rows': sql = ''' INSERT INTO `quidel` (`location`, `epiweek`, `num_rows`) VALUES (%s, %s, %s) ON DUPLICATE KEY UPDATE `num_rows` = %s ''' elif update_field == 'num_devices': sql = ''' INSERT INTO `quidel` (`location`, `epiweek`, `num_devices`) VALUES (%s, %s, %s) ON DUPLICATE KEY UPDATE `num_devices` = %s ''' total_rows = 0 for location in locations: if location not in qd_ts: continue ews = sorted(qd_ts[location].keys()) num_missing = 0 for ew in ews: v = qd_ts[location][ew] sql_data = (location, ew, v, v) cur.execute(sql, sql_data) total_rows += 1 if v == 0: num_missing += 1 if num_missing > 0: print(' [%s] missing %d/%d value(s)' % (location, num_missing, len(ews))) # keep track of how many rows were added rows_after = get_num_rows() print('Inserted %d/%d row(s)' % (rows_after - rows_before, total_rows))
def update(locations, terms, first=None, last=None, countries=['US']): # connect to the database u, p = secrets.db.epi cnx = mysql.connector.connect(user=u, password=p, database='epidata') cur = cnx.cursor() def get_num_rows(): cur.execute('SELECT count(1) `num` FROM `ght`') for (num, ) in cur: pass return num # check from 4 weeks preceeding the last week with data through this week cur.execute( 'SELECT max(`epiweek`) `ew0`, yearweek(now(), 6) `ew1` FROM `ght`') for (ew0, ew1) in cur: ew0 = 200401 if ew0 is None else flu.add_epiweeks(ew0, -4) ew0 = ew0 if first is None else first ew1 = ew1 if last is None else last print('Checking epiweeks between %d and %d...' % (ew0, ew1)) # keep track of how many rows were added rows_before = get_num_rows() # check Google Trends for new and/or revised data sql = ''' INSERT INTO `ght` (`query`, `location`, `epiweek`, `value`) VALUES (%s, %s, %s, %s) ON DUPLICATE KEY UPDATE `value` = %s ''' total_rows = 0 ght = GHT(API_KEY) for term in terms: print(' [%s] using term' % term) ll, cl = len(locations), len(countries) for i in range(max(ll, cl)): location = locations[i] if i < ll else locations[0] country = countries[i] if i < cl else countries[0] try: #term2 = ('"%s"' % term) if ' ' in term else term term2 = term attempt = 0 while True: attempt += 1 try: result = ght.get_data(ew0, ew1, location, term2, country=country) break except Exception as ex: if attempt >= 5: raise ex else: delay = 2**attempt print( ' [%s|%s] caught exception (will retry in %ds):' % (term, location, delay), ex) time.sleep(delay) values = [ p['value'] for p in result['data']['lines'][0]['points'] ] ew = result['start_week'] num_missing = 0 for v in values: # Default SQL location value for US country for backwards compatibility # i.e. California's location is still stored as 'CA', # and having location == 'US' is still stored as 'US' sql_location = location if location != NO_LOCATION_STR else country # Change SQL location for non-US countries if country != 'US': # Underscore added to distinguish countries from 2-letter US states sql_location = country + "_" if location != NO_LOCATION_STR: sql_location = sql_location + location sql_data = (term, sql_location, ew, v, v) cur.execute(sql, sql_data) total_rows += 1 if v == 0: num_missing += 1 #print(' [%s|%s|%d] missing value' % (term, location, ew)) ew = flu.add_epiweeks(ew, 1) if num_missing > 0: print(' [%s|%s] missing %d/%d value(s)' % (term, location, num_missing, len(values))) except Exception as ex: print( ' [%s|%s] caught exception (will NOT retry):' % (term, location), ex) # keep track of how many rows were added rows_after = get_num_rows() print('Inserted %d/%d row(s)' % (rows_after - rows_before, total_rows)) # cleanup cur.close() cnx.commit() cnx.close()
def update(locations, terms, first=None, last=None): # connect to the database u, p = secrets.db.epi cnx = mysql.connector.connect(user=u, password=p, database='epidata') cur = cnx.cursor() def get_num_rows(): cur.execute('SELECT count(1) `num` FROM `ght`') for (num,) in cur: pass return num # check from 4 weeks preceeding the last week with data through this week cur.execute('SELECT max(`epiweek`) `ew0`, yearweek(now(), 6) `ew1` FROM `ght`') for (ew0, ew1) in cur: ew0 = 200401 if ew0 is None else flu.add_epiweeks(ew0, -4) ew0 = ew0 if first is None else first ew1 = ew1 if last is None else last print('Checking epiweeks between %d and %d...' % (ew0, ew1)) # keep track of how many rows were added rows_before = get_num_rows() # check Google Trends for new and/or revised data sql = ''' INSERT INTO `ght` (`query`, `location`, `epiweek`, `value`) VALUES (%s, %s, %s, %s) ON DUPLICATE KEY UPDATE `value` = %s ''' total_rows = 0 ght = GHT(API_KEY) for term in terms: print(' [%s] using term' % term) for location in locations: try: #term2 = ('"%s"' % term) if ' ' in term else term term2 = term attempt = 0 while True: attempt += 1 try: result = ght.get_data(ew0, ew1, location, term2) break except Exception as ex: if attempt >= 5: raise ex else: delay = 2 ** attempt print(' [%s|%s] caught exception (will retry in %ds):' % (term, location, delay), ex) time.sleep(delay) values = [p['value'] for p in result['data']['lines'][0]['points']] ew = result['start_week'] num_missing = 0 for v in values: sql_data = (term, location, ew, v, v) cur.execute(sql, sql_data) total_rows += 1 if v == 0: num_missing += 1 #print(' [%s|%s|%d] missing value' % (term, location, ew)) ew = flu.add_epiweeks(ew, 1) if num_missing > 0: print(' [%s|%s] missing %d/%d value(s)' % (term, location, num_missing, len(values))) except Exception as ex: print(' [%s|%s] caught exception (will NOT retry):' % (term, location), ex) # keep track of how many rows were added rows_after = get_num_rows() print('Inserted %d/%d row(s)'%(rows_after - rows_before, total_rows)) # cleanup cur.close() cnx.commit() cnx.close()
def get_most_recent_issue(): # search for FluView issues within the last 10 weeks ew2 = EpiDate.today().get_ew() ew1 = flu.add_epiweeks(ew2, -9) rows = Epidata.check(Epidata.fluview('nat', Epidata.range(ew1, ew2))) return max([row['issue'] for row in rows])