def generate_date_ranges(start, end): """ Take a start and end date and convert to list of 30 day Epidata ranges. The final tuple may only be a few days depending of the modulo of the range and 30. The ranges should partition the entire range, inclusive of both endpoints, and do not overlap, i.e. they will be of the form (start, start+30), (start+31, start+61), (start+62, start+92), ... Parameters ---------- start: date datetime.date object for first day. end: datetime datetime.date object for last day. Returns ------- Ordered list of dictionaries generated by Epidata.range specifying the partitioning intervals. """ curr_end = start + timedelta(30) output = [] while curr_end < end: output.append( Epidata.range(_date_to_int(start), _date_to_int(curr_end))) start += timedelta(31) curr_end = start + timedelta(30) output.append(Epidata.range(_date_to_int(start), _date_to_int(end))) return output
def grabDataFromEpicast(self): if self.region=='': self.fludata = Epidata.fluview(self.state, [Epidata.range(201040,self.todaysEW)]) elif self.state=='': self.fludata = Epidata.fluview(self.region, [Epidata.range(201040,self.todaysEW)]) else: self.fludata = Epidata.fluview(self.region+self.state, [Epidata.range(201040,self.todaysEW)]) self.fludata_message = self.fludata['message'] self.fludata_data = self.fludata['epidata']
def get_season(season, location): #end = (season + 1) * 100 + 29 #epiweeks = Epidata.range(flu.add_epiweeks(end, -51), end) begin = season * 100 + 30 epiweeks = Epidata.range(begin, flu.add_epiweeks(begin, 51)) rows = AF_Utils._get(Epidata.ilinet(location, epiweeks)) return [row['wili'] for row in rows]
def get_wiki(ew1, ew2): # get the raw wiki data, broken down by epiweek, article, and hour epiweeks = Epidata.range(ew1, ew2) result = {} data = api_fetch(Epidata.wiki(ARTICLES, epiweeks=epiweeks, hours=HOURS)) # index the data for fast access for row in data: epiweek, article = row['epiweek'], row['article'] if epiweek not in result: result[epiweek] = {} if article not in result[epiweek]: result[epiweek][article] = {'c': [], 't': []} result[epiweek][article]['c'].append(row['count']) result[epiweek][article]['t'].append(row['total']) # group by epiweek and article (combining hours) data = [] for epiweek in sorted(list(result.keys())): row = [] for article in sorted(ARTICLES): count, total = result[epiweek][article]['c'], result[epiweek][ article]['t'] if len(count) != len(HOURS) or len(total) != len(HOURS): raise Exception('wiki is missing hours') row.append(1e6 * sum(count) / sum(total)) data.append(row) # return a list of weekly data return data
def get_unstable_wILI(region, ew1, ew2): weeks = Epidata.range(ew1, ew2) epidata = AF_Utils._get(Epidata.fluview(region, weeks, issues=ew2)) data = [row['wili'] for row in epidata] if len(data) != flu.delta_epiweeks(ew1, ew2) + 1: raise Exception('missing data') return data
def get_historical_sensor_data(sensor: SensorConfig, geo_value: str, geo_type: str, start_date: date, end_date: date) -> Tuple[LocationSeries, list]: """ Query Epidata API for historical sensorization data. Will only return values if they are not null. If any days are null or are not available, they will be listed as missing. Parameters ---------- sensor SensorConfig specifying which sensor to retrieve. geo_type Geo type to retrieve. geo_value Geo value to retrieve. start_date First day to retrieve (inclusive). end_date Last day to retrieve (inclusive). Returns ------- Tuple of (LocationSeries containing non-na data, list of dates without valid data). If no data was found, an empty LocationSeries is returned. """ response = Epidata.covidcast_nowcast(data_source=sensor.source, signals=sensor.signal, time_type="day", geo_type=geo_type, time_values=Epidata.range( start_date.strftime("%Y%m%d"), end_date.strftime("%Y%m%d")), geo_value=geo_value, sensor_names=sensor.name, lag=sensor.lag) all_dates = [i.date() for i in date_range(start_date, end_date)] if response["result"] == 1: output = LocationSeries(geo_value=geo_value, geo_type=geo_type, data={ datetime.strptime(str(i["time_value"]), "%Y%m%d").date(): i["value"] for i in response.get("epidata", []) if not isnan(i["value"]) }) missing_dates = [i for i in all_dates if i not in output.dates] return output, missing_dates if response["result"] == -2: # no results print("No historical results found") output = LocationSeries(geo_value=geo_value, geo_type=geo_type) return output, all_dates raise Exception(f"Bad result from Epidata: {response['message']}")
def load_us(states, latest=False): us_covid19_cases_path = os.path.join(config.base_data_dir, config.us_covid19_cases) #df_us = pd.read_csv(us_covid19_cases_path) import sys sys.path.append('src/') from delphi_epidata import Epidata start_date = 20200401 from datetime import datetime stop_date = int(datetime.today().strftime('%Y%m%d')) for target_state in states: print(f'Processing data for state: {target_state} ' + ' *' * 10) print('Start date = ', start_date, ' End date = ', stop_date) res_incidence = Epidata.covidcast('jhu-csse', 'confirmed_7dav_incidence_num', 'day', 'state', \ [start_date, Epidata.range(start_date, stop_date)], target_state) res_death = Epidata.covidcast('jhu-csse', 'deaths_7dav_incidence_num', 'day', 'state', \ [start_date, Epidata.range(start_date, stop_date)], target_state) df_state = pd.DataFrame(columns=['Confirmed', 'Deceased', 'Recovered']) if len(res_incidence) > 0 and len(res_death) > 0: df_jhu_7day = pd.DataFrame(res_incidence['epidata']) df_jhu_7day_deaths = pd.DataFrame(res_death['epidata']) df_state['Date'] = pd.to_datetime(df_jhu_7day['time_value'], format='%Y%m%d') df_state['Confirmed'] = df_jhu_7day['value'] df_state['Deceased'] = df_jhu_7day_deaths['value'] df_state['Recovered'].fillna(value=0, inplace=True) # ensures sorting with respect to date df_state.index = pd.to_datetime(df_state.Date) df_state[['Total_Confirmed', 'Total_Deceased', 'Total_Recovered']] \ = df_state[['Confirmed', 'Deceased', 'Recovered']].cumsum(axis=0, skipna=True) df_state.to_csv(os.path.join(config.base_data_dir, f'Cases_USA_{target_state}.csv'), index=False) else: print(' *** Error: Can not import data from Delphi database. Check src/state_data_loader.py') exit()
def get_fluview_data(states, start, end): """ return a dictionary of dataframe with the different epiweeks """ ilinet_raw = {} for state in states: print("State {}".format(state)) res = Epidata.fluview( regions=state, #source epiweeks=[Epidata.range(start, end)]) #range 2009 to 2016 if res['result'] == 1: print(res['result'], res['message'], len(res['epidata'])) data = pd.DataFrame(res['epidata']) ilinet_raw[state] = data else: print("(-2, u'no success')") return ilinet_raw
def pull_data() -> pd.DataFrame: """ Pull HHS data from Epidata API for all states and dates and convert to a DataFrame. Returns ------- DataFrame of HHS data. """ today = int(date.today().strftime("%Y%m%d")) past_reference_day = int(date( 2020, 1, 1).strftime("%Y%m%d")) # first available date in DB all_states = GeoMapper().get_geo_values("state_id") responses = pull_data_iteratively(all_states, Epidata.range(past_reference_day, today)) all_columns = pd.DataFrame(responses).replace(NAN_VALUES, np.nan) all_columns["timestamp"] = pd.to_datetime(all_columns["collection_week"], format="%Y%m%d") return all_columns
def get_influenza_counts_df(): """Load influenza counts from the CMU Delphi API, return a pandas dataframe""" # Retrieves current date, formats it "YYYY-mm-dd", and converts it to epiweek today_obj = datetime.today() today_str = today_obj.strftime("%Y-%m-%d") epiweek = DataLoader.get_approx_epiweek_from_date(today_str) # Retrieves national fluview data for each "epiweek" from 2020: results = Epidata.fluview(["nat"], [Epidata.range(202001, epiweek)]) results_df = pd.DataFrame.from_records( results["epidata"]).sort_values(by=["epiweek"]) results_df = results_df[[ "epiweek", "lag", "num_ili", "num_patients", "num_providers", "wili", "ili" ]] # Convert epiweeks to approximate real date for graphing results_df["date"] = results_df["epiweek"].apply( DataLoader.get_approx_date_from_epiweek) return results_df
def get_ili(location, issue, ew1, ew2): result = {} epiweeks = Epidata.range(ew1, ew2) num_weeks = flu.delta_epiweeks(ew1, ew2) + 1 # try to get unstable, but gracefully fall back to stable if issue is not None: res = Epidata.fluview(location, epiweeks, issues=issue) if res['result'] == 1: for row in res['epidata']: result[row['epiweek']] = row['wili'] # check to see if another API call is needed if issue is None or res['result'] != 1 or len(res['epidata']) < num_weeks: # get stable data data = api_fetch(Epidata.fluview(location, epiweeks)) for row in data: epiweek = row['epiweek'] if epiweek not in result: result[epiweek] = row['wili'] # return a list of weekly data return [[result[ew]] for ew in sorted(list(result.keys()))]
def get_gft(location, ew1, ew2): epiweeks = Epidata.range(ew1, ew2) data = api_fetch(Epidata.gft(location, epiweeks)) return [[1e-3 * row['num']] for row in data]
,'quidel' :['smoothed_pct_negative','smoothed_tests_per_device']} return d if __name__ == "__main__": todaysEW = fromToday2EpiWeek() todayYMD = todayYMD() variables = ['geo_value','time_value','value','stderr','sample_size'] fromDataSource2Signal = fromDataSource2Signal() fips2name = listPACounties() for datasource in ['fb-survey','ght','doctor-visits','google-survey','quidel']: for signal in fromDataSource2Signal[datasource]: dataSet = DS(variables,datasource,signal) for county in fips2name: sys.stdout.write('\r{:s}--{:s}--{:06d}\r'.format(datasource,signal,county)) sys.stdout.flush() dataFromAPI = Epidata.covidcast(datasource,signal,'day','county',Epidata.range(20200101,todayYMD),county) if dataFromAPI["message"] == "no results": continue if dataFromAPI['message'] == "success": for data in dataFromAPI['epidata']: dataSet.appendData(data) if dataSet.has_data(): dataSet.convert2pandasDF().exportDF()
def get_twitter(location, ew1, ew2): epiweeks = Epidata.range(ew1, ew2) data = api_fetch( Epidata.twitter(secrets.api.twitter, location, epiweeks=epiweeks)) return [[row['percent']] for row in data]
def get_ght(ew1, ew2): epiweeks = Epidata.range(ew1, ew2) data = api_fetch(Epidata.ght(secrets.api.ght, 'US', epiweeks, '/m/0cycc')) return [[row['value']] for row in data]
todaysEW = fromToday2EpiWeek() todayYMD = todayYMD() variables = ['geo_value', 'time_value', 'value', 'stderr', 'sample_size'] fromDataSource2Signal = fromDataSource2Signal() fips2name = listPACounties() for datasource in [ 'fb-survey', 'ght', 'doctor-visits', 'google-survey', 'quidel' ]: for signal in fromDataSource2Signal[datasource]: dataSet = DS(variables, datasource, signal) for county in fips2name: sys.stdout.write('\r{:s}--{:s}--{:06d}\r'.format( datasource, signal, county)) sys.stdout.flush() dataFromAPI = Epidata.covidcast( datasource, signal, 'day', 'county', Epidata.range(20200101, todayYMD), county) if dataFromAPI["message"] == "no results": continue if dataFromAPI['message'] == "success": for data in dataFromAPI['epidata']: dataSet.appendData(data) if dataSet.has_data(): dataSet.convert2pandasDF().exportDF()
cur = cnx.cursor(buffered=True) # Get ground truth history = {} regions = [ "nat", "hhs1", "hhs2", "hhs3", "hhs4", "hhs5", "hhs6", "hhs7", "hhs8", "hhs9", "hhs10", "ga", "pa", "dc", "tx", "or" ] # for 2017-18 season, 201744 is the first ground truth data we get after the competition starts (i.e., users forecasted for it in 201743) ############################################################# season_start, season_end = 201744, 201820 for r in range(1, len(regions) + 1): history[r] = {} rows = Epidata.check( Epidata.fluview(regions[r - 1], Epidata.range(season_start, season_end))) truth = [(row['epiweek'], row['wili']) for row in rows] availableWeeks = [row[0] for row in truth] for row in truth: (epiweek, wili) = row history[r][epiweek] = wili print(regions[r - 1], epiweek, wili) epiweek = availableWeeks[-1] print("epiweek", epiweek) if (epiweek == 201801): forecast_made = 201752 else: forecast_made = epiweek - 1 # debug print print("availableWeeks", availableWeeks) expected_weeks = epi_utils.delta_epiweeks(season_start, epiweek) + 1
Collect actual wili data using the delphi API """ from delphi_epidata import Epidata from datetime import datetime import pandas as pd import pymmwr BASELINE_URL = "https://raw.githubusercontent.com/cdcepi/FluSight-forecasts/master/wILI_Baseline.csv" current_epiweek = pymmwr.date_to_mmwr_week() # Range of epiweeks to gather data for epiweek_start = 199710 epiweek_end = int(str(current_epiweek["year"]) + str(current_epiweek["week"]).zfill(2)) epiweek_range = Epidata.range(epiweek_start, epiweek_end) regions = ["nat", *["hhs" + str(i) for i in range(1, 11)]] # NOTE Lag value # A lag of 0 means that the data for each week collected will be # as observed at that point in time. # Pass None as lag will let us collect the most recent data # available df = { "epiweek": [], "region": [], "wili": [] }