def prescribe(start_date_str: str, end_date_str: str, path_to_prior_ips_file: str, path_to_cost_file: str, output_file_path) -> None: info(f"prescription started @ {datetime.now()}") info(f"prescription from {start_date_str} to {end_date_str}") info(f"prescription with past IPS: {path_to_prior_ips_file}") info(f"prescription with past costs: {path_to_cost_file}") info(f"prescription with output to: {output_file_path}") start_date = pd.to_datetime(start_date_str, format='%Y-%m-%d') end_date = pd.to_datetime(end_date_str, format='%Y-%m-%d') n_days = (end_date - start_date).days + 1 # Load the past IPs data past_ips_df = load_ips_file(path_to_prior_ips_file) geos = past_ips_df['GeoID'].unique() # Load historical data with basic preprocessing df = prepare_historical_df() # Restrict it to dates before the start_date df = df[df['Date'] <= start_date] # Create past case data arrays for all geos past_cases = {} for geo in geos: geo_df = df[df['GeoID'] == geo] past_cases[geo] = np.maximum(0, np.array(geo_df[CASES_COL])) # Create past ip data arrays for all geos past_ips = {} for geo in geos: geo_df = past_ips_df[past_ips_df['GeoID'] == geo] past_ips[geo] = np.array(geo_df[IP_COLS]) # Fill in any missing case data before start_date # using predictor given past_ips_df. # Note that the following assumes that the df returned by prepare_historical_df() # has the same final date for all regions. This has been true so far, but relies # on it being true for the Oxford data csv loaded by prepare_historical_df(). last_historical_data_date_str = df['Date'].max() last_historical_data_date = pd.to_datetime(last_historical_data_date_str, format='%Y-%m-%d') if last_historical_data_date + pd.Timedelta(days=1) < start_date: info("Filling in missing data...") missing_data_start_date = last_historical_data_date + pd.Timedelta(days=1) missing_data_start_date_str = datetime.strftime(missing_data_start_date, format='%Y-%m-%d') missing_data_end_date = start_date - pd.Timedelta(days=1) missing_data_end_date_str = datetime.strftime(missing_data_end_date, format='%Y-%m-%d') pred_df = get_predictions(missing_data_start_date_str, missing_data_end_date_str, past_ips_df) pred_df = add_geo_id(pred_df) for geo in geos: geo_df = pred_df[pred_df['GeoID'] == geo].sort_values(by='Date') pred_cases_arr = np.array(geo_df[PRED_CASES_COL]) past_cases[geo] = np.append(past_cases[geo], pred_cases_arr) else: info("No missing data.") # Load IP costs to condition prescriptions cost_df = pd.read_csv(path_to_cost_file) cost_df['RegionName'] = cost_df['RegionName'].fillna("") cost_df = add_geo_id(cost_df) geo_costs = {} for geo in geos: costs = cost_df[cost_df['GeoID'] == geo] cost_arr = np.array(costs[IP_COLS])[0] geo_costs[geo] = cost_arr # perform iterations while we have a time-budget info(f"initializing prescriptions generator") limits = NPI_LIMITS * n_days prescription_generator = PrescriptionGenerator(n_days, limits) info(f"prescription run 1 started @ {datetime.now()}") prescribe_loop(geos, geo_costs, past_cases, past_ips, n_days, output_file_path, start_date, PRESCRIPTION_CANDIDATES_PER_INDEX_RUN_1, prescription_generator, limits) info(f"prescription run 2 started @ {datetime.now()}") prescribe_loop(geos, geo_costs, past_cases, past_ips, n_days, output_file_path, start_date, PRESCRIPTION_CANDIDATES_PER_INDEX_RUN_2, prescription_generator, limits) info(f"prescription run 2 ended @ {datetime.now()}")
def prescribe(start_date_str: str, end_date_str: str, path_to_prior_ips_file: str, path_to_cost_file: str, output_file_path) -> None: start_date = pd.to_datetime(start_date_str, format='%Y-%m-%d') end_date = pd.to_datetime(end_date_str, format='%Y-%m-%d') # Load the past IPs data print("Loading past IPs data...") past_ips_df = load_ips_file(path_to_prior_ips_file) geos = past_ips_df['GeoID'].unique() # Load historical data with basic preprocessing print("Loading historical data...") df = prepare_historical_df() # Restrict it to dates before the start_date df = df[df['Date'] <= start_date] # Create past case data arrays for all geos past_cases = {} for geo in geos: geo_df = df[df['GeoID'] == geo] past_cases[geo] = np.maximum(0, np.array(geo_df[CASES_COL])) # Create past ip data arrays for all geos past_ips = {} for geo in geos: geo_df = past_ips_df[past_ips_df['GeoID'] == geo] past_ips[geo] = np.array(geo_df[IP_COLS]) # Fill in any missing case data before start_date # using predictor given past_ips_df. # Note that the following assumes that the df returned by prepare_historical_df() # has the same final date for all regions. This has been true so far, but relies # on it being true for the Oxford data csv loaded by prepare_historical_df(). last_historical_data_date_str = df['Date'].max() last_historical_data_date = pd.to_datetime(last_historical_data_date_str, format='%Y-%m-%d') if last_historical_data_date + pd.Timedelta(days=1) < start_date: print("Filling in missing data...") missing_data_start_date = last_historical_data_date + pd.Timedelta(days=1) missing_data_start_date_str = datetime.strftime(missing_data_start_date, format='%Y-%m-%d') missing_data_end_date = start_date - pd.Timedelta(days=1) missing_data_end_date_str = datetime.strftime(missing_data_end_date, format='%Y-%m-%d') pred_df = get_predictions(missing_data_start_date_str, missing_data_end_date_str, past_ips_df) pred_df = add_geo_id(pred_df) for geo in geos: geo_df = pred_df[pred_df['GeoID'] == geo].sort_values(by='Date') pred_cases_arr = np.array(geo_df[PRED_CASES_COL]) past_cases[geo] = np.append(past_cases[geo], pred_cases_arr) else: print("No missing data.") # Gather values for scaling network output ip_max_values_arr = np.array([IP_MAX_VALUES[ip] for ip in IP_COLS]) # Load prescriptors checkpoint = neat.Checkpointer.restore_checkpoint(PRESCRIPTORS_FILE) prescriptors = list(checkpoint.population.values())[:NB_PRESCRIPTIONS] config = neat.Config(neat.DefaultGenome, neat.DefaultReproduction, neat.DefaultSpeciesSet, neat.DefaultStagnation, NEAT_CONFIG_FILE) # Load IP costs to condition prescriptions cost_df = pd.read_csv(path_to_cost_file) cost_df['RegionName'] = cost_df['RegionName'].fillna("") cost_df = add_geo_id(cost_df) geo_costs = {} for geo in geos: costs = cost_df[cost_df['GeoID'] == geo] cost_arr = np.array(costs[IP_COLS])[0] geo_costs[geo] = cost_arr # Generate prescriptions prescription_dfs = [] for prescription_idx, prescriptor in enumerate(prescriptors): print("Generating prescription", prescription_idx, "...") # Create net from genome net = neat.nn.FeedForwardNetwork.create(prescriptor, config) # Set up dictionary for keeping track of prescription df_dict = {'CountryName': [], 'RegionName': [], 'Date': []} for ip_col in sorted(IP_MAX_VALUES.keys()): df_dict[ip_col] = [] # Set initial data eval_past_cases = deepcopy(past_cases) eval_past_ips = deepcopy(past_ips) # Generate prescriptions iteratively, feeding resulting # predictions from the predictor back into the prescriptor. action_start_date = start_date while action_start_date <= end_date: # Get prescription for all regions for geo in geos: # Prepare input data. Here we use log to place cases # on a reasonable scale; many other approaches are possible. X_cases = np.log(eval_past_cases[geo][-NB_LOOKBACK_DAYS:] + 1) X_ips = eval_past_ips[geo][-NB_LOOKBACK_DAYS:] X_costs = geo_costs[geo] X = np.concatenate([X_cases.flatten(), X_ips.flatten(), X_costs]) # Get prescription prescribed_ips = net.activate(X) # Map prescription to integer outputs prescribed_ips = (prescribed_ips * ip_max_values_arr).round() # Add it to prescription dictionary for the full ACTION_DURATION country_name, region_name = geo.split('__') if region_name == 'nan': region_name = np.nan for date in pd.date_range(action_start_date, periods=ACTION_DURATION): if date > end_date: break date_str = date.strftime("%Y-%m-%d") df_dict['CountryName'].append(country_name) df_dict['RegionName'].append(region_name) df_dict['Date'].append(date_str) for ip_col, prescribed_ip in zip(IP_COLS, prescribed_ips): df_dict[ip_col].append(prescribed_ip) # Create dataframe from prescriptions pres_df = pd.DataFrame(df_dict) # Make prediction given prescription for all countries pred_df = get_predictions(start_date_str, date_str, pres_df) # Update past data with new days of prescriptions and predictions pres_df = add_geo_id(pres_df) pred_df = add_geo_id(pred_df) for date in pd.date_range(action_start_date, periods=ACTION_DURATION): if date > end_date: break date_str = date.strftime("%Y-%m-%d") new_pres_df = pres_df[pres_df['Date'] == date_str] new_pred_df = pred_df[pred_df['Date'] == date_str] for geo in geos: geo_pres = new_pres_df[new_pres_df['GeoID'] == geo] geo_pred = new_pred_df[new_pred_df['GeoID'] == geo] # Append array of prescriptions pres_arr = np.array([geo_pres[ip_col].values[0] for ip_col in IP_COLS]).reshape(1,-1) eval_past_ips[geo] = np.concatenate([eval_past_ips[geo], pres_arr]) # It is possible that the predictor does not return values for some regions. # To make sure we generate full prescriptions, this script continues anyway. # This should not happen, but is included here for robustness. if len(geo_pred) != 0: eval_past_cases[geo] = np.append(eval_past_cases[geo], geo_pred[PRED_CASES_COL].values[0]) # Move on to next action date action_start_date += pd.DateOffset(days=ACTION_DURATION) # Add prescription df to list of all prescriptions for this submission pres_df['PrescriptionIndex'] = prescription_idx prescription_dfs.append(pres_df) # Combine dfs for all prescriptions into a single df for the submission prescription_df = pd.concat(prescription_dfs) prescription_df = prescription_df.drop(columns='GeoID') # Create the output directory if necessary. output_dir = os.path.dirname(output_file_path) if output_dir != '': os.makedirs(output_dir, exist_ok=True) # Save to a csv file prescription_df.to_csv(output_file_path, index=False) print('Prescriptions saved to', output_file_path) return
def eval_genomes(genomes, config): # Every generation sample a different set of costs per geo, # so that over time solutions become robust to different costs. cost_df = generate_costs(distribution='uniform') cost_df = add_geo_id(cost_df) geo_costs = {} for geo in eval_geos: costs = cost_df[cost_df['GeoID'] == geo] cost_arr = np.array(costs[IP_COLS])[0] geo_costs[geo] = cost_arr # Evaluate each individual for genome_id, genome in genomes: # Create net from genome net = neat.nn.FeedForwardNetwork.create(genome, config) # Set up dictionary to keep track of prescription df_dict = {'CountryName': [], 'RegionName': [], 'Date': []} for ip_col in IP_COLS: df_dict[ip_col] = [] # Set initial data eval_past_cases = deepcopy(past_cases) eval_past_ips = deepcopy(past_ips) # Compute prescribed stringency incrementally stringency = 0. # Make prescriptions one day at a time, feeding resulting # predictions from the predictor back into the prescriptor. for date in pd.date_range(eval_start_date, eval_end_date): date_str = date.strftime("%Y-%m-%d") # Prescribe for each geo for geo in eval_geos: # Prepare input data. Here we use log to place cases # on a reasonable scale; many other approaches are possible. X_cases = np.log(eval_past_cases[geo][-NB_LOOKBACK_DAYS:] + 1) X_ips = eval_past_ips[geo][-NB_LOOKBACK_DAYS:] X_costs = geo_costs[geo] X = np.concatenate( [X_cases.flatten(), X_ips.flatten(), X_costs]) # Get prescription prescribed_ips = net.activate(X) # Map prescription to integer outputs prescribed_ips = (prescribed_ips * ip_max_values_arr).round() # Add it to prescription dictionary country_name, region_name = geo.split('__') if region_name == 'nan': region_name = np.nan df_dict['CountryName'].append(country_name) df_dict['RegionName'].append(region_name) df_dict['Date'].append(date_str) for ip_col, prescribed_ip in zip(IP_COLS, prescribed_ips): df_dict[ip_col].append(prescribed_ip) # Update stringency. This calculation could include division by # the number of IPs and/or number of geos, but that would have # no effect on the ordering of candidate solutions. stringency += np.sum(geo_costs[geo] * prescribed_ips) # Create dataframe from prescriptions. pres_df = pd.DataFrame(df_dict) # Make prediction given prescription for all countries pred_df = get_predictions(EVAL_START_DATE, date_str, pres_df) # Update past data with new day of prescriptions and predictions pres_df['GeoID'] = pres_df['CountryName'] + '__' + pres_df[ 'RegionName'].astype(str) pred_df['RegionName'] = pred_df['RegionName'].fillna("") pred_df['GeoID'] = pred_df['CountryName'] + '__' + pred_df[ 'RegionName'].astype(str) new_pres_df = pres_df[pres_df['Date'] == date_str] new_pred_df = pred_df[pred_df['Date'] == date_str] for geo in eval_geos: geo_pres = new_pres_df[new_pres_df['GeoID'] == geo] geo_pred = new_pred_df[new_pred_df['GeoID'] == geo] # Append array of prescriptions pres_arr = np.array([ geo_pres[ip_col].values[0] for ip_col in IP_COLS ]).reshape(1, -1) eval_past_ips[geo] = np.concatenate( [eval_past_ips[geo], pres_arr]) # Append predicted cases eval_past_cases[geo] = np.append( eval_past_cases[geo], geo_pred[PRED_CASES_COL].values[0]) # Compute fitness. There are many possibilities for computing fitness and ranking # candidates. Here we choose to minimize the product of ip stringency and predicted # cases. This product captures the area of the 2D objective space that dominates # the candidate. We minimize it by including a negation. To place the fitness on # a reasonable scale, we take means over all geos and days. Note that this fitness # function can lead directly to the degenerate solution of all ips 0, i.e., # stringency zero. To achieve more interesting behavior, a different fitness # function may be required. new_cases = pred_df[PRED_CASES_COL].mean().mean() genome.fitness = -(a * (new_cases**2) + b * (stringency**2)) print('Evaluated Genome', genome_id) print('New cases:', new_cases) print('Stringency:', stringency) print('Fitness:', genome.fitness)
def prescribe( start_date_str: str, end_date_str: str, path_to_prior_ips_file: str, path_to_cost_file: str, output_file_path, prescriptors_file, ) -> None: print('output file:', output_file_path, ' file:', prescriptors_file) start_date = pd.to_datetime(start_date_str, format='%Y-%m-%d') end_date = pd.to_datetime(end_date_str, format='%Y-%m-%d') # Load historical data with basic preprocessing print("Loading historical data...") df = prepare_historical_df() # Restrict it to dates before the start_date df = df[df['Date'] <= start_date] # Fill in any missing case data using predictor given ips_df. # todo: ignore ips_df for now, and instead assume we have case # data for all days and geos up until the start_date. # Create historical data arrays for all geos past_cases = {} past_ips = {} for geo in df['GeoID'].unique(): geo_df = df[df['GeoID'] == geo] past_cases[geo] = np.maximum(0, np.array(geo_df[CASES_COL])) past_ips[geo] = np.array(geo_df[IP_COLS]) # Gather values for scaling network output ip_max_values_arr = np.array([IP_MAX_VALUES[ip] for ip in IP_COLS]) # Load prescriptors checkpoint = neat.Checkpointer.restore_checkpoint(prescriptors_file) prescriptors = checkpoint.population.values() config = neat.Config(neat.DefaultGenome, neat.DefaultReproduction, neat.DefaultSpeciesSet, neat.DefaultStagnation, 'config-prescriptor') # Load IP costs to condition prescriptions cost_df = pd.read_csv(path_to_cost_file) cost_df['RegionName'] = cost_df['RegionName'].fillna("") cost_df = add_geo_id(cost_df) geo_costs = {} for geo in cost_df['GeoID'].unique(): costs = cost_df[cost_df['GeoID'] == geo] cost_arr = np.array(costs[IP_COLS])[0] geo_costs[geo] = cost_arr # Generate prescriptions prescription_dfs = [] for prescription_idx, prescriptor in enumerate(prescriptors): print("Generating prescription", prescription_idx, "...") # Create net from genome net = neat.nn.FeedForwardNetwork.create(prescriptor, config) # Set up dictionary for keeping track of prescription df_dict = {'CountryName': [], 'RegionName': [], 'Date': []} for ip_col in sorted(IP_MAX_VALUES.keys()): df_dict[ip_col] = [] # Set initial data eval_past_cases = deepcopy(past_cases) eval_past_ips = deepcopy(past_ips) # Generate prescriptions one day at a time, feeding resulting # predictions from the predictor back into the prescriptor. for date in pd.date_range(start_date, end_date): date_str = date.strftime("%Y-%m-%d") # Get prescription for all regions for geo in df['GeoID'].unique(): # Prepare input data. Here we use log to place cases # on a reasonable scale; many other approaches are possible. X_cases = np.log(eval_past_cases[geo][-NB_LOOKBACK_DAYS:] + 1) X_ips = eval_past_ips[geo][-NB_LOOKBACK_DAYS:] X_costs = geo_costs[geo] X = np.concatenate( [X_cases.flatten(), X_ips.flatten(), X_costs]) # Get prescription prescribed_ips = net.activate(X) # Map prescription to integer outputs prescribed_ips = (prescribed_ips * ip_max_values_arr).round() # Add it to prescription dictionary country_name, region_name = geo.split('__') if region_name == 'nan': region_name = np.nan df_dict['CountryName'].append(country_name) df_dict['RegionName'].append(region_name) df_dict['Date'].append(date_str) for ip_col, prescribed_ip in zip(IP_COLS, prescribed_ips): df_dict[ip_col].append(prescribed_ip) # Create dataframe from prescriptions pres_df = pd.DataFrame(df_dict) # Make prediction given prescription for all countries pred_df = get_predictions(start_date_str, date_str, pres_df) # Update past data with new day of prescriptions and predictions pres_df['GeoID'] = pres_df['CountryName'] + '__' + pres_df[ 'RegionName'].astype(str) pred_df['RegionName'] = pred_df['RegionName'].fillna("") pred_df['GeoID'] = pred_df['CountryName'] + '__' + pred_df[ 'RegionName'].astype(str) new_pres_df = pres_df[pres_df['Date'] == date_str] new_pred_df = pred_df[pred_df['Date'] == date_str] for geo in df['GeoID'].unique(): geo_pres = new_pres_df[new_pres_df['GeoID'] == geo] geo_pred = new_pred_df[new_pred_df['GeoID'] == geo] # Append array of prescriptions pres_arr = np.array([ geo_pres[ip_col].values[0] for ip_col in IP_COLS ]).reshape(1, -1) eval_past_ips[geo] = np.concatenate( [eval_past_ips[geo], pres_arr]) # It is possible that the predictor does not return values for some regions. # To make sure we generate full prescriptions, this script continues anyway. # Geos that are ignored in this way by the predictor, will not be used in # quantitative evaluation. A list of such geos can be found in unused_geos.txt. if len(geo_pred) != 0: eval_past_cases[geo] = np.append( eval_past_cases[geo], geo_pred[PRED_CASES_COL].values[0]) # Add prescription df to list of all prescriptions for this submission pres_df['PrescriptionIndex'] = prescription_idx prescription_dfs.append(pres_df) # Combine dfs for all prescriptions into a single df for the submission prescription_df = pd.concat(prescription_dfs) # Create the output path os.makedirs(os.path.dirname(output_file_path), exist_ok=True) # Save to a csv file prescription_df.to_csv(output_file_path, index=False) print('Prescriptions saved to', output_file_path) return