Ejemplo n.º 1
0
def prescribe(start_date_str: str,
              end_date_str: str,
              path_to_prior_ips_file: str,
              path_to_cost_file: str,
              output_file_path) -> None:
    info(f"prescription started @ {datetime.now()}")
    info(f"prescription from {start_date_str} to {end_date_str}")
    info(f"prescription with past IPS:   {path_to_prior_ips_file}")
    info(f"prescription with past costs: {path_to_cost_file}")
    info(f"prescription with output to:  {output_file_path}")

    start_date = pd.to_datetime(start_date_str, format='%Y-%m-%d')
    end_date = pd.to_datetime(end_date_str, format='%Y-%m-%d')
    n_days = (end_date - start_date).days + 1

    # Load the past IPs data
    past_ips_df = load_ips_file(path_to_prior_ips_file)
    geos = past_ips_df['GeoID'].unique()

    # Load historical data with basic preprocessing

    df = prepare_historical_df()

    # Restrict it to dates before the start_date
    df = df[df['Date'] <= start_date]

    # Create past case data arrays for all geos
    past_cases = {}
    for geo in geos:
        geo_df = df[df['GeoID'] == geo]
        past_cases[geo] = np.maximum(0, np.array(geo_df[CASES_COL]))

    # Create past ip data arrays for all geos
    past_ips = {}
    for geo in geos:
        geo_df = past_ips_df[past_ips_df['GeoID'] == geo]
        past_ips[geo] = np.array(geo_df[IP_COLS])

    # Fill in any missing case data before start_date
    # using predictor given past_ips_df.
    # Note that the following assumes that the df returned by prepare_historical_df()
    # has the same final date for all regions. This has been true so far, but relies
    # on it being true for the Oxford data csv loaded by prepare_historical_df().
    last_historical_data_date_str = df['Date'].max()
    last_historical_data_date = pd.to_datetime(last_historical_data_date_str,
                                               format='%Y-%m-%d')
    if last_historical_data_date + pd.Timedelta(days=1) < start_date:
        info("Filling in missing data...")
        missing_data_start_date = last_historical_data_date + pd.Timedelta(days=1)
        missing_data_start_date_str = datetime.strftime(missing_data_start_date, format='%Y-%m-%d')
        missing_data_end_date = start_date - pd.Timedelta(days=1)
        missing_data_end_date_str = datetime.strftime(missing_data_end_date, format='%Y-%m-%d')
        pred_df = get_predictions(missing_data_start_date_str,
                                  missing_data_end_date_str,
                                  past_ips_df)
        pred_df = add_geo_id(pred_df)
        for geo in geos:
            geo_df = pred_df[pred_df['GeoID'] == geo].sort_values(by='Date')
            pred_cases_arr = np.array(geo_df[PRED_CASES_COL])
            past_cases[geo] = np.append(past_cases[geo], pred_cases_arr)
    else:
        info("No missing data.")

    # Load IP costs to condition prescriptions
    cost_df = pd.read_csv(path_to_cost_file)
    cost_df['RegionName'] = cost_df['RegionName'].fillna("")
    cost_df = add_geo_id(cost_df)
    geo_costs = {}
    for geo in geos:
        costs = cost_df[cost_df['GeoID'] == geo]
        cost_arr = np.array(costs[IP_COLS])[0]
        geo_costs[geo] = cost_arr

    # perform iterations while we have a time-budget
    info(f"initializing prescriptions generator")
    limits = NPI_LIMITS * n_days
    prescription_generator = PrescriptionGenerator(n_days, limits)

    info(f"prescription run 1 started @ {datetime.now()}")
    prescribe_loop(geos,
                   geo_costs,
                   past_cases,
                   past_ips,
                   n_days,
                   output_file_path,
                   start_date,
                   PRESCRIPTION_CANDIDATES_PER_INDEX_RUN_1,
                   prescription_generator,
                   limits)

    info(f"prescription run 2 started @ {datetime.now()}")
    prescribe_loop(geos,
                   geo_costs,
                   past_cases,
                   past_ips,
                   n_days,
                   output_file_path,
                   start_date,
                   PRESCRIPTION_CANDIDATES_PER_INDEX_RUN_2,
                   prescription_generator,
                   limits)
    info(f"prescription run 2 ended @ {datetime.now()}")
Ejemplo n.º 2
0
def prescribe(start_date_str: str,
              end_date_str: str,
              path_to_prior_ips_file: str,
              path_to_cost_file: str,
              output_file_path) -> None:

    start_date = pd.to_datetime(start_date_str, format='%Y-%m-%d')
    end_date = pd.to_datetime(end_date_str, format='%Y-%m-%d')

    # Load the past IPs data
    print("Loading past IPs data...")
    past_ips_df = load_ips_file(path_to_prior_ips_file)
    geos = past_ips_df['GeoID'].unique()

    # Load historical data with basic preprocessing
    print("Loading historical data...")
    df = prepare_historical_df()

    # Restrict it to dates before the start_date
    df = df[df['Date'] <= start_date]

    # Create past case data arrays for all geos
    past_cases = {}
    for geo in geos:
        geo_df = df[df['GeoID'] == geo]
        past_cases[geo] = np.maximum(0, np.array(geo_df[CASES_COL]))

    # Create past ip data arrays for all geos
    past_ips = {}
    for geo in geos:
        geo_df = past_ips_df[past_ips_df['GeoID'] == geo]
        past_ips[geo] = np.array(geo_df[IP_COLS])

    # Fill in any missing case data before start_date
    # using predictor given past_ips_df.
    # Note that the following assumes that the df returned by prepare_historical_df()
    # has the same final date for all regions. This has been true so far, but relies
    # on it being true for the Oxford data csv loaded by prepare_historical_df().
    last_historical_data_date_str = df['Date'].max()
    last_historical_data_date = pd.to_datetime(last_historical_data_date_str,
                                               format='%Y-%m-%d')
    if last_historical_data_date + pd.Timedelta(days=1) < start_date:
        print("Filling in missing data...")
        missing_data_start_date = last_historical_data_date + pd.Timedelta(days=1)
        missing_data_start_date_str = datetime.strftime(missing_data_start_date,
                                                           format='%Y-%m-%d')
        missing_data_end_date = start_date - pd.Timedelta(days=1)
        missing_data_end_date_str = datetime.strftime(missing_data_end_date,
                                                           format='%Y-%m-%d')
        pred_df = get_predictions(missing_data_start_date_str,
                                  missing_data_end_date_str,
                                  past_ips_df)
        pred_df = add_geo_id(pred_df)
        for geo in geos:
            geo_df = pred_df[pred_df['GeoID'] == geo].sort_values(by='Date')
            pred_cases_arr = np.array(geo_df[PRED_CASES_COL])
            past_cases[geo] = np.append(past_cases[geo], pred_cases_arr)
    else:
        print("No missing data.")

    # Gather values for scaling network output
    ip_max_values_arr = np.array([IP_MAX_VALUES[ip] for ip in IP_COLS])

    # Load prescriptors
    checkpoint = neat.Checkpointer.restore_checkpoint(PRESCRIPTORS_FILE)
    prescriptors = list(checkpoint.population.values())[:NB_PRESCRIPTIONS]
    config = neat.Config(neat.DefaultGenome, neat.DefaultReproduction,
                         neat.DefaultSpeciesSet, neat.DefaultStagnation,
                         NEAT_CONFIG_FILE)

    # Load IP costs to condition prescriptions
    cost_df = pd.read_csv(path_to_cost_file)
    cost_df['RegionName'] = cost_df['RegionName'].fillna("")
    cost_df = add_geo_id(cost_df)
    geo_costs = {}
    for geo in geos:
        costs = cost_df[cost_df['GeoID'] == geo]
        cost_arr = np.array(costs[IP_COLS])[0]
        geo_costs[geo] = cost_arr

    # Generate prescriptions
    prescription_dfs = []
    for prescription_idx, prescriptor in enumerate(prescriptors):
        print("Generating prescription", prescription_idx, "...")

        # Create net from genome
        net = neat.nn.FeedForwardNetwork.create(prescriptor, config)

        # Set up dictionary for keeping track of prescription
        df_dict = {'CountryName': [], 'RegionName': [], 'Date': []}
        for ip_col in sorted(IP_MAX_VALUES.keys()):
            df_dict[ip_col] = []

        # Set initial data
        eval_past_cases = deepcopy(past_cases)
        eval_past_ips = deepcopy(past_ips)

        # Generate prescriptions iteratively, feeding resulting
        # predictions from the predictor back into the prescriptor.
        action_start_date = start_date
        while action_start_date <= end_date:

            # Get prescription for all regions
            for geo in geos:

                # Prepare input data. Here we use log to place cases
                # on a reasonable scale; many other approaches are possible.
                X_cases = np.log(eval_past_cases[geo][-NB_LOOKBACK_DAYS:] + 1)
                X_ips = eval_past_ips[geo][-NB_LOOKBACK_DAYS:]
                X_costs = geo_costs[geo]
                X = np.concatenate([X_cases.flatten(),
                                    X_ips.flatten(),
                                    X_costs])

                # Get prescription
                prescribed_ips = net.activate(X)

                # Map prescription to integer outputs
                prescribed_ips = (prescribed_ips * ip_max_values_arr).round()

                # Add it to prescription dictionary for the full ACTION_DURATION
                country_name, region_name = geo.split('__')
                if region_name == 'nan':
                    region_name = np.nan
                for date in pd.date_range(action_start_date, periods=ACTION_DURATION):
                    if date > end_date:
                        break
                    date_str = date.strftime("%Y-%m-%d")
                    df_dict['CountryName'].append(country_name)
                    df_dict['RegionName'].append(region_name)
                    df_dict['Date'].append(date_str)
                    for ip_col, prescribed_ip in zip(IP_COLS, prescribed_ips):
                        df_dict[ip_col].append(prescribed_ip)

            # Create dataframe from prescriptions
            pres_df = pd.DataFrame(df_dict)

            # Make prediction given prescription for all countries
            pred_df = get_predictions(start_date_str, date_str, pres_df)

            # Update past data with new days of prescriptions and predictions
            pres_df = add_geo_id(pres_df)
            pred_df = add_geo_id(pred_df)
            for date in pd.date_range(action_start_date, periods=ACTION_DURATION):
                if date > end_date:
                    break
                date_str = date.strftime("%Y-%m-%d")
                new_pres_df = pres_df[pres_df['Date'] == date_str]
                new_pred_df = pred_df[pred_df['Date'] == date_str]
                for geo in geos:
                    geo_pres = new_pres_df[new_pres_df['GeoID'] == geo]
                    geo_pred = new_pred_df[new_pred_df['GeoID'] == geo]
                    # Append array of prescriptions
                    pres_arr = np.array([geo_pres[ip_col].values[0] for
                                         ip_col in IP_COLS]).reshape(1,-1)
                    eval_past_ips[geo] = np.concatenate([eval_past_ips[geo], pres_arr])

                    # It is possible that the predictor does not return values for some regions.
                    # To make sure we generate full prescriptions, this script continues anyway.
                    # This should not happen, but is included here for robustness.
                    if len(geo_pred) != 0:
                        eval_past_cases[geo] = np.append(eval_past_cases[geo],
                                                         geo_pred[PRED_CASES_COL].values[0])

            # Move on to next action date
            action_start_date += pd.DateOffset(days=ACTION_DURATION)

        # Add prescription df to list of all prescriptions for this submission
        pres_df['PrescriptionIndex'] = prescription_idx
        prescription_dfs.append(pres_df)

    # Combine dfs for all prescriptions into a single df for the submission
    prescription_df = pd.concat(prescription_dfs)
    prescription_df = prescription_df.drop(columns='GeoID')

    # Create the output directory if necessary.
    output_dir = os.path.dirname(output_file_path)
    if output_dir != '':
        os.makedirs(output_dir, exist_ok=True)

    # Save to a csv file
    prescription_df.to_csv(output_file_path, index=False)
    print('Prescriptions saved to', output_file_path)

    return
Ejemplo n.º 3
0
    def eval_genomes(genomes, config):

        # Every generation sample a different set of costs per geo,
        # so that over time solutions become robust to different costs.
        cost_df = generate_costs(distribution='uniform')
        cost_df = add_geo_id(cost_df)
        geo_costs = {}
        for geo in eval_geos:
            costs = cost_df[cost_df['GeoID'] == geo]
            cost_arr = np.array(costs[IP_COLS])[0]
            geo_costs[geo] = cost_arr

        # Evaluate each individual
        for genome_id, genome in genomes:

            # Create net from genome
            net = neat.nn.FeedForwardNetwork.create(genome, config)

            # Set up dictionary to keep track of prescription
            df_dict = {'CountryName': [], 'RegionName': [], 'Date': []}
            for ip_col in IP_COLS:
                df_dict[ip_col] = []

            # Set initial data
            eval_past_cases = deepcopy(past_cases)
            eval_past_ips = deepcopy(past_ips)

            # Compute prescribed stringency incrementally
            stringency = 0.

            # Make prescriptions one day at a time, feeding resulting
            # predictions from the predictor back into the prescriptor.
            for date in pd.date_range(eval_start_date, eval_end_date):
                date_str = date.strftime("%Y-%m-%d")

                # Prescribe for each geo
                for geo in eval_geos:

                    # Prepare input data. Here we use log to place cases
                    # on a reasonable scale; many other approaches are possible.
                    X_cases = np.log(eval_past_cases[geo][-NB_LOOKBACK_DAYS:] +
                                     1)
                    X_ips = eval_past_ips[geo][-NB_LOOKBACK_DAYS:]
                    X_costs = geo_costs[geo]
                    X = np.concatenate(
                        [X_cases.flatten(),
                         X_ips.flatten(), X_costs])

                    # Get prescription
                    prescribed_ips = net.activate(X)

                    # Map prescription to integer outputs
                    prescribed_ips = (prescribed_ips *
                                      ip_max_values_arr).round()

                    # Add it to prescription dictionary
                    country_name, region_name = geo.split('__')
                    if region_name == 'nan':
                        region_name = np.nan
                    df_dict['CountryName'].append(country_name)
                    df_dict['RegionName'].append(region_name)
                    df_dict['Date'].append(date_str)
                    for ip_col, prescribed_ip in zip(IP_COLS, prescribed_ips):
                        df_dict[ip_col].append(prescribed_ip)

                    # Update stringency. This calculation could include division by
                    # the number of IPs and/or number of geos, but that would have
                    # no effect on the ordering of candidate solutions.
                    stringency += np.sum(geo_costs[geo] * prescribed_ips)

                # Create dataframe from prescriptions.
                pres_df = pd.DataFrame(df_dict)

                # Make prediction given prescription for all countries
                pred_df = get_predictions(EVAL_START_DATE, date_str, pres_df)

                # Update past data with new day of prescriptions and predictions
                pres_df['GeoID'] = pres_df['CountryName'] + '__' + pres_df[
                    'RegionName'].astype(str)
                pred_df['RegionName'] = pred_df['RegionName'].fillna("")
                pred_df['GeoID'] = pred_df['CountryName'] + '__' + pred_df[
                    'RegionName'].astype(str)
                new_pres_df = pres_df[pres_df['Date'] == date_str]
                new_pred_df = pred_df[pred_df['Date'] == date_str]
                for geo in eval_geos:
                    geo_pres = new_pres_df[new_pres_df['GeoID'] == geo]
                    geo_pred = new_pred_df[new_pred_df['GeoID'] == geo]

                    # Append array of prescriptions
                    pres_arr = np.array([
                        geo_pres[ip_col].values[0] for ip_col in IP_COLS
                    ]).reshape(1, -1)
                    eval_past_ips[geo] = np.concatenate(
                        [eval_past_ips[geo], pres_arr])

                    # Append predicted cases
                    eval_past_cases[geo] = np.append(
                        eval_past_cases[geo],
                        geo_pred[PRED_CASES_COL].values[0])

            # Compute fitness. There are many possibilities for computing fitness and ranking
            # candidates. Here we choose to minimize the product of ip stringency and predicted
            # cases. This product captures the area of the 2D objective space that dominates
            # the candidate. We minimize it by including a negation. To place the fitness on
            # a reasonable scale, we take means over all geos and days. Note that this fitness
            # function can lead directly to the degenerate solution of all ips 0, i.e.,
            # stringency zero. To achieve more interesting behavior, a different fitness
            # function may be required.
            new_cases = pred_df[PRED_CASES_COL].mean().mean()

            genome.fitness = -(a * (new_cases**2) + b * (stringency**2))

            print('Evaluated Genome', genome_id)
            print('New cases:', new_cases)
            print('Stringency:', stringency)
            print('Fitness:', genome.fitness)
Ejemplo n.º 4
0
def prescribe(
    start_date_str: str,
    end_date_str: str,
    path_to_prior_ips_file: str,
    path_to_cost_file: str,
    output_file_path,
    prescriptors_file,
) -> None:

    print('output file:', output_file_path, '   file:', prescriptors_file)
    start_date = pd.to_datetime(start_date_str, format='%Y-%m-%d')
    end_date = pd.to_datetime(end_date_str, format='%Y-%m-%d')

    # Load historical data with basic preprocessing
    print("Loading historical data...")
    df = prepare_historical_df()

    # Restrict it to dates before the start_date
    df = df[df['Date'] <= start_date]

    # Fill in any missing case data using predictor given ips_df.
    # todo: ignore ips_df for now, and instead assume we have case
    # data for all days and geos up until the start_date.

    # Create historical data arrays for all geos
    past_cases = {}
    past_ips = {}
    for geo in df['GeoID'].unique():
        geo_df = df[df['GeoID'] == geo]
        past_cases[geo] = np.maximum(0, np.array(geo_df[CASES_COL]))
        past_ips[geo] = np.array(geo_df[IP_COLS])

    # Gather values for scaling network output
    ip_max_values_arr = np.array([IP_MAX_VALUES[ip] for ip in IP_COLS])

    # Load prescriptors
    checkpoint = neat.Checkpointer.restore_checkpoint(prescriptors_file)
    prescriptors = checkpoint.population.values()
    config = neat.Config(neat.DefaultGenome, neat.DefaultReproduction,
                         neat.DefaultSpeciesSet, neat.DefaultStagnation,
                         'config-prescriptor')

    # Load IP costs to condition prescriptions
    cost_df = pd.read_csv(path_to_cost_file)
    cost_df['RegionName'] = cost_df['RegionName'].fillna("")
    cost_df = add_geo_id(cost_df)
    geo_costs = {}
    for geo in cost_df['GeoID'].unique():
        costs = cost_df[cost_df['GeoID'] == geo]
        cost_arr = np.array(costs[IP_COLS])[0]
        geo_costs[geo] = cost_arr

    # Generate prescriptions
    prescription_dfs = []
    for prescription_idx, prescriptor in enumerate(prescriptors):
        print("Generating prescription", prescription_idx, "...")

        # Create net from genome
        net = neat.nn.FeedForwardNetwork.create(prescriptor, config)

        # Set up dictionary for keeping track of prescription
        df_dict = {'CountryName': [], 'RegionName': [], 'Date': []}
        for ip_col in sorted(IP_MAX_VALUES.keys()):
            df_dict[ip_col] = []

        # Set initial data
        eval_past_cases = deepcopy(past_cases)
        eval_past_ips = deepcopy(past_ips)

        # Generate prescriptions one day at a time, feeding resulting
        # predictions from the predictor back into the prescriptor.
        for date in pd.date_range(start_date, end_date):
            date_str = date.strftime("%Y-%m-%d")

            # Get prescription for all regions
            for geo in df['GeoID'].unique():

                # Prepare input data. Here we use log to place cases
                # on a reasonable scale; many other approaches are possible.
                X_cases = np.log(eval_past_cases[geo][-NB_LOOKBACK_DAYS:] + 1)
                X_ips = eval_past_ips[geo][-NB_LOOKBACK_DAYS:]
                X_costs = geo_costs[geo]
                X = np.concatenate(
                    [X_cases.flatten(),
                     X_ips.flatten(), X_costs])

                # Get prescription
                prescribed_ips = net.activate(X)

                # Map prescription to integer outputs
                prescribed_ips = (prescribed_ips * ip_max_values_arr).round()

                # Add it to prescription dictionary
                country_name, region_name = geo.split('__')
                if region_name == 'nan':
                    region_name = np.nan
                df_dict['CountryName'].append(country_name)
                df_dict['RegionName'].append(region_name)
                df_dict['Date'].append(date_str)
                for ip_col, prescribed_ip in zip(IP_COLS, prescribed_ips):
                    df_dict[ip_col].append(prescribed_ip)

            # Create dataframe from prescriptions
            pres_df = pd.DataFrame(df_dict)

            # Make prediction given prescription for all countries
            pred_df = get_predictions(start_date_str, date_str, pres_df)

            # Update past data with new day of prescriptions and predictions
            pres_df['GeoID'] = pres_df['CountryName'] + '__' + pres_df[
                'RegionName'].astype(str)
            pred_df['RegionName'] = pred_df['RegionName'].fillna("")
            pred_df['GeoID'] = pred_df['CountryName'] + '__' + pred_df[
                'RegionName'].astype(str)
            new_pres_df = pres_df[pres_df['Date'] == date_str]
            new_pred_df = pred_df[pred_df['Date'] == date_str]
            for geo in df['GeoID'].unique():
                geo_pres = new_pres_df[new_pres_df['GeoID'] == geo]
                geo_pred = new_pred_df[new_pred_df['GeoID'] == geo]

                # Append array of prescriptions
                pres_arr = np.array([
                    geo_pres[ip_col].values[0] for ip_col in IP_COLS
                ]).reshape(1, -1)
                eval_past_ips[geo] = np.concatenate(
                    [eval_past_ips[geo], pres_arr])

                # It is possible that the predictor does not return values for some regions.
                # To make sure we generate full prescriptions, this script continues anyway.
                # Geos that are ignored in this way by the predictor, will not be used in
                # quantitative evaluation. A list of such geos can be found in unused_geos.txt.
                if len(geo_pred) != 0:
                    eval_past_cases[geo] = np.append(
                        eval_past_cases[geo],
                        geo_pred[PRED_CASES_COL].values[0])

        # Add prescription df to list of all prescriptions for this submission
        pres_df['PrescriptionIndex'] = prescription_idx
        prescription_dfs.append(pres_df)

    # Combine dfs for all prescriptions into a single df for the submission
    prescription_df = pd.concat(prescription_dfs)

    # Create the output path
    os.makedirs(os.path.dirname(output_file_path), exist_ok=True)

    # Save to a csv file
    prescription_df.to_csv(output_file_path, index=False)
    print('Prescriptions saved to', output_file_path)

    return