Example #1
0
def get_forecasts(df,
                  outcome,
                  method,
                  output_key,
                  target_day=np.array([1]),
                  demographic_vars=[]
                  ):
    
    """
    This is a tentative interface for extracting cases/deaths forecasts of future days
    
    df: county_level df
    outcome: 'cases' or 'deaths'
    method: currently only support 'exponential' and 'shared_exponential'
    target_day:
    output_key
    
    output: df with forecasts in output_key 
    """
    
    ## not tested yet
    
    
    if method == 'exponential':
        return exponential_modeling.get_exponential_forecasts(df=df, 
                                                              outcome=outcome, 
                                                              target_day=target_day,
                                                              output_key=output_key)
         
    

    elif method == 'shared_exponential':
        df[output_key] = exponential_modeling.fit_and_predict_shared_exponential(df, 
                                                                                 mode='predict_future', 
                                                                                 demographic_vars=[],
                                                                                 outcome=outcome)
        return df
    elif method == 'shared_demographic':
        assert len(demographic_vars) > 0

        df[output_key] = exponential_modeling.fit_and_predict_shared_exponential(df, 
                                                                                 mode='predict_future', 
                                                                                 demographic_vars=demographic_vars,
                                                                                 outcome=outcome)
        return df
    
    elif method == 'ensemble':
        df[output_key] = fit_and_predict.fit_and_predict(df, 
                                                         method='ensemble',
                                                         mode='predict_future', 
                                                         demographic_vars=[],
                                                         outcome=outcome)[f'predicted_{outcome}_{method}_{target_day[-1]}']
        return df        

    
    else:
        print('Unknown method')
        raise ValueError        
def fit_and_predict(df,
                    outcome: str = 'deaths',
                    method: str = 'exponential',
                    mode: str = 'predict_future',
                    target_day: np.ndarray = np.array([1]),
                    output_key: str = None,
                    demographic_vars=[],
                    verbose: bool = False):
    """
    Trains a method (method) to predict a current number of days ahead (target_day)
    Predicts the values of the number of deaths for the final day of test_df and writes to the column
    'predicted_deaths_'+method+'_'+str(target_day[-1]) of the test_df
    
    Params
    ------
    df
        a df with county level deaths and cases and demographic information
    outcome
        key for the outcome to predict (the values in this column should have a list for each row)
    method
        what method to use to do forecasting
    target_day
        np.array([1,2,..,n]) predicts these number of days ahead (can just be np.array([3])) for example if you just want 3 days ahead)
    output_key
        key to save the output as
    mode:
        either 'predict_future' or 'eval_mode'
        predict_future is predicting deaths on FUTURE days, so target_day=np.array([1])) means it predicts tomorrow's deaths
        eval_mode is for evaluating the performance of the classifier. 
        target_day=np.array([k])) will predict the current days death count using information from k days ago. 
        target_day= np.array([1,2,3,...,k]) will predict todays deaths, yesterdays deaths, deaths k-1 days ago using information from k days ago.


    Returns
    -------
    test_df
        returns dataframe with added column
    """
    assert mode == 'predict_future' or mode == 'eval_mode', 'unknown mode'
    if output_key is None:
        output_key = f'predicted_{outcome}_{method}_{target_day[-1]}'
        if len(demographic_vars) > 0:
            output_key += '_demographics'
    if method == 'AR':
        print('currently deprecated')
        raise NotImplementedError
        loss, model, best_window = naive_autoreg_baselines.train_and_evaluate_model(
            train_df, test_df)
        return naive_autoreg_baselines.make_predictions(
            test_df, model, best_window)

    elif method == 'exponential':
        preds = exponential_modeling.exponential_fit(df[outcome].values,
                                                     mode=mode,
                                                     target_day=target_day)

        df[output_key] = preds
        #del test_df['predicted_deaths_exponential']

        return df

    elif method == 'linear':
        preds = exponential_modeling.linear_fit(df[outcome].values,
                                                mode=mode,
                                                target_day=target_day)

        df[output_key] = preds
        #del test_df['predicted_deaths_exponential']

        return df

    elif method == 'shared_exponential':
        # Fit a poisson GLM with shared parameters across counties. Input to the poisson GLM is demographic_vars and log(previous_days_deaths+1)
        cur_day_predictions = exponential_modeling.fit_and_predict_shared_exponential(
            df,
            mode,
            outcome=outcome,
            demographic_vars=demographic_vars,
            target_day=target_day,
            verbose=verbose)
        #if len(demographic_vars) > 0:
        #    output_key += '_demographics'
        # import IPython
        # IPython.embed()
        df[output_key] = cur_day_predictions
        return df

    elif method == 'ensemble':
        print('please use fit_and_predict_ensemble instead')

    elif method == 'advanced_shared_model':
        if 'neighbor_deaths' not in df.columns:
            neighboring_counties_df = pd.read_csv(
                oj(
                    parentdir,
                    'data/county_level/raw/county_ids/county_adjacency2010.csv'
                ))
            neighboring_counties_df['fipscounty'] = neighboring_counties_df[
                'fipscounty'].astype(str).str.zfill(5)
            neighboring_counties_df['fipsneighbor'] = neighboring_counties_df[
                'fipsneighbor'].astype(str).str.zfill(5)
            df['countyFIPS'] = df['countyFIPS'].astype(str).str.zfill(5)

            county_neighbor_deaths = []
            county_neighbor_cases = []
            county_fips = list(df['countyFIPS'])
            for fips in county_fips:
                neighboring_counties = list(neighboring_counties_df.loc[
                    neighboring_counties_df['fipscounty'] == fips]
                                            ['fipsneighbor'])
                neighboring_county_deaths = list(df.loc[df['countyFIPS'].isin(
                    neighboring_counties)]['deaths'])
                neighboring_county_cases = list(df.loc[df['countyFIPS'].isin(
                    neighboring_counties)]['cases'])
                # if not in county adjacency file, assume neighboring deaths/counts to 0
                if len(neighboring_county_deaths) == 0:
                    n_deaths = len(
                        df.loc[df['countyFIPS'] == fips]['deaths'].iloc[0])
                    n_cases = len(
                        df.loc[df['countyFIPS'] == fips]['cases'].iloc[0])
                    sum_neighboring_county_deaths = np.zeros(n_deaths)
                    sum_neighboring_county_cases = np.zeros(n_cases)
                else:
                    sum_neighboring_county_deaths = np.zeros(
                        len(neighboring_county_deaths[0]))
                    for deaths in neighboring_county_deaths:
                        sum_neighboring_county_deaths += deaths
                    sum_neighboring_county_cases = np.zeros(
                        len(neighboring_county_deaths[0]))
                    for cases in neighboring_county_cases:
                        sum_neighboring_county_cases += cases
                county_neighbor_deaths.append(sum_neighboring_county_deaths)
                county_neighbor_cases.append(sum_neighboring_county_cases)

            df['neighbor_deaths'] = county_neighbor_deaths
            df['neighbor_cases'] = county_neighbor_cases

        feat_transforms = defaultdict(lambda y: [lambda x: x])
        feat_transforms['deaths'] = [lambda x: np.log(x + 1)]
        feat_transforms['cases'] = [lambda x: np.log(x + 1)]
        feat_transforms['neighbor_deaths'] = [lambda x: np.log(x + 1)]
        feat_transforms['neighbor_cases'] = [lambda x: np.log(x + 1)]
        default_values = defaultdict(lambda: 0)
        aux_feats = ['cases', 'neighbor_deaths', 'neighbor_cases']
        shared_model = SharedModel(df=df,
                                   outcome=outcome,
                                   demographic_variables=[],
                                   mode=mode,
                                   target_days=target_day,
                                   feat_transforms=feat_transforms,
                                   auxiliary_time_features=aux_feats,
                                   time_series_default_values=default_values,
                                   scale=True)
        shared_model.create_dataset()
        shared_model.fit_model()
        shared_model.predict()

        df[output_key] = shared_model.predictions
        return df

    else:
        print('Unknown method')
        raise ValueError
def fit_and_predict(df,
                    outcome: str = 'deaths',
                    method: str = 'exponential',
                    mode: str = 'predict_future',
                    target_day: np.ndarray = np.array([1]),
                    output_key: str = None,
                    demographic_vars=[]):
    """
    Trains a method (method) to predict a current number of days ahead (target_day)
    Predicts the values of the number of deaths for the final day of test_df and writes to the column
    'predicted_deaths_'+method+'_'+str(target_day[-1]) of the test_df
    
    Params
    ------
    df
        a df with county level deaths and cases and demographic information
    outcome
        key for the outcome to predict (the values in this column should have a list for each row)
    method
        what method to use to do forecasting
    target_day
        np.array([1,2,..,n]) predicts these number of days ahead (can just be np.array([3])) for example if you just want 3 days ahead)
    output_key
        key to save the output as
    mode:
        either 'predict_future' or 'eval_mode'
        predict_future is predicting deaths on FUTURE days, so target_day=np.array([1])) means it predicts tomorrow's deaths
        eval_mode is for evaluating the performance of the classifier. 
        target_day=np.array([k])) will predict the current days death count using information from k days ago. 
        target_day= np.array([1,2,3,...,k]) will predict todays deaths, yesterdays deaths, deaths k-1 days ago using information from k days ago.


    Returns
    -------
    test_df
        returns dataframe with added column
    """

    assert mode == 'predict_future' or mode == 'eval_mode', 'unknown mode'
    if output_key is None:
        output_key = f'predicted_{outcome}_{method}_{target_day[-1]}'
        if len(demographic_vars) > 0:
            output_key += '_demographics'
    if method == 'AR':
        print('currently deprecated')
        raise NotImplementedError
        loss, model, best_window = naive_autoreg_baselines.train_and_evaluate_model(
            train_df, test_df)
        return naive_autoreg_baselines.make_predictions(
            test_df, model, best_window)

    elif method == 'exponential':
        preds = exponential_modeling.exponential_fit(df[outcome].values,
                                                     mode=mode,
                                                     target_day=target_day)

        df[output_key] = preds
        #del test_df['predicted_deaths_exponential']

        return df

    elif method == 'linear':
        preds = exponential_modeling.linear_fit(df[outcome].values,
                                                mode=mode,
                                                target_day=target_day)

        df[output_key] = preds
        #del test_df['predicted_deaths_exponential']

        return df

    elif method == 'shared_exponential':
        # Fit a poisson GLM with shared parameters across counties. Input to the poisson GLM is demographic_vars and log(previous_days_deaths+1)
        cur_day_predictions = exponential_modeling.fit_and_predict_shared_exponential(
            df,
            mode,
            outcome=outcome,
            demographic_vars=demographic_vars,
            target_day=target_day)
        #if len(demographic_vars) > 0:
        #    output_key += '_demographics'
        # import IPython
        # IPython.embed()
        df[output_key] = cur_day_predictions
        return df

    elif method == 'ensemble':
        print('please use fit_and_predict_ensemble instead')

    else:
        print('Unknown method')
        raise ValueError
def fit_and_predict(train_df,
                    test_df,
                    outcome,
                    method,
                    mode,
                    target_day=np.array([1]),
                    demographic_vars=[]):
    """
    Trains a method (method) to predict a current number of days ahead (target_day)
    Predicts the values of the number of deaths for the final day of test_df and writes to the column
    'predicted_deaths_'+method+'_'+str(target_day[-1]) of the test_df
    
    Input:
    train_df, tests: dfs with county level deaths and cases
    method: string
    target_day = np.array([1,2,..,n]) predicts these number of days ahead (can just be np.array([3])) for example if you just want 3 days ahead)
    mode: either 'predict_future' or 'eval_mode'
    predict_future is predicting deaths on FUTURE days, so target_day=np.array([1])) means it predicts tomorrow's deaths
    eval_mode is for evaluating the performance of the classifier. target_day=np.array([k])) will predict the current days death count
    using information from k days ago. target_day= np.array([1,2,3,...,k]) will predict todays deaths, yesterdays deaths, deaths k-1 days ago
    using information from k days ago.


    Output:
    test_df 
    """

    assert mode == 'predict_future' or mode == 'eval_mode', 'unknown mode'
    if method == 'AR':
        print('currently deprecated')
        raise NotImplementedError
        loss, model, best_window = naive_autoreg_baselines.train_and_evaluate_model(
            train_df, test_df)
        return naive_autoreg_baselines.make_predictions(
            test_df, model, best_window)

    elif method == 'exponential':
        preds = exponential_modeling.exponential_fit(test_df[outcome].values,
                                                     mode=mode,
                                                     target_day=target_day)
        test_df[f'predicted_{outcome}_{method}_{target_day[-1]}'] = preds
        #del test_df['predicted_deaths_exponential']

        return test_df

    elif method == 'shared_exponential':

        # Fit a poisson GLM with shared parameters across counties. Input to the poisson GLM is demographic_vars and log(previous_days_deaths+1)
        cur_day_predictions = exponential_modeling.fit_and_predict_shared_exponential(
            train_df,
            test_df,
            mode,
            outcome=outcome,
            demographic_vars=demographic_vars,
            target_day=target_day)
        save_name = f'predicted_{outcome}_{method}_{target_day[-1]}'
        if len(demographic_vars) > 0:
            save_name += '_demographics'
        # import IPython
        # IPython.embed()
        test_df[save_name] = cur_day_predictions
        return test_df

    elif method == 'ensemble':
        #if target_day != np.array([1]):
        #    raise NotImplementedError
        shared_preds = exponential_modeling.fit_and_predict_shared_exponential(
            train_df,
            test_df,
            mode=mode,
            outcome=outcome,
            demographic_vars=demographic_vars,
            target_day=target_day)
        exp_preds = exponential_modeling.exponential_fit(
            test_df[outcome].values, mode=mode, target_day=target_day)
        if mode == 'predict_future':
            use_df = test_df
        else:
            use_df = exponential_modeling.leave_t_day_out(
                test_df, target_day[-1])
        weights = pmdl_weight.compute_pmdl_weight(
            use_df,
            methods=['exponential', 'shared_exponential'],
            outcome=outcome)
        weights_sum = weights['exponential'] + weights['shared_exponential']

        preds = [
            exp_preds[i] * weights['exponential'][i] / weights_sum[i] +
            np.array(shared_preds[i]) * weights['shared_exponential'][i] /
            weights_sum[i] for i in range(len(test_df))
        ]
        test_df[f'predicted_{outcome}_{method}_{target_day[-1]}'] = preds
        return test_df

    else:
        print('Unknown method')
        raise ValueError