Example #1
0
class ModelPipeline:
    """
    Base class for a model generator.
    If a model needs to have initial parameters started for the predictive validity,
    put that in run_init_model
    """
    def __init__(self,
                 all_data,
                 col_t,
                 col_obs,
                 col_group,
                 col_obs_compare,
                 all_cov_names,
                 fun,
                 predict_space,
                 obs_se_func=None):
        """
        Base class for a model pipeline. At minimum needs the following arguments for a
        model pipeline.

        Args:
            all_data: (pd.DataFrame) of *all* the data that will go into this modeling pipeline
            col_t: (str) name of the column with time
            col_group: (str) name of the column with the group in it
            col_obs: (str) the name of the column with observations for fitting the model
            col_obs_compare: (str) the name of the column that will be used for predictive validity comparison
            all_cov_names: List[str] list of name(s) of covariate(s). Not the same as the covariate specifications
                that are required by CurveModel in order of parameters. You should exclude intercept from this list.
            fun: (callable) the space to fit in, one of curvefit.functions
            predict_space: (callable) the space to do predictive validity in, one of curvefit.functions
            obs_se_func: (optional) function to get observation standard error from col_t

        Attributes:
            self.pv: (curvefit.pv.PVModel) predictive validity model
            self.forecaster: (curvefit.forecaster.Forecaster) residual forecasting tool
            self.mean_predictions: (dict) dictionary of mean predictions keyed by group
            self.simulated_data: (dict) dictionary of simulated datasets keyed by group
            self.draws: (dict) dictionary of resulting keyed by group
        """
        self.all_data = all_data
        self.col_t = col_t
        self.col_group = col_group
        self.col_obs = col_obs
        self.col_obs_compare = col_obs_compare
        self.all_cov_names = all_cov_names
        self.fun = fun
        self.predict_space = predict_space
        self.obs_se_func = obs_se_func

        if self.obs_se_func is not None:
            self.col_obs_se = 'obs_se'
            self.all_data[self.col_obs_se] = self.all_data[self.col_t].apply(
                self.obs_se_func)
        else:
            self.col_obs_se = None

        # these are the attributes that can't be used to initialize a
        # CurveModel but are needed to initialize the ModelPipeline
        self.pop_cols = [
            'all_data', 'all_cov_names', 'col_obs_compare', 'predict_space',
            'obs_se_func'
        ]

        self.all_data.sort_values([col_group, col_t], inplace=True)
        self.groups = sorted(self.all_data[self.col_group].unique())

        self.pv = None
        self.forecaster = None

        self.mean_predictions = None
        self.simulated_data = None
        self.draws = None
        self.draw_models = None

    def setup_pipeline(self):
        """
        Sets up the pipeline for running predictive validity and forecasting data out.
        Should be run at the end of the inheriting class' init so that the self.generate()
        gets the model settings to be run for all models.
        """
        self.pv = PVModel(data=self.all_data,
                          col_t=self.col_t,
                          col_group=self.col_group,
                          col_obs=self.col_obs,
                          col_obs_compare=self.col_obs_compare,
                          predict_space=self.predict_space,
                          model_generator=self.generate())
        self.forecaster = Forecaster()

    def run_init_model(self):
        """
        Runs the model that doesn't need to be run multiple times.
        """
        self.refresh()

    def refresh(self):
        """
        Clear the current model results.
        """
        pass

    def generate(self):
        """
        Generate a copy of this class.
        """
        return deepcopy(self)

    def fit(self, df, group=None):
        """
        Function to fit the model with a given data frame.
        Args:
            df: (pd.DataFrame)
            group: (str) optional group to use in whatever capacity is needed for calling this function
        """
        pass

    def predict(self, times, predict_space, predict_group):
        """
        Function to create predictions based on the model fit.
        Args:
            times: (np.array) of times to predict at
            predict_space: (callable) curvefit.functions function to predict in that space
            predict_group: which group to make predictions for
        """
        pass

    def run_predictive_validity(self, theta):
        """
        Run predictive validity for the full model.

        Args:
            theta: amount of scaling for residuals relative to prediction.
        """
        self.pv.run_pv(theta=theta)

    def fit_residuals(self, smoothed_radius, exclude_below, exclude_groups):
        """
        Fits residuals given a smoothed radius, and some models to exclude.
        Exclude below excludes models with less than that many data points.
        Exclude groups excludes all models from the list of groups regardless of the data points.

        Args:
            smoothed_radius: List[int] 2-element list of amount of smoothing for the residuals
            exclude_groups: List[str] which groups to exclude from the residual analysis
            exclude_below: (int) observations with less than exclude_below
                will be excluded from the analysis

        Returns:

        """
        residual_data = self.pv.get_smoothed_residuals(radius=smoothed_radius)
        residual_data = residual_data.loc[
            residual_data['num_data'] > exclude_below].copy()
        residual_data = residual_data.loc[~residual_data[self.col_group].
                                          isin(exclude_groups)].copy()

        self.forecaster.fit_residuals(
            residual_data=residual_data,
            mean_col='residual_mean',
            std_col='residual_std',
            residual_covariates=['far_out', 'num_data'],
            residual_model_type='linear')

    def create_draws(self,
                     num_draws,
                     num_forecast_out,
                     prediction_times,
                     theta=1,
                     std_threshold=1e-2):
        """
        Generate draws for a model pipeline, smoothing over a neighbor radius of residuals
        for far out and num data points.

        Args:

            num_draws: (int) the number of draws to take
            num_forecast_out: (int) how far out into the future should residual simulations be taken
            prediction_times: (int) which times to produce final predictions at
            std_threshold: (float) floor for standard deviation
            theta: (float) between 0 and 1, how much scaling of the residuals to do relative to the prediction mean
        """
        if self.pv.all_residuals is None:
            raise RuntimeError(
                "Need to first run predictive validity with self.run_predictive_validity."
            )

        generator = self.generate()

        self.mean_predictions = {}
        self.simulated_data = {}
        self.draws = {}

        self.fit(df=self.all_data)

        for group in self.groups:
            sims = self.forecaster.simulate(mp=self,
                                            far_out=num_forecast_out,
                                            num_simulations=num_draws,
                                            group=group,
                                            theta=theta,
                                            epsilon=std_threshold)
            self.simulated_data[group] = sims
            self.mean_predictions[group] = self.predict(
                times=prediction_times,
                predict_space=self.predict_space,
                predict_group=group)

        for group in self.groups:
            self.draws[group] = []

        for i in range(num_draws):
            new_data = []

            for group in self.groups:
                new_data.append(self.simulated_data[group][i])
            new_data = pd.concat(new_data)

            print(f"Creating {i}th draw.", end='\r')
            generator.fit(df=new_data)

            for group in self.groups:
                predictions = generator.predict(
                    times=prediction_times,
                    predict_space=self.predict_space,
                    predict_group=group)
                self.draws[group].append(predictions)

        return self
Example #2
0
class ModelPipeline:
    """
    Base class for a model generator.
    If a model needs to have initial parameters started for the predictive validity,
    put that in run_init_model
    """
    def __init__(self,
                 all_data,
                 col_t,
                 col_obs,
                 col_group,
                 col_obs_compare,
                 all_cov_names,
                 fun,
                 predict_space,
                 obs_se_func=None):
        """
        Base class for a model pipeline. At minimum needs the following arguments for a
        model pipeline.

        Args:
            all_data: (pd.DataFrame) of *all* the data that will go into this modeling pipeline
            col_t: (str) name of the column with time
            col_group: (str) name of the column with the group in it
            col_obs: (str) the name of the column with observations for fitting the model
            col_obs_compare: (str) the name of the column that will be used for predictive validity comparison
            all_cov_names: List[str] list of name(s) of covariate(s). Not the same as the covariate specifications
                that are required by CurveModel in order of parameters. You should exclude intercept from this list.
            fun: (callable) the space to fit in, one of curvefit.functions
            predict_space: (callable) the space to do predictive validity in, one of curvefit.functions
            obs_se_func: (optional) function to get observation standard error from col_t

        Attributes:
            self.pv: (curvefit.pv.PVModel) predictive validity model
            self.forecaster: (curvefit.forecaster.Forecaster) residual forecasting tool
            self.mean_predictions: (dict) dictionary of mean predictions keyed by group
            self.simulated_data: (dict) dictionary of simulated datasets keyed by group
            self.draws: (dict) dictionary of resulting keyed by group
        """
        self.all_data = all_data
        self.col_t = col_t
        self.col_group = col_group
        self.col_obs = col_obs
        self.col_obs_compare = col_obs_compare
        self.all_cov_names = all_cov_names
        self.fun = fun
        self.predict_space = predict_space
        self.obs_se_func = obs_se_func

        if self.obs_se_func is not None:
            self.col_obs_se = 'obs_se'
            self.all_data[self.col_obs_se] = self.all_data[self.col_t].apply(
                self.obs_se_func)
        else:
            self.col_obs_se = None

        # these are the attributes that can't be used to initialize a
        # CurveModel but are needed to initialize the ModelPipeline
        self.pop_cols = [
            'all_data', 'all_cov_names', 'col_obs_compare', 'predict_space',
            'obs_se_func'
        ]

        self.all_data.sort_values([col_group, col_t], inplace=True)
        self.groups = sorted(self.all_data[self.col_group].unique())

        self.pv = None
        self.forecaster = None

        self.mean_predictions = None
        self.simulated_data = None
        self.draws = None
        self.draw_models = None

    def run(self, n_draws, prediction_times, cv_threshold, smoothed_radius,
            exclude_below):
        """
        Runs the whole model with PV and forecasting residuals and creating draws.

        Args:
            n_draws: (int) number of draws to produce
            prediction_times: (np.array) array of times to make predictions at
            cv_threshold: (float) lower bound on the coefficient of variation
                for the residuals simulation
            smoothed_radius: List[int] residual smoothing before running the
                residual forecast -- how many neighbors to look at, e.g. [3, 3]
                would smooth over a radius of 3
            exclude_below: (int) exclude results from the predictive validity analysis
                that had less than this many data points -- just for going into the regression
                to predict the coefficient of variation (low numbers of data points makes this unstable)
        Returns:
        """
        assert type(n_draws) == int
        assert type(cv_threshold) == float
        assert type(smoothed_radius) == list
        assert type(exclude_below) == int

        # Setup the initial model (optional for some subclasses)
        self.run_init_model()

        # Run predictive validity with a theta = 1, means everything is in relative space
        # -- relative mean bias, relative standard deviation (coefficient of variation)
        self.run_predictive_validity(theta=1)

        # Excludes Wuhan from the residual fitting.
        # Right now only std_covariates are used.
        self.fit_residuals(smoothed_radius=smoothed_radius,
                           exclude_below=exclude_below,
                           mean_covariates=['num_data_transformed', 'far_out'],
                           std_covariates=['log_num_data_transformed'],
                           exclude_groups=['Wuhan City, Hubei'])

        # Create draws. Access them in self.draws by location.
        self.create_draws(num_draws=n_draws,
                          std_threshold=cv_threshold,
                          prediction_times=prediction_times,
                          theta=1)

    def setup_pipeline(self):
        """
        Sets up the pipeline for running predictive validity and forecasting data out.
        Should be run at the end of the inheriting class' init so that the self.generate()
        gets the model settings to be run for all models.
        """
        self.pv = PVModel(data=self.all_data,
                          col_t=self.col_t,
                          col_group=self.col_group,
                          col_obs=self.col_obs,
                          col_obs_compare=self.col_obs_compare,
                          predict_space=self.predict_space,
                          model_generator=self.generate())
        self.forecaster = Forecaster()

    def run_init_model(self):
        """
        Runs the model that doesn't need to be run multiple times.
        """
        self.refresh()

    def refresh(self):
        """
        Clear the current model results.
        """
        pass

    def generate(self):
        """
        Generate a copy of this class.
        """
        return deepcopy(self)

    def fit(self, df, group=None):
        """
        Function to fit the model with a given data frame.
        Args:
            df: (pd.DataFrame)
            group: (str) optional group to use in whatever capacity is needed for calling this function
        """
        pass

    def predict(self, times, predict_space, predict_group):
        """
        Function to create predictions based on the model fit.
        Args:
            times: (np.array) of times to predict at
            predict_space: (callable) curvefit.functions function to predict in that space
            predict_group: which group to make predictions for
        """
        pass

    def run_predictive_validity(self, theta):
        """
        Run predictive validity for the full model.

        Args:
            theta: amount of scaling for residuals relative to prediction.
        """
        self.pv.run_pv(theta=theta)

    def fit_residuals(self,
                      smoothed_radius,
                      mean_covariates,
                      std_covariates,
                      exclude_below,
                      exclude_groups,
                      std_floor=1e-5):
        """
        Fits residuals given a smoothed radius, and some models to exclude.
        Exclude below excludes models with less than that many data points.
        Exclude groups excludes all models from the list of groups regardless of the data points.

        Args:
            smoothed_radius: List[int] 2-element list of amount of smoothing for the residuals
            mean_covariates: List[str] which covariates to use to predict the residuals
                choices of num_data, far_out, and data_index (where data_index = far_out + num_data)
            std_covariates: List[str] which covariates to use to predict the coefficient of variation
                in the residuals
            exclude_groups: List[str] which groups to exclude from the residual analysis
            exclude_below: (int) observations with less than exclude_below
                will be excluded from the analysis
            std_floor: (float) minimum standard deviation (or coefficient of variation given theta)
                for the regression inputs

        Returns:

        """
        residual_data = self.pv.get_smoothed_residuals(radius=smoothed_radius)
        residual_data = residual_data.loc[
            residual_data['num_data'] > exclude_below].copy()
        residual_data = residual_data.loc[~residual_data[self.col_group].
                                          isin(exclude_groups)].copy()
        residual_data['residual_std'] = residual_data['residual_std'].apply(
            lambda x: max(x, std_floor))

        self.forecaster.fit_residuals(residual_data=residual_data,
                                      mean_col='residual_mean',
                                      std_col='residual_std',
                                      mean_covariates=mean_covariates,
                                      std_covariates=std_covariates,
                                      residual_model_type='linear')

    def create_draws(self,
                     num_draws,
                     prediction_times,
                     theta=1,
                     std_threshold=1e-2):
        """
        Generate draws for a model pipeline, smoothing over a neighbor radius of residuals
        for far out and num data points.

        Args:
            num_draws: (int) the number of draws to take
            prediction_times: (int) which times to produce final predictions (draws) at
            std_threshold: (float) floor for standard deviation
            theta: (float) between 0 and 1, how much scaling of the residuals to do relative to the prediction mean
        """
        if self.pv.all_residuals is None:
            raise RuntimeError(
                "Need to first run predictive validity with self.run_predictive_validity."
            )

        # Get the best fit we can
        self.fit(df=self.all_data)

        self.mean_predictions = {}
        self.draws = {}

        for group in self.groups:
            # Get the mean prediction for each group
            self.mean_predictions[group] = self.predict(
                times=prediction_times,
                predict_space=self.predict_space,
                predict_group=group)

        # Loop through each group, forecasting the residuals and making draws
        for group in self.groups:
            draws = self.forecaster.simulate(mp=self,
                                             num_simulations=num_draws,
                                             prediction_times=prediction_times,
                                             group=group,
                                             theta=theta,
                                             epsilon=std_threshold)
            self.draws[group] = draws

        return self

    def plot_draws(self, prediction_times, sharex, sharey):
        plot_uncertainty(generator=self,
                         sharex=sharex,
                         sharey=sharey,
                         prediction_times=prediction_times)