Ejemplo n.º 1
0
    def exposure_model(self, model, print_model_results=True):
        """Estimation of g(A=1,W), which is Pr(A=1|W)

        model:
            -Independent variables to predict the exposure. Example) 'var1 + var2 + var3'
        print_model_results:
            -Whether to print the fitted model results. Default is True (prints results)
        """
        self._exp_model = self._exposure + ' ~ ' + model
        fitmodel = propensity_score(self.df,
                                    self._exp_model,
                                    mresult=print_model_results)
        self.gA1 = fitmodel.predict(self.df)
        self._fit_exposure_model = True
Ejemplo n.º 2
0
    def exposure_model(self, model, print_results=True):
        """Used to specify the propensity score model. Model used to predict the exposure via a logistic regression
        model

        model:
            -Independent variables to predict the exposure. Example) 'var1 + var2 + var3'
        print_results:
            -Whether to print the fitted model results. Default is True (prints results)
        """
        self._exp_model = self._exposure + ' ~ ' + model
        fitmodel = propensity_score(self.df,
                                    self._exp_model,
                                    print_results=print_results)
        self.df['ps'] = fitmodel.predict(self.df)
        self._fit_exposure_model = True
Ejemplo n.º 3
0
    def exposure_model(self,
                       model,
                       custom_model=None,
                       bound=False,
                       print_results=True):
        """Estimation of Pr(A=1|L), which is termed as g(A=1|L) in the literature

        Parameters
        ----------
        model : str
            Independent variables to predict the exposure. Example) 'var1 + var2 + var3'
        custom_model : optional
            Input for a custom model that is used in place of the logit model (default). The model must have the
            "fit()" and  "predict()" attributes. Both sklearn and supylearner are supported as custom models. In the
            background, TMLE will fit the custom model and generate the predicted probablities
        bound : float, list, optional
            Value between 0,1 to truncate predicted probabilities. Helps to avoid near positivity violations.
            Specifying this argument can improve finite sample performance for random positivity violations. However,
            inference becomes limited to the restricted population. Default is False, meaning no truncation of
            predicted probabilities occurs. Providing a single float assumes symmetric trunctation. A collection of
            floats can be provided for asymmetric trunctation
        print_results : bool, optional
            Whether to print the fitted model results. Default is True (prints results)
        """
        self._exp_model = self._exposure + ' ~ ' + model

        # Step 3) Estimation of g-model (exposure model)
        if custom_model is None:
            fitmodel = propensity_score(self.df,
                                        self._exp_model,
                                        print_results=print_results)
            self.g1W = fitmodel.predict(self.df)

        # User-specified prediction model
        else:
            data = patsy.dmatrix(model + ' - 1', self.df)
            self.g1W = _exposure_machine_learner(xdata=np.asarray(data),
                                                 ydata=np.asarray(
                                                     self.df[self._exposure]),
                                                 ml_model=custom_model,
                                                 print_results=print_results)

        self.g0W = 1 - self.g1W
        if bound:  # Bounding predicted probabilities if requested
            self.g1W = self._bounding(self.g1W, bounds=bound)
            self.g0W = self._bounding(self.g0W, bounds=bound)

        self._fit_exposure_model = True
Ejemplo n.º 4
0
    def exposure_model(self, model, print_results=True):
        """Specify the propensity score / inverse probability weight model. Model used to predict the exposure via a
        logistic regression model

        Parameters
        ----------
        model : str
            Independent variables to predict the exposure. For example, 'var1 + var2 + var3'
        print_results : bool, optional
            Whether to print the fitted model results. Default is True (prints results)
        """
        self._exp_model = self._exposure + ' ~ ' + model
        fitmodel = propensity_score(self.df,
                                    self._exp_model,
                                    print_results=print_results)
        self.df['ps'] = fitmodel.predict(self.df)
        self._fit_exposure_model = True
Ejemplo n.º 5
0
    def exposure_model(self, model, print_results=True):
        r"""Specify the propensity score / inverse probability weight model. Model used to predict the exposure via a
        logistic regression model. This model estimates

        .. math::

            \widehat{\Pr}(A=1|L) = logit^{-1}(\widehat{\beta_0} + \widehat{\beta} L)

        Parameters
        ----------
        model : str
            Independent variables to predict the exposure. For example, 'var1 + var2 + var3'
        print_results : bool, optional
            Whether to print the fitted model results. Default is True (prints results)
        """
        self._exp_model = self._exposure + ' ~ ' + model
        fitmodel = propensity_score(self.df, self._exp_model, weights=self._weight_, print_results=print_results)
        self.df['_ps_'] = fitmodel.predict(self.df)
        self._fit_exposure_ = True
Ejemplo n.º 6
0
    def missing_model(self, model, custom_model=None, print_results=True):
        """Estimation of Pr(M=1|A,L), which is the missing data mechanism for the outcome. The corresponding observation
        probabilities are used to update the clever covariates for estimation of Qn.

        The initial estimate of Q is still based on complete observations only

        Parameters
        ----------
        model : str
            Independent variables to predict the exposure. Example) 'var1 + var2 + var3'. The treatment must be
            included for the missing data model
        custom_model : optional
            Input for a custom model that is used in place of the logit model (default). The model must have the
            "fit()" and  "predict()" attributes. Both sklearn and supylearner are supported as custom models. In the
            background, TMLE will fit the custom model and generate the predicted probablities
        print_results : bool, optional
            Whether to print the fitted model results. Default is True (prints results)
        """
        # Error if no missing outcome data
        if not self._miss_flag:
            raise ValueError(
                "No missing outcome data is present in the data set")

        # Warning if exposure is not included in the missingness of outcome model
        if self._exposure not in model:
            warnings.warn(
                "For the specified missing outcome model, the exposure variable should be included in the "
                "model", UserWarning)

        self._miss_model = self._missing_indicator + ' ~ ' + model

        # Step 3b) Prediction for M if missing outcome data exists
        if custom_model is None:  # Logistic Regression model for predictions
            fitmodel = propensity_score(self.df,
                                        self._miss_model,
                                        print_results=print_results)
            dfx = self.df.copy()
            dfx[self._exposure] = 1
            self.m1W = fitmodel.predict(dfx)
            dfx = self.df.copy()
            dfx[self._exposure] = 0
            self.m0W = fitmodel.predict(dfx)

        # User-specified model
        else:
            data = patsy.dmatrix(model + ' - 1', self.df)

            dfx = self.df.copy()
            dfx[self._exposure] = 1
            adata = patsy.dmatrix(model + ' - 1', dfx)
            dfx = self.df.copy()
            dfx[self._exposure] = 0
            ndata = patsy.dmatrix(model + ' - 1', dfx)

            self.m1W, self.m0W = _missing_machine_learner(
                xdata=np.array(data),
                mdata=self.df[self._missing_indicator],
                all_a=adata,
                none_a=ndata,
                ml_model=custom_model,
                print_results=print_results)

        self._fit_missing_model = True
Ejemplo n.º 7
0
    def exposure_model(self,
                       model,
                       custom_model=None,
                       bound=False,
                       print_results=True):
        """Estimation of Pr(A=1|L), which is termed as g(A=1|L) in the literature

        Parameters
        ----------
        model : str
            Independent variables to predict the exposure. Example) 'var1 + var2 + var3'
        custom_model : optional
            Input for a custom model that is used in place of the logit model (default). The model must have the
            "fit()" and  "predict()" attributes. Both sklearn and supylearner are supported as custom models. In the
            background, TMLE will fit the custom model and generate the predicted probablities
        bound : float, list, optional
            Value between 0,1 to truncate predicted probabilities. Helps to avoid near positivity violations.
            Specifying this argument can improve finite sample performance for random positivity violations. However,
            inference becomes limited to the restricted population. Default is False, meaning no truncation of
            predicted probabilities occurs. Providing a single float assumes symmetric trunctation. A collection of
            floats can be provided for asymmetric trunctation
        print_results : bool, optional
            Whether to print the fitted model results. Default is True (prints results)
        """
        self._exp_model = self._exposure + ' ~ ' + model

        # Step 3) Estimation of g-model (exposure model)
        if custom_model is None:
            fitmodel = propensity_score(self.df,
                                        self._exp_model,
                                        print_results=print_results)
            self.g1W = fitmodel.predict(self.df)

        # User-specified prediction model
        else:
            data = patsy.dmatrix(model + ' - 1', self.df)
            try:
                fm = custom_model.fit(X=data, y=self.df[self._outcome])
            except TypeError:
                raise TypeError(
                    "Currently custom_model must have the 'fit' function with arguments 'X', 'y'. This "
                    "covers both sklearn and supylearner. If there is a predictive model you would "
                    "like to use, please open an issue at https://github.com/pzivich/zepid and I "
                    "can work on adding support")
            if print_results and hasattr(fm, 'summarize'):
                fm.summarize()
            if hasattr(fm, 'predict_proba'):
                self.g1W = fm.predict_proba(data)[:, 1]
            elif hasattr(fm, 'predict'):
                self.g1W = fm.predict(data)
            else:
                raise ValueError(
                    "Currently custom_model must have 'predict' or 'predict_proba' attribute"
                )

        self.g0W = 1 - self.g1W
        if bound:  # Bounding predicted probabilities if requested
            self.g1W = self._bounding(self.g1W, bounds=bound)
        if bound:  # Bounding predicted probabilities if requested
            self.g0W = self._bounding(self.g0W, bounds=bound)

        self._fit_exposure_model = True
Ejemplo n.º 8
0
    def exposure_model(self,
                       model,
                       custom_model=None,
                       bound=False,
                       print_results=True):
        """Estimation of g(A=1,W), which is Pr(A=1|W)

        model:
            -Independent variables to predict the exposure. Example) 'var1 + var2 + var3'
        custom_model:
            -Input for a custom model. The model must already be estimated and have the "predict()" attribute to work.
             This allows the user to use any outside model they want and bring it into TMLE. For example, you can use
             any sklearn model, ensemble model (SuPyLearner), or just different statsmodels regression models than
             logistic regression. Please see online for an example
             NOTE: if a custom model is used, patsy in the background does the data filtering from the equation above.
             The equation order of variables MUST match that of the custom_model when it was fit. If not, this can lead
             to unexpected estimates
        bound:
            -Value between 0,1 to truncate predicted probabilities. Helps to avoid near positivity violations. Default
             is False, meaning no truncation of predicted probabilities occurs. Providing a single float assumes
             symmetric trunctation. A collection of floats can be provided for asymmetric trunctation
        print_results:
            -Whether to print the fitted model results. Default is True (prints results)
        """
        self._exp_model = self._exposure + ' ~ ' + model

        # Base logistic regression model to generated predicted probabilities
        if custom_model is None:
            fitmodel = propensity_score(self.df,
                                        self._exp_model,
                                        print_results=print_results)
            self.gW = fitmodel.predict(self.df)
            if bound is not False:  # Bounding predicted probabilities if requested
                self.gW = self._bounding(self.gW, bounds=bound)

        # User-specified prediction model
        else:
            try:  # This two-stage 'try' filters whether the data needs an intercept, then has the predict() attr
                data = patsy.dmatrix(model, self.df)
                try:
                    self.gW = custom_model.predict(data)
                    if bound is not False:  # Bounding predicted probabilities if requested
                        self.gW = self._bounding(self.gW, bounds=bound)
                except AttributeError:
                    raise AttributeError(
                        "custom_model does not have the 'predict()' attribute")
            except ValueError:
                data = patsy.dmatrix(model + ' - 1', self.df)
                try:
                    if hasattr(custom_model, 'predict_proba'):
                        self.gW = custom_model.predict_proba(data)[:, 1]
                    else:
                        self.gW = custom_model.predict(data)
                    if bound is not False:  # Bounding predicted probabilities if requested
                        self.gW = self._bounding(self.gW, bounds=bound)
                except AttributeError:
                    raise AttributeError(
                        "custom_model does not have the 'predict()' or 'predict_proba()' attribute"
                    )
        # Setting flag for fit() check to make sure exposure model was specified
        self._fit_exposure_model = True