def exposure_model(self, model, print_model_results=True): """Estimation of g(A=1,W), which is Pr(A=1|W) model: -Independent variables to predict the exposure. Example) 'var1 + var2 + var3' print_model_results: -Whether to print the fitted model results. Default is True (prints results) """ self._exp_model = self._exposure + ' ~ ' + model fitmodel = propensity_score(self.df, self._exp_model, mresult=print_model_results) self.gA1 = fitmodel.predict(self.df) self._fit_exposure_model = True
def exposure_model(self, model, print_results=True): """Used to specify the propensity score model. Model used to predict the exposure via a logistic regression model model: -Independent variables to predict the exposure. Example) 'var1 + var2 + var3' print_results: -Whether to print the fitted model results. Default is True (prints results) """ self._exp_model = self._exposure + ' ~ ' + model fitmodel = propensity_score(self.df, self._exp_model, print_results=print_results) self.df['ps'] = fitmodel.predict(self.df) self._fit_exposure_model = True
def exposure_model(self, model, custom_model=None, bound=False, print_results=True): """Estimation of Pr(A=1|L), which is termed as g(A=1|L) in the literature Parameters ---------- model : str Independent variables to predict the exposure. Example) 'var1 + var2 + var3' custom_model : optional Input for a custom model that is used in place of the logit model (default). The model must have the "fit()" and "predict()" attributes. Both sklearn and supylearner are supported as custom models. In the background, TMLE will fit the custom model and generate the predicted probablities bound : float, list, optional Value between 0,1 to truncate predicted probabilities. Helps to avoid near positivity violations. Specifying this argument can improve finite sample performance for random positivity violations. However, inference becomes limited to the restricted population. Default is False, meaning no truncation of predicted probabilities occurs. Providing a single float assumes symmetric trunctation. A collection of floats can be provided for asymmetric trunctation print_results : bool, optional Whether to print the fitted model results. Default is True (prints results) """ self._exp_model = self._exposure + ' ~ ' + model # Step 3) Estimation of g-model (exposure model) if custom_model is None: fitmodel = propensity_score(self.df, self._exp_model, print_results=print_results) self.g1W = fitmodel.predict(self.df) # User-specified prediction model else: data = patsy.dmatrix(model + ' - 1', self.df) self.g1W = _exposure_machine_learner(xdata=np.asarray(data), ydata=np.asarray( self.df[self._exposure]), ml_model=custom_model, print_results=print_results) self.g0W = 1 - self.g1W if bound: # Bounding predicted probabilities if requested self.g1W = self._bounding(self.g1W, bounds=bound) self.g0W = self._bounding(self.g0W, bounds=bound) self._fit_exposure_model = True
def exposure_model(self, model, print_results=True): """Specify the propensity score / inverse probability weight model. Model used to predict the exposure via a logistic regression model Parameters ---------- model : str Independent variables to predict the exposure. For example, 'var1 + var2 + var3' print_results : bool, optional Whether to print the fitted model results. Default is True (prints results) """ self._exp_model = self._exposure + ' ~ ' + model fitmodel = propensity_score(self.df, self._exp_model, print_results=print_results) self.df['ps'] = fitmodel.predict(self.df) self._fit_exposure_model = True
def exposure_model(self, model, print_results=True): r"""Specify the propensity score / inverse probability weight model. Model used to predict the exposure via a logistic regression model. This model estimates .. math:: \widehat{\Pr}(A=1|L) = logit^{-1}(\widehat{\beta_0} + \widehat{\beta} L) Parameters ---------- model : str Independent variables to predict the exposure. For example, 'var1 + var2 + var3' print_results : bool, optional Whether to print the fitted model results. Default is True (prints results) """ self._exp_model = self._exposure + ' ~ ' + model fitmodel = propensity_score(self.df, self._exp_model, weights=self._weight_, print_results=print_results) self.df['_ps_'] = fitmodel.predict(self.df) self._fit_exposure_ = True
def missing_model(self, model, custom_model=None, print_results=True): """Estimation of Pr(M=1|A,L), which is the missing data mechanism for the outcome. The corresponding observation probabilities are used to update the clever covariates for estimation of Qn. The initial estimate of Q is still based on complete observations only Parameters ---------- model : str Independent variables to predict the exposure. Example) 'var1 + var2 + var3'. The treatment must be included for the missing data model custom_model : optional Input for a custom model that is used in place of the logit model (default). The model must have the "fit()" and "predict()" attributes. Both sklearn and supylearner are supported as custom models. In the background, TMLE will fit the custom model and generate the predicted probablities print_results : bool, optional Whether to print the fitted model results. Default is True (prints results) """ # Error if no missing outcome data if not self._miss_flag: raise ValueError( "No missing outcome data is present in the data set") # Warning if exposure is not included in the missingness of outcome model if self._exposure not in model: warnings.warn( "For the specified missing outcome model, the exposure variable should be included in the " "model", UserWarning) self._miss_model = self._missing_indicator + ' ~ ' + model # Step 3b) Prediction for M if missing outcome data exists if custom_model is None: # Logistic Regression model for predictions fitmodel = propensity_score(self.df, self._miss_model, print_results=print_results) dfx = self.df.copy() dfx[self._exposure] = 1 self.m1W = fitmodel.predict(dfx) dfx = self.df.copy() dfx[self._exposure] = 0 self.m0W = fitmodel.predict(dfx) # User-specified model else: data = patsy.dmatrix(model + ' - 1', self.df) dfx = self.df.copy() dfx[self._exposure] = 1 adata = patsy.dmatrix(model + ' - 1', dfx) dfx = self.df.copy() dfx[self._exposure] = 0 ndata = patsy.dmatrix(model + ' - 1', dfx) self.m1W, self.m0W = _missing_machine_learner( xdata=np.array(data), mdata=self.df[self._missing_indicator], all_a=adata, none_a=ndata, ml_model=custom_model, print_results=print_results) self._fit_missing_model = True
def exposure_model(self, model, custom_model=None, bound=False, print_results=True): """Estimation of Pr(A=1|L), which is termed as g(A=1|L) in the literature Parameters ---------- model : str Independent variables to predict the exposure. Example) 'var1 + var2 + var3' custom_model : optional Input for a custom model that is used in place of the logit model (default). The model must have the "fit()" and "predict()" attributes. Both sklearn and supylearner are supported as custom models. In the background, TMLE will fit the custom model and generate the predicted probablities bound : float, list, optional Value between 0,1 to truncate predicted probabilities. Helps to avoid near positivity violations. Specifying this argument can improve finite sample performance for random positivity violations. However, inference becomes limited to the restricted population. Default is False, meaning no truncation of predicted probabilities occurs. Providing a single float assumes symmetric trunctation. A collection of floats can be provided for asymmetric trunctation print_results : bool, optional Whether to print the fitted model results. Default is True (prints results) """ self._exp_model = self._exposure + ' ~ ' + model # Step 3) Estimation of g-model (exposure model) if custom_model is None: fitmodel = propensity_score(self.df, self._exp_model, print_results=print_results) self.g1W = fitmodel.predict(self.df) # User-specified prediction model else: data = patsy.dmatrix(model + ' - 1', self.df) try: fm = custom_model.fit(X=data, y=self.df[self._outcome]) except TypeError: raise TypeError( "Currently custom_model must have the 'fit' function with arguments 'X', 'y'. This " "covers both sklearn and supylearner. If there is a predictive model you would " "like to use, please open an issue at https://github.com/pzivich/zepid and I " "can work on adding support") if print_results and hasattr(fm, 'summarize'): fm.summarize() if hasattr(fm, 'predict_proba'): self.g1W = fm.predict_proba(data)[:, 1] elif hasattr(fm, 'predict'): self.g1W = fm.predict(data) else: raise ValueError( "Currently custom_model must have 'predict' or 'predict_proba' attribute" ) self.g0W = 1 - self.g1W if bound: # Bounding predicted probabilities if requested self.g1W = self._bounding(self.g1W, bounds=bound) if bound: # Bounding predicted probabilities if requested self.g0W = self._bounding(self.g0W, bounds=bound) self._fit_exposure_model = True
def exposure_model(self, model, custom_model=None, bound=False, print_results=True): """Estimation of g(A=1,W), which is Pr(A=1|W) model: -Independent variables to predict the exposure. Example) 'var1 + var2 + var3' custom_model: -Input for a custom model. The model must already be estimated and have the "predict()" attribute to work. This allows the user to use any outside model they want and bring it into TMLE. For example, you can use any sklearn model, ensemble model (SuPyLearner), or just different statsmodels regression models than logistic regression. Please see online for an example NOTE: if a custom model is used, patsy in the background does the data filtering from the equation above. The equation order of variables MUST match that of the custom_model when it was fit. If not, this can lead to unexpected estimates bound: -Value between 0,1 to truncate predicted probabilities. Helps to avoid near positivity violations. Default is False, meaning no truncation of predicted probabilities occurs. Providing a single float assumes symmetric trunctation. A collection of floats can be provided for asymmetric trunctation print_results: -Whether to print the fitted model results. Default is True (prints results) """ self._exp_model = self._exposure + ' ~ ' + model # Base logistic regression model to generated predicted probabilities if custom_model is None: fitmodel = propensity_score(self.df, self._exp_model, print_results=print_results) self.gW = fitmodel.predict(self.df) if bound is not False: # Bounding predicted probabilities if requested self.gW = self._bounding(self.gW, bounds=bound) # User-specified prediction model else: try: # This two-stage 'try' filters whether the data needs an intercept, then has the predict() attr data = patsy.dmatrix(model, self.df) try: self.gW = custom_model.predict(data) if bound is not False: # Bounding predicted probabilities if requested self.gW = self._bounding(self.gW, bounds=bound) except AttributeError: raise AttributeError( "custom_model does not have the 'predict()' attribute") except ValueError: data = patsy.dmatrix(model + ' - 1', self.df) try: if hasattr(custom_model, 'predict_proba'): self.gW = custom_model.predict_proba(data)[:, 1] else: self.gW = custom_model.predict(data) if bound is not False: # Bounding predicted probabilities if requested self.gW = self._bounding(self.gW, bounds=bound) except AttributeError: raise AttributeError( "custom_model does not have the 'predict()' or 'predict_proba()' attribute" ) # Setting flag for fit() check to make sure exposure model was specified self._fit_exposure_model = True