Beispiel #1
0
    def regression_models(self,
                          model_denominator,
                          model_numerator='1',
                          print_results=True):
        """Logistic regression model(s) for estimating weights. The model denominator must be specified for both
        stabilized and unstabilized weights. The optional argument 'model_numerator' allows specification of the
        stabilization factor for the weight numerator. By default model results are returned

        Parameters
        ----------
        model_denominator : str
            String listing variables to predict the exposure, separated by +. For example, 'var1 + var2 + var3'. This
            is for the predicted probabilities of the denominator
        model_numerator : str, optional
            Optional string listing variables to predict the selection separated by +. Only used to calculate the
            numerator. Default ('1') calculates the overall probability of selection. In general, this is recommended.
            Adding in other variables means they are no longer accounted for in estimation of IPSW. Argument is also
            only used when calculating stabilized weights
        print_results : bool, optional
            Whether to print the model results from the regression models. Default is True
        """
        if not self.stabilized:
            if model_numerator != '1':
                raise ValueError(
                    'Argument for model_numerator is only used for stabilized=True'
                )

        dmodel = propensity_score(self.df,
                                  self.selection + ' ~ ' + model_denominator,
                                  print_results=print_results)

        self.sample['__denom__'] = dmodel.predict(self.sample)
        self._denominator_model = True

        # Stabilization factor if valid
        if self.stabilized:
            nmodel = propensity_score(self.df,
                                      self.selection + ' ~ ' + model_numerator,
                                      print_results=print_results)
            self.sample['__numer__'] = nmodel.predict(self.sample)
        else:
            self.sample['__numer__'] = 1

        # Calculate IPSW (generalizability)
        if self.generalize:
            self.sample['__ipsw__'] = self.sample['__numer__'] / self.sample[
                '__denom__']

        # Calculate IOSW (transportability)
        else:
            if self.stabilized:
                self.sample['__ipsw__'] = (((1 - self.sample['__denom__']) /
                                            self.sample['__denom__']) *
                                           (self.sample['__numer__'] /
                                            (1 - self.sample['__numer__'])))
            else:
                self.sample['__ipsw__'] = (
                    1 - self.sample['__denom__']) / self.sample['__denom__']

        self.Weight = self.sample['__ipsw__']
Beispiel #2
0
    def regression_models(self,
                          model_denominator,
                          model_numerator,
                          print_results=True):
        """Regression model to generate predicted probabilities of censoring, conditional on specified variables.
        Whether stabilized or unstabilized IPCW are generated depends on the specified model numerator.

        Parameters
        --------------
        model_denominator : str
            String of predictor variables for the denominator following `patsy` syntax. Any variables included in the
            numerator, should be included in the denominator as well. Example 'var1 + var2 + var3 + t_start + t_squared'
        model_numerator : str
            String of predictor variables for the numerator following `patsy` syntax. In general, time is used as the
            stabilization factor. Example of argument 't_start + t_squared'
        print_results : bool, optional
            Whether to print the model results. Default is True
        """
        nmodel = propensity_score(self.df,
                                  '__uncensored__ ~ ' + model_numerator,
                                  print_results=print_results)
        self.df['__numer__'] = nmodel.predict(self.df)
        dmodel = propensity_score(self.df,
                                  '__uncensored__ ~ ' + model_denominator,
                                  print_results=print_results)
        self.df['__denom__'] = dmodel.predict(self.df)
        self.df['__cnumer__'] = self.df.groupby(
            self.idvar)['__numer__'].cumprod()
        self.df['__cdenom__'] = self.df.groupby(
            self.idvar)['__denom__'].cumprod()
Beispiel #3
0
    def _monotone_variables(self, model_denominator, model_numerator, print_results):
        """Estimates probabilities under the monotone missing mechanism
        """
        model_denominator = list(model_denominator)
        model_numerator = list(model_numerator)

        # Check to make sure number of models is not more than number of missing variables
        if len(self.missing) < len(model_denominator) or len(self.missing) < len(model_numerator):
            raise ValueError('More models are specified than missing variables!')

        # If less models than missing variables are specified, repeat the last model for all variables
        while len(self.missing) > len(model_denominator):
            model_denominator.append(model_denominator[-1])
        while len(self.missing) > len(model_numerator):
            model_numerator.append(model_numerator[-1])

        # Looping through all missing variables and specified models
        probs_denom = pd.Series([1] * self.df.shape[0])
        probs_num = pd.Series([1] * self.df.shape[0])

        for mv, model_d, model_n in zip(self.missing, model_denominator, model_numerator):
            df = self.df.copy()

            # Restricting to all those observed by the "outer set" variable
            if mv == self.missing[0]:
                uniform = False
            else:
                # Checking to see if this variable and the previous are uniformly missing
                uniform = self._check_uniform(df, miss1=self.missing[self.missing.index(mv) - 1], miss2=mv)
                # Restricting to only observed
                df = df.loc[df[self.missing[self.missing.index(mv) - 1]].notnull()].copy()

            if uniform:
                continue
            else:
                df.loc[df[mv].isnull(), '_observed_indicator_'] = 0
                df.loc[df[mv].notnull(), '_observed_indicator_'] = 1
                dmodel = propensity_score(df, '_observed_indicator_ ~ ' + model_d, print_results=print_results)
                probs_denom = probs_denom * dmodel.predict(self.df)

                # Only for stabilized IPMW with monotone missing data
                if self.stabilized:
                    nmodel = propensity_score(df, '_observed_indicator_ ~ ' + model_n, print_results=print_results)
                    probs_num = probs_num * nmodel.predict(self.df)

        # Calculating Probabilities
        self.df['__denom__'] = np.where(self.df[self.missing[-1]].notnull(), probs_denom, np.nan)
        if self.stabilized:
            self.df['__numer__'] = np.where(self.df[self.missing[-1]].notnull(), probs_num, np.nan)
        else:
            self.df['__numer__'] = np.where(self.df[self.missing[-1]].notnull(), 1, np.nan)
        self._denominator_model = True
Beispiel #4
0
    def _single_variable(self, model_denominator, model_numerator, print_results):
        """Estimates probabilities when only a single variable is missing
        """
        self.df.loc[self.df[self.missing].isnull(), '_observed_indicator_'] = 0
        self.df.loc[self.df[self.missing].notnull(), '_observed_indicator_'] = 1

        dmodel = propensity_score(self.df, '_observed_indicator_ ~ ' + model_denominator, print_results=print_results)
        self.df['__denom__'] = np.where(self.df[self.missing].notnull(), dmodel.predict(self.df), np.nan)
        self._denominator_model = True

        if self.stabilized:
            nmodel = propensity_score(self.df, '_observed_indicator_ ~ ' + model_numerator, print_results=print_results)
            self.df['__numer__'] = np.where(self.df[self.missing].notnull(), nmodel.predict(self.df), np.nan)
        else:
            self.df['__numer__'] = np.where(self.df[self.missing].notnull(), 1, np.nan)
Beispiel #5
0
    def _closed_form_solver_(treat, model, df, snm_matrix, y_matrix, weights,
                             print_results):
        """Background function to calculate the closed form solution for psi
        """
        # Calculate predictions
        fm = propensity_score(df=df,
                              model=model,
                              weights=weights,
                              print_results=print_results)
        pred_treat = fm.predict(df)

        diff = df[treat] - pred_treat
        if weights is not None:
            diff = diff * df[weights]

        # D-dimensional psi-matrix
        lhm = np.dot(snm_matrix.mul(diff, axis=0).transpose(),
                     snm_matrix)  # Dot product to produce D-by-D matrix

        # Array of outcomes
        y_matrix = y_matrix.mul(diff, axis=0)
        rha = y_matrix.sum()

        # Solving matrix and array for psi values
        psi_array = np.linalg.solve(lhm, rha)
        return psi_array
Beispiel #6
0
        def function_to_optimize(data, psi, snm_terms, y, a, pi_model,
                                 alpha_shift, weights):
            # loop through all psi values to calculate the corresponding H(psi) value based on covariate pattern
            snm = data[y] - data[snm_terms].mul(psi,
                                                axis='columns').sum(axis=1)
            data['H_psi'] = snm

            # Creating new terms to add to model
            h_terms_list = [w.replace(treatment, 'H_psi') for w in snm_terms]
            h_terms = ''
            for h in h_terms_list:
                h_terms += ' + ' + h

            # Estimating the necessary model
            fm = propensity_score(df=data,
                                  model=a + ' ~ ' + pi_model + h_terms,
                                  weights=weights,
                                  print_results=False)

            # Pulling elements from fitted model
            alpha = fm.params[
                h_terms_list] - alpha_shift  # Estimated alphas with the shift
            if verbose_solver:
                print('Psi:  ', np.array(psi))
                print('Alpha:', np.array(alpha))

            return np.abs(np.array(alpha)), psi
Beispiel #7
0
    def exposure_model(self,
                       model,
                       custom_model=None,
                       bound=False,
                       print_results=True):
        """Estimation of Pr(A=1|L), which is termed as g(A=1|L) in the literature

        Parameters
        ----------
        model : str
            Independent variables to predict the exposure. Example) 'var1 + var2 + var3'
        custom_model : optional
            Input for a custom model that is used in place of the logit model (default). The model must have the
            "fit()" and  "predict()" attributes. Both sklearn and supylearner are supported as custom models. In the
            background, TMLE will fit the custom model and generate the predicted probablities
        bound : float, list, optional
            Value between 0,1 to truncate predicted probabilities. Helps to avoid near positivity violations.
            Specifying this argument can improve finite sample performance for random positivity violations. However,
            truncating weights leads to additional confounding. Default is False, meaning no truncation of
            predicted probabilities occurs. Providing a single float assumes symmetric trunctation, where values below
            or above the threshold are set to the threshold value. Alternatively a list of floats can be provided for
            asymmetric trunctation, with the first value being the lower bound and the second being the upper bound
        print_results : bool, optional
            Whether to print the fitted model results. Default is True (prints results)
        """
        self._exp_model = self.exposure + ' ~ ' + model
        self.__mweight = model

        # Step 3) Estimation of g-model (exposure model)
        if custom_model is None:
            fitmodel = propensity_score(self.df,
                                        self._exp_model,
                                        print_results=print_results)
            self.g1W = fitmodel.predict(self.df)

        # User-specified prediction model
        else:
            # TODO need to create smart warning system
            # warnings.warn("TMLE can result in confidence intervals below nominal coverage when used with "
            #              "certain machine learning algorithms")
            self._exp_model_custom = True
            data = patsy.dmatrix(model + ' - 1', self.df)
            self.g1W = exposure_machine_learner(xdata=np.asarray(data),
                                                ydata=np.asarray(
                                                    self.df[self.exposure]),
                                                ml_model=custom_model,
                                                print_results=print_results)

        self.g0W = 1 - self.g1W
        if bound:  # Bounding predicted probabilities if requested
            self.g1W = _bounding_(self.g1W, bounds=bound)
            self.g0W = _bounding_(self.g0W, bounds=bound)

        self._fit_exposure_model = True
Beispiel #8
0
    def exposure_model(self,
                       model,
                       custom_model=None,
                       bound=False,
                       print_results=True):
        """Estimation of Pr(A=1|L), which is termed as g(A=1|L) in the literature

        Parameters
        ----------
        model : str
            Independent variables to predict the exposure. Example) 'var1 + var2 + var3'
        custom_model : optional
            Input for a custom model that is used in place of the logit model (default). The model must have the
            "fit()" and  "predict()" attributes. Both sklearn and supylearner are supported as custom models. In the
            background, TMLE will fit the custom model and generate the predicted probablities
        bound : float, list, optional
            Value between 0,1 to truncate predicted probabilities. Helps to avoid near positivity violations.
            Specifying this argument can improve finite sample performance for random positivity violations. However,
            inference becomes limited to the restricted population. Default is False, meaning no truncation of
            predicted probabilities occurs. Providing a single float assumes symmetric trunctation. A collection of
            floats can be provided for asymmetric trunctation
        print_results : bool, optional
            Whether to print the fitted model results. Default is True (prints results)
        """
        self._exp_model = self.exposure + ' ~ ' + model

        # Step 3) Estimation of g-model (exposure model)
        if custom_model is None:
            fitmodel = propensity_score(self.df,
                                        self._exp_model,
                                        print_results=print_results)
            self.g1W = fitmodel.predict(self.df)

        # User-specified prediction model
        else:
            self._exp_model_custom = True
            data = patsy.dmatrix(model + ' - 1', self.df)
            self.g1W = exposure_machine_learner(xdata=np.asarray(data),
                                                ydata=np.asarray(
                                                    self.df[self.exposure]),
                                                ml_model=custom_model,
                                                print_results=print_results)

        self.g0W = 1 - self.g1W
        if bound:  # Bounding predicted probabilities if requested
            self.g1W = self._bounding(self.g1W, bounds=bound)
            self.g0W = self._bounding(self.g0W, bounds=bound)

        self._fit_exposure_model = True
Beispiel #9
0
    def treatment_model(self, model, print_results=True):
        r"""Specify the parametric regression model for the observed treatment conditional on the sufficient adjustment
        set. This model estimates the following component of the stochastic IPTW weights

        .. math::

            \widehat{\Pr}(A=a|L)

        Parameters
        ----------
        model : str
            String listing variables to predict the exposure via `patsy` syntax. For example, `'var1 + var2 + var3'`
        print_results : bool, optional
            Whether to print the model results from the regression models. Default is True
        """
        # Calculating denominator probabilities
        denominator_model = propensity_score(self.df,
                                             self.treatment + ' ~ ' + model,
                                             weights=self.weights,
                                             print_results=print_results)
        self._pdenom_ = denominator_model.predict(self.df)
Beispiel #10
0
    def exposure_model(self, model, print_results=True):
        r"""Specify the propensity score / inverse probability weight model. Model used to predict the exposure via a
        logistic regression model. This model estimates

        .. math::

            \widehat{\Pr}(A=1|L) = logit^{-1}(\widehat{\beta_0} + \widehat{\beta} L)

        Parameters
        ----------
        model : str
            Independent variables to predict the exposure. For example, 'var1 + var2 + var3'
        print_results : bool, optional
            Whether to print the fitted model results. Default is True (prints results)
        """
        self._exp_model = self.exposure + ' ~ ' + model
        fitmodel = propensity_score(self.df,
                                    self._exp_model,
                                    weights=self._weight_,
                                    print_results=print_results)
        self.df['_ps_'] = fitmodel.predict(self.df)
        self._fit_exposure_ = True
Beispiel #11
0
    def exposure_model(self, model, bound=False, print_results=True):
        r"""Specify the propensity score / inverse probability weight model. Model used to predict the exposure via a
        logistic regression model. This model estimates

        .. math::

            \widehat{\Pr}(A=1|L) = logit^{-1}(\widehat{\beta_0} + \widehat{\beta} L)

        Parameters
        ----------
        model : str
            Independent variables to predict the exposure. For example, 'var1 + var2 + var3'
        bound : float, list, optional
            Value between 0,1 to truncate predicted probabilities. Helps to avoid near positivity violations.
            Specifying this argument can improve finite sample performance for random positivity violations. However,
            truncating weights leads to additional confounding. Default is False, meaning no truncation of
            predicted probabilities occurs. Providing a single float assumes symmetric trunctation, where values below
            or above the threshold are set to the threshold value. Alternatively a list of floats can be provided for
            asymmetric trunctation, with the first value being the lower bound and the second being the upper bound
        print_results : bool, optional
            Whether to print the fitted model results. Default is True (prints results)
        """
        self.__mweight = model
        self._exp_model = self.exposure + ' ~ ' + model
        fitmodel = propensity_score(self.df,
                                    self._exp_model,
                                    weights=self._weight_,
                                    print_results=print_results)
        ps = fitmodel.predict(self.df)
        self.df['_g1_'] = ps
        self.df['_g0_'] = 1 - ps

        # If bounds are requested
        if bound:
            self.df['_g1_'] = _bounding_(self.df['_g1_'], bounds=bound)
            self.df['_g0_'] = _bounding_(self.df['_g0_'], bounds=bound)

        self._fit_exposure_ = True
Beispiel #12
0
    def missing_model(self, model, bound=False, print_results=True):
        r"""Estimation of Pr(M=0|A,L), which is the missing data mechanism for the outcome. Predicted probabilities are
        used to create inverse probability of censoring weights to account for informative missing data on the outcome.

        Missing weights take the following form

        .. math::
            \frac{1}{\Pr(C=0|A=a, L)}

        Weights are calculated for both A=1 and A=0

        Note
        ----
        The treatment variable should be included in the model

        Parameters
        ----------
        model : str
            Independent variables to predict the exposure. Example) 'var1 + var2 + var3'. The treatment must be
            included for the missing data model
        bound : float, list, optional
            Value between 0,1 to truncate predicted probabilities. Helps to avoid near positivity violations.
            Specifying this argument can improve finite sample performance for random positivity violations. However,
            truncating weights leads to additional confounding. Default is False, meaning no truncation of
            predicted probabilities occurs. Providing a single float assumes symmetric trunctation, where values below
            or above the threshold are set to the threshold value. Alternatively a list of floats can be provided for
            asymmetric trunctation, with the first value being the lower bound and the second being the upper bound
        print_results : bool, optional
            Whether to print the fitted model results. Default is True (prints results)
        """
        # Error if no missing outcome data
        if not self._miss_flag:
            raise ValueError(
                "No missing outcome data is present in the data set")

        # Warning if exposure is not included in the missingness of outcome model
        if self.exposure not in model:
            warnings.warn(
                "For the specified missing outcome model, the exposure variable should be included in the "
                "model", UserWarning)

        # Warning if exposure is not included in the missingness of outcome model
        if self.exposure not in model:
            warnings.warn(
                "For the specified missing outcome model, the exposure variable should be included in the "
                "model", UserWarning)

        self._miss_model = self._missing_indicator + ' ~ ' + model
        fitmodel = propensity_score(self.df,
                                    self._miss_model,
                                    print_results=print_results)

        dfx = self.df.copy()
        dfx[self.exposure] = 1
        self.df['_ipmw_a1_'] = np.where(self.df[self._missing_indicator] == 1,
                                        fitmodel.predict(dfx), np.nan)
        dfx = self.df.copy()
        dfx[self.exposure] = 0
        self.df['_ipmw_a0_'] = np.where(self.df[self._missing_indicator] == 1,
                                        fitmodel.predict(dfx), np.nan)

        # If bounds are requested
        if bound:
            self.df['_ipmw_a1_'] = _bounding_(self.df['_ipmw_a1_'],
                                              bounds=bound)
            self.df['_ipmw_a0_'] = _bounding_(self.df['_ipmw_a0_'],
                                              bounds=bound)

        self._fit_missing_ = True
Beispiel #13
0
    def missing_model(self, model, custom_model=None, bound=False, print_results=True):
        r"""Estimation of Pr(M=0|A,L), which is the missing data mechanism for the outcome. Predicted probabilities are
        used to create inverse probability of censoring weights to account for informative missing data on the outcome.

        Missing weights take the following form

        .. math::
            \frac{1}{\Pr(C=0|A=a, L)}

        Weights are calculated for both A=1 and A=0

        Note
        ----
        The treatment variable should be included in the model

        Parameters
        ----------
        model : str
            Independent variables to predict the exposure. Example) 'var1 + var2 + var3'. The treatment must be
            included for the missing data model
        custom_model : optional
            Input for a custom model that is used in place of the logit model (default). The model must have the
            "fit()" and  "predict()" attributes. SciKit-Learn style models are supported as custom models. In the
            background, AIPTW will fit the custom model and generate the predicted probablities
        bound : float, list, optional
            Value between 0,1 to truncate predicted probabilities. Helps to avoid near positivity violations.
            Specifying this argument can improve finite sample performance for random positivity violations. However,
            truncating weights leads to additional confounding. Default is False, meaning no truncation of
            predicted probabilities occurs. Providing a single float assumes symmetric trunctation, where values below
            or above the threshold are set to the threshold value. Alternatively a list of floats can be provided for
            asymmetric trunctation, with the first value being the lower bound and the second being the upper bound
        print_results : bool, optional
            Whether to print the fitted model results. Default is True (prints results)
        """
        # Error if no missing outcome data
        if not self._miss_flag:
            raise ValueError("No missing outcome data is present in the data set")

        # Warning if exposure is not included in the missingness of outcome model
        if self.exposure not in model:
            warnings.warn("For the specified missing outcome model, the exposure variable should be included in the "
                          "model", UserWarning)

        # Warning if exposure is not included in the missingness of outcome model
        if self.exposure not in model:
            warnings.warn("For the specified missing outcome model, the exposure variable should be included in the "
                          "model", UserWarning)

        self._miss_model = self._missing_indicator + ' ~ ' + model
        fitmodel = propensity_score(self.df, self._miss_model, print_results=print_results)

        if custom_model is None:  # Logistic Regression model for predictions
            dfx = self.df.copy()
            dfx[self.exposure] = 1
            m1w = np.where(self.df[self._missing_indicator] == 1, fitmodel.predict(dfx), np.nan)
            dfx = self.df.copy()
            dfx[self.exposure] = 0
            m0w = np.where(self.df[self._missing_indicator] == 1, fitmodel.predict(dfx), np.nan)
        else:  # User-Specified model
            self._miss_model_custom = True
            data = patsy.dmatrix(model + ' - 1', self.df)
            dfx = self.df.copy()
            dfx[self.exposure] = 1
            adata = patsy.dmatrix(model + ' - 1', dfx)
            dfx = self.df.copy()
            dfx[self.exposure] = 0
            ndata = patsy.dmatrix(model + ' - 1', dfx)

            m1w, m0w = missing_machine_learner(xdata=np.array(data),
                                               mdata=self.df[self._missing_indicator],
                                               all_a=adata, none_a=ndata,
                                               ml_model=copy.deepcopy(custom_model),
                                               print_results=print_results)

        # If bounds are requested
        if bound:
            m1w = probability_bounds(m1w, bounds=bound)
            m0w = probability_bounds(m0w, bounds=bound)

        self.df['_ipmw_a1_'] = m1w
        self.df['_ipmw_a0_'] = m0w

        self._fit_missing_ = True
Beispiel #14
0
    def sampling_model(self,
                       model_denominator,
                       model_numerator='1',
                       bound=None,
                       stabilized=True,
                       print_results=True):
        """Logistic regression model(s) for estimating sampling weights. The model denominator must be specified for
        both stabilized and unstabilized weights. The optional argument 'model_numerator' allows specification of the
        stabilization factor for the weight numerator. By default model results are returned

        Parameters
        ----------
        model_denominator : str
            String listing variables to predict the exposure, separated by +. For example, 'var1 + var2 + var3'. This
            is for the predicted probabilities of the denominator
        model_numerator : str, optional
            Optional string listing variables to predict the selection separated by +. Only used to calculate the
            numerator. Default ('1') calculates the overall probability of selection. In general, this is recommended.
            Adding in other variables means they are no longer accounted for in estimation of IPSW. Argument is also
            only used when calculating stabilized weights
        bound : float, list, optional
            Value between 0,1 to truncate predicted probabilities. Helps to avoid near positivity violations.
            Specifying this argument can improve finite sample performance for random positivity violations. However,
            inference becomes limited to the restricted population. Default is False, meaning no truncation of
            predicted probabilities occurs. Providing a single float assumes symmetric trunctation. A collection of
            floats can be provided for asymmetric trunctation
        stabilized : bool, optional
            Whether to generated stabilized IPSW. Default is True, which returns the stabilized IPSW
        print_results : bool, optional
            Whether to print the model results from the regression models. Default is True
        """
        if not stabilized:
            if model_numerator != '1':
                raise ValueError(
                    'Argument for model_numerator is only used for stabilized=True'
                )

        dmodel = propensity_score(self.df,
                                  self.selection + ' ~ ' + model_denominator,
                                  print_results=print_results)

        self.sample['__denom__'] = dmodel.predict(self.sample)
        self._denominator_model = True

        # Stabilization factor if valid
        if stabilized:
            nmodel = propensity_score(self.df,
                                      self.selection + ' ~ ' + model_numerator,
                                      print_results=print_results)
            self.sample['__numer__'] = nmodel.predict(self.sample)
        else:
            self.sample['__numer__'] = 1

        if bound:
            self.sample['__denom__'] = probability_bounds(
                self.sample['__denom__'], bounds=bound)
            self.sample['__numer__'] = probability_bounds(
                self.sample['__numer__'], bounds=bound)

        # Calculate IPSW (generalizability)
        if self.generalize:
            self.sample['__ipsw__'] = self.sample['__numer__'] / self.sample[
                '__denom__']

        # Calculate IOSW (transportability)
        else:
            if stabilized:
                self.sample['__ipsw__'] = (((1 - self.sample['__denom__']) /
                                            self.sample['__denom__']) *
                                           (self.sample['__numer__'] /
                                            (1 - self.sample['__numer__'])))
            else:
                self.sample['__ipsw__'] = (
                    1 - self.sample['__denom__']) / self.sample['__denom__']

        self.ipsw = self.sample['__ipsw__']
Beispiel #15
0
    def missing_model(self,
                      model_denominator,
                      model_numerator=None,
                      stabilized=True,
                      bound=False,
                      print_results=True):
        """Estimation of Pr(M=0|A=a,L), which is the missing data mechanism for the outcome. The corresponding
        observation probabilities are used to account for informative censoring by observed variables. The missing_model
        only accounts for missing outcome data.

        The inverse probability weights calculated by this function account for informative censoring (missing data on
        the outcome) by observed variables. The parametric model should be sufficiently flexible to capture any
        interaction terms and functional forms of continuous variables

        Note
        ----
        The treatment variable should be included in the model

        Parameters
        ----------
        model_denominator: str
            String listing variables predicting missingness of outcomes via `patsy` syntax. For example, `
            'var1 + var2 + var3'. This is for the predicted probabilities of the denominator
        model_numerator : str, optional
            Optional string listing variables to predict the exposure, separated by +. Only used to calculate the
            numerator. Default (None) calculates the probability of censoring by treatment only. In general this is
            recommended. If assessing effect modifcation, this variable should be included in the numerator as well.
            Argument is only used when calculating stabilized weights
        stabilized : bool, optional
            Whether to use stabilized inverse probability of censoring weights
        bound : float, list, optional
            Value between 0,1 to truncate predicted probabilities. Helps to avoid near positivity violations.
            Specifying this argument can improve finite sample performance for random positivity violations. However,
            inference becomes limited to the restricted population. Default is False, meaning no truncation of
            predicted probabilities occurs. Providing a single float assumes symmetric trunctation. A collection of
            floats can be provided for asymmetric trunctation
        print_results: bool, optional
        """
        # Error if no missing outcome data
        if not self._miss_flag:
            raise ValueError(
                "No missing outcome data is present in the data set")

        # Warning if exposure is not included in the missingness of outcome model
        if self.treatment not in model_denominator:
            warnings.warn(
                "For the specified missing outcome model, the exposure variable should be included in the "
                "model", UserWarning)

        self._miss_model = self._missing_indicator + ' ~ ' + model_denominator
        fitmodel = propensity_score(self.df,
                                    self._miss_model,
                                    print_results=print_results)

        if stabilized:
            if model_numerator is None:
                mnum = self.treatment
            else:
                mnum = model_numerator
            numerator_model = propensity_score(self.df,
                                               self._missing_indicator +
                                               ' ~ ' + mnum,
                                               weights=self._weight_,
                                               print_results=print_results)
            n = numerator_model.predict(self.df)
        else:
            n = 1

        if bound:  # Bounding predicted probabilities if requested
            d = _bounding_(fitmodel.predict(self.df), bounds=bound)
        else:
            d = fitmodel.predict(self.df)

        self.ipmw = np.where(self.df[self._missing_indicator] == 1, n / d,
                             np.nan)
        self._fit_missing_ = True
Beispiel #16
0
    def missing_model(self,
                      model,
                      custom_model=None,
                      bound=False,
                      print_results=True):
        """Estimation of Pr(M=1|A,L), which is the missing data mechanism for the outcome. The corresponding observation
        probabilities are used to update the clever covariates for estimation of Qn.

        The initial estimate of Q is still based on complete observations only

        Parameters
        ----------
        model : str
            Independent variables to predict the exposure. Example) 'var1 + var2 + var3'. The treatment must be
            included for the missing data model
        custom_model : optional
            Input for a custom model that is used in place of the logit model (default). The model must have the
            "fit()" and  "predict()" attributes. Both sklearn and supylearner are supported as custom models. In the
            background, TMLE will fit the custom model and generate the predicted probablities
        bound: float, list, optional
            Value between 0,1 to truncate predicted probabilities. Helps to avoid near positivity violations.
            Specifying this argument can improve finite sample performance for random positivity violations. However,
            truncating weights leads to additional confounding. Default is False, meaning no truncation of
            predicted probabilities occurs. Providing a single float assumes symmetric trunctation, where values below
            or above the threshold are set to the threshold value. Alternatively a list of floats can be provided for
            asymmetric trunctation, with the first value being the lower bound and the second being the upper bound
        print_results : bool, optional
            Whether to print the fitted model results. Default is True (prints results)
        """
        # Error if no missing outcome data
        if not self._miss_flag:
            raise ValueError(
                "No missing outcome data is present in the data set")

        # Warning if exposure is not included in the missingness of outcome model
        if self.exposure not in model:
            warnings.warn(
                "For the specified missing outcome model, the exposure variable should be included in the "
                "model", UserWarning)

        self._miss_model = self._missing_indicator + ' ~ ' + model

        # Step 3b) Prediction for M if missing outcome data exists
        if custom_model is None:  # Logistic Regression model for predictions
            fitmodel = propensity_score(self.df,
                                        self._miss_model,
                                        print_results=print_results)
            dfx = self.df.copy()
            dfx[self.exposure] = 1
            self.m1W = fitmodel.predict(dfx)
            dfx = self.df.copy()
            dfx[self.exposure] = 0
            self.m0W = fitmodel.predict(dfx)

        # User-specified model
        else:
            # TODO need to create smart warning system
            # warnings.warn("TMLE can result in confidence intervals below nominal coverage when used with "
            #              "certain machine learning algorithms")

            self._miss_model_custom = True
            data = patsy.dmatrix(model + ' - 1', self.df)

            dfx = self.df.copy()
            dfx[self.exposure] = 1
            adata = patsy.dmatrix(model + ' - 1', dfx)
            dfx = self.df.copy()
            dfx[self.exposure] = 0
            ndata = patsy.dmatrix(model + ' - 1', dfx)

            self.m1W, self.m0W = missing_machine_learner(
                xdata=np.array(data),
                mdata=self.df[self._missing_indicator],
                all_a=adata,
                none_a=ndata,
                ml_model=custom_model,
                print_results=print_results)

        if bound:  # Bounding predicted probabilities if requested
            self.m1W = _bounding_(self.m1W, bounds=bound)
            self.m0W = _bounding_(self.m0W, bounds=bound)

        self._fit_missing_model = True
Beispiel #17
0
    def regression_models(self,
                          model_denominator,
                          model_numerator='1',
                          print_results=True,
                          custom_model_denominator=None,
                          custom_model_numerator=None):
        """Logistic regression model(s) for propensity score models. The model denominator must be specified for both
        stabilized and unstabilized weights. The optional argument 'model_numerator' allows specification of the
        stabilization factor for the weight numerator. By default model results are returned

        Parameters
        ------------
        model_denominator : str
            String listing variables to predict the exposure via `patsy` syntax. For example, `'var1 + var2 + var3'`.
            This is for the predicted probabilities of the denominator
        model_numerator : str, optional
            Optional string listing variables to predict the exposure, separated by +. Only used to calculate the
            numerator. Default ('1') calculates the overall probability of exposure. In general this is recommended. If
            confounding variables are included in the numerator, they would later need to be adjusted for in the faux
            marginal structural argument. Additionally, used for assessment of effect measure modification. Argument is
            also only used when calculating stabilized weights
        print_results : bool, optional
            Whether to print the model results from the regression models. Default is True
        custom_model_denominator : optional
            Input for a custom model that is used in place of the logit model. The model must have the
            `fit()` and  `predict()` attributes. Both `sklearn` and `supylearner` are supported as custom models. In the
            background, `IPTW` will fit the custom model and generate the predicted probablities
        custom_model_numerator : optional
            Input for a custom model that is used in place of the logit model. The model must have the
            `fit()` and  `predict()` attributes. Both `sklearn` and `supylearner` are supported as custom models. In the
            background, `IPTW` will fit the custom model and generate the predicted probablities

        Note
        ----
        If custom models are used, it is important that GEE is used to obtain the variance. Bootstrapped confidence
        intervals are incorrect with the usage of some machine learning models
        """
        # Calculating denominator probabilities
        self.__mdenom = model_denominator
        if custom_model_denominator is None:
            self.denominator_model = propensity_score(
                self.df,
                self.ex + ' ~ ' + model_denominator,
                weights=self._weight_,
                print_results=print_results)
            d = self.denominator_model.predict(self.df)
        else:
            data = patsy.dmatrix(model_denominator + ' - 1', self.df)
            self.denominator_model = 'User-specified model'
            d = exposure_machine_learner(xdata=np.asarray(data),
                                         ydata=np.asarray(self.df[self.ex]),
                                         ml_model=custom_model_denominator,
                                         print_results=print_results)

        self.df['__denom__'] = d

        # Calculating numerator probabilities (if stabilized)
        if self.stabilized is True:
            if custom_model_numerator is None:
                self.numerator_model = propensity_score(
                    self.df,
                    self.ex + ' ~ ' + model_numerator,
                    weights=self._weight_,
                    print_results=print_results)
                n = self.numerator_model.predict(self.df)

            else:
                data = patsy.dmatrix(model_numerator + ' - 1', self.df)
                n = exposure_machine_learner(xdata=np.asarray(data),
                                             ydata=np.asarray(
                                                 self.df[self.ex]),
                                             ml_model=custom_model_numerator,
                                             print_results=print_results)

        # If unstabilized, numerator is always 1
        else:
            if model_numerator != '1':
                raise ValueError(
                    'Argument for model_numerator is only used for stabilized=True'
                )
            n = 1
        self.df['__numer__'] = n
Beispiel #18
0
    def missing_model(self, model, custom_model=None, print_results=True):
        """Estimation of Pr(M=1|A,L), which is the missing data mechanism for the outcome. The corresponding observation
        probabilities are used to update the clever covariates for estimation of Qn.

        The initial estimate of Q is still based on complete observations only

        Parameters
        ----------
        model : str
            Independent variables to predict the exposure. Example) 'var1 + var2 + var3'. The treatment must be
            included for the missing data model
        custom_model : optional
            Input for a custom model that is used in place of the logit model (default). The model must have the
            "fit()" and  "predict()" attributes. Both sklearn and supylearner are supported as custom models. In the
            background, TMLE will fit the custom model and generate the predicted probablities
        print_results : bool, optional
            Whether to print the fitted model results. Default is True (prints results)
        """
        # Error if no missing outcome data
        if not self._miss_flag:
            raise ValueError(
                "No missing outcome data is present in the data set")

        # Warning if exposure is not included in the missingness of outcome model
        if self.exposure not in model:
            warnings.warn(
                "For the specified missing outcome model, the exposure variable should be included in the "
                "model", UserWarning)

        self._miss_model = self._missing_indicator + ' ~ ' + model

        # Step 3b) Prediction for M if missing outcome data exists
        if custom_model is None:  # Logistic Regression model for predictions
            fitmodel = propensity_score(self.df,
                                        self._miss_model,
                                        print_results=print_results)
            dfx = self.df.copy()
            dfx[self.exposure] = 1
            self.m1W = fitmodel.predict(dfx)
            dfx = self.df.copy()
            dfx[self.exposure] = 0
            self.m0W = fitmodel.predict(dfx)

        # User-specified model
        else:
            self._miss_model_custom = True
            data = patsy.dmatrix(model + ' - 1', self.df)

            dfx = self.df.copy()
            dfx[self.exposure] = 1
            adata = patsy.dmatrix(model + ' - 1', dfx)
            dfx = self.df.copy()
            dfx[self.exposure] = 0
            ndata = patsy.dmatrix(model + ' - 1', dfx)

            self.m1W, self.m0W = missing_machine_learner(
                xdata=np.array(data),
                mdata=self.df[self._missing_indicator],
                all_a=adata,
                none_a=ndata,
                ml_model=custom_model,
                print_results=print_results)

        self._fit_missing_model = True
Beispiel #19
0
    def treatment_model(self,
                        model_denominator,
                        model_numerator='1',
                        stabilized=True,
                        bound=False,
                        print_results=True):
        """Logistic regression model(s) for propensity score models. The model denominator must be specified for both
        stabilized and unstabilized weights. The optional argument 'model_numerator' allows specification of the
        stabilization factor for the weight numerator. By default model results are returned

        Parameters
        ------------
        model_denominator : str
            String listing variables to predict the exposure via `patsy` syntax. For example, `'var1 + var2 + var3'`.
            This is for the predicted probabilities of the denominator
        model_numerator : str, optional
            Optional string listing variables to predict the exposure, separated by +. Only used to calculate the
            numerator. Default ('1') calculates the overall probability of exposure. In general this is recommended. If
            confounding variables are included in the numerator, they would later need to be adjusted for in the faux
            marginal structural argument. Additionally, used for assessment of effect measure modification. Argument is
            also only used when calculating stabilized weights
        stabilized : bool, optional
            Whether to return stabilized or unstabilized weights. Default is stabilized weights (True)
        bound : float, list, optional
            Value between 0,1 to truncate predicted probabilities. Helps to avoid near positivity violations.
            Specifying this argument can improve finite sample performance for random positivity violations. However,
            inference becomes limited to the restricted population. Default is False, meaning no truncation of
            predicted probabilities occurs. Providing a single float assumes symmetric trunctation. A collection of
            floats can be provided for asymmetric trunctation
        print_results : bool, optional
            Whether to print the model results from the regression models. Default is True
        """
        # Calculating denominator probabilities
        self.__mdenom = model_denominator
        denominator_model = propensity_score(self.df,
                                             self.treatment + ' ~ ' +
                                             model_denominator,
                                             weights=self._weight_,
                                             print_results=print_results)
        d = denominator_model.predict(self.df)
        self.df['__denom__'] = d

        # Calculating numerator probabilities (if stabilized)
        if stabilized is True:
            numerator_model = propensity_score(self.df,
                                               self.treatment + ' ~ ' +
                                               model_numerator,
                                               weights=self._weight_,
                                               print_results=print_results)
            n = numerator_model.predict(self.df)
        else:
            if model_numerator != '1':
                raise ValueError(
                    'Argument for model_numerator is only used for stabilized=True'
                )
            n = 1
        self.df['__numer__'] = n

        # Bounding predicted probabilities if requested
        if bound:
            self.df['__denom__'] = _bounding_(self.df['__denom__'],
                                              bounds=bound)
            self.df['__numer__'] = _bounding_(self.df['__numer__'],
                                              bounds=bound)

        # Calculating weights
        self.iptw = self._weight_calculator(self.df,
                                            denominator='__denom__',
                                            numerator='__numer__',
                                            stabilized=stabilized)