Esempio n. 1
0
def plot_kde(df, treatment, probability,
             measure='probability', bw_method='scott', fill=True, color_e='b', color_u='r'):
    """Generates a density plot that can be used to check whether positivity may be violated qualitatively. The
    kernel density used is SciPy's Gaussian kernel. Either Scott's Rule or Silverman's Rule can be implemented.
    Alternative option to the boxplot of probabilities

    Parameters
    ------------
    df : DataFrame
        Pandas dataframe containing the variables of interest
    treatment : str
        Column name of the treatment variable
    probability : str
        Column name of the predicted probability of treatment
    measure : str, optional
        Measure to plot. Options include either the probabilities or log-odds stratified by treatment received.
        Default is probabilities (measure='probability'). Log-odds can be requested via measure='logit'
    bw_method : str, optional
        Method used to estimate the bandwidth. Following SciPy, either 'scott' or 'silverman' are valid options
    fill : bool, optional
        Whether to color the area under the density curves. Default is true
    color_e : str, optional
        Color of the line/area for the treated group. Default is Blue
    color_u : str, optional
        Color of the line/area for the treated group. Default is Red

    Returns
    ---------------
    matplotlib axes
    """
    if measure == 'probability':
        x = np.linspace(0, 1, 10000)
        density_t = gaussian_kde(df.loc[df[treatment] == 1][probability].dropna(),
                                 bw_method=bw_method)
        density_u = gaussian_kde(df.loc[df[treatment] == 0][probability].dropna(),
                                 bw_method=bw_method)
    elif measure == 'logit':
        t = np.log(probability_to_odds(df.loc[df[treatment] == 1][probability].dropna()))
        density_t = gaussian_kde(t, bw_method=bw_method)

        u = np.log(probability_to_odds(df.loc[df[treatment] == 0][probability].dropna()))
        density_u = gaussian_kde(u, bw_method=bw_method)
        x = np.linspace(np.min((np.min(t), np.min(u))) - 1, np.max((np.max(t), np.max(u))) + 1, 10000)
    else:
        raise ValueError("Only plots of probabilities or log-odds are supported. Please specify either "
                         "'probability' or 'logit'")

    ax = plt.gca()
    if fill:
        ax.fill_between(x, density_t(x), color=color_e, alpha=0.2, label=None)
        ax.fill_between(x, density_u(x), color=color_u, alpha=0.2, label=None)
    ax.plot(x, density_t(x), color=color_e, label='Treat = 1')
    ax.plot(x, density_u(x), color=color_u, label='Treat = 0')
    if measure == 'probability':
        ax.set_xlabel('Probability')
    else:
        ax.set_xlabel('Log-Odds')
    ax.set_ylabel('Density')
    ax.legend()
    return ax
Esempio n. 2
0
def plot_boxplot(df, treatment, probability, measure='probability'):
    """Generates a stratified boxplot that can be used to visually check whether positivity may be violated,
    qualitatively. Alternative option to the kernel density plot.

    Parameters
    ----------
    df : DataFrame
        Pandas dataframe containing the variables of interest
    treatment : str
        Column name of the treatment variable
    probability : str
        Column name of the predicted probability of treatment

    measure : str, optional
        Measure to plot. Options include either the probabilities or log-odds stratified by treatment received.
        Default is probabilities (measure='probability'). Log-odds can be requested via measure='logit'

    Returns
    -------------
    matplotlib axes
    """
    if measure == 'probability':
        boxes = (df.loc[df[treatment] == 1][probability].dropna(),
                 df.loc[df[treatment] == 0][probability].dropna())

    elif measure == 'logit':
        boxes = (np.log(
            probability_to_odds(
                df.loc[df[treatment] == 1][probability].dropna())),
                 np.log(
                     probability_to_odds(
                         df.loc[df[treatment] == 0][probability].dropna())))
    else:
        raise ValueError(
            "Only plots of probabilities or log-odds are supported. Please specify either "
            "'probability' or 'logit")

    labs = ['A = 1', 'A = 0']
    meanpointprops = dict(marker='D',
                          markeredgecolor='black',
                          markerfacecolor='black')
    ax = plt.gca()
    ax.boxplot(boxes, labels=labs, meanprops=meanpointprops, showmeans=True)
    if measure == 'probability':
        ax.set_ylabel('Probability')
        ax.set_ylim([0, 1])
    else:
        ax.set_ylabel('Log-Odds')
    return ax
Esempio n. 3
0
    def plot_boxplot(self, measure='probability'):
        """Generates a stratified boxplot that can be used to visually check whether positivity may be violated,
        qualitatively. Alternative option to the kernel density plot.

        Parameters
        ----------
        measure : str, optional
            Measure to plot. Options include either the probabilities or log-odds stratified by treatment received.
            Default is probabilities (measure='probability'). Log-odds can be requested via measure='logit'

        Returns
        -------------
        matplotlib axes
        """
        if measure == 'probability':
            boxes = (self.df.loc[self.df[self.ex] == 1]['__denom__'].dropna(),
                     self.df.loc[self.df[self.ex] == 0]['__denom__'].dropna())

        elif measure == 'logit':
            boxes = (np.log(
                probability_to_odds(
                    self.df.loc[self.df[self.ex] == 1]['__denom__'].dropna())),
                     np.log(
                         probability_to_odds(self.df.loc[self.df[self.ex] == 0]
                                             ['__denom__'].dropna())))
        else:
            raise ValueError(
                "Only plots of probabilities or log-odds are supported. Please specify either "
                "'probability' or 'logit")

        labs = ['Treat = 1', 'Treat = 0']
        meanpointprops = dict(marker='D',
                              markeredgecolor='black',
                              markerfacecolor='black')
        ax = plt.gca()
        ax.boxplot(boxes,
                   labels=labs,
                   meanprops=meanpointprops,
                   showmeans=True)
        if measure == 'probability':
            ax.set_ylabel('Probability')
        else:
            ax.set_ylabel('Log-Odds')
        return ax
Esempio n. 4
0
    def fit(self):
        """Estimates risk difference, risk ratio, and odds ratio based on the gAW and QAW. If a continuous outcome,
        then the average treatment effect is returned. Confidence intervals come from influence curves

        Returns
        -------
        `TMLE` gains `risk_difference`, `risk_ratio`, and `odds_ratio` for binary outcomes and
        `average _treatment_effect` for continuous outcomes
        """
        if (self._fit_exposure_model is False) or (self._fit_outcome_model is
                                                   False):
            raise ValueError(
                'The exposure and outcome models must be specified before the psi estimate can '
                'be generated')
        if self._miss_flag and not self._fit_missing_model:
            warnings.warn(
                "No missing data model has been specified. All missing outcome data is assumed to be "
                "missing completely at random. To relax this assumption to outcome data is missing at random"
                "please use the `missing_model()` function", UserWarning)

        # Step 4) Calculating clever covariate (HAW)
        if self._miss_flag and self._fit_missing_model:
            self.g1W_total = self.g1W * self.m1W
            self.g0W_total = self.g0W * self.m0W
        else:
            self.g1W_total = self.g1W
            self.g0W_total = self.g0W
        H1W = self.df[self.exposure] / self.g1W_total
        H0W = -(1 - self.df[self.exposure]) / self.g0W_total
        HAW = H1W + H0W

        # Step 5) Estimating TMLE
        f = sm.families.family.Binomial()
        y = self.df[self.outcome]
        log = sm.GLM(y,
                     np.column_stack((H1W, H0W)),
                     offset=np.log(probability_to_odds(self.QAW)),
                     family=f,
                     missing='drop').fit()
        self._epsilon = log.params
        Qstar1 = logistic.cdf(
            np.log(probability_to_odds(self.QA1W)) +
            self._epsilon[0] / self.g1W_total)
        Qstar0 = logistic.cdf(
            np.log(probability_to_odds(self.QA0W)) -
            self._epsilon[1] / self.g0W_total)
        Qstar = log.predict(np.column_stack((H1W, H0W)),
                            offset=np.log(probability_to_odds(self.QAW)))

        # Step 6) Calculating Psi
        if self.alpha == 0.05:  # Without this, won't match R exactly. R relies on 1.96, while I use SciPy
            zalpha = 1.96
        else:
            zalpha = norm.ppf(1 - self.alpha / 2, loc=0, scale=1)

        # p-values are not implemented (doing my part to enforce CL over p-values)
        delta = np.where(self.df[self._missing_indicator] == 1, 1, 0)
        if self._continuous_outcome:
            # Calculating Average Treatment Effect
            Qstar = self._unit_unbound(Qstar,
                                       mini=self._continuous_min,
                                       maxi=self._continuous_max)
            Qstar1 = self._unit_unbound(Qstar1,
                                        mini=self._continuous_min,
                                        maxi=self._continuous_max)
            Qstar0 = self._unit_unbound(Qstar0,
                                        mini=self._continuous_min,
                                        maxi=self._continuous_max)

            self.average_treatment_effect = np.nanmean(Qstar1 - Qstar0)
            # Influence Curve for CL
            y_unbound = self._unit_unbound(self.df[self.outcome],
                                           mini=self._continuous_min,
                                           maxi=self._continuous_max)
            ic = np.where(
                delta == 1,
                HAW * (y_unbound - Qstar) + (Qstar1 - Qstar0) -
                self.average_treatment_effect,
                Qstar1 - Qstar0 - self.average_treatment_effect)
            seIC = np.sqrt(np.nanvar(ic, ddof=1) / self.df.shape[0])
            self.average_treatment_effect_se = seIC
            self.average_treatment_effect_ci = [
                self.average_treatment_effect - zalpha * seIC,
                self.average_treatment_effect + zalpha * seIC
            ]
        else:
            # Calculating Risk Difference
            self.risk_difference = np.nanmean(Qstar1 - Qstar0)
            # Influence Curve for CL
            ic = np.where(
                delta == 1,
                HAW * (self.df[self.outcome] - Qstar) + (Qstar1 - Qstar0) -
                self.risk_difference, (Qstar1 - Qstar0) - self.risk_difference)
            seIC = np.sqrt(np.nanvar(ic, ddof=1) / self.df.shape[0])
            self.risk_difference_se = seIC
            self.risk_difference_ci = [
                self.risk_difference - zalpha * seIC,
                self.risk_difference + zalpha * seIC
            ]

            # Calculating Risk Ratio
            self.risk_ratio = np.nanmean(Qstar1) / np.nanmean(Qstar0)
            # Influence Curve for CL
            ic = np.where(
                delta == 1,
                (1 / np.mean(Qstar1) *
                 (H1W *
                  (self.df[self.outcome] - Qstar) + Qstar1 - np.mean(Qstar1)) -
                 (1 / np.mean(Qstar0)) *
                 (-1 * H0W *
                  (self.df[self.outcome] - Qstar) + Qstar0 - np.mean(Qstar0))),
                (Qstar1 - np.mean(Qstar1)) + Qstar0 - np.mean(Qstar0))

            seIC = np.sqrt(np.nanvar(ic, ddof=1) / self.df.shape[0])
            self.risk_ratio_se = seIC
            self.risk_ratio_ci = [
                np.exp(np.log(self.risk_ratio) - zalpha * seIC),
                np.exp(np.log(self.risk_ratio) + zalpha * seIC)
            ]

            # Calculating Odds Ratio
            self.odds_ratio = (np.nanmean(Qstar1) /
                               (1 - np.nanmean(Qstar1))) / (
                                   np.nanmean(Qstar0) /
                                   (1 - np.nanmean(Qstar0)))
            # Influence Curve for CL
            ic = np.where(
                delta == 1,
                ((1 / (np.nanmean(Qstar1) * (1 - np.nanmean(Qstar1))) *
                  (H1W * (self.df[self.outcome] - Qstar) + Qstar1)) -
                 (1 / (np.nanmean(Qstar0) * (1 - np.nanmean(Qstar0))) *
                  (-1 * H0W * (self.df[self.outcome] - Qstar) + Qstar0))),
                ((1 / (np.nanmean(Qstar1) *
                       (1 - np.nanmean(Qstar1))) * Qstar1 -
                  (1 / (np.nanmean(Qstar0) *
                        (1 - np.nanmean(Qstar0))) * Qstar0))))
            seIC = np.sqrt(np.nanvar(ic, ddof=1) / self.df.shape[0])
            self.odds_ratio_se = seIC
            self.odds_ratio_ci = [
                np.exp(np.log(self.odds_ratio) - zalpha * seIC),
                np.exp(np.log(self.odds_ratio) + zalpha * seIC)
            ]
Esempio n. 5
0
    def fit(self):
        """Estimates risk difference, risk ratio, and odds ratio based on the gAW and QAW. Confidence intervals come
        from the influence curve

        Returns
        -------
        TMLE gains Psi and confint attributes
        """
        if (self._fit_exposure_model is False) or (self._fit_outcome_model is
                                                   False):
            raise ValueError(
                'The exposure and outcome models must be specified before the psi estimate can '
                'be generated')

        # Step 4) Calculating clever covariate (HAW)
        H1W = self.df[self._exposure] / self.g1W
        H0W = -(1 - self.df[self._exposure]) / (self.g0W)
        HAW = H1W + H0W

        # Step 5) Estimating TMLE
        f = sm.families.family.Binomial()
        log = sm.GLM(self.df[self._outcome],
                     np.column_stack((H1W, H0W)),
                     offset=np.log(probability_to_odds(self.QAW)),
                     family=f).fit()
        self._epsilon = log.params
        Qstar1 = logistic.cdf(
            np.log(probability_to_odds(self.QA1W)) +
            self._epsilon[0] * 1 / self.g1W)
        Qstar0 = logistic.cdf(
            np.log(probability_to_odds(self.QA0W)) +
            self._epsilon[1] * -1 / self.g0W)
        Qstar = log.predict(np.column_stack((H1W, H0W)),
                            offset=np.log(probability_to_odds(self.QAW)))

        # Step 6) Calculating Psi
        if self.alpha == 0.05:  # Without this, won't match R exactly. R relies on 1.96, while I use SciPy
            zalpha = 1.96
        else:
            zalpha = norm.ppf(1 - self.alpha / 2, loc=0, scale=1)

        # p-values are not implemented (doing my part to enforce CL over p-values)
        # Calculating Risk Difference
        self.risk_difference = np.mean(Qstar1 - Qstar0)
        # Influence Curve for CL
        ic = HAW * (self.df[self._outcome] - Qstar) + (
            Qstar1 - Qstar0) - self.risk_difference
        varIC = np.var(ic, ddof=1) / self.df.shape[0]
        self.risk_difference_ci = [
            self.risk_difference - zalpha * math.sqrt(varIC),
            self.risk_difference + zalpha * math.sqrt(varIC)
        ]

        # Calculating Risk Ratio
        self.risk_ratio = np.mean(Qstar1) / np.mean(Qstar0)
        # Influence Curve for CL
        ic = (1 / np.mean(Qstar1) *
              (H1W *
               (self.df[self._outcome] - Qstar) + Qstar1 - np.mean(Qstar1)) -
              (1 / np.mean(Qstar0)) *
              (-1 * H0W *
               (self.df[self._outcome] - Qstar) + Qstar0 - np.mean(Qstar0)))
        varIC = np.var(ic, ddof=1) / self.df.shape[0]
        self.risk_ratio_ci = [
            np.exp(np.log(self.risk_ratio) - zalpha * math.sqrt(varIC)),
            np.exp(np.log(self.risk_ratio) + zalpha * math.sqrt(varIC))
        ]

        # Calculating Odds Ratio
        self.odds_ratio = (np.mean(Qstar1) /
                           (1 - np.mean(Qstar1))) / (np.mean(Qstar0) /
                                                     (1 - np.mean(Qstar0)))
        # Influence Curve for CL
        ic = ((1 / (np.mean(Qstar1) * (1 - np.mean(Qstar1))) *
               (H1W * (self.df[self._outcome] - Qstar) + Qstar1)) -
              (1 / (np.mean(Qstar0) * (1 - np.mean(Qstar0))) *
               (-1 * H0W * (self.df[self._outcome] - Qstar) + Qstar0)))
        seIC = math.sqrt(np.var(ic, ddof=1) / self.df.shape[0])
        self.odds_ratio_ci = [
            np.exp(np.log(self.odds_ratio) - zalpha * seIC),
            np.exp(np.log(self.odds_ratio) + zalpha * seIC)
        ]
Esempio n. 6
0
 def test_forth_and_back_conversions(self):
     original = 1.1
     pr = odds_to_probability(original)
     odd = probability_to_odds(pr)
     npt.assert_allclose(original, odd)
Esempio n. 7
0
 def test_back_and_forth_conversions(self):
     original = 0.12
     odd = probability_to_odds(original)
     pr = odds_to_probability(odd)
     npt.assert_allclose(original, pr)
Esempio n. 8
0
 def test_probability_to_odds(self):
     od = probability_to_odds(0.5)
     assert od == 1
Esempio n. 9
0
    def fit(self):
        """Estimates psi based on the gAW and QAW. Confidence intervals come from the influence curve
        """
        if (self._fit_exposure_model is False) or (self._fit_exposure_model is
                                                   False):
            raise ValueError(
                'The exposure and outcome models must be specified before the psi estimate can '
                'be generated')

        # Calculating clever covariates
        H1W = self.df[self._exposure] / self.gW
        H0W = (1 - self.df[self._exposure]) / (1 - self.gW)

        # Fitting logistic model with QAW offset
        f = sm.families.family.Binomial(sm.families.links.logit)
        log = sm.GLM(self.df[self._outcome],
                     np.column_stack((H1W, H0W)),
                     offset=np.log(probability_to_odds(self.QAW)),
                     family=f).fit()
        self._epsilon = log.params

        # Getting Qn*
        # Qstar = logistic.cdf(self.QAW + self._epsilon*gAW) # I think this would allow natural course comparison
        Qstar1 = logistic.cdf(
            np.log(probability_to_odds(self.QA1W)) + self._epsilon[0] *
            (1 / self.gW))
        Qstar0 = logistic.cdf(
            np.log(probability_to_odds(self.QA0W)) + self._epsilon[1] *
            (1 / (1 - self.gW)))
        # Estimating parameter
        if self._psi_correspond == 'risk_difference':
            self.psi = np.mean(Qstar1 - Qstar0)
        elif self._psi_correspond == 'risk_ratio':
            self.psi = np.mean(Qstar1) / np.mean(Qstar0)
        elif self._psi_correspond == 'odds_ratio':
            self.psi = (np.mean(Qstar1) /
                        (1 - np.mean(Qstar1))) / (np.mean(Qstar0) /
                                                  (1 - np.mean(Qstar0)))
        else:
            raise ValueError(
                'Specified parameter is not implemented. Please use one of the available options'
            )

        # Getting influence curve
        zalpha = norm.ppf(1 - self.alpha / 2, loc=0, scale=1)
        if self._psi_correspond == 'risk_difference':
            ic = ((self.df[self._exposure] / self.gW -
                   (1 - self.df[self._exposure]) / (1 - self.gW)) *
                  (self.df[self._outcome] - self.QAW) + self.QA1W - self.QA0W -
                  (np.mean(Qstar1) - np.mean(Qstar0)))
            varIC = np.var(ic, ddof=1) / self.df.shape[0]
            self.confint = [
                self.psi - zalpha * math.sqrt(varIC),
                self.psi + zalpha * math.sqrt(varIC)
            ]
        elif self._psi_correspond == 'risk_ratio':
            ic = ((1 / np.mean(Qstar1)) *
                  (self.df[self._exposure] / self.gW *
                   (self.df[self._outcome] - self.QAW) + self.QA1W -
                   np.mean(Qstar1)) - (1 / np.mean(Qstar0)) *
                  ((1 - self.df[self._exposure]) / (1 - self.gW) *
                   (self.df[self._outcome] - self.QAW) + self.QA0W -
                   np.mean(Qstar0)))
            varIC = np.var(ic, ddof=1) / self.df.shape[0]
            self.confint = [
                np.exp(np.log(self.psi) - zalpha * math.sqrt(varIC)),
                np.exp(np.log(self.psi) + zalpha * math.sqrt(varIC))
            ]
        elif self._psi_correspond == 'odds_ratio':
            ic = (1 / (np.mean(Qstar1) * (1 - np.mean(Qstar1))) *
                  (self.df[self._exposure] / self.gW *
                   (self.df[self._outcome] - self.QAW + self.QA1W)) - 1 /
                  (np.mean(Qstar0) * (1 - np.mean(Qstar0))) *
                  ((1 - self.df[self._exposure]) / (1 - self.gW) *
                   (self.df[self._outcome] - self.QAW + self.QA0W)))
            varIC = np.var(ic, ddof=1) / self.df.shape[0]
            self.confint = [
                np.exp(np.log(self.psi) - zalpha * math.sqrt(varIC)),
                np.exp(np.log(self.psi) + zalpha * math.sqrt(varIC))
            ]
        else:
            pass