Exemple #1
0
 def significance__one_vs_any(self) -> Series:
     """
     Return the probability that one choice is ranked higher than a randomly
     selected other choice.
     """
     data = self.make_features(naming='{{choice}}')
     sums: Series = data.sum()
     n = len(data)
     results = []
     for category in self.categories:
         rest = [c for c in self.categories if c != category]
         m_one = sums[category]
         m_rest = sums[rest].mean()
         results.append({
             'category':
             category,
             'p': (BetaBinomialConjugate(alpha=1, beta=1, n=n,
                                         k=m_one).posterior() >
                   BetaBinomialConjugate(alpha=1, beta=1, n=n,
                                         k=m_rest).posterior())
         })
     return DataFrame(results).set_index('category')['p']
 def significance_one_vs_any(self) -> Series:
     """
     Return the probability that a random respondent is more likely to answer
     one category than a randomly selected other category.
     """
     data = self.make_features(naming='{{choice}}')
     sums = data.sum()
     results = []
     for category in self.categories:
         rest = [c for c in self.categories if c != category]
         n_one = len(data)
         m_one = sums[category]
         n_rest = len(rest) * len(data)
         m_rest = sums[rest].sum()
         results.append({
             'category':
             category,
             'p': (BetaBinomialConjugate(alpha=1, beta=1, n=n_one,
                                         k=m_one).posterior() >
                   BetaBinomialConjugate(
                       alpha=1, beta=1, n=n_rest, k=m_rest).posterior())
         })
     return DataFrame(results).set_index('category')['p']
Exemple #3
0
 def significance_one_vs_any(self) -> Series:
     """
     Return the probability that a random respondent is more likely to answer
     one category than a randomly selected other category.
     """
     sums = self.data.value_counts()
     sums = sums.reindex(self.categories).fillna(0).astype(int)
     results = []
     for category in self.categories:
         anys = [c for c in self.categories if c != category]
         n_one = len(self.data)
         m_one = sums[category]
         n_any = len(self.data)
         m_any = sums[anys].mean()
         results.append({
             'category':
             category,
             'p': (BetaBinomialConjugate(alpha=1, beta=1, n=n_one,
                                         k=m_one).posterior() >
                   BetaBinomialConjugate(alpha=1, beta=1, n=n_any,
                                         k=m_any).posterior())
         })
     return DataFrame(results).set_index('category')['p']
Exemple #4
0
 def __gt__(self, other: 'LikertQuestion') -> float:
     """
     Return the probability that the posterior estimate for the probability
     of max-rating is greater in self than other.
     """
     data_self = self.make_features()
     data_other = other.make_features()
     bb_self = BetaBinomialConjugate(alpha=1,
                                     beta=1,
                                     n=len(data_self),
                                     k=data_self.sum())
     bb_other = BetaBinomialConjugate(alpha=1,
                                      beta=1,
                                      n=len(data_other),
                                      k=data_other.sum())
     return bb_self.posterior() > bb_other.posterior()
Exemple #5
0
    def prob_superior(self, question: CategoricalQuestion,
                      attribute: SingleCategoryAttribute,
                      exp_attr_values: List[str], exp_answers: List[str],
                      ctl_attr_values: List[str],
                      ctl_answers: List[str]) -> BBProbSuperiorResult:
        """
        Calculate the probability that the number of responses from the
        experimental group in `exp_answers` is significantly higher than the
        number of responses from the control group in `ctl_answers`.

        N.B. to assess the effect of respondent attributes, `exp_answers` and
        `ctl_answers` should be identical.

        :param question: The question to consider.
        :param attribute: The attribute to use.
        :param exp_attr_values: The attribute values of the experimental group.
        :param exp_answers: The answers to count in the experimental group.
        :param ctl_attr_values: The attribute values of the control group.
        :param ctl_answers: The answers to count in the control group.
        """
        # find n and k for experimental respondent and answer group
        n_exp = self.count_responses(question=question,
                                     condition_category=attribute,
                                     condition_values=exp_attr_values)
        k_exp = self.count_responses(question=question,
                                     answers=exp_answers,
                                     condition_category=attribute,
                                     condition_values=exp_attr_values)
        # find n and k for control respondent and answer group
        n_ctl = self.count_responses(question=question,
                                     condition_category=attribute,
                                     condition_values=ctl_attr_values)
        k_ctl = self.count_responses(question=question,
                                     answers=ctl_answers,
                                     condition_category=attribute,
                                     condition_values=ctl_attr_values)
        # create beta-binomial distribution for each group
        bb_exp = BetaBinomialConjugate(alpha=1, beta=1, n=n_exp, m=k_exp)
        bb_ctl = BetaBinomialConjugate(alpha=1, beta=1, n=n_ctl, m=k_ctl)
        # calculate probability of superiority of test group
        p_superior = bb_exp > bb_ctl

        return BBProbSuperiorResult(
            p_superior=p_superior,
            experimental_mean=bb_exp.posterior().mean(),
            control_mean=bb_ctl.posterior().mean())
    def test_infer_posteriors(self):

        b__0_3 = Beta(1 + 0, 1 + 3)
        b__1_2 = Beta(1 + 1, 1 + 2)
        b__2_1 = Beta(1 + 2, 1 + 1)
        b__3_0 = Beta(1 + 3, 1 + 0)

        expected = DataFrame(
            data=[(1, 1, 'c', 1, b__0_3), (2, 1, 'c', 1, b__1_2),
                  (1, 2, 'c', 1, b__2_1), (2, 2, 'c', 1, b__3_0),
                  (1, 1, 'd', 1, b__3_0), (2, 1, 'd', 1, b__2_1),
                  (1, 2, 'd', 1, b__1_2), (2, 2, 'd', 1, b__0_3)],
            columns=['a', 'b', 'prob_var', 'prob_val', 'Beta'])
        actual = BetaBinomialConjugate.infer_posteriors(
            data=self.binomial_data,
            prob_vars=['c', 'd'],
            cond_vars=['a', 'b'])
        for _, row in expected.iterrows():
            actual_beta = actual.loc[(actual['a'] == row['a']) &
                                     (actual['b'] == row['b']) &
                                     (actual['prob_var'] == row['prob_var']) &
                                     (actual['prob_val'] == row['prob_val']),
                                     'Beta'].iloc[0]
            self.assertTrue(row['Beta'] == actual_beta)
Exemple #7
0
    def significance_one_vs_one(self) -> DataFrame:
        """
        Return the probability that a random respondent is more likely to answer
        each category than each other.
        """
        results = []
        for category_1, category_2 in product(self._categories,
                                              self._categories):
            try:
                category_1_count = self._data.value_counts()[category_1]
            except KeyError:
                category_1_count = 0
            try:
                category_2_count = self._data.value_counts()[category_2]
            except KeyError:
                category_2_count = 0
            num_responses = len(self._data.dropna())
            bb_category_1 = BetaBinomialConjugate(alpha=1,
                                                  beta=1,
                                                  n=num_responses,
                                                  k=category_1_count)
            bb_category_2 = BetaBinomialConjugate(alpha=1,
                                                  beta=1,
                                                  n=num_responses,
                                                  k=category_2_count)
            results.append({
                'category_1':
                category_1,
                'category_2':
                category_2,
                'p':
                bb_category_1.posterior() > bb_category_2.posterior()
            })

        results_data = DataFrame(results)
        pt = pivot_table(data=results_data,
                         index='category_1',
                         columns='category_2',
                         values='p')
        return pt
    def test_infer_posterior(self):

        expected = Beta(alpha=1 + 4, beta=1 + 6)
        actual = BetaBinomialConjugate.infer_posterior(self.series)
        self.assertEqual(expected, actual)
Exemple #9
0
    def plot_conditional_prob_densities(
            self,
            categorical: Union[DataDistributionMixin, DataCategoriesMixin],
            hdi: float = 0.95,
            width: float = 0.8,
            num_segments: int = 100,
            color: Color = 'k',
            color_min: Optional[Color] = None,
            color_mean: Optional[Color] = None,
            edge_color: Optional[Color] = None,
            axf: Optional[AxesFormatter] = None) -> AxesFormatter:
        """
        Plot conditional probability densities of the data, split by the
        categories of an Ordinal or Nominal distribution.

        :param categorical: Nominal or Ordinal distribution.
        :param hdi: Highest Density Interval width for each distribution.
        :param width: Width of each density bar.
        :param num_segments: Number of segments to plot per density.
        :param color: Color for the densest part of each distribution.
        :param color_min: Color for the sparsest part of each distribution,
                          if different to color.
        :param color_mean: Color for mean data markers.
        :param edge_color: Optional color for the edge of each density bar.
        :param axf: Optional AxesFormatter to plot on.
        """
        axf = axf or AxesFormatter()

        cats = categorical.categories
        n_cats = len(cats)
        yy_min, yy_max = inf, -inf
        # filter categorical data
        shared_ix = list(
            set(self._data.index).intersection(categorical.data.index))
        cat_data = categorical.data.loc[shared_ix]
        ratio_data = self._data.loc[shared_ix]
        for c, category in enumerate(categorical.categories):
            cat_ratio_data = ratio_data.loc[cat_data == category]
            if len(cat_ratio_data) == 0:
                continue
            # fit distribution and find limits for HDI
            cat_dist = BetaBinomialConjugate.infer_posterior(cat_ratio_data)
            # cat_dist = Beta.fit(data=cat_ratio_data)
            y_min, y_max = cat_dist.hdi(hdi)
            yy_min, yy_max = min(y_min, yy_min), max(y_max, yy_max)
            # plot density
            axf.add_v_density(x=c + 1,
                              y_to_z=cat_dist.pdf().at(
                                  linspace(y_min, y_max, num_segments + 1)),
                              color=color,
                              color_min=color_min,
                              edge_color=edge_color,
                              width=width)
            # plot descriptive statistics lines
            if color_mean is not None:
                mean = cat_ratio_data.mean()
                axf.add_line(x=[c + 0.55, c + 1.45],
                             y=[mean, mean],
                             color=color_mean)
        # labels
        axf.set_text(title=f'{hdi: .0%} HDIs of $p(' + r'p_{' + self.name +
                     r'}' + f'|{categorical.name})$',
                     x_label=categorical.name,
                     y_label=r'$p_{' + self.name + r'}$')
        # axes
        axf.set_x_lim(0, n_cats + 1)
        yy_range = yy_max - yy_min
        axf.set_y_lim(yy_min - yy_range * 0.05, yy_max + yy_range * 0.05)
        axf.y_ticks.set_locations(linspace(0, 1, 11))
        axf.x_ticks.set_locations(range(1, n_cats + 1)).set_labels(cats)

        return axf
Exemple #10
0
    def _draw_significance_values(
            self, other: 'SingleCategoryPTMixin',
            sig_colors: Tuple[str, str],
            sig_values: Tuple[float, float],
            transpose: bool,
            ax: Axes
    ):

        counts = count_coincidences(
            data_1=self.data, data_2=other.data,
            column_1=self.name, column_2=other.name,
            column_1_order=self.category_names,
            column_2_order=other.category_names
        )
        # calculate p(X=x,Y=y) > mean(p(X=~x, Y=~y)) for each x and y
        n_total = counts.sum().sum()  # total number of coincidences
        results = []
        for cat_1, cat_2 in product(self.category_names,
                                    other.category_names):
            m_event = counts.loc[cat_2, cat_1]  # coincidences with value combo
            m_any = (
                (n_total - m_event) /  # coincidences not with value combo
                (len(self.category_names) *
                 len(other.category_names) - 1)  # number of other value combos
            )  # average of all other coincidence counts
            results.append({
                self.name: cat_1,
                other.name: cat_2,
                'p': (
                        BetaBinomialConjugate(
                            alpha=1, beta=1, n=n_total, k=m_event
                        ).posterior() >
                        BetaBinomialConjugate(
                            alpha=1, beta=1, n=n_total, k=m_any
                        ).posterior()
                )
            })
        results_data = DataFrame(results)
        # define plot offsets
        min_add = 0.05
        max_add = 0.95
        line_width = 2
        # draw significance rectangles
        for _, row in results_data.iterrows():
            color = None
            if row['p'] >= sig_values[0]:
                color = sig_colors[0]
            elif row['p'] < sig_values[1]:
                color = sig_colors[1]
            if color is None:
                continue
            if not transpose:
                x = self.category_names.index(row[self.name])
                y = other.category_names.index(row[other.name])
            else:
                y = self.category_names.index(row[self.name])
                x = other.category_names.index(row[other.name])

            ax.plot([x + min_add, x + max_add], [y + min_add, y + min_add],
                    color, linewidth=line_width)
            ax.plot([x + min_add, x + max_add], [y + max_add, y + max_add],
                    color, linewidth=line_width)
            ax.plot([x + min_add, x + min_add], [y + min_add, y + max_add],
                    color, linewidth=line_width)
            ax.plot([x + max_add, x + max_add], [y + min_add, y + max_add],
                    color, linewidth=line_width)
Exemple #11
0
    def plot_cpt(self,
                 condition: 'SingleCategoryPTMixin',
                 significance: Optional[str] = None,
                 sig_colors: Tuple[str, str] = ('#00ff00', '#ff0000'),
                 sig_values: Tuple[float, float] = (0.945, 0.055),
                 **kwargs) -> Axes:
        """
        Plot a conditional probability table of self and other.

        :param condition: Another SingleCategory to condition on.
        :param significance: One of ['prob', 'cond'].
                            'prob' gives p(X=x1|Y=y1) > p(X≠x1|Y=y1)
                            'cond' gives p(X=x1|Y=y1) > p(X=x1|Y≠y1)
        :param sig_colors: Tuple of (high, low) colors for highlighting
                           significance.
                           Equal to p(X=x1,Y=y1) > p(X≠x1, Y≠y1).
        :param sig_values: Tuple of (high, low) values for assessing
                           significance.
        :param kwargs: See utils.plots.plot_pt
        """
        if self.name == condition.name:
            raise ValueError('categoricals must have different names')
        if isinstance(condition, SingleCategoryPTMixin):
            condition_data = condition.data
        else:
            # assume multi category
            condition_data = condition.make_features(naming='{{choice}}')
        jpt = create_cpt(
            prob_data=self.data, cond_data=condition_data,
            prob_name=self.name, cond_name=condition.name,
            prob_order=self.category_names, cond_order=condition.category_names
        )
        if 'var_sep' not in kwargs.keys():
            kwargs['var_sep'] = '|'
        if not 'transpose' in kwargs.keys():
            kwargs['transpose'] = True
        ax = plot_pt(pt=jpt, **kwargs)

        # draw significance values
        if significance is not None:
            counts = count_coincidences(
                data_1=self.data, data_2=condition_data,
                column_1=self.name, column_2=condition.name,
                column_1_order=self.category_names,
                column_2_order=condition.category_names
            )
            if isinstance(condition, SingleCategoryPTMixin):
                results = []
                if significance == 'prob':
                    for cond_cat in condition.category_names:
                        n_cond = counts.loc[cond_cat].sum()
                        for prob_cat in self.category_names:
                            m_prob_cond = counts.loc[cond_cat, prob_cat]
                            m_any = (
                                (n_cond - m_prob_cond) /
                                (len(self.category_names) - 1)
                            )
                            p = (
                                BetaBinomialConjugate(
                                    alpha=1, beta=1, n=n_cond, k=m_prob_cond
                                ).posterior() > BetaBinomialConjugate(
                                    alpha=1, beta=1, n=n_cond, k=m_any
                                ).posterior()
                            )
                            results.append({
                                self.name: prob_cat,
                                condition.name: cond_cat,
                                'p': p
                            })
                elif significance == 'cond':
                    n = counts.sum().sum()
                    for prob_cat in self.category_names:
                        n_prob = counts[prob_cat].sum()
                        for cond_cat in condition.category_names:
                            n_cond = counts.loc[cond_cat].sum()
                            m_prob_cond = counts.loc[cond_cat, prob_cat]
                            m_any = n_prob - m_prob_cond
                            n_any = n - n_cond
                            p = (
                                BetaBinomialConjugate(
                                    n=n_cond, k=m_prob_cond, alpha=1, beta=1,
                                ).posterior() > BetaBinomialConjugate(
                                    n=n_any, k=m_any, alpha=1, beta=1
                                ).posterior()
                            )
                            results.append({
                                self.name: prob_cat,
                                condition.name: cond_cat,
                                'p': p
                            })
                else:
                    raise ValueError(
                        "significance must be one of ['prob', 'cond']"
                    )

                results_data = DataFrame(results)

            else:
                raise NotImplementedError(
                    'significance not implemented for MultiCategories'
                )

            min_add = 0.1
            max_add = 0.9
            line_width = 2

            for _, row in results_data.iterrows():
                color = None
                if row['p'] >= sig_values[0]:
                    color = sig_colors[0]
                elif row['p'] < sig_values[1]:
                    color = sig_colors[1]
                if color is None:
                    continue
                if not kwargs['transpose']:
                    x = self.category_names.index(row[self.name])
                    y = condition.category_names.index(row[condition.name])
                    if significance == 'prob':
                        ax.plot([x + min_add, x + min_add],
                                [y + min_add, y + max_add],
                                color, linewidth=line_width)
                        ax.plot([x + max_add, x + max_add],
                                [y + min_add, y + max_add],
                                color, linewidth=line_width)
                    elif significance == 'cond':
                        ax.plot([x + min_add, x + max_add],
                                [y + min_add, y + min_add],
                                color, linewidth=line_width)
                        ax.plot([x + min_add, x + max_add],
                                [y + max_add, y + max_add],
                                color, linewidth=line_width)
                else:
                    y = self.category_names.index(row[self.name])
                    x = condition.category_names.index(row[condition.name])
                    if significance == 'prob':
                        ax.plot([x + min_add, x + max_add],
                                [y + min_add, y + min_add],
                                color, linewidth=line_width)
                        ax.plot([x + min_add, x + max_add],
                                [y + max_add, y + max_add],
                                color, linewidth=line_width)
                    elif significance == 'cond':
                        ax.plot([x + min_add, x + min_add],
                                [y + min_add, y + max_add],
                                color, linewidth=line_width)
                        ax.plot([x + max_add, x + max_add],
                                [y + min_add, y + max_add],
                                color, linewidth=line_width)
        return ax