Beispiel #1
0
def create_jct(data_1: Union[DataFrame, Series], data_2: Union[DataFrame,
                                                               Series],
               count_1_name: str, count_2_name: str, count_1_order: List[str],
               count_2_order: List[str]) -> DataFrame:
    """
    Return a joint count table.

    :param data_1: The DataFrame or Series containing the data for the first
                   distribution.
    :param data_2: The DataFrame or Series containing the data for the second
                   distribution.
    :param count_1_name: The name of the first question or attribute to find
                         probability of.
    :param count_2_name: The name of the second question or attribute to find
                         probability of.
    :param count_1_order: List of labels to order the count values.
    :param count_2_order: List of labels to order the count values.
    :return: DataFrame of joint counts with Index of `count_1_order` and columns
             of `count_2_order`.
    """
    if count_1_name == count_2_name:
        raise ValueError('prob_1_name must be different to prob_2_name')
    jc_table = count_coincidences(data_1=data_1,
                                  data_2=data_2,
                                  column_1=count_1_name,
                                  column_2=count_2_name,
                                  column_1_order=count_1_order,
                                  column_2_order=count_2_order)

    return jc_table
Beispiel #2
0
def create_cpt(prob_data: Union[DataFrame, Series], cond_data: Union[DataFrame,
                                                                     Series],
               prob_name: str, cond_name: str, prob_order: List[str],
               cond_order: List[str]) -> DataFrame:
    """
    Return a conditional probability table.

    :param prob_data: The Series or DataFrame containing the data to find the
                      probability of.
    :param cond_data: The Series or DataFrame containing the data to condition
                      on.
    :param prob_name: The name of the question or attribute to find probability
                      of.
    :param cond_name: The name of the question or attribute to condition on.
    :param prob_order: List of labels to order the probability values.
    :param cond_order: List of labels to order the conditioning values.
    :return: DataFrame of conditional probabilities with Index of `condition`
             and columns of `probability`.
    """
    if prob_name == cond_name:
        raise ValueError('prob_name must be different to cond_name')
    cp_table = count_coincidences(data_1=prob_data,
                                  data_2=cond_data,
                                  column_1=prob_name,
                                  column_1_order=prob_order,
                                  column_2=cond_name,
                                  column_2_order=cond_order)
    cp_table = cp_table.div(cp_table.sum(axis=1), axis=0)
    # order probabilities
    cp_table = cp_table[[p for p in prob_order if p in cp_table.columns]]
    # order conditions
    cp_table = cp_table.loc[[c for c in cond_order if c in cp_table.index]]

    return cp_table
Beispiel #3
0
def create_jpt(data_1: Union[DataFrame, Series], data_2: Union[DataFrame,
                                                               Series],
               prob_1_name: str, prob_2_name: str, prob_1_order: List[str],
               prob_2_order: List[str]) -> DataFrame:
    """
    Return a joint probability table.

    :param data_1: The DataFrame or Series containing the data for the first
                   distribution.
    :param data_2: The DataFrame or Series containing the data for the second
                   distribution.
    :param prob_1_name: The name of the first question or attribute to find
                        probability of.
    :param prob_2_name: The name of the second question or attribute to find
                        probability of.
    :param prob_1_order: List of labels to order the probability values.
    :param prob_2_order: List of labels to order the conditioning values.
    :return: DataFrame of joint probabilities with Index of `prob_1` and columns
             of `prob_2`.
    """
    if prob_1_name == prob_2_name:
        raise ValueError('prob_1_name must be different to prob_2_name')
    jp_table = count_coincidences(data_1=data_1,
                                  data_2=data_2,
                                  column_1=prob_1_name,
                                  column_2=prob_2_name,
                                  column_1_order=prob_1_order,
                                  column_2_order=prob_2_order)
    jp_table = jp_table / jp_table.sum().sum()

    return jp_table
    def test_count_coincidences_stacked_simple(self):

        coincidences = count_coincidences(
            data_1=self.gender_stacked, data_2=self.symptoms,
            column_1='gender', column_2='symptoms',
            column_1_order=['Female', 'Male'],
            column_2_order=['cough', 'fever', 'headache']
        )
        self.assertEqual(16, coincidences.loc['cough', 'Female'])
        self.assertEqual(7, coincidences.loc['fever', 'Female'])
        self.assertEqual(14, coincidences.loc['headache', 'Female'])
        self.assertEqual(15, coincidences.loc['cough', 'Male'])
        self.assertEqual(23, coincidences.loc['fever', 'Male'])
        self.assertEqual(25, coincidences.loc['headache', 'Male'])
 def test_count_coincidences_stacked_stacked(self):
     """
     for cpts and jpts between 2 stacked distributions
     """
     coincidences = count_coincidences(
         data_1=self.gender_stacked, data_2=self.symptoms_stacked,
         column_1='gender', column_2='symptoms',
         column_1_order=['Female', 'Male'],
         column_2_order=['cough', 'fever', 'headache']
     )
     self.assertEqual(9, coincidences.loc['cough', 'Female'])
     self.assertEqual(7, coincidences.loc['fever', 'Female'])
     self.assertEqual(12, coincidences.loc['headache', 'Female'])
     self.assertEqual(10, coincidences.loc['cough', 'Male'])
     self.assertEqual(22, coincidences.loc['fever', 'Male'])
     self.assertEqual(20, coincidences.loc['headache', 'Male'])
 def test_count_coincidences_simple_simple(self):
     """
     for cpts and jpts between 2 unstacked distributions
     """
     coincidences = count_coincidences(
         data_1=self.gender, data_2=self.symptoms,
         column_1='gender', column_2='symptoms',
         column_1_order=['Female', 'Male'],
         column_2_order=['cough', 'fever', 'headache']
     )
     self.assertEqual(15, coincidences.loc['cough', 'Female'])
     self.assertEqual(18, coincidences.loc['fever', 'Female'])
     self.assertEqual(4, coincidences.loc['headache', 'Female'])
     self.assertEqual(12, coincidences.loc['cough', 'Male'])
     self.assertEqual(17, coincidences.loc['fever', 'Male'])
     self.assertEqual(34, coincidences.loc['headache', 'Male'])
Beispiel #7
0
    def _draw_significance_values(
            self, other: 'SingleCategoryPTMixin',
            sig_colors: Tuple[str, str],
            sig_values: Tuple[float, float],
            transpose: bool,
            ax: Axes
    ):

        counts = count_coincidences(
            data_1=self.data, data_2=other.data,
            column_1=self.name, column_2=other.name,
            column_1_order=self.category_names,
            column_2_order=other.category_names
        )
        # calculate p(X=x,Y=y) > mean(p(X=~x, Y=~y)) for each x and y
        n_total = counts.sum().sum()  # total number of coincidences
        results = []
        for cat_1, cat_2 in product(self.category_names,
                                    other.category_names):
            m_event = counts.loc[cat_2, cat_1]  # coincidences with value combo
            m_any = (
                (n_total - m_event) /  # coincidences not with value combo
                (len(self.category_names) *
                 len(other.category_names) - 1)  # number of other value combos
            )  # average of all other coincidence counts
            results.append({
                self.name: cat_1,
                other.name: cat_2,
                'p': (
                        BetaBinomialConjugate(
                            alpha=1, beta=1, n=n_total, k=m_event
                        ).posterior() >
                        BetaBinomialConjugate(
                            alpha=1, beta=1, n=n_total, k=m_any
                        ).posterior()
                )
            })
        results_data = DataFrame(results)
        # define plot offsets
        min_add = 0.05
        max_add = 0.95
        line_width = 2
        # draw significance rectangles
        for _, row in results_data.iterrows():
            color = None
            if row['p'] >= sig_values[0]:
                color = sig_colors[0]
            elif row['p'] < sig_values[1]:
                color = sig_colors[1]
            if color is None:
                continue
            if not transpose:
                x = self.category_names.index(row[self.name])
                y = other.category_names.index(row[other.name])
            else:
                y = self.category_names.index(row[self.name])
                x = other.category_names.index(row[other.name])

            ax.plot([x + min_add, x + max_add], [y + min_add, y + min_add],
                    color, linewidth=line_width)
            ax.plot([x + min_add, x + max_add], [y + max_add, y + max_add],
                    color, linewidth=line_width)
            ax.plot([x + min_add, x + min_add], [y + min_add, y + max_add],
                    color, linewidth=line_width)
            ax.plot([x + max_add, x + max_add], [y + min_add, y + max_add],
                    color, linewidth=line_width)
Beispiel #8
0
    def plot_cpt(self,
                 condition: 'SingleCategoryPTMixin',
                 significance: Optional[str] = None,
                 sig_colors: Tuple[str, str] = ('#00ff00', '#ff0000'),
                 sig_values: Tuple[float, float] = (0.945, 0.055),
                 **kwargs) -> Axes:
        """
        Plot a conditional probability table of self and other.

        :param condition: Another SingleCategory to condition on.
        :param significance: One of ['prob', 'cond'].
                            'prob' gives p(X=x1|Y=y1) > p(X≠x1|Y=y1)
                            'cond' gives p(X=x1|Y=y1) > p(X=x1|Y≠y1)
        :param sig_colors: Tuple of (high, low) colors for highlighting
                           significance.
                           Equal to p(X=x1,Y=y1) > p(X≠x1, Y≠y1).
        :param sig_values: Tuple of (high, low) values for assessing
                           significance.
        :param kwargs: See utils.plots.plot_pt
        """
        if self.name == condition.name:
            raise ValueError('categoricals must have different names')
        if isinstance(condition, SingleCategoryPTMixin):
            condition_data = condition.data
        else:
            # assume multi category
            condition_data = condition.make_features(naming='{{choice}}')
        jpt = create_cpt(
            prob_data=self.data, cond_data=condition_data,
            prob_name=self.name, cond_name=condition.name,
            prob_order=self.category_names, cond_order=condition.category_names
        )
        if 'var_sep' not in kwargs.keys():
            kwargs['var_sep'] = '|'
        if not 'transpose' in kwargs.keys():
            kwargs['transpose'] = True
        ax = plot_pt(pt=jpt, **kwargs)

        # draw significance values
        if significance is not None:
            counts = count_coincidences(
                data_1=self.data, data_2=condition_data,
                column_1=self.name, column_2=condition.name,
                column_1_order=self.category_names,
                column_2_order=condition.category_names
            )
            if isinstance(condition, SingleCategoryPTMixin):
                results = []
                if significance == 'prob':
                    for cond_cat in condition.category_names:
                        n_cond = counts.loc[cond_cat].sum()
                        for prob_cat in self.category_names:
                            m_prob_cond = counts.loc[cond_cat, prob_cat]
                            m_any = (
                                (n_cond - m_prob_cond) /
                                (len(self.category_names) - 1)
                            )
                            p = (
                                BetaBinomialConjugate(
                                    alpha=1, beta=1, n=n_cond, k=m_prob_cond
                                ).posterior() > BetaBinomialConjugate(
                                    alpha=1, beta=1, n=n_cond, k=m_any
                                ).posterior()
                            )
                            results.append({
                                self.name: prob_cat,
                                condition.name: cond_cat,
                                'p': p
                            })
                elif significance == 'cond':
                    n = counts.sum().sum()
                    for prob_cat in self.category_names:
                        n_prob = counts[prob_cat].sum()
                        for cond_cat in condition.category_names:
                            n_cond = counts.loc[cond_cat].sum()
                            m_prob_cond = counts.loc[cond_cat, prob_cat]
                            m_any = n_prob - m_prob_cond
                            n_any = n - n_cond
                            p = (
                                BetaBinomialConjugate(
                                    n=n_cond, k=m_prob_cond, alpha=1, beta=1,
                                ).posterior() > BetaBinomialConjugate(
                                    n=n_any, k=m_any, alpha=1, beta=1
                                ).posterior()
                            )
                            results.append({
                                self.name: prob_cat,
                                condition.name: cond_cat,
                                'p': p
                            })
                else:
                    raise ValueError(
                        "significance must be one of ['prob', 'cond']"
                    )

                results_data = DataFrame(results)

            else:
                raise NotImplementedError(
                    'significance not implemented for MultiCategories'
                )

            min_add = 0.1
            max_add = 0.9
            line_width = 2

            for _, row in results_data.iterrows():
                color = None
                if row['p'] >= sig_values[0]:
                    color = sig_colors[0]
                elif row['p'] < sig_values[1]:
                    color = sig_colors[1]
                if color is None:
                    continue
                if not kwargs['transpose']:
                    x = self.category_names.index(row[self.name])
                    y = condition.category_names.index(row[condition.name])
                    if significance == 'prob':
                        ax.plot([x + min_add, x + min_add],
                                [y + min_add, y + max_add],
                                color, linewidth=line_width)
                        ax.plot([x + max_add, x + max_add],
                                [y + min_add, y + max_add],
                                color, linewidth=line_width)
                    elif significance == 'cond':
                        ax.plot([x + min_add, x + max_add],
                                [y + min_add, y + min_add],
                                color, linewidth=line_width)
                        ax.plot([x + min_add, x + max_add],
                                [y + max_add, y + max_add],
                                color, linewidth=line_width)
                else:
                    y = self.category_names.index(row[self.name])
                    x = condition.category_names.index(row[condition.name])
                    if significance == 'prob':
                        ax.plot([x + min_add, x + max_add],
                                [y + min_add, y + min_add],
                                color, linewidth=line_width)
                        ax.plot([x + min_add, x + max_add],
                                [y + max_add, y + max_add],
                                color, linewidth=line_width)
                    elif significance == 'cond':
                        ax.plot([x + min_add, x + min_add],
                                [y + min_add, y + max_add],
                                color, linewidth=line_width)
                        ax.plot([x + max_add, x + max_add],
                                [y + min_add, y + max_add],
                                color, linewidth=line_width)
        return ax
Beispiel #9
0
symptoms_stacked = Series(data=symptoms.values,
                          name='symptoms',
                          index=MultiIndex.from_tuples(
                              tuples=[(ix, 'aspirin') for ix in range(1, 51)] +
                              [(ix, 'paracetamol') for ix in range(26, 76)],
                              names=['Respondent ID', 'medication']))
gender_stacked = Series(data=gender.values,
                        name='gender',
                        index=MultiIndex.from_tuples(
                            tuples=[(ix, 'aspirin') for ix in range(11, 61)] +
                            [(ix, 'paracetamol') for ix in range(36, 86)],
                            names=['Respondent ID', 'medication']))

print('\nsimple-simple')
coincidences = count_coincidences(gender.copy(), symptoms.copy(), 'gender',
                                  'symptoms', ['Female', 'Male'],
                                  ['cough', 'fever', 'headache'])
print(coincidences)

print('\nsimple-stacked')
coincidences = count_coincidences(gender.copy(), symptoms_stacked.copy(),
                                  'gender', 'symptoms', ['Female', 'Male'],
                                  ['cough', 'fever', 'headache'])
print(coincidences)

print('\nstacked-simple')
coincidences = count_coincidences(gender_stacked.copy(), symptoms.copy(),
                                  'gender', 'symptoms', ['Female', 'Male'],
                                  ['cough', 'fever', 'headache'])
print(coincidences)