def test_expected_freq():
    assert_array_equal(expected_freq([1]), np.array([1.0]))

    observed = np.array([[[2, 0], [0, 2]], [[0, 2], [2, 0]], [[1, 1], [1, 1]]])
    e = expected_freq(observed)
    assert_array_equal(e, np.ones_like(observed))

    observed = np.array([[10, 10, 20], [20, 20, 20]])
    e = expected_freq(observed)
    correct = np.array([[12., 12., 16.], [18., 18., 24.]])
    assert_array_almost_equal(e, correct)
Esempio n. 2
0
def test_expected_freq():
    assert_array_equal(expected_freq([1]), np.array([1.0]))

    observed = np.array([[[2, 0], [0, 2]], [[0, 2], [2, 0]], [[1, 1], [1, 1]]])
    e = expected_freq(observed)
    assert_array_equal(e, np.ones_like(observed))

    observed = np.array([[10, 10, 20], [20, 20, 20]])
    e = expected_freq(observed)
    correct = np.array([[12., 12., 16.], [18., 18., 24.]])
    assert_array_almost_equal(e, correct)
Esempio n. 3
0
def check_for_fisher(df, var1, var2):
    exp_freq = expected_freq(pd.crosstab(df[var1], df[var2]))
    if exp_freq.shape != (2,2):
        return False
    if (exp_freq<=GLOBAL_EXPECTED_FREQ).any(axis=None) or len(df)<=GLOBAL_N_FOR_FISHER: 
        return True
    return False
Esempio n. 4
0
    def chi_square_yats(self):
        observed_list = [self.get_observed()]
        observed = np.asarray(observed_list)
        expected = expected_freq(observed)
        dof = expected.size - sum(expected.shape) + expected.ndim - 1
        observed = observed + 0.5 * np.sign(expected - observed)

        return power_divergence(observed,
                                expected,
                                ddof=observed.size - 1 - dof,
                                axis=None,
                                lambda_=None)
Esempio n. 5
0
def get_expected_values(crosstab):
    expected = expected_freq(crosstab)
    return expected
def cross_chi2(index, columns):
    chi_res = []
    cross_result = pd.crosstab(index=index, columns=columns, margins=True)
    cr_re = pd.crosstab(index=index, columns=columns,
                        margins=False)  # 给模型的不能有汇总列,8/25修改
    chi2_pearson, p_value_pearson, dof_pearson, expect_pearson = chi2_contingency(
        cr_re, correction=True, lambda_='pearson')  # pearson 卡方
    chi2_log, p_value_log, dof_log, expect_log = chi2_contingency(
        cr_re, correction=True, lambda_='log-likelihood')
    chi2_ftukey, p_value_ftukey, dof_ftukey, expect_ftukey = chi2_contingency(
        cr_re, correction=True, lambda_='freeman-tukey')
    chi2_mll, p_value_mll, dof_mll, expect_mll = chi2_contingency(
        cr_re, correction=True, lambda_='mod-log-likelihood')
    chi2_neyman, p_value_neyman, dof_neyman, expect_neyman = chi2_contingency(
        cr_re, correction=True, lambda_='neyman')
    chi2_cr, p_value_cr, dof_cr, expect_cr = chi2_contingency(
        cr_re, correction=True, lambda_='cressie-read')

    chi_res.append([
        "{:.4f}".format(chi2_pearson), "{:.4f}".format(p_value_pearson),
        dof_pearson
    ])
    chi_res.append(
        ["{:.4f}".format(chi2_log), "{:.4f}".format(p_value_log), dof_log])
    chi_res.append([
        "{:.4f}".format(chi2_ftukey), "{:.4f}".format(p_value_ftukey),
        dof_ftukey
    ])
    chi_res.append(
        ["{:.4f}".format(chi2_mll), "{:.4f}".format(p_value_mll), dof_mll])
    chi_res.append([
        "{:.4f}".format(chi2_neyman), "{:.4f}".format(p_value_neyman),
        dof_neyman
    ])
    chi_res.append(
        ["{:.4f}".format(chi2_cr), "{:.4f}".format(p_value_cr), dof_cr])

    corss_index = cross_result.index.tolist()
    corss_index[-1] = '总计'
    corss_columns = cross_result.columns.tolist()
    corss_columns[-1] = '总计'

    corss_value = cross_result.values.tolist()
    exp = pd.DataFrame(expected_freq(cr_re))
    exp = sum_data(exp)
    expect = format_data_col(exp).values.tolist()

    r1 = {
        'title': "交叉表",
        'row': corss_index,
        'col': corss_columns[0:],
        'data': corss_value
    }
    r1 = transform_table_data_to_html(r1)

    r2 = {
        'title': "期望频数表",
        'row': corss_index,
        'col': corss_columns,
        'data': expect
    }
    r2 = transform_table_data_to_html(r2)
    r3 = {
        'title':
        "卡方检验",
        'row': [
            "pearson", "log-likelihood", "freeman-tukey", "mod-log-likelihood",
            "neyman", "cressie-read"
        ],
        'col': ['值', '显著性', '自由度'],
        'data':
        chi_res
    }
    r3 = transform_table_data_to_html(r3)
    return [r1, r2, r3]
Esempio n. 7
0
def test_marginal_sums(contingency_table, threshold=5):
    """ Return True if the expected marginal sums are all above 5,
    in which case the chi square test of independency is generally
    considered valid"""
    expected_frequencies = contingency.expected_freq(contingency_table.values)
    return np.all(np.greater(expected_frequencies, threshold))
Esempio n. 8
0
def chi2_independence(data, x, y, correction=True):
    """
    Chi-squared independence tests between two categorical variables.

    The test is computed for different values of :math:`\\lambda`: 1, 2/3, 0,
    -1/2, -1 and -2 (Cressie and Read, 1984).

    Parameters
    ----------
    data : :py:class:`pandas.DataFrame`
        The dataframe containing the ocurrences for the test.
    x, y : string
        The variables names for the Chi-squared test. Must be names of columns
        in ``data``.
    correction : bool
        Whether to apply Yates' correction when the degree of freedom of the
        observed contingency table is 1 (Yates 1934).

    Returns
    -------
    expected : pd.DataFrame
        The expected contingency table of frequencies.
    observed : pd.DataFrame
        The (corrected or not) observed contingency table of frequencies.
    stats : :py:class:`pandas.DataFrame`
        The test summary, containing four columns:

        * ``'test'``: The statistic name
        * ``'lambda'``: The :math:`\\lambda` value used for the power\
                        divergence statistic
        * ``'chi2'``: The test statistic
        * ``'p'``: The p-value of the test
        * ``'cramer'``: The Cramer's V effect size
        * ``'power'``: The statistical power of the test

    Notes
    -----
    From Wikipedia:

    *The chi-squared test is used to determine whether there is a significant
    difference between the expected frequencies and the observed frequencies
    in one or more categories.*

    As application examples, this test can be used to *i*) evaluate the
    quality of a categorical variable in a classification problem or to *ii*)
    check the similarity between two categorical variables. In the first
    example, a good categorical predictor and the class column should present
    high :math:`\\chi^2` and low p-value. In the second example, similar
    categorical variables should present low :math:`\\chi^2` and high p-value.

    This function is a wrapper around the
    :py:func:`scipy.stats.power_divergence` function.

    .. warning :: As a general guideline for the consistency of this test, the
        observed and the expected contingency tables should not have cells
        with frequencies lower than 5.

    References
    ----------
    .. [1] Cressie, N., & Read, T. R. (1984). Multinomial goodness‐of‐fit
           tests. Journal of the Royal Statistical Society: Series B
           (Methodological), 46(3), 440-464.

    .. [2] Yates, F. (1934). Contingency Tables Involving Small Numbers and the
           :math:`\\chi^2` Test. Supplement to the Journal of the Royal
           Statistical Society, 1, 217-235.

    Examples
    --------
    Let's see if gender is a good categorical predictor for the presence of
    heart disease.

    >>> import pingouin as pg
    >>> data = pg.read_dataset('chi2_independence')
    >>> data['sex'].value_counts(ascending=True)
    0     96
    1    207
    Name: sex, dtype: int64

    If gender is not a good predictor for heart disease, we should expect the
    same 96:207 ratio across the target classes.

    >>> expected, observed, stats = pg.chi2_independence(data, x='sex',
    ...                                                  y='target')
    >>> expected
    target          0           1
    sex
    0       43.722772   52.277228
    1       94.277228  112.722772

    Let's see what the data tells us.

    >>> observed
    target      0     1
    sex
    0        24.5  71.5
    1       113.5  93.5

    The proportion is lower on the class 0 and higher on the class 1. The
    tests should be sensitive to this difference.

    >>> stats.round(3)
                     test  lambda    chi2  dof    p  cramer  power
    0             pearson   1.000  22.717  1.0  0.0   0.274  0.997
    1        cressie-read   0.667  22.931  1.0  0.0   0.275  0.998
    2      log-likelihood   0.000  23.557  1.0  0.0   0.279  0.998
    3       freeman-tukey  -0.500  24.220  1.0  0.0   0.283  0.998
    4  mod-log-likelihood  -1.000  25.071  1.0  0.0   0.288  0.999
    5              neyman  -2.000  27.458  1.0  0.0   0.301  0.999

    Very low p-values indeed. The gender qualifies as a good predictor for the
    presence of heart disease on this dataset.
    """
    # Python code inspired by SciPy's chi2_contingency
    assert isinstance(data, pd.DataFrame), 'data must be a pandas DataFrame.'
    assert isinstance(x, str), 'x must be a string.'
    assert isinstance(y, str), 'y must be a string.'
    assert all(col in data.columns for col in (x, y)),\
        'columns are not in dataframe.'
    assert isinstance(correction, bool), 'correction must be a boolean.'

    observed = pd.crosstab(data[x], data[y])

    if observed.size == 0:
        raise ValueError('No data; observed has size 0.')

    expected = pd.DataFrame(expected_freq(observed),
                            index=observed.index,
                            columns=observed.columns)

    # All count frequencies should be at least 5
    for df, name in zip([observed, expected], ['observed', 'expected']):
        if (df < 5).any(axis=None):
            warnings.warn('Low count on {} frequencies.'.format(name))

    dof = float(expected.size - sum(expected.shape) + expected.ndim - 1)

    if dof == 1 and correction:
        # Adjust `observed` according to Yates' correction for continuity.
        observed = observed + 0.5 * np.sign(expected - observed)

    ddof = observed.size - 1 - dof
    n = data.shape[0]
    stats = []
    names = [
        "pearson", "cressie-read", "log-likelihood", "freeman-tukey",
        "mod-log-likelihood", "neyman"
    ]

    for name, lambda_ in zip(names, [1.0, 2 / 3, 0.0, -1 / 2, -1.0, -2.0]):
        if dof == 0:
            chi2, p, cramer, power = 0.0, 1.0, np.nan, np.nan
        else:
            chi2, p = power_divergence(observed,
                                       expected,
                                       ddof=ddof,
                                       axis=None,
                                       lambda_=lambda_)
            dof_cramer = min(expected.shape) - 1
            cramer = np.sqrt(chi2 / (n * dof_cramer))
            power = power_chi2(dof=dof, w=cramer, n=n, alpha=0.05)

        stats.append({
            'test': name,
            'lambda': lambda_,
            'chi2': chi2,
            'dof': dof,
            'p': p,
            'cramer': cramer,
            'power': power
        })

    stats = pd.DataFrame(stats)[[
        'test', 'lambda', 'chi2', 'dof', 'p', 'cramer', 'power'
    ]]
    return expected, observed, stats
def pointwise_mutual_information(contingency_matrix):
    expected_freq_matrix = expected_freq(contingency_matrix)
    return {
        'pmi': np.log2(contingency_matrix[0][0] / expected_freq_matrix[0][0])
    }
Esempio n. 10
0
def meets_cochran(ser):
    expected = expected_freq(
        np.array([ser, cluster_stats2.cluster_sizes - ser]))
    emin = (np.round(expected) >= 1).all()
    perc_expected = ((expected > 5).sum() / expected.size) > 0.8
    return emin and perc_expected
Esempio n. 11
0
def has_zero_expected(ser):
    expected = expected_freq(
        np.array([ser, cluster_stats2.cluster_sizes - ser]))
    return np.any(np.round(expected) == 0)
Esempio n. 12
0
def tabella_di_contingenza(dataframe,
                           colonna_A,
                           colonna_B,
                           ordine_A=False,
                           ordine_B=False,
                           informativo=False,
                           norm_axis=False):
    '''
    dataframe: inserire la tabella su cui si vuole fare la tabulazione incrociata
    colonna_A: inserire la stringa di testo che rappresenta l'intestazione della singola colonna
    colonna_B: inserire la stringa di testo che rappresenta l'intestazione della singola colonna
    ordine_A: inserire una lista di valori rappresentativi dell'ordine delle categorie della colonna A
    ordine_B: inserire una lista di valori rappresentativi dell'ordine delle categorie della colonna B
    iformativo: True, permette di avere in una stessa tabella frequenze, frequenze attese e scarti.
    '''
    # qui aggiuntere tabella con scarti e percentuale.
    # qui andrebbero inserite anche le percentuali di riga
    crosstab = pd.crosstab(dataframe[colonna_A],
                           dataframe[colonna_B],
                           margins=True)
    # normalize : boolean, {‘all’, ‘index’, ‘columns’}

    if ordine_A != False:
        crosstab = crosstab.reindex(ordine_A, axis=0)
    if ordine_B != False:
        crosstab = crosstab.reindex(ordine_B, axis=1)
    if informativo == True:
        expected = pd.DataFrame(expected_freq(crosstab),
                                index=crosstab.index,
                                columns=crosstab.columns)
        crosstab_norm_all = pd.crosstab(
            dataframe[colonna_A],
            dataframe[colonna_B],
            margins=True,
            normalize="all").applymap(lambda x: ("( {:.2f})".format(x)))
        crosstab_norm_index = pd.crosstab(
            dataframe[colonna_A],
            dataframe[colonna_B],
            margins=True,
            normalize="index").applymap(lambda x: ("( {:.2f})".format(x)))
        crosstab_norm_columns = pd.crosstab(
            dataframe[colonna_A],
            dataframe[colonna_B],
            margins=True,
            normalize="columns").applymap(lambda x: ("( {:.2f})".format(x)))
        if norm_axis == False:
            crosstab = crosstab.applymap(str) + " " + expected.applymap(
                lambda x: ("( {:.2f})".format(x))) + " " + (
                    crosstab - expected).applymap(lambda x: (
                        "( {:.2f})".format(x))) + " " + crosstab_norm_all
        if norm_axis == "index":
            crosstab = crosstab.applymap(str) + " " + expected.applymap(
                lambda x: ("( {:.2f})".format(x))) + " " + (
                    crosstab - expected).applymap(lambda x: (
                        "( {:.2f})".format(x))) + " " + crosstab_norm_index
        if norm_axis == "columns":
            crosstab = crosstab.applymap(str) + " " + expected.applymap(
                lambda x: ("( {:.2f})".format(x))) + " " + (
                    crosstab - expected).applymap(lambda x: (
                        "( {:.2f})".format(x))) + " " + crosstab_norm_columns

    return crosstab
Esempio n. 13
0
         len(controls) - len(segment[3])
     ]
 ]
 if method == 'chi':
     p = chi2_contingency(contingency_table, correction=yates)[1]
     if yates:
         method_name = 'Yates chi-squared'
     else:
         method_name = 'Chi-squared'
 elif method == 'fisher':
     p = fisher_exact(contingency_table)[1]
     method_name = 'Fisher'
 elif method == 'g':
     p = power_divergence(
         contingency_table[0] + contingency_table[1],
         f_exp=expected_freq(contingency_table).ravel(),
         ddof=2,
         lambda_='log-likelihood')[1]
     method_name = 'G-test'
 else:
     expected_frequency_table = expected_freq(contingency_table)
     num_large_cells = 0
     num_small_cells = 0
     for row in expected_frequency_table:
         for cell in row:
             if cell >= 5:
                 num_large_cells += 1
             elif cell < 1:
                 num_small_cells += 1
                 break
     if num_large_cells >= 3 and num_small_cells == 0:
Esempio n. 14
0
def contingency_table(dataframe,
                      columns_a,
                      columns_b,
                      order_a=False,
                      order_b=False,
                      informative=True,
                      norm_axis=False):
    '''
    dataframe: enter the table on which you want to make the cross tabulation
    columns_a:  insert the text string representing the header of the single column
    columns_b:  insert the text string representing the header of the single column
    order_a: insert a list of values representative of the order of the categories in column A
    order_b:  insert a list of values representative of the order of the categories in column B
    informative: True, allows you to have in the same table frequencies, expected frequencies and discards. 
    '''

    if order_a != False:
        dataframe[columns_a] = pd.Categorical(dataframe[columns_a],
                                              categories=order_a)

    if order_b != False:
        dataframe[columns_b] = pd.Categorical(dataframe[columns_b],
                                              categories=order_b)

    crosstab = pd.crosstab(dataframe[columns_a],
                           dataframe[columns_b],
                           margins=True,
                           dropna=False)

    if informative == True:
        expected = pd.DataFrame(expected_freq(crosstab),
                                index=crosstab.index,
                                columns=crosstab.columns)
        crosstab_norm_all = pd.crosstab(
            dataframe[columns_a],
            dataframe[columns_b],
            margins=True,
            normalize="all",
            dropna=False).applymap(lambda x: ("( {:.2f})".format(x)))
        crosstab_norm_index = pd.crosstab(
            dataframe[columns_a],
            dataframe[columns_b],
            margins=True,
            normalize="index",
            dropna=False).applymap(lambda x: ("( {:.2f})".format(x)))
        crosstab_norm_columns = pd.crosstab(
            dataframe[columns_a],
            dataframe[columns_b],
            margins=True,
            normalize="columns",
            dropna=False).applymap(lambda x: ("( {:.2f})".format(x)))
        if norm_axis == False:
            crosstab = crosstab.applymap(str) + " " + expected.applymap(
                lambda x: ("( {:.2f})".format(x))) + " " + (
                    crosstab - expected).applymap(lambda x: (
                        "( {:.2f})".format(x))) + " " + crosstab_norm_all
        if norm_axis == "index":
            crosstab = crosstab.applymap(str) + " " + expected.applymap(
                lambda x: ("( {:.2f})".format(x))) + " " + (
                    crosstab - expected).applymap(lambda x: (
                        "( {:.2f})".format(x))) + " " + crosstab_norm_index
        if norm_axis == "columns":
            crosstab = crosstab.applymap(str) + " " + expected.applymap(
                lambda x: ("( {:.2f})".format(x))) + " " + (
                    crosstab - expected).applymap(lambda x: (
                        "( {:.2f})".format(x))) + " " + crosstab_norm_columns

    return crosstab
Esempio n. 15
0
     #Cases without segment         Controls without segment
     [len(cases) - len(segment[2]), len(controls) - len(segment[3])]
 ]
 if method == 'chi':
     p = chi2_contingency(contingency_table, correction=yates)[1]
     if yates:
         method_name = 'Yates chi-squared'
     else:
         method_name = 'Chi-squared'
 elif method == 'fisher':
     p = fisher_exact(contingency_table)[1]
     method_name = 'Fisher'
 elif method == 'g':
     p = power_divergence(
         contingency_table[0] + contingency_table[1],
         f_exp=expected_freq(contingency_table).ravel(),
         ddof=2,
         lambda_='log-likelihood'
     )[1]
     method_name = 'G-test'
 else:
     expected_frequency_table = expected_freq(contingency_table)
     num_large_cells = 0
     num_small_cells = 0
     for row in expected_frequency_table:
         for cell in row:
             if cell >= 5:
                 num_large_cells += 1
             elif cell < 1:
                 num_small_cells += 1
                 break