def expect_assertion_error(*params): with pytest.raises(AssertionError): pg.chi2_independence(*params)
def test_chi2_independence(self): """Test function chi2_independence.""" # Setup np.random.seed(42) mean, cov = [0.5, 0.5], [(1, .6), (.6, 1)] x, y = np.random.multivariate_normal(mean, cov, 30).T data = pd.DataFrame({'x': x, 'y': y}) mask_class_1 = data > 0.5 data[mask_class_1] = 1 data[~mask_class_1] = 0 # Comparing results with SciPy _, _, stats = pg.chi2_independence(data, x='x', y='y') contingency_table = pd.crosstab(data['x'], data['y']) for i in stats.index: lambda_ = stats.at[i, 'lambda'] dof = stats.at[i, 'dof'] chi2 = stats.at[i, 'chi2'] p = round(stats.at[i, 'p'], 6) sp_chi2, sp_p, sp_dof, _ = chi2_contingency(contingency_table, lambda_=lambda_) assert (chi2, p, dof) == (round(sp_chi2, 3), round(sp_p, 6), sp_dof) # Testing resilience to NaN mask_nan = np.random.random(data.shape) > 0.8 # ~20% NaN values data[mask_nan] = np.nan pg.chi2_independence(data, x='x', y='y') # Testing validations def expect_assertion_error(*params): with pytest.raises(AssertionError): pg.chi2_independence(*params) expect_assertion_error(1, 'x', 'y') # Not a pd.DataFrame expect_assertion_error(data, x, 'y') # Not a string expect_assertion_error(data, 'x', y) # Not a string expect_assertion_error(data, 'x', 'z') # Not a column of data # Testing "no data" ValueError data['x'] = np.nan with pytest.raises(ValueError): pg.chi2_independence(data, x='x', y='y') # Testing degenerated case (observed == expected) data['x'] = 1 data['y'] = 1 expected, observed, stats = pg.chi2_independence(data, 'x', 'y') assert expected.iloc[0, 0] == observed.iloc[0, 0] assert stats.at[0, 'dof'] == 0 for i in stats.index: chi2 = stats.at[i, 'chi2'] p = stats.at[i, 'p'] assert (chi2, p) == (0.0, 1.0) # Testing warning on low count data.iloc[0, 0] = 0 with pytest.warns(UserWarning): pg.chi2_independence(data, 'x', 'y') # Comparing results with R # 2 x 2 contingency table (dof = 1) # >>> tbl = table(df$sex, df$target) # >>> chisq.test(tbl, correct = TRUE) # >>> cramersV(tbl) _, _, stats = pg.chi2_independence(df_ind, 'sex', 'target') assert stats.at[0, 'chi2'] == 22.717 assert stats.at[0, 'dof'] == 1 assert np.allclose(stats.at[0, 'p'], 1.877e-06) assert round(stats.at[0, 'cramer'], 2) == 0.27 # 4 x 2 contingency table _, _, stats = pg.chi2_independence(df_ind, 'cp', 'target') assert stats.at[0, 'chi2'] == 81.686 assert stats.at[0, 'dof'] == 3. assert stats.at[0, 'p'] < 2.2e-16 assert round(stats.at[0, 'cramer'], 3) == 0.519 assert np.allclose(stats.at[0, 'power'], 1.)
axes[4].axvline(0, linestyle="--", color="grey") frac = (x3["CD15(FITC-A)"] > 0).groupby( x3["severity_group"]).sum() / x3["severity_group"].value_counts() sns.barplot(frac * 100, frac.index, palette=pal, ax=axes[5]) axes[5].set(xlabel="% CD15 positive") axes[5].set_yticklabels([]) fig.tight_layout() fig.savefig( figures_dir / "panels" / "Figure2.CD5_expression_positivity.svg", **figkws, ) from scipy.stats import fisher_exact import pingouin as pg # type: ignore pg.chi2_independence(data=x, x="severity_group", y="cluster") y = x[["cluster"]].join(pd.get_dummies(x["severity_group"])) for cat in x["severity_group"].unique(): pg.chi2_independence(data=y, x="cluster", y=cat) y = x[["severity_group"]].join(pd.get_dummies(x["cluster"])) v = dict() for cat in x["cluster"].unique(): v[cat] = pg.chi2_independence(data=y, x="severity_group", y=cat)[2].iloc[-1] y = pd.get_dummies(x[["severity_group", "cluster"]]) for seve in meta["severity_group"].cat.categories: for clus in x["cluster"].unique(): y2 = y[["severity_group_" + seve, "cluster_" + clus]]
if choose_analysis == "Chi-square test": cat_vars = df.select_dtypes(include=np.object).columns.tolist() y_var1 = st.sidebar.selectbox("Choose first categorical variable:", cat_vars) y_var2 = st.sidebar.selectbox("Choose second categorical variable:", cat_vars) expand = st.sidebar.beta_expander("More options") yates_correction = expand.checkbox("Use Yates correction?") # move_counts = expand.slider("Adjust labels for counts on bar plot", 0.0, 0.5, 0.15, 0.01) st.header("Chi-square test of independence:") st.markdown("----") st.success("Expected and observed frequencies:") expected, observed, stats = pg.chi2_independence( df, x=y_var1, y=y_var2, correction=True if yates_correction else False) st.subheader("Expected") st.write(expected) st.subheader("Observed") st.write(observed) st.subheader("Chi-square test results:") st.write(stats.loc[[0]]) st.markdown("----") st.success("Frequency bars are generated:") st.markdown("## ") fig = plt.figure(figsize=(12, 6)) total = float(len(df)) ax = sns.countplot(x=y_var1, hue=y_var2, data=df, palette="Set2") numX = len([x for x in df[y_var1].unique() if x == x])
def test_significance(positive_samples_path, negative_samples_path, good_seed_only): """ Computes the significance levels / effect size of the generation strategy on the success of adversarial samples """ # Read-in tables print('Reading-in tables ...') with open(positive_samples_path, 'r', encoding='utf8') as psp: positive_samples = json.load(psp) with open(negative_samples_path, 'r', encoding='utf8') as nsp: negative_samples = json.load(nsp) # Store success labels per generation strategy success_var = { 'insert_at_homograph': list(), 'replace_at_homograph': list(), 'insert_at_other': list(), 'replace_at_other': list() } # Construct dataframe for the Chi^2 test print('Looking up sample provenance ...') for term in positive_samples.keys(): for seed_cluster in positive_samples[term].keys(): for adv_cluster in positive_samples[term][seed_cluster].keys(): for seed_sentence in positive_samples[term][seed_cluster][ adv_cluster].keys(): for sample in positive_samples[term][seed_cluster][ adv_cluster][seed_sentence]: if good_seed_only == 'True' and sample[20][ 0] != 'not_flipped': continue gen_strat = sample[19][-1] success_var[gen_strat].append(1) for term in negative_samples.keys(): for seed_cluster in negative_samples[term].keys(): for adv_cluster in negative_samples[term][seed_cluster].keys(): for seed_sentence in negative_samples[term][seed_cluster][ adv_cluster].keys(): for sample in negative_samples[term][seed_cluster][ adv_cluster][seed_sentence]: if good_seed_only == 'True' and sample[20][ 0] != 'not_flipped': continue gen_strat = sample[19][-1] success_var[gen_strat].append(0) # Construct dataframe print('Computing correlations ...') success_dict = {'method': list(), 'labels': list()} for m in success_var.keys(): success_dict['method'] += [m] * len(success_var[m]) success_dict['labels'] += success_var[m] unrolled_success_dict = pd.DataFrame.from_dict(success_dict) # Perform Chi^2 test expected, observed, stats = chi2_independence(unrolled_success_dict, x='method', y='labels') chi2 = stats.iloc[0]['chi2'] p = stats.iloc[0]['p'] p = p if p > 0.00005 else 0.0 v = stats.iloc[0]['cramer'] # Report print('Done!') print('=' * 20) print('CHI^2 STATS:') if p > 0.0: print('{:.3f}, {:.4f}, {:.4f}'.format(chi2, p, v)) else: print('{:.3f}, {:.1f}, {:.4f}'.format(chi2, p, v))
cbar_kws=dict(label="Mean intensity\n(Z-score)"), **kws, ) grid2.savefig( output_dir / f"{panel_name}.{label}.cluster_mean_intensity.clustermap.zscore.svg", **figkws, ) plt.close(grid2.fig) # Association between factors and cluster distribution for var in clin_vars[1:]: y = a.obs[[var]].join(pd.get_dummies(a.obs["cluster"])) v = dict() for cat in a.obs["cluster"].unique(): v[cat] = pg.chi2_independence(data=y, x=var, y=cat)[2].iloc[-1] res = pd.DataFrame(v).T # same order as clustermap res = res.reindex(mean.iloc[grid1.dendrogram_row.reordered_ind].index) fig, ax = plt.subplots(1, 1, figsize=(1.430, 0.08)) cramer = res["cramer"].astype(float) points = ax.scatter( cramer.index, [0] * cramer.shape[0], s=6, c=cramer, cmap="autumn_r", marker="s", edgecolors="none",