Example #1
0
def build_partial_corr(corr_df, target, covar, method='pearson', padjust='fdr_bh', pval=0.05, covar_name=None):
    """
    Builds partial correlation DataFrame from corr_df of the target survey, controlling for covar.
    
    corr_df (pd.DataFrame): correlation frame, assuming each row is an observation
    target (str): targe column, can be a string prefix or suffix
    covar (list): a list of covariates to control for
    covar_name (str): optional name for covariates in the display
    """
    
    partial_corr = pg.pairwise_corr(data=corr_df, covar=covar, method=method)
    _, p_adj = pg.multicomp(partial_corr['p-unc'].values, alpha=pval, method=padjust)
    partial_corr['p-corr'] = p_adj
    
    partial_corr = partial_corr.loc[(partial_corr['p-corr'] < pval) & (~partial_corr['X'].str.contains(target)) & (partial_corr['Y'].str.contains(target))]
    partial_corr['r_ctl'] = partial_corr['r']
    partial_corr['p_ctl'] = partial_corr['p-corr']
    if covar_name is not None:
        partial_corr['covar'] = covar_name
    
    
    partial_corr = partial_corr[['X', 'Y', 'covar', 'r_ctl', 'p_ctl']]
    
    # drop the controlling covars for the raw pairwise correlation
    pairwise_corr = pg.pairwise_corr(data=corr_df.drop(covar, axis='columns'), method=method, padjust=padjust)
    pairwise_corr['r_unctl'] = pairwise_corr['r']
    pairwise_corr['p_unctl'] = pairwise_corr['p-corr']

    partial_corr = partial_corr.merge(pairwise_corr[['X', 'Y', 'r_unctl', 'p_unctl', 'n']], on=['X', 'Y'], how='left').sort_values('p_ctl')
    return partial_corr.style.set_caption(method)
Example #2
0
        # add stats to title
        for ax in grid.axes.flat:
            ax.set(yscale="symlog")
            ax.set_ylim(bottom=0)
            var = ax.get_title().replace("population = ", "")
            try:
                child, parent = re.findall(r"(.*)/(.*)", var)[0]
                ax.set_title(child)
                ax.set_ylabel("Cells / uL")
            except IndexError:
                ax.set_title(var)

        grid.savefig(figfile)
        plt.close(grid.fig)


import pingouin as pg

m = matrix.join(meta[["severity_group"]])
m["severity_group"] = m["severity_group"].cat.remove_unused_categories()
res = pd.concat(
    [
        pg.pairwise_ttests(
            data=m, dv=var, between="severity_group", parametric=False
        ).assign(variable=var)
        for var in m.columns[:-1]
    ]
).drop(["Contrast"], axis=1)
res["p-cor"] = pg.multicomp(res["p-unc"].values, method="fdr_bh")[1]
res.to_csv("diff.absolute.csv", index=False)
Example #3
0
plt.hist(x=dataFrame.slope)
#plt.show()

if withImage_anova == 'yes':
    # 2-way ANOVA
    aov = pg.mixed_anova(dv='slope',
                         between='group',
                         within='eye',
                         subject='subject',
                         data=dataFrame)
    aov.round(3)
    aov

    # Bonferroni correction
    pvals = [aov['p-unc'][0], aov['p-unc'][1], aov['p-unc'][2]]
    reject, pvals_corr = pg.multicomp(pvals, method='fdr_bh')
    print(reject, pvals_corr)

    for sub in subjects:

        data = cs_data.loc[cs_data.id == sub]

        y, slope, eye = getRegressionCoeff(data)
        x = data.test_num.unique().tolist()

        # Assign color for plots
        if sub[0:3] == 'ASW':
            plot_palette = plot_colors[3]
        elif sub[0:2] == 'AS' or sub[0:2] == 'AM':
            plot_palette = plot_colors[0]
        elif sub[0:2] == 'AA':
Example #4
0
def stats(model, quantity, data, targets, tw, rm, nd):
    if model == 'absolute':
        data = data.drop(['NormQuant'], axis=1)
        data['NormMean'] = data['NormMean'].astype(float)
        mean = 'NormMean'
    else:
        data = data.drop(['rq'], axis=1)
        data['rqMean'] = data['rqMean'].astype(float)
        mean = 'rqMean'

    # prepare data from intermediate dataframe
    data = data[data['Outliers'].eq(False)]
    data = data.drop_duplicates(keep='first')

    # t-test and anova for normally distributed data
    if nd == 'True':
        if quantity == 2:
            # T-Test between 2 groups
            stats_dfs = pandas.DataFrame()
            posthoc_dfs = pandas.DataFrame()
            group = data['Group'].dropna()
            group = group.drop_duplicates(keep='first').values.tolist()
            for item in targets:
                df = data[data['Target Name'].eq(item)]
                group1 = df[df['Group'].eq(group[0])][mean]
                group2 = df[df['Group'].eq(group[1])][mean]
                t_test = ttest(group1, group2, paired=bool(rm))

                if rm == 'True':
                    t_test['paired'] = 'TRUE'
                else:
                    t_test['paired'] = 'FALSE'
                t_test['Target Name'] = item
                if stats_dfs is None:
                    stats_dfs = t_test
                else:
                    stats_dfs = stats_dfs.append(t_test, ignore_index=True)
            # reformat output table
            stats_dfs = stats_dfs.rename(columns={
                'cohen-d': 'effect size',
                'BF10': 'Bayes factor',
                'dof': 'DF'
            })
            cols = [
                'Target Name', 'DF', 'T', 'tail', 'paired', 'p-val',
                'effect size', 'power', 'Bayes factor'
            ]
            stats_dfs = stats_dfs.reindex(columns=cols)
        elif quantity >= 3:
            # ANOVA test
            stats_dfs = pandas.DataFrame()
            posthoc_dfs = pandas.DataFrame()
            # tukey_dfs = pandas.DataFrame()
            pvals = []
            for item in targets:
                if rm == 'True':
                    # one-way
                    if tw == 'False':
                        # repeated measure anova
                        aov = pg.rm_anova(
                            dv=mean,
                            data=data[data['Target Name'].eq(item)],
                            within='Group',
                            subject='Sample Name',
                            detailed=True)
                        pvals.append(aov['p-unc'][0])
                        aov = aov.drop([1])
                        aov['measures'] = ['dependent']
                        aov['Target Name'] = item
                    # two-way
                    else:
                        aov = pg.rm_anova(
                            dv=mean,
                            data=data[data['Target Name'].eq(item)],
                            within=['Group1', 'Group2'],
                            subject='Sample Name',
                            detailed=True)
                        reject_tw, pval_corr_tw = pg.multicomp(list(
                            aov['p-unc']),
                                                               alpha=0.05,
                                                               method='bonf')
                        aov['p-value corrected'] = pval_corr_tw
                        aov['measures'] = ['dependent'] * 3
                        aov['Target Name'] = [item] * 3
                    aov.drop(['eps'], axis=1)
                    ph = pairwise_ttests(
                        data=data[data['Target Name'].eq(item)],
                        dv=mean,
                        within='Group',
                        subject='Sample Name',
                        padjust='fdr_bh')
                    ph['Target Name'] = item
                    ph['Test'] = 'T-Test'
                else:
                    # one-way
                    if tw == 'False':
                        aov = pg.anova(dv=mean,
                                       between='Group',
                                       data=data[data['Target Name'].eq(item)],
                                       detailed=True)
                        pvals.append(aov['p-unc'][0])
                        aov = aov.drop([1])
                        aov['measures'] = ['independent']
                        aov['Target Name'] = item
                        ph = pairwise_ttests(
                            data=data[data['Target Name'].eq(item)],
                            dv=mean,
                            between='Group',
                            padjust='fdr_bh')
                        ph['Test'] = 'T-Test'
                    # two-way
                    else:
                        aov = pg.anova(dv=mean,
                                       between=['Group1', 'Group2'],
                                       data=data[data['Target Name'].eq(item)],
                                       detailed=False)
                        aov = aov.drop([3])
                        reject_tw, pval_corr_tw = pg.multicomp(list(
                            aov['p-unc']),
                                                               alpha=0.05,
                                                               method='bonf')
                        aov['p-value corrected'] = pval_corr_tw
                        aov['measures'] = ['independent'] * 3
                        aov['Target Name'] = [item] * 3
                        ph = pairwise_ttests(
                            data=data[data['Target Name'].eq(item)],
                            dv=mean,
                            between=['Group1', 'Group2'],
                            padjust='fdr_bh')
                        ph['Test'] = 'T-Test'
                    ph['Target Name'] = item
                if stats_dfs is None:
                    stats_dfs = aov
                else:
                    stats_dfs = stats_dfs.append(aov, ignore_index=True)
                if posthoc_dfs is None:
                    posthoc_dfs = ph
                else:
                    posthoc_dfs = posthoc_dfs.append(ph, ignore_index=True)

            reject, pvals_corr = pg.multicomp(pvals, alpha=0.05, method='bonf')

            # reformat output tables
            stats_dfs = stats_dfs.rename(columns={
                'p-unc': 'p-value',
                'np2': 'effect size'
            })
            if tw == 'False':
                stats_dfs['p-value corrected'] = pvals_corr
                stats_dfs['distribution'] = ['parametric'] * len(targets)
                stats_dfs['test'] = ['ANOVA'] * len(targets)
                stats_dfs['statistic'] = ['NA'] * len(targets)
            else:
                stats_dfs['distribution'] = ['parametric'] * (len(targets) * 3)
                stats_dfs['test'] = ['ANOVA'] * (len(targets) * 3)
                stats_dfs['statistic'] = ['NA'] * (len(targets) * 3)
            cols = [
                'Target Name', 'Source', 'DF', 'F', 'MS', 'SS', 'p-value',
                'p-value corrected', 'measures', 'distribution', 'test',
                'statistic', 'effect size'
            ]
            stats_dfs = stats_dfs.reindex(columns=cols)
            if tw == 'False':
                posthoc_dfs = posthoc_dfs.drop(['Contrast', 'T'], axis=1)
            else:
                posthoc_dfs = posthoc_dfs.drop(['T'], axis=1)
            posthoc_dfs = posthoc_dfs.rename(
                columns={
                    'hedges': 'effect size',
                    'p-corr': 'p-value corrected',
                    'p-unc': 'p-value',
                    'p-adjust': 'correction method',
                    'BF10': 'Bayes factor',
                    'dof': 'DF'
                })
            if tw == 'False':
                cols2 = [
                    'Target Name', 'A', 'B', 'DF', 'p-value corrected',
                    'p-value', 'correction method', 'Paired', 'Parametric',
                    'Test', 'effect size', 'Bayes factor'
                ]
            else:
                cols2 = [
                    'Target Name', 'Contrast', 'Group1', 'A', 'B', 'DF',
                    'p-value corrected', 'p-value', 'correction method',
                    'Paired', 'Parametric', 'Test', 'effect size',
                    'Bayes factor'
                ]
            posthoc_dfs = posthoc_dfs.reindex(columns=cols2)

    # nonparametric tests for not normally distributed data
    else:
        if quantity == 2:
            stats_dfs = pandas.DataFrame()
            posthoc_dfs = pandas.DataFrame()
            group = data['Group'].dropna()
            group = group.drop_duplicates(keep='first').values.tolist()
            for item in targets:
                df = data[data['Target Name'].eq(item)]
                group1 = df[df['Group'].eq(group[0])][mean]
                group2 = df[df['Group'].eq(group[1])][mean]
                if rm == 'True':
                    # Mann-Whitney U test
                    test = mannwhitneyu(group1, group2)
                    test = pandas.DataFrame(
                        {
                            'Target Name': item,
                            'pvalue': test.pvalue,
                            'statistic': test.statistic
                        },
                        index=[0])
                else:
                    # Wilcoxon
                    test = wilcoxon(group1, group2)
                    test = pandas.DataFrame(
                        {
                            'Target Name': item,
                            'pvalue': test.pvalue,
                            'statistic': test.statistic
                        },
                        index=[0])
                if stats_dfs is None:
                    stats_dfs = test
                else:
                    stats_dfs = stats_dfs.append(test, ignore_index=True)

        elif quantity >= 3:
            stats_dfs = pandas.DataFrame()
            posthoc_dfs = pandas.DataFrame()

            pvals = []
            for item in targets:
                if rm == 'True':
                    # friedman test for repeated measurements
                    df = pg.friedman(dv=mean,
                                     within='Group',
                                     subject='Sample Name',
                                     data=data[data['Target Name'].eq(item)])
                    pvals.append(df['p-unc'][0])
                    df['test'] = ['Friedman Q']
                    df['measures'] = ['dependent']
                    df = df.rename(columns={'Q': 'statistic'})
                    df['Target Name'] = item
                    df['DF'] = 'NA'
                    ph = pairwise_ttests(
                        data=data[data['Target Name'].eq(item)],
                        dv=mean,
                        within='Group',
                        subject='Sample Name',
                        padjust='fdr_bh',
                        parametric=False)
                    ph['Target Name'] = item
                    ph['DF'] = 'NA'
                    ph['Bayes factor'] = 'NA'
                    ph['Test'] = 'Wilcoxon'
                else:
                    # Kruskal-Wallis H test
                    df = pg.kruskal(dv=mean,
                                    between='Group',
                                    data=data[data['Target Name'].eq(item)])
                    pvals.append(df['p-unc'][0])
                    df['test'] = ['Kruskal-Wallis H']
                    df['measures'] = ['independent']
                    df = df.rename(columns={'H': 'statistic'})
                    df['Target Name'] = item
                    df['DF'] = 'NA'
                    ph = pairwise_ttests(
                        data=data[data['Target Name'].eq(item)],
                        dv=mean,
                        between='Group',
                        padjust='fdr_bh',
                        parametric=False)
                    ph['Target Name'] = item
                    ph['DF'] = 'NA'
                    ph['Bayes factor'] = 'NA'
                    ph['Test'] = 'Mann-Whitney U'
                if stats_dfs is None:
                    stats_dfs = df
                else:
                    stats_dfs = stats_dfs.append(df, ignore_index=True)
                if posthoc_dfs is None:
                    posthoc_dfs = ph
                else:
                    posthoc_dfs = posthoc_dfs.append(ph, ignore_index=True)

            reject, pvals_corr = pg.multicomp(pvals, alpha=0.05, method='bonf')
            # reformat output tables
            stats_dfs = stats_dfs.rename(columns={
                'dof': 'DF',
                'p-unc': 'p-value'
            })
            stats_dfs['p-value corrected'] = pvals_corr
            stats_dfs['distribution'] = ['non-parametric'] * len(targets)
            stats_dfs['MS'] = ['NA'] * len(targets)
            stats_dfs['SS'] = ['NA'] * len(targets)
            stats_dfs['effect size'] = ['NA'] * len(targets)
            cols = [
                'Target Name', 'DF', 'MS', 'SS', 'p-value',
                'p-value corrected', 'measures', 'distribution', 'test',
                'statistic', 'effect size'
            ]
            stats_dfs = stats_dfs.reindex(columns=cols)

            posthoc_dfs = posthoc_dfs.drop(['Contrast'], axis=1)
            posthoc_dfs = posthoc_dfs.rename(
                columns={
                    'hedges': 'effect size',
                    'p-corr': 'p-value corrected',
                    'p-unc': 'p-value',
                    'p-adjust': 'correction method',
                    'BF10': 'Bayes factor'
                })
            cols2 = [
                'Target Name', 'A', 'B', 'DF', 'p-value corrected', 'p-value',
                'correction method', 'Paired', 'Parametric', 'Test',
                'effect size', 'Bayes factor'
            ]
            posthoc_dfs = posthoc_dfs.reindex(columns=cols2)

    return stats_dfs, posthoc_dfs
Example #5
0
def check_mcnemar_significance(mcnemar_pvals):
    import pingouin as pg

    reject, pvals = pg.multicomp(mcnemar_pvals, alpha=0.05, method="holm")

    return reject, pvals
Example #6
0
def swarmboxenplot(
    data: DataFrame,
    x: str,
    y: tp.Union[str, Iterables],
    hue: str = None,
    swarm: bool = True,
    boxen: bool = True,
    bar: bool = False,
    ax: tp.Union[Axis, tp.Sequence[Axis]] = None,
    test: tp.Union[bool, str] = "mann-whitney",
    multiple_testing: tp.Union[bool, str] = "fdr_bh",
    test_upper_threshold: float = 0.05,
    test_lower_threshold: float = 0.01,
    plot_non_significant: bool = False,
    plot_kws: tp.Dict[str, tp.Any] = None,
    test_kws: tp.Dict[str, tp.Any] = None,
    fig_kws: tp.Dict[str, tp.Any] = None,
) -> tp.Optional[tp.Union[Figure, DataFrame, tp.Tuple[Figure, DataFrame]]]:
    """
    A categorical plot that overlays individual observations
    as a swarm plot and summary statistics about them in a boxen plot.

    In addition, this plot will test differences between observation
    groups and add lines representing a significant difference between
    them.

    Parameters
    ----------
    data: pd.DataFrame
        A dataframe with data where the rows are the observations and
        columns are the variables to group them by.
    x: str
        The categorical variable.
    y: str | list[str]
        The continuous variable to plot.
        If more than one is given, will ignore the `ax` attribute and
        return figure with a subplot per each `y` variable.
    hue: str, optional
        An optional categorical variable to further group observations by.
    swarm: bool
        Whether to plot individual observations as a swarmplot.
    boxen: bool
        Whether to plot summary statistics as a boxenplot.
    ax: matplotlib.axes.Axes, optional
        An optional axes to draw in.
    test: bool | str
        Whether to test differences between observation groups.
        If `False`, will not return a dataframe as well.
        If a string is passed, will perform test accordingly. Available tests:
            - 't-test':
            - 'mann-whitney':
            - 'kruskal':
        Default is a parwise 'mann-whitney' test with p-value adjustment.
    multiple_testing: str
        Method for multiple testing correction.
    test_upper_threshold: float
        Upper theshold to consider p-values significant.
        Will be marked with "*".
    test_lower_threshold: float
        Secondary theshold to consider p-values highly significant.
        Will be marked with "**".
    plot_non_significant: bool
        Whether to add a "n.s." sign to p-values above `test_upper_threshold`.
    plot_kws: dict
        Additional values to pass to seaborn.boxenplot or seaborn.swarmplot
    test_kws: dict
        Additional values to pass to pingouin.pairwise_ttests.
        The default is: dict(parametric=False) to run a non-parametric test.

    Returns
    -------
    tuple[Figure, pandas.DataFrame]:
        if `ax` is None and `test` is True.

        pandas.DataFrame: if `ax` is not None.
        Figure: if `test` is False.
    None:
        if `test` is False and `ax` is not None.

    Raises
    ------
    ValueError:
        If either the `x` or `hue` column in `data` are not
        Category, string or object type, or if `y` is not numeric.

    """
    # opts = dict(data=data, x='h', y='y', hue='x', test_kws=dict(parametric=False))
    # opts = dict(data=data, x='cat', y='cont')
    # for k, v in opts.items():
    #     locals()[k] = v

    for var, name in [(x, "x"), (hue, "hue")]:
        if var is not None:
            if not data[var].dtype.name in ["category", "string", "object"]:
                raise ValueError(
                    f"`{name}` variable must be categorical, string or object."
                )

    if test_kws is None:
        test_kws = dict()
    if plot_kws is None:
        plot_kws = dict()

    data = data.sort_values([x] + ([hue] if hue is not None else []))

    if isinstance(y, (list, pd.Series, pd.Index)):
        # TODO: display only one legend for hue
        if ax is None:
            n, m = get_grid_dims(y)
            default_fig_kws = dict(nrows=n,
                                   ncols=m,
                                   figsize=(m * 4, n * 4),
                                   sharex=True,
                                   squeeze=False)
            default_fig_kws.update(fig_kws or {})
            fig, axes = plt.subplots(**default_fig_kws)
            axes = axes.flatten()
        elif isinstance(ax, np.ndarray):
            axes = ax.flatten()
        elif isinstance(ax, matplotlib.axes.Axes):
            axes = np.asarray([ax])

        _stats = list()
        idx = -1
        for idx, _var in enumerate(y):
            _ax = axes[idx]
            s: DataFrame = swarmboxenplot(
                data=data,
                x=x,
                y=_var,
                hue=hue,
                swarm=swarm,
                boxen=boxen,
                bar=bar,
                ax=_ax,
                test=test,
                multiple_testing=multiple_testing,
                test_upper_threshold=test_upper_threshold,
                test_lower_threshold=test_lower_threshold,
                plot_non_significant=plot_non_significant,
                plot_kws=plot_kws,
                test_kws=test_kws,
            )
            _ax.set(title=_var + _ax.get_title(), xlabel=None, ylabel=None)
            if test is not False:
                _stats.append(s.assign(Variable=_var))
        # "close" excess subplots
        for _ax in axes[idx + 1:]:
            _ax.axis("off")
        if test is not False:
            stats = pd.concat(_stats).reset_index(drop=True)
            cols = [c for c in stats.columns if c != "Variable"]
            stats = stats.reindex(["Variable"] + cols, axis=1)

            # If there is just one test per `y` (no hue), correct p-values
            if stats.shape == len(y):
                stats["p-cor"] = pg.multicomp(stats["p-unc"].values,
                                              method=multiple_testing)[1]
        if ax is None:
            return (fig, stats) if test else fig
        return stats if test else None

    if data[y].dtype.name in ["category", "string", "object"]:
        raise ValueError("`y` variable must be numeric.")

    if ax is None:
        fig, _ax = plt.subplots(1, 1, figsize=(4, 4))
    else:
        _ax = ax

    # Plot vanilla seaborn
    if boxen:
        assert not bar
        # Tmp fix for lack of support for Pandas Int64 in boxenplot:
        if data[y].dtype.name == "Int64":
            data[y] = data[y].astype(float)
        boxen_kws = filter_kwargs_by_callable(plot_kws, sns.boxenplot)
        sns.boxenplot(data=data, x=x, y=y, hue=hue, ax=_ax, **boxen_kws)
    if bar:
        assert not boxen
        bar_kws = filter_kwargs_by_callable(plot_kws, sns.barplot)
        sns.barplot(data=data, x=x, y=y, hue=hue, ax=_ax, **bar_kws)

    if (boxen or bar) and swarm:
        _add_transparency_to_plot(_ax, kind="bar" if bar else "boxen")
    if swarm:
        swarm_kws = filter_kwargs_by_callable(plot_kws, sns.swarmplot)
        if hue is not None and "dodge" not in swarm_kws:
            swarm_kws["dodge"] = True
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", category=UserWarning)
            sns.swarmplot(data=data, x=x, y=y, hue=hue, ax=_ax, **swarm_kws)
    _ax.set_xticklabels(_ax.get_xticklabels(), rotation=90, ha="right")

    if test is not False:

        if test in [True, "t-test", "mann-whitney"]:
            test_function = pg.pairwise_ttests
            if test == "mann-whitney":
                test_kws["parametric"] = False
        elif test in ["kruskal"]:
            test_function = pg.kruskal
            assert hue is None, "If test is 'kruskal', 'hue' must be None."
        else:
            raise ValueError(f"Test type '{test}' not recognized.")
        #
        if not data.index.is_unique:
            print("Warning: dataframe contains a duplicated index.")

        # remove NaNs
        datat = data.dropna(subset=[x, y] + ([hue] if hue is not None else []))
        # remove categories with only one element
        keep = datat.groupby(x).size()[datat.groupby(x).size() > 1].index
        datat = datat.loc[datat[x].isin(keep), :]
        if datat[x].dtype.name == "category":
            datat[x] = datat[x].cat.remove_unused_categories()
        ylim = _ax.get_ylim()  # save original axis boundaries for later
        ylength = abs(ylim[1]) + (abs(ylim[0]) if ylim[0] < 0 else 0)

        # Now calculate stats
        # # get empty dataframe in case nothing can be calculated
        stat = _get_empty_stat_results(datat, x, y, hue, add_median=True)
        # # mirror groups to account for own pingouin order
        tats = stat.rename(columns={
            "B": "A",
            "A": "B",
            "median_A": "median_B",
            "median_B": "median_A",
        })
        stat = (pd.concat([stat,
                           tats]).sort_values(["Contrast", "A",
                                               "B"]).reset_index(drop=True))
        try:
            _stat = test_function(
                data=datat,
                dv=y,
                between=x if hue is None else [x, hue],
                **test_kws,
            )
        except (AssertionError, ValueError) as e:
            print(str(e))
            _stat = stat
        except KeyError:
            print("Only one category with values!")
            _stat = stat

        if test == "kruskal":
            p = _stat.squeeze()["p-unc"]
            symbol = ("**" if p <= test_lower_threshold else "n.s." if
                      ((p > test_upper_threshold) or pd.isnull(p)) else "*")
            _ax.set_title(symbol)
            return (fig, _stat) if ax is None else _stat

        stat = _stat.merge(
            stat[["Contrast", "A", "B", "median_A", "median_B"] +
                 ([x] if hue is not None else [])],
            how="left",
        )
        if multiple_testing is not False:
            if "p-unc" not in stat.columns:
                stat["p-unc"] = np.nan
            stat["p-cor"] = pg.multicomp(stat["p-unc"].values,
                                         method=multiple_testing)[1]
            pcol = "p-cor"
        else:
            pcol = "p-unc"

        # This ensures there is a point for each `x` class and keep the order
        # correct for below
        mm = data.groupby([x] + ([hue] if hue is not None else []))[y].median()
        if hue is None:
            order = {k: float(i) for i, k in enumerate(mm.index)}
        else:
            nhues = data[hue].drop_duplicates().dropna().shape[0]
            order = {
                k: (float(i) / nhues) - (1 / nhues) - 0.05
                for i, k in enumerate(mm.index)
            }
        _ax.scatter(order.values(), mm, alpha=0, color="white")

        # Plot significance bars
        # start at top of the plot and progressively decrease sig. bar downwards
        py = data[y].max()
        incr = ylength / 100  # divide yaxis in 100 steps
        for idx, row in stat.iterrows():
            p = row[pcol]
            if (pd.isnull(p) or
                (p > test_upper_threshold)) and (not plot_non_significant):
                py -= incr
                continue
            symbol = ("**" if p <= test_lower_threshold else "n.s." if
                      ((p > test_upper_threshold) or pd.isnull(p)) else "*")
            if hue is not None:
                if row[x] != "-":
                    xx = (order[(row[x], row["A"])], order[(row[x], row["B"])])
                else:
                    try:
                        # TODO: get more accurate middle of group
                        xx = (
                            order[(row["A"], stat["A"].iloc[-1])] -
                            (1 / nhues),
                            order[(row["B"], stat["B"].iloc[-1])] -
                            (1 / nhues),
                        )
                    except KeyError:
                        # These are the hue groups without contrasting on 'x'
                        continue
            else:
                xx = (order[row["A"]], order[row["B"]])

            red_fact = 0.95  # make the end position shorter
            _ax.plot(
                (xx[0], xx[1] * red_fact),
                (py, py),
                color="black",
                linewidth=1.2,
            )
            _ax.text(xx[1] * red_fact,
                     py,
                     s=symbol,
                     color="black",
                     ha="center")
            py -= incr
        _ax.set_ylim(ylim)
        return (fig, stat) if ax is None else stat
    return fig if ax is None else None
Example #7
0
    def qualOrdinalUnpaired(imgDir,
                            sheetName,
                            sheetDf,
                            sheetScale,
                            silent=False):
        print("######################################## ", sheetName,
              " ########################################"
              ) if not silent else None
        meltedSheetDf = sheetDf.melt(var_name='Factor', value_name='Variable')
        contingencySheetDf = pd.crosstab(index=meltedSheetDf['Variable'],
                                         columns=meltedSheetDf['Factor'])
        statDf = pd.DataFrame(columns=[
            'COMPARISON', 'TEST', 'STATISTICS', 'P-VALUE', 'EFFECT SIZE'
        ])
        #fill empty scale value
        for sheetStep in range(sheetScale):
            if not sheetStep in contingencySheetDf.index.values:
                contingencySheetDf.loc[sheetStep] = [
                    0 for x in range(len(contingencySheetDf.columns.values))
                ]
        contingencySheetDf.sort_index(inplace=True)
        # ALL MODALITY
        if len(contingencySheetDf.columns) > 2:
            sheetDf_long = sheetDf.melt(ignore_index=False).reset_index()
            kruskal_stats = pg.kruskal(data=sheetDf_long,
                                       dv="value",
                                       between="variable")
            source, ddof1, hvalue, pvalue = kruskal_stats.values[0]
            statDf = statDf.append(
                {
                    'COMPARISON': 'ALL',
                    'TEST': "Kruskal-Wallis",
                    'STATISTICS': hvalue,
                    'P-VALUE': pvalue,
                    'EFFECT SIZE': -1
                },
                ignore_index=True)

        # BETWEEN MODALITY
        modality_names = sheetDf.columns.values
        uncorrectedStatIndex = len(statDf.index)
        for i in range(len(modality_names)):
            for j in range(i + 1, len(modality_names)):
                stats_mannwhitney = pg.mwu(x=sheetDf.loc[:, modality_names[i]],
                                           y=sheetDf.loc[:, modality_names[j]],
                                           alternative='two-sided')
                uvalue, alternative, pvalue, RBC, CLES = stats_mannwhitney.values[
                    0]
                statDf = statDf.append(
                    {
                        'COMPARISON':
                        modality_names[i] + '|' + modality_names[j],
                        'TEST': "Mann-Whitney",
                        'STATISTICS': uvalue,
                        'P-VALUE': pvalue,
                        'EFFECT SIZE': RBC
                    },
                    ignore_index=True)
        reject, statDf.loc[uncorrectedStatIndex::, 'P-VALUE'] = pg.multicomp(
            statDf.loc[uncorrectedStatIndex::, 'P-VALUE'].values,
            alpha=0.05,
            method="holm")

        StackedBarPlotter.StackedBarPlotter(filename=imgDir + '/' + sheetName +
                                            '.png',
                                            title=sheetName,
                                            dataDf=sheetDf,
                                            histDf=contingencySheetDf,
                                            statDf=statDf)