Ejemplo n.º 1
0
def ROME_plot(df, rvar, lev, pred, qnt=10, cost=1, margin=2):
    """
    Plot a ROME chart
    df: A pandas dataframe of a dictionary of dataframes with keys

    Examples
    --------
    ROME_plot(df, "buyer", "yes", "pred_a", cost=0.5, margin=6)
    ROME_plot(df, "buyer", "yes", ["pred_a", "pred_b"], cost=0.5, margin=6)
    dct = {"Training": df.query("training == 1"), "Test": df.query("training == 0")}
    ROME_plot(dct, "buyer", "yes", "pred_a", cost=0.5, margin=6)
    """
    dct = ifelse(type(df) is dict, df, {"": df})
    pred = ifelse(type(pred) is list, pred, [pred])
    group = ifelse(len(pred) > 1 or len(dct.keys()) > 1, "predictor", None)
    rd = [
        ROME(dct[k], rvar, lev, p, qnt=qnt, cost=cost,
             margin=margin).assign(predictor=p + ifelse(k == "", k, f" ({k})"))
        for k in dct.keys() for p in pred
    ]
    rd = pd.concat(rd)
    fig = sns.lineplot(x="cum_prop", y="ROME", data=rd, hue=group, marker="o")
    fig.set(
        ylabel="Return on Marketing Expenditures (ROME)",
        xlabel="Proportion of customers",
    )
    fig.axhline(0, linestyle="--", linewidth=1)
    return fig
Ejemplo n.º 2
0
def lift_plot(df, rvar, lev, pred, qnt=10):
    """
    Plot a cumulative lift chart
    df: A pandas dataframe of a dictionary of dataframes with keys

    Examples
    --------
    lift_plot(df, "buyer", "yes", "pred_a")
    lift_plot(df, "buyer", "yes", ["pred_a", "pred_b"], qnt=20)
    lift = {"Training": df.query("training == 1"), "Test": df.query("training == 0")}
    lift_plot(dct, "buyer", "yes", "pred_a")
    """
    dct = ifelse(type(df) is dict, df, {"": df})
    pred = ifelse(type(pred) is list, pred, [pred])
    group = ifelse(len(pred) > 1 or len(dct.keys()) > 1, "predictor", None)
    rd = [
        lift(dct[k], rvar, lev, p,
             qnt=qnt).assign(predictor=p + ifelse(k == "", k, f" ({k})"))
        for k in dct.keys() for p in pred
    ]
    rd = pd.concat(rd)
    fig = sns.lineplot(x="cum_prop",
                       y="cum_lift",
                       data=rd,
                       hue=group,
                       marker="o")
    fig.axhline(1, linestyle="--", linewidth=1)
    fig.set(ylabel="Cumulative lift", xlabel="Proportion of customers")
    return fig
Ejemplo n.º 3
0
def profit_plot(df, rvar, lev, pred, qnt=10, cost=1, margin=2):
    """
    Plot a profit chart
    df: A pandas dataframe of a dictionary of dataframes with keys

    Examples
    --------
    profit_plot(df, "buyer", "yes", "pred_a", cost=0.5, margin=6)
    profit_plot(df, "buyer", "yes", ["pred_a", "pred_b"], cost=0.5, margin=6)
    dct = {"Training": df.query("training == 1"), "Test": df.query("training == 0")}
    profit_plot(dct, "buyer", "yes", "pred_a", cost=0.5, margin=6)
    """
    dct = ifelse(type(df) is dict, df, {"": df})
    pred = ifelse(type(pred) is list, pred, [pred])
    group = ifelse(len(pred) > 1 or len(dct.keys()) > 1, "predictor", None)
    cnf = [
        confusion(dct[k], rvar, lev, p, cost=cost, margin=margin)[-1]
        for k in dct.keys() for p in pred
    ]
    df = [
        profit(dct[k], rvar, lev, p, qnt=qnt, cost=cost,
               margin=margin).assign(predictor=p +
                                     ifelse(k == "", k, f" ({k})"))
        for k in dct.keys() for p in pred
    ]
    df = pd.concat(df)
    fig = sns.lineplot(x="cum_prop",
                       y="cum_profit",
                       data=df,
                       hue=group,
                       marker="o")
    fig.set(ylabel="Profit", xlabel="Proportion of customers")
    fig.axhline(1, linestyle="--", linewidth=1)
    [fig.axvline(l, linestyle="--", linewidth=1) for l in cnf]
    return fig
Ejemplo n.º 4
0
Archivo: perf.py Proyecto: vnijs/pyrsm
def ROME_plot(df, rvar, lev, pred, qnt=10, cost=1, margin=2, marker="o", **kwargs):
    """
    Plot a ROME curve

    Parameters
    ----------
    df : Pandas dataframe or a dictionary of dataframes with keys to show multiple curves for different models or data samples
    rvar : str
        Name of the response variable column in df
    lev : str
        Name of the 'success' level in rvar
    pred : str
        Name of the column in df with model predictions
    qnt : int
        Number of quantiles to create
    cost : int
        Cost of an action
    margin : int
        Benefit of an action if a successful outcome results from the action
    marker : str
        Marker to use for line plot
    **kwargs : Named arguments to be passed to the seaborn lineplot function

    Returns
    -------
    Seaborn object
        Plot of ROME per quantile

    Examples
    --------
    ROME_plot(df, "buyer", "yes", "pred_a", cost=0.5, margin=6)
    ROME_plot(df, "buyer", "yes", ["pred_a", "pred_b"], cost=0.5, margin=6)
    dct = {"Training": df.query("training == 1"), "Test": df.query("training == 0")}
    ROME_plot(dct, "buyer", "yes", "pred_a", cost=0.5, margin=6)
    """
    dct = ifelse(type(df) is dict, df, {"": df})
    pred = ifelse(type(pred) is list, pred, [pred])
    group = ifelse(len(pred) > 1 or len(dct.keys()) > 1, "predictor", None)
    rd = [
        ROME_tab(dct[k], rvar, lev, p, qnt=qnt, cost=cost, margin=margin).assign(
            predictor=p + ifelse(k == "", k, f" ({k})")
        )
        for k in dct.keys()
        for p in pred
    ]
    rd = pd.concat(rd)
    fig = sns.lineplot(
        x="cum_prop", y="ROME", data=rd, hue=group, marker=marker, **kwargs
    )
    fig.set(
        ylabel="Return on Marketing Expenditures (ROME)",
        xlabel="Proportion of customers",
    )
    fig.axhline(0, linestyle="--", linewidth=1)
    return fig
Ejemplo n.º 5
0
def coef_plot(fitted, alpha=0.05, intercept=False, incl=None, excl=None, figsize=None):
    """
    Coefficient plot

    Parameters
    ----------
    fitted : A fitted linear regression model
    alpha : float
        Significance level
    intercept : bool
        Include intercept in coefficient plot (True or False)
    incl : str or list of strings
        Variables to include in the coefficient plot. All will be included by default
    excl : str or list of strings
        Variables to exclude from the coefficient plot. None are excluded by default

    Returns
    -------
    Matplotlit object
        Plot of Odds ratios
    """
    df = fitted.conf_int(alpha=alpha).reset_index().iloc[::-1]
    df["coefficient"] = fitted.params[df["index"]].dropna().values

    if not intercept:
        df = df.query('index != "Intercept"')

    if incl is not None:
        incl = ifelse(isinstance(incl, list), incl, [incl])
        rx = "(" + "|".join([f"^\b{v}|^{v}\\[" for v in incl]) + ")"
        incl = df["index"].str.match(fr"{rx}")
        if intercept:
            incl[0] = True
        df = df[incl]

    if excl is not None:
        excl = ifelse(isinstance(excl, list), excl, [excl])
        rx = "(" + "|".join([f"^\b{v}|^{v}\\[" for v in excl]) + ")"
        excl = df["index"].str.match(fr"{rx}")
        if intercept:
            excl[0] = False
        df = df[~excl]

    low, high = [100 * alpha / 2, 100 * (1 - (alpha / 2))]
    df.columns = ["index", f"{low}%", f"{high}%", "coefficient"]
    err = [df["coefficient"] - df[f"{low}%"], df[f"{high}%"] - df["coefficient"]]

    fig = plt.figure(figsize=figsize)
    ax = fig.add_subplot()
    ax.axvline(0, ls="dashdot")
    ax.errorbar(x="coefficient", y="index", data=df, xerr=err, fmt="none")
    ax.scatter(x="coefficient", y="index", data=df)
    ax.set(xlabel="Coefficient")
    return ax
Ejemplo n.º 6
0
Archivo: perf.py Proyecto: vnijs/pyrsm
def auc(rvar, pred, lev=1):
    """
    Calculate area under the RO curve (AUC)

    Calculation adapted from https://stackoverflow.com/a/50202118/1974918

    Parameters
    ----------
    rvar : Pandas series or numpy vector
        Vector with the response variable
    pred : Pandas series or numpy vector
        Vector with model predictions
    lev : str
        Name of the 'success' level in rvar

    Returns
    -------
    float :
        AUC metric

    Examples
    --------
    auc(dvd.buy, np.random.uniform(size=20000), "yes")
    auc(dvd.buy, rsm.ifelse(dvd.buy == "yes", 1, 0), "yes")
    """
    if type(rvar[0]) != bool or lev is not None:
        rvar = rvar == lev

    n1 = np.sum(rvar == False)
    n2 = np.sum(rvar)

    U = np.sum(rankdata(pred)[rvar == False]) - n1 * (n1 + 1) / 2
    wt = U / n1 / n2
    return ifelse(wt < 0.5, 1 - wt, wt)
Ejemplo n.º 7
0
def or_ci(fitted, alpha=0.05, intercept=False, dec=3):
    """
    Confidence interval for Odds ratios

    Parameters
    ----------
    fitted  A fitted logistic regression model
    alpha   Significance level
    dec     Number of decimal places

    Return
    ------
    A dataframe with Odd-ratios and confidence interval
    """
    df = pd.DataFrame(np.exp(fitted.params), columns=["OR"])
    df["OR%"] = 100 * ifelse(df["OR"] < 1, -(1 - df["OR"]), df["OR"] - 1)

    low, high = [100 * alpha / 2, 100 * (1 - (alpha / 2))]
    df[[f"{low}%", f"{high}%"]] = np.exp(fitted.conf_int(alpha=alpha))

    if dec is not None:
        df = df.round(dec)

    df["OR%"] = [f"{OR}%" for OR in df["OR%"]]
    df = df.reset_index()

    if intercept is False:
        df = df.loc[df["index"] != "Intercept"]

    return df
Ejemplo n.º 8
0
def coef_ci(fitted, alpha=0.05, intercept=False, dec=3):
    """
    Confidence interval for coefficient from linear regression

    Parameters
    ----------
    fitted : A fitted linear regression model
    alpha : float
        Significance level
    intercept : bool
        Include intercept in the output (True or False)
    dec : int
        Number of decimal places to use in rounding

    Returns
    -------
    Pandas dataframe with regression coefficients and confidence intervals
    """

    df = pd.DataFrame({"coefficient": fitted.params})

    low, high = [100 * alpha / 2, 100 * (1 - (alpha / 2))]
    df[[f"{low}%", f"{high}%"]] = fitted.conf_int(alpha=alpha)

    if dec is None:
        df["p.values"] = ifelse(fitted.pvalues < 0.001, "< .001", fitted.pvalues)
    else:
        df = df.round(dec)
        df["p.values"] = ifelse(
            fitted.pvalues < 0.001, "< .001", fitted.pvalues.round(dec)
        )

    df["  "] = sig_stars(fitted.pvalues)
    df = df.reset_index()

    if intercept is False:
        df = df.loc[df["index"] != "Intercept"]

    return df
Ejemplo n.º 9
0
def evalreg(df, rvar, pred, dec=3):
    """
    Evaluate regression models. Calculates R-squared, MSE, and MAE

    Parameters
    ----------
    df : Pandas dataframe or a dictionary of dataframes with keys to show results for
        multiple model predictions and datasets (training and test)
    rvar : str
        Name of the response variable column in df
    pred : str
        Name of the column, of list of column names, in df with model predictions
    dec : int
        Number of decimal places to use in rounding

    Examples
    --------
    """

    dct = ifelse(type(df) is dict, df, {"All": df})
    pred = ifelse(type(pred) is list, pred, [pred])

    def calculate_metrics(key, dfm, pm):
        return pd.DataFrame().assign(
            Type=[key],
            predictor=[pm],
            n=[dfm.shape[0]],
            r2=[metrics.r2_score(dfm[rvar], dfm[pm])],
            mse=[metrics.mean_squared_error(dfm[rvar], dfm[pm])],
            mae=[metrics.mean_absolute_error(dfm[rvar], dfm[pm])],
        )

    result = pd.DataFrame()
    for key, val in dct.items():
        for p in pred:
            result = result.append(calculate_metrics(key, val, p))

    result.index = range(result.shape[0])
    return result.round(dec)
Ejemplo n.º 10
0
def distr_plot(df, nint=25, **kwargs):
    """
    Plot histograms for numeric variables and frequency plots for categorical.
    variables. Columns of type integer with less than 25 unique values will be
    treated as categorical. To change this behavior, increase or decrease the
    value of the 'nint' argument

    Parameters
    ----------
    df : Pandas dataframe
    nint: int
        The number of unique values in a series of type integer below which the
        series will be treated as a categorical variable
    **kwargs : Named arguments to be passed to the pandas plotting methods
    """
    fig, axes = plt.subplots(math.ceil(df.shape[1] / 2),
                             2,
                             figsize=(10, 1.5 * df.shape[1]))
    plt.subplots_adjust(wspace=0.25, hspace=0.3)
    row = 0
    for i, c in enumerate(df.columns):
        s = df[c]
        j = ifelse(i % 2 == 0, 0, 1)
        if pd.api.types.is_integer_dtype(s.dtype) and s.nunique() < nint:
            s.value_counts(sort=False).plot.bar(ax=axes[row, j],
                                                title=c,
                                                rot=0,
                                                color="slateblue",
                                                **kwargs)
        elif pd.api.types.is_numeric_dtype(s.dtype):
            s.plot.hist(ax=axes[row, j],
                        title=c,
                        rot=0,
                        color="slateblue",
                        **kwargs)
        elif pd.api.types.is_categorical_dtype(s.dtype):
            s.value_counts(sort=False).plot.bar(ax=axes[row, j],
                                                title=c,
                                                rot=0,
                                                color="slateblue",
                                                **kwargs)
        else:
            print(f"No plot for {c} (type {s.dtype})")

        if j == 1:
            row += 1
Ejemplo n.º 11
0
def sim_prediction(df, vary=[], nnv=5):
    """
    Simulate data for prediction

    Parameters
    ----------
    df : Pandas dataframe
    vary : List of column names of Dictionary with keys and values to use
    nnv : int
        Number of values to use to simulate the effect of a numeric variable

    Returns:
    ----------
    Pandas dataframe with values to use for estimation
    """

    def fix_value(s):
        if pd.api.types.is_numeric_dtype(s.dtype):
            return s.mean()
        else:
            return s.value_counts().idxmax()

    dct = {c: [fix_value(df[c])] for c in df.columns}
    dtypes = df.dtypes
    if type(vary) is dict:
        # user provided values and ranges
        for key, val in vary.items():
            dct[key] = val
    else:
        # auto derived values and ranges
        vary = ifelse(type(vary) is list, vary, [vary])
        for v in vary:
            if pd.api.types.is_numeric_dtype(df[v].dtype):
                nu = df[v].nunique()
                if nu > 2:
                    dct[v] = np.linspace(df[v].min(), df[v].max(), min([nu, nnv]))
                else:
                    dct[v] = [df[v].min(), df[v].max()]
            else:
                dct[v] = df[v].unique()

    return expand_grid(dct, dtypes)
Ejemplo n.º 12
0
def varprop(x, na=True):
    """
    Calculate the variance for a proportion

    Parameters
    ----------
    x : List, numpy array, or pandas series
        Numeric variable with only values 0 and 1
    na : bool
        Drop missing values before calculating (True or False)

    Returns
    -------
    float
        Calculated variance for a proportion based on a vector of 0 and 1 values

    Examples
    --------
    varprop([0, 1, 1, 1, 0, 0, 0])
    """

    p = ifelse(na, np.nanmean(x), np.mean(x))
    return p * (1 - p)
Ejemplo n.º 13
0
def test_ifelse_true():
    assert (ifelse(
        3 > 2, "greater",
        "smaller") == "greater"), "Logical comparison in ifelse incorrect"
Ejemplo n.º 14
0
Archivo: perf.py Proyecto: vnijs/pyrsm
def evalbin(df, rvar, lev, pred, cost=1, margin=2, dec=3):
    """
    Evaluate binary classification models. Calculates TP, FP, TN, FN, contact, total,
    TPR, TNR, precision, Fscore, accuracy, profit, ROME, AUC, kappa, and profit index

    Parameters
    ----------
    df : Pandas dataframe or a dictionary of dataframes with keys to show results for
        multiple model predictions and datasets (training and test)
    rvar : str
        Name of the response variable column in df
    lev : str
        Name of the 'success' level in rvar
    pred : str
        Name of the column, of list of column names, in df with model predictions
    cost : int
        Cost of an action
    margin : int
        Benefit of an action if a successful outcome results from the action
    dec : int
        Number of decimal places to use in rounding

    Examples
    --------
    """

    dct = ifelse(type(df) is dict, df, {"All": df})
    pred = ifelse(type(pred) is list, pred, [pred])

    def calculate_metrics(key, dfm, pm):
        TP, FP, TN, FN, contact = confusion(dfm, rvar, lev, pm, cost, margin)
        total = TN + FN + FP + TP
        TPR = TP / (TP + FN)
        TNR = TN / (TN + FP)
        precision = TP / (TP + FP)
        profit = margin * TP - cost * (TP + FP)

        fpr, tpr, thresholds = metrics.roc_curve(dfm[rvar], dfm[pm], pos_label=lev)
        break_even = cost / margin
        gtbe = dfm[pm] > break_even
        pos = dfm[rvar] == lev

        return pd.DataFrame().assign(
            Type=[key],
            predictor=[pm],
            TP=[TP],
            FP=[FP],
            TN=[TN],
            FN=[FN],
            total=[total],
            TPR=[TPR],
            TNR=[TNR],
            precision=[precision],
            Fscore=[2 * (precision * TPR) / (precision + TPR)],
            accuracy=[(TP + TN) / total],
            kappa=[metrics.cohen_kappa_score(pos, gtbe)],
            profit=[profit],
            index=[0],
            ROME=[profit / (cost * (TP + FP))],
            contact=[contact],
            AUC=[metrics.auc(fpr, tpr)],
        )

    result = pd.DataFrame()
    for key, val in dct.items():
        for p in pred:
            result = result.append(calculate_metrics(key, val, p))

    result.index = range(result.shape[0])
    result["index"] = result.groupby("Type")["profit"].transform(lambda x: x / x.max())
    return result.round(dec)
Ejemplo n.º 15
0
Archivo: perf.py Proyecto: vnijs/pyrsm
def profit_plot(
    df, rvar, lev, pred, qnt=10, cost=1, margin=2, contact=False, marker="o", **kwargs
):
    """
    Plot a profit curve

    Parameters
    ----------
    df : Pandas dataframe or a dictionary of dataframes with keys to show multiple curves for different models or data samples
    rvar : str
        Name of the response variable column in df
    lev : str
        Name of the 'success' level in rvar
    pred : str
        Name of the column in df with model predictions
    qnt : int
        Number of quantiles to create
    cost : int
        Cost of an action
    margin : int
        Benefit of an action if a successful outcome results from the action
    contact : bool
        Plot a vertical line that shows the optimal contact level. Requires
        that `pred` is a series of probabilities. Values equal to 1 (100% contact)
        will not be plotted
    marker : str
        Marker to use for line plot
    **kwargs : Named arguments to be passed to the seaborn lineplot function

    Returns
    -------
    Seaborn object
        Plot of profits per quantile

    Examples
    --------
    profit_plot(df, "buyer", "yes", "pred_a", cost=0.5, margin=6)
    profit_plot(df, "buyer", "yes", ["pred_a", "pred_b"], cost=0.5, margin=6)
    dct = {"Training": df.query("training == 1"), "Test": df.query("training == 0")}
    profit_plot(dct, "buyer", "yes", "pred_a", cost=0.5, margin=6)
    """
    dct = ifelse(type(df) is dict, df, {"": df})
    pred = ifelse(type(pred) is list, pred, [pred])
    group = ifelse(len(pred) > 1 or len(dct.keys()) > 1, "predictor", None)
    df = [
        profit_tab(dct[k], rvar, lev, p, qnt=qnt, cost=cost, margin=margin).assign(
            predictor=p + ifelse(k == "", k, f" ({k})")
        )
        for k in dct.keys()
        for p in pred
    ]
    df = pd.concat(df)
    fig = sns.lineplot(
        x="cum_prop", y="cum_profit", data=df, hue=group, marker=marker, **kwargs
    )
    fig.set(ylabel="Profit", xlabel="Proportion of customers")
    fig.axhline(1, linestyle="--", linewidth=1)
    if contact:
        cnf = [
            confusion(dct[k], rvar, lev, p, cost=cost, margin=margin)[-1]
            for k in dct.keys()
            for p in pred
        ]
        [
            fig.axvline(l, linestyle="--", linewidth=1)
            for l in filter(lambda x: x < 1, cnf)
        ]
    return fig
Ejemplo n.º 16
0
def test_ifelse_false():
    assert (ifelse(
        2 > 3, "greater",
        "smaller") == "smaller"), "Logical comparison in ifelse incorrect"
Ejemplo n.º 17
0
def test_ifelse_array():
    assert all(ifelse(np.array([2, 3, 4]) > 2, 1, 0) == np.array(
        [0, 1, 1])), "Logical comparison of np.array in ifelse incorrect"
Ejemplo n.º 18
0
Archivo: logit.py Proyecto: vnijs/pyrsm
def or_plot(fitted,
            alpha=0.05,
            intercept=False,
            incl=None,
            excl=None,
            figsize=None):
    """
    Odds ratio plot

    Parameters
    ----------
    fitted : A fitted logistic regression model
    alpha : float
        Significance level
    intercept : bool
        Include intercept in odds-ratio plot (True or False)
    incl : str or list of strings
        Variables to include in the odds-ratio plot. All will be included by default
    excl : str or list of strings
        Variables to exclude from the odds-ratio plot. None are excluded by default

    Returns
    -------
    Matplotlit object
        Plot of Odds ratios
    """

    # iloc to reverse order
    df = or_ci(fitted, alpha=alpha, intercept=intercept,
               dec=100).dropna().iloc[::-1]

    if incl is not None:
        incl = ifelse(isinstance(incl, list), incl, [incl])
        rx = "(" + "|".join([f"^{v}$|^{v}\\[" for v in incl]) + ")"
        incl = df["index"].str.match(fr"{rx}")
        if intercept:
            incl[0] = True
        df = df[incl]

    if excl is not None:
        excl = ifelse(isinstance(excl, list), excl, [excl])
        rx = "(" + "|".join([f"^{v}$|^{v}\\[" for v in excl]) + ")"
        excl = df["index"].str.match(fr"{rx}")
        if intercept:
            excl[0] = False
        df = df[~excl]

    low, high = [100 * alpha / 2, 100 * (1 - (alpha / 2))]
    err = [df["OR"] - df[f"{low}%"], df[f"{high}%"] - df["OR"]]

    fig = plt.figure(figsize=figsize)
    ax = fig.add_subplot()
    ax.axvline(1, ls="dashdot")
    ax.errorbar(x="OR", y="index", data=df, xerr=err, fmt="none")
    ax.scatter(x="OR", y="index", data=df)
    ax.set_xscale("log")
    ax.xaxis.set_minor_formatter(ticker.NullFormatter())
    ax.xaxis.set_major_locator(
        ticker.LogLocator(subs=[0.1, 0.2, 0.5, 1, 2, 5, 10]))
    ax.xaxis.set_major_formatter(ticker.StrMethodFormatter("{x:.1f}"))
    ax.set(xlabel="Odds-ratio")
    return ax
Ejemplo n.º 19
0
    def summary(self, output=["observed", "expected"], dec=2):
        """
        Print different output tables for a cross_tabs object

        Parameters
        ----------
        output : list of tables to show
            Options include "observed" (observed frequencies),
            "expected" (expected frequencies), "chisq" (chi-square values)
            for each cell, "dev_std" (standardized deviations from expected)
            "perc_row" (percentages conditioned by row), "perc_col"
            (percentages conditioned by column), "perc" (percentages by the
            total number of observations). The default value is ["observed", "expected"]
        dec : int
            Number of decimal places to use in rounding

        Examples
        --------
        import pyrsm as rsm
        rsm.load_data(pkg="basics", name="newspaper", dct=globals())
        ct = rsm.cross_tabs(newspaper)
        ct.summary()
        """

        output = ifelse(type(output) is list, output, [output])
        prn = f"""
Cross-tabs
Variables: {self.var1}, {self.var2}
Null hyp: there is no association between {self.var1} and {self.var2}
Alt. hyp: there is an association between {self.var1} and {self.var2}
"""
        if "observed" in output:
            prn += f"""
Observed:

{self.observed.applymap(lambda x: "{:,}".format(x))}
"""
        if "expected" in output:
            prn += f"""
Expected: (row total x column total) / total

{self.expected.round(dec).applymap(lambda x: "{:,}".format(x))}
"""
        if "chisq" in output:
            prn += f"""
Contribution to chi-squared: (o - e)^2 / e

{self.chisq.round(dec).applymap(lambda x: "{:,}".format(x))}
"""

        if "dev_std" in output:
            prn += f"""
Deviation standardized: (o - e) / sqrt(e)

{self.dev_std.round(dec).applymap(lambda x: "{:,}".format(x))}
"""

        if "perc_row" in output:
            prn += f"""
Row percentages:

{self.perc_row.transform(lambda x: (100*x).round(dec).astype(str) + "%")}
"""
        if "perc_col" in output:
            prn += f"""
Column percentages:

{self.perc_col.transform(lambda x: (100*x).round(dec).astype(str) + "%")}
"""
        if "perc_all" in output:
            prn += f"""
Percentages:

{self.perc.transform(lambda x: (100*x).round(dec).astype(str) + "%")}
"""
        prn += f"""
Chi-squared: {round(self.chisq_test[0], dec)} df({int(self.chisq_test[2])}), p.value {ifelse(self.chisq_test[1] < 0.001, "< .001", round(self.chisq_test[1], dec))}
{100 * round(self.expected_low[0] / self.expected_low[1], dec)}% of cells have expected values below 5
"""
        print(prn)
Ejemplo n.º 20
0
    def plot(self, output="perc_col", **kwargs):
        """
        Plot of correlations between numeric variables in a Pandas dataframe

        Parameters
        ----------
        output : list of tables to show
            Options include "observed" (observed frequencies),
            "expected" (expected frequencies), "chisq" (chi-square values)
            for each cell, "dev_std" (standardized deviations from expected)
            "perc_row" (percentages conditioned by row), "perc_col"
            (percentages conditioned by column), "perc" (percentages by the
            total number of observations). The default value is ["observed", "expected"]
        **kwargs : Named arguments to be passed to pandas plotting functions

        Examples
        --------
        import pyrsm as rsm
        rsm.load_data(pkg="basics", name="newspaper", dct=globals())
        ct = rsm.cross_tabs(newspaper, "Income", "Newspaper")
        ct.plot()
        """
        output = ifelse(type(output) is list, output, [output])

        args = {"rot": False}
        if "observed" in output:
            df = (self.observed.transpose().drop(columns="Total").drop(
                "Total", axis=0).apply(lambda x: x * 100 / sum(x), axis=1))
            args["title"] = "Observed frequencies"
            args.update(**kwargs)
            fig = df.plot.bar(stacked=True, **args)
        if "expected" in output:
            df = (self.expected.transpose().drop(columns="Total").drop(
                "Total", axis=0).apply(lambda x: x * 100 / sum(x), axis=1))
            args["title"] = "Expected frequencies"
            args.update(**kwargs)
            fig = df.plot.bar(stacked=True, **args)
        if "chisq" in output:
            df = self.chisq.transpose().drop(columns="Total").drop("Total",
                                                                   axis=0)
            args["title"] = "Contribution to chi-squared statistic"
            args.update(**kwargs)
            fig = df.plot.bar(**args)
        if "dev_std" in output:
            df = self.dev_std.transpose()
            args["title"] = "Deviation standardized"
            args.update(**kwargs)
            fig, ax = plt.subplots()
            df.plot.bar(**args, ax=ax)
            ax.axhline(y=1.96, color="black", linestyle="--")
            ax.axhline(y=1.64, color="black", linestyle="--")
            ax.axhline(y=-1.96, color="black", linestyle="--")
            ax.axhline(y=-1.64, color="black", linestyle="--")
            ax.annotate("95%", xy=(0, 2.1), va="bottom", ha="center")
            ax.annotate("90%", xy=(0, 1.4), va="top", ha="center")
        if "perc_col" in output:
            df = self.perc_col.transpose().drop(columns="Total").drop("Total",
                                                                      axis=0)
            args["title"] = "Column percentages"
            args.update(**kwargs)
            fig = df.plot.bar(**args)
        if "perc_row" in output:
            df = self.perc_row.transpose().drop(columns="Total").drop("Total",
                                                                      axis=0)
            args["title"] = "Row percentages"
            args.update(**kwargs)
            fig = df.plot.bar(**args)
        if "perc" in output:
            df = self.perc.transpose().drop(columns="Total").drop("Total",
                                                                  axis=0)
            args["title"] = "Table percentages"
            args.update(**kwargs)
            fig = df.plot.bar(**args)
Ejemplo n.º 21
0
Archivo: logit.py Proyecto: vnijs/pyrsm
def or_ci(fitted,
          alpha=0.05,
          intercept=False,
          importance=False,
          data=None,
          dec=3):
    """
    Confidence interval for Odds ratios

    Parameters
    ----------
    fitted : A fitted logistic regression model
    alpha : float
        Significance level
    intercept : bool
        Include intercept in output (True or False)
    importance : int
        Calculate variable importance. Only meaningful if data
        used in estimation was standardized prior to model
        estimation
    data : Pandas dataframe
        Unstandardized data used to calculate descriptive
        statistics
    dec : int
        Number of decimal places to use in rounding

    Returns
    -------
    Pandas dataframe with Odd-ratios and confidence intervals
    """

    df = pd.DataFrame(np.exp(fitted.params), columns=["OR"]).dropna()
    df["OR%"] = 100 * ifelse(df["OR"] < 1, -(1 - df["OR"]), df["OR"] - 1)

    low, high = [100 * alpha / 2, 100 * (1 - (alpha / 2))]
    df[[f"{low}%", f"{high}%"]] = np.exp(fitted.conf_int(alpha=alpha))
    df["p.values"] = ifelse(fitted.pvalues < 0.001, "< .001",
                            fitted.pvalues.round(dec))
    df["  "] = sig_stars(fitted.pvalues)
    df["OR%"] = [f"{round(o, max(dec-2, 0))}%" for o in df["OR%"]]
    df = df.reset_index()

    if importance:
        df["dummy"] = df["index"].str.contains("[T", regex=False)
        df["importance"] = (pd.DataFrame().assign(OR=df["OR"],
                                                  ORinv=1 /
                                                  df["OR"]).max(axis=1))

    if isinstance(data, pd.DataFrame):
        # using a fake response variable variable
        data = data.assign(__rvar__=1).copy()
        form = "__rvar__ ~ " + fitted.model.formula.split("~", 1)[1]
        exog = pd.DataFrame(smf.logit(formula=form, data=data).exog)
        weights = fitted._freq_weights
        if sum(weights) > len(weights):

            def wmean(x):
                return weighted_mean(x, weights)

            def wstd(x):
                return weighted_sd(pd.DataFrame(x), weights)[0]

            df = pd.concat(
                [df, exog.apply([wmean, wstd, "min", "max"]).T],
                axis=1,
            )
        else:
            df = pd.concat(
                [df, exog.apply(["mean", "std", "min", "max"]).T], axis=1)

    if intercept is False:
        df = df.loc[df["index"] != "Intercept"]

    return df.round(dec)