def ROME_plot(df, rvar, lev, pred, qnt=10, cost=1, margin=2): """ Plot a ROME chart df: A pandas dataframe of a dictionary of dataframes with keys Examples -------- ROME_plot(df, "buyer", "yes", "pred_a", cost=0.5, margin=6) ROME_plot(df, "buyer", "yes", ["pred_a", "pred_b"], cost=0.5, margin=6) dct = {"Training": df.query("training == 1"), "Test": df.query("training == 0")} ROME_plot(dct, "buyer", "yes", "pred_a", cost=0.5, margin=6) """ dct = ifelse(type(df) is dict, df, {"": df}) pred = ifelse(type(pred) is list, pred, [pred]) group = ifelse(len(pred) > 1 or len(dct.keys()) > 1, "predictor", None) rd = [ ROME(dct[k], rvar, lev, p, qnt=qnt, cost=cost, margin=margin).assign(predictor=p + ifelse(k == "", k, f" ({k})")) for k in dct.keys() for p in pred ] rd = pd.concat(rd) fig = sns.lineplot(x="cum_prop", y="ROME", data=rd, hue=group, marker="o") fig.set( ylabel="Return on Marketing Expenditures (ROME)", xlabel="Proportion of customers", ) fig.axhline(0, linestyle="--", linewidth=1) return fig
def lift_plot(df, rvar, lev, pred, qnt=10): """ Plot a cumulative lift chart df: A pandas dataframe of a dictionary of dataframes with keys Examples -------- lift_plot(df, "buyer", "yes", "pred_a") lift_plot(df, "buyer", "yes", ["pred_a", "pred_b"], qnt=20) lift = {"Training": df.query("training == 1"), "Test": df.query("training == 0")} lift_plot(dct, "buyer", "yes", "pred_a") """ dct = ifelse(type(df) is dict, df, {"": df}) pred = ifelse(type(pred) is list, pred, [pred]) group = ifelse(len(pred) > 1 or len(dct.keys()) > 1, "predictor", None) rd = [ lift(dct[k], rvar, lev, p, qnt=qnt).assign(predictor=p + ifelse(k == "", k, f" ({k})")) for k in dct.keys() for p in pred ] rd = pd.concat(rd) fig = sns.lineplot(x="cum_prop", y="cum_lift", data=rd, hue=group, marker="o") fig.axhline(1, linestyle="--", linewidth=1) fig.set(ylabel="Cumulative lift", xlabel="Proportion of customers") return fig
def profit_plot(df, rvar, lev, pred, qnt=10, cost=1, margin=2): """ Plot a profit chart df: A pandas dataframe of a dictionary of dataframes with keys Examples -------- profit_plot(df, "buyer", "yes", "pred_a", cost=0.5, margin=6) profit_plot(df, "buyer", "yes", ["pred_a", "pred_b"], cost=0.5, margin=6) dct = {"Training": df.query("training == 1"), "Test": df.query("training == 0")} profit_plot(dct, "buyer", "yes", "pred_a", cost=0.5, margin=6) """ dct = ifelse(type(df) is dict, df, {"": df}) pred = ifelse(type(pred) is list, pred, [pred]) group = ifelse(len(pred) > 1 or len(dct.keys()) > 1, "predictor", None) cnf = [ confusion(dct[k], rvar, lev, p, cost=cost, margin=margin)[-1] for k in dct.keys() for p in pred ] df = [ profit(dct[k], rvar, lev, p, qnt=qnt, cost=cost, margin=margin).assign(predictor=p + ifelse(k == "", k, f" ({k})")) for k in dct.keys() for p in pred ] df = pd.concat(df) fig = sns.lineplot(x="cum_prop", y="cum_profit", data=df, hue=group, marker="o") fig.set(ylabel="Profit", xlabel="Proportion of customers") fig.axhline(1, linestyle="--", linewidth=1) [fig.axvline(l, linestyle="--", linewidth=1) for l in cnf] return fig
def ROME_plot(df, rvar, lev, pred, qnt=10, cost=1, margin=2, marker="o", **kwargs): """ Plot a ROME curve Parameters ---------- df : Pandas dataframe or a dictionary of dataframes with keys to show multiple curves for different models or data samples rvar : str Name of the response variable column in df lev : str Name of the 'success' level in rvar pred : str Name of the column in df with model predictions qnt : int Number of quantiles to create cost : int Cost of an action margin : int Benefit of an action if a successful outcome results from the action marker : str Marker to use for line plot **kwargs : Named arguments to be passed to the seaborn lineplot function Returns ------- Seaborn object Plot of ROME per quantile Examples -------- ROME_plot(df, "buyer", "yes", "pred_a", cost=0.5, margin=6) ROME_plot(df, "buyer", "yes", ["pred_a", "pred_b"], cost=0.5, margin=6) dct = {"Training": df.query("training == 1"), "Test": df.query("training == 0")} ROME_plot(dct, "buyer", "yes", "pred_a", cost=0.5, margin=6) """ dct = ifelse(type(df) is dict, df, {"": df}) pred = ifelse(type(pred) is list, pred, [pred]) group = ifelse(len(pred) > 1 or len(dct.keys()) > 1, "predictor", None) rd = [ ROME_tab(dct[k], rvar, lev, p, qnt=qnt, cost=cost, margin=margin).assign( predictor=p + ifelse(k == "", k, f" ({k})") ) for k in dct.keys() for p in pred ] rd = pd.concat(rd) fig = sns.lineplot( x="cum_prop", y="ROME", data=rd, hue=group, marker=marker, **kwargs ) fig.set( ylabel="Return on Marketing Expenditures (ROME)", xlabel="Proportion of customers", ) fig.axhline(0, linestyle="--", linewidth=1) return fig
def coef_plot(fitted, alpha=0.05, intercept=False, incl=None, excl=None, figsize=None): """ Coefficient plot Parameters ---------- fitted : A fitted linear regression model alpha : float Significance level intercept : bool Include intercept in coefficient plot (True or False) incl : str or list of strings Variables to include in the coefficient plot. All will be included by default excl : str or list of strings Variables to exclude from the coefficient plot. None are excluded by default Returns ------- Matplotlit object Plot of Odds ratios """ df = fitted.conf_int(alpha=alpha).reset_index().iloc[::-1] df["coefficient"] = fitted.params[df["index"]].dropna().values if not intercept: df = df.query('index != "Intercept"') if incl is not None: incl = ifelse(isinstance(incl, list), incl, [incl]) rx = "(" + "|".join([f"^\b{v}|^{v}\\[" for v in incl]) + ")" incl = df["index"].str.match(fr"{rx}") if intercept: incl[0] = True df = df[incl] if excl is not None: excl = ifelse(isinstance(excl, list), excl, [excl]) rx = "(" + "|".join([f"^\b{v}|^{v}\\[" for v in excl]) + ")" excl = df["index"].str.match(fr"{rx}") if intercept: excl[0] = False df = df[~excl] low, high = [100 * alpha / 2, 100 * (1 - (alpha / 2))] df.columns = ["index", f"{low}%", f"{high}%", "coefficient"] err = [df["coefficient"] - df[f"{low}%"], df[f"{high}%"] - df["coefficient"]] fig = plt.figure(figsize=figsize) ax = fig.add_subplot() ax.axvline(0, ls="dashdot") ax.errorbar(x="coefficient", y="index", data=df, xerr=err, fmt="none") ax.scatter(x="coefficient", y="index", data=df) ax.set(xlabel="Coefficient") return ax
def auc(rvar, pred, lev=1): """ Calculate area under the RO curve (AUC) Calculation adapted from https://stackoverflow.com/a/50202118/1974918 Parameters ---------- rvar : Pandas series or numpy vector Vector with the response variable pred : Pandas series or numpy vector Vector with model predictions lev : str Name of the 'success' level in rvar Returns ------- float : AUC metric Examples -------- auc(dvd.buy, np.random.uniform(size=20000), "yes") auc(dvd.buy, rsm.ifelse(dvd.buy == "yes", 1, 0), "yes") """ if type(rvar[0]) != bool or lev is not None: rvar = rvar == lev n1 = np.sum(rvar == False) n2 = np.sum(rvar) U = np.sum(rankdata(pred)[rvar == False]) - n1 * (n1 + 1) / 2 wt = U / n1 / n2 return ifelse(wt < 0.5, 1 - wt, wt)
def or_ci(fitted, alpha=0.05, intercept=False, dec=3): """ Confidence interval for Odds ratios Parameters ---------- fitted A fitted logistic regression model alpha Significance level dec Number of decimal places Return ------ A dataframe with Odd-ratios and confidence interval """ df = pd.DataFrame(np.exp(fitted.params), columns=["OR"]) df["OR%"] = 100 * ifelse(df["OR"] < 1, -(1 - df["OR"]), df["OR"] - 1) low, high = [100 * alpha / 2, 100 * (1 - (alpha / 2))] df[[f"{low}%", f"{high}%"]] = np.exp(fitted.conf_int(alpha=alpha)) if dec is not None: df = df.round(dec) df["OR%"] = [f"{OR}%" for OR in df["OR%"]] df = df.reset_index() if intercept is False: df = df.loc[df["index"] != "Intercept"] return df
def coef_ci(fitted, alpha=0.05, intercept=False, dec=3): """ Confidence interval for coefficient from linear regression Parameters ---------- fitted : A fitted linear regression model alpha : float Significance level intercept : bool Include intercept in the output (True or False) dec : int Number of decimal places to use in rounding Returns ------- Pandas dataframe with regression coefficients and confidence intervals """ df = pd.DataFrame({"coefficient": fitted.params}) low, high = [100 * alpha / 2, 100 * (1 - (alpha / 2))] df[[f"{low}%", f"{high}%"]] = fitted.conf_int(alpha=alpha) if dec is None: df["p.values"] = ifelse(fitted.pvalues < 0.001, "< .001", fitted.pvalues) else: df = df.round(dec) df["p.values"] = ifelse( fitted.pvalues < 0.001, "< .001", fitted.pvalues.round(dec) ) df[" "] = sig_stars(fitted.pvalues) df = df.reset_index() if intercept is False: df = df.loc[df["index"] != "Intercept"] return df
def evalreg(df, rvar, pred, dec=3): """ Evaluate regression models. Calculates R-squared, MSE, and MAE Parameters ---------- df : Pandas dataframe or a dictionary of dataframes with keys to show results for multiple model predictions and datasets (training and test) rvar : str Name of the response variable column in df pred : str Name of the column, of list of column names, in df with model predictions dec : int Number of decimal places to use in rounding Examples -------- """ dct = ifelse(type(df) is dict, df, {"All": df}) pred = ifelse(type(pred) is list, pred, [pred]) def calculate_metrics(key, dfm, pm): return pd.DataFrame().assign( Type=[key], predictor=[pm], n=[dfm.shape[0]], r2=[metrics.r2_score(dfm[rvar], dfm[pm])], mse=[metrics.mean_squared_error(dfm[rvar], dfm[pm])], mae=[metrics.mean_absolute_error(dfm[rvar], dfm[pm])], ) result = pd.DataFrame() for key, val in dct.items(): for p in pred: result = result.append(calculate_metrics(key, val, p)) result.index = range(result.shape[0]) return result.round(dec)
def distr_plot(df, nint=25, **kwargs): """ Plot histograms for numeric variables and frequency plots for categorical. variables. Columns of type integer with less than 25 unique values will be treated as categorical. To change this behavior, increase or decrease the value of the 'nint' argument Parameters ---------- df : Pandas dataframe nint: int The number of unique values in a series of type integer below which the series will be treated as a categorical variable **kwargs : Named arguments to be passed to the pandas plotting methods """ fig, axes = plt.subplots(math.ceil(df.shape[1] / 2), 2, figsize=(10, 1.5 * df.shape[1])) plt.subplots_adjust(wspace=0.25, hspace=0.3) row = 0 for i, c in enumerate(df.columns): s = df[c] j = ifelse(i % 2 == 0, 0, 1) if pd.api.types.is_integer_dtype(s.dtype) and s.nunique() < nint: s.value_counts(sort=False).plot.bar(ax=axes[row, j], title=c, rot=0, color="slateblue", **kwargs) elif pd.api.types.is_numeric_dtype(s.dtype): s.plot.hist(ax=axes[row, j], title=c, rot=0, color="slateblue", **kwargs) elif pd.api.types.is_categorical_dtype(s.dtype): s.value_counts(sort=False).plot.bar(ax=axes[row, j], title=c, rot=0, color="slateblue", **kwargs) else: print(f"No plot for {c} (type {s.dtype})") if j == 1: row += 1
def sim_prediction(df, vary=[], nnv=5): """ Simulate data for prediction Parameters ---------- df : Pandas dataframe vary : List of column names of Dictionary with keys and values to use nnv : int Number of values to use to simulate the effect of a numeric variable Returns: ---------- Pandas dataframe with values to use for estimation """ def fix_value(s): if pd.api.types.is_numeric_dtype(s.dtype): return s.mean() else: return s.value_counts().idxmax() dct = {c: [fix_value(df[c])] for c in df.columns} dtypes = df.dtypes if type(vary) is dict: # user provided values and ranges for key, val in vary.items(): dct[key] = val else: # auto derived values and ranges vary = ifelse(type(vary) is list, vary, [vary]) for v in vary: if pd.api.types.is_numeric_dtype(df[v].dtype): nu = df[v].nunique() if nu > 2: dct[v] = np.linspace(df[v].min(), df[v].max(), min([nu, nnv])) else: dct[v] = [df[v].min(), df[v].max()] else: dct[v] = df[v].unique() return expand_grid(dct, dtypes)
def varprop(x, na=True): """ Calculate the variance for a proportion Parameters ---------- x : List, numpy array, or pandas series Numeric variable with only values 0 and 1 na : bool Drop missing values before calculating (True or False) Returns ------- float Calculated variance for a proportion based on a vector of 0 and 1 values Examples -------- varprop([0, 1, 1, 1, 0, 0, 0]) """ p = ifelse(na, np.nanmean(x), np.mean(x)) return p * (1 - p)
def test_ifelse_true(): assert (ifelse( 3 > 2, "greater", "smaller") == "greater"), "Logical comparison in ifelse incorrect"
def evalbin(df, rvar, lev, pred, cost=1, margin=2, dec=3): """ Evaluate binary classification models. Calculates TP, FP, TN, FN, contact, total, TPR, TNR, precision, Fscore, accuracy, profit, ROME, AUC, kappa, and profit index Parameters ---------- df : Pandas dataframe or a dictionary of dataframes with keys to show results for multiple model predictions and datasets (training and test) rvar : str Name of the response variable column in df lev : str Name of the 'success' level in rvar pred : str Name of the column, of list of column names, in df with model predictions cost : int Cost of an action margin : int Benefit of an action if a successful outcome results from the action dec : int Number of decimal places to use in rounding Examples -------- """ dct = ifelse(type(df) is dict, df, {"All": df}) pred = ifelse(type(pred) is list, pred, [pred]) def calculate_metrics(key, dfm, pm): TP, FP, TN, FN, contact = confusion(dfm, rvar, lev, pm, cost, margin) total = TN + FN + FP + TP TPR = TP / (TP + FN) TNR = TN / (TN + FP) precision = TP / (TP + FP) profit = margin * TP - cost * (TP + FP) fpr, tpr, thresholds = metrics.roc_curve(dfm[rvar], dfm[pm], pos_label=lev) break_even = cost / margin gtbe = dfm[pm] > break_even pos = dfm[rvar] == lev return pd.DataFrame().assign( Type=[key], predictor=[pm], TP=[TP], FP=[FP], TN=[TN], FN=[FN], total=[total], TPR=[TPR], TNR=[TNR], precision=[precision], Fscore=[2 * (precision * TPR) / (precision + TPR)], accuracy=[(TP + TN) / total], kappa=[metrics.cohen_kappa_score(pos, gtbe)], profit=[profit], index=[0], ROME=[profit / (cost * (TP + FP))], contact=[contact], AUC=[metrics.auc(fpr, tpr)], ) result = pd.DataFrame() for key, val in dct.items(): for p in pred: result = result.append(calculate_metrics(key, val, p)) result.index = range(result.shape[0]) result["index"] = result.groupby("Type")["profit"].transform(lambda x: x / x.max()) return result.round(dec)
def profit_plot( df, rvar, lev, pred, qnt=10, cost=1, margin=2, contact=False, marker="o", **kwargs ): """ Plot a profit curve Parameters ---------- df : Pandas dataframe or a dictionary of dataframes with keys to show multiple curves for different models or data samples rvar : str Name of the response variable column in df lev : str Name of the 'success' level in rvar pred : str Name of the column in df with model predictions qnt : int Number of quantiles to create cost : int Cost of an action margin : int Benefit of an action if a successful outcome results from the action contact : bool Plot a vertical line that shows the optimal contact level. Requires that `pred` is a series of probabilities. Values equal to 1 (100% contact) will not be plotted marker : str Marker to use for line plot **kwargs : Named arguments to be passed to the seaborn lineplot function Returns ------- Seaborn object Plot of profits per quantile Examples -------- profit_plot(df, "buyer", "yes", "pred_a", cost=0.5, margin=6) profit_plot(df, "buyer", "yes", ["pred_a", "pred_b"], cost=0.5, margin=6) dct = {"Training": df.query("training == 1"), "Test": df.query("training == 0")} profit_plot(dct, "buyer", "yes", "pred_a", cost=0.5, margin=6) """ dct = ifelse(type(df) is dict, df, {"": df}) pred = ifelse(type(pred) is list, pred, [pred]) group = ifelse(len(pred) > 1 or len(dct.keys()) > 1, "predictor", None) df = [ profit_tab(dct[k], rvar, lev, p, qnt=qnt, cost=cost, margin=margin).assign( predictor=p + ifelse(k == "", k, f" ({k})") ) for k in dct.keys() for p in pred ] df = pd.concat(df) fig = sns.lineplot( x="cum_prop", y="cum_profit", data=df, hue=group, marker=marker, **kwargs ) fig.set(ylabel="Profit", xlabel="Proportion of customers") fig.axhline(1, linestyle="--", linewidth=1) if contact: cnf = [ confusion(dct[k], rvar, lev, p, cost=cost, margin=margin)[-1] for k in dct.keys() for p in pred ] [ fig.axvline(l, linestyle="--", linewidth=1) for l in filter(lambda x: x < 1, cnf) ] return fig
def test_ifelse_false(): assert (ifelse( 2 > 3, "greater", "smaller") == "smaller"), "Logical comparison in ifelse incorrect"
def test_ifelse_array(): assert all(ifelse(np.array([2, 3, 4]) > 2, 1, 0) == np.array( [0, 1, 1])), "Logical comparison of np.array in ifelse incorrect"
def or_plot(fitted, alpha=0.05, intercept=False, incl=None, excl=None, figsize=None): """ Odds ratio plot Parameters ---------- fitted : A fitted logistic regression model alpha : float Significance level intercept : bool Include intercept in odds-ratio plot (True or False) incl : str or list of strings Variables to include in the odds-ratio plot. All will be included by default excl : str or list of strings Variables to exclude from the odds-ratio plot. None are excluded by default Returns ------- Matplotlit object Plot of Odds ratios """ # iloc to reverse order df = or_ci(fitted, alpha=alpha, intercept=intercept, dec=100).dropna().iloc[::-1] if incl is not None: incl = ifelse(isinstance(incl, list), incl, [incl]) rx = "(" + "|".join([f"^{v}$|^{v}\\[" for v in incl]) + ")" incl = df["index"].str.match(fr"{rx}") if intercept: incl[0] = True df = df[incl] if excl is not None: excl = ifelse(isinstance(excl, list), excl, [excl]) rx = "(" + "|".join([f"^{v}$|^{v}\\[" for v in excl]) + ")" excl = df["index"].str.match(fr"{rx}") if intercept: excl[0] = False df = df[~excl] low, high = [100 * alpha / 2, 100 * (1 - (alpha / 2))] err = [df["OR"] - df[f"{low}%"], df[f"{high}%"] - df["OR"]] fig = plt.figure(figsize=figsize) ax = fig.add_subplot() ax.axvline(1, ls="dashdot") ax.errorbar(x="OR", y="index", data=df, xerr=err, fmt="none") ax.scatter(x="OR", y="index", data=df) ax.set_xscale("log") ax.xaxis.set_minor_formatter(ticker.NullFormatter()) ax.xaxis.set_major_locator( ticker.LogLocator(subs=[0.1, 0.2, 0.5, 1, 2, 5, 10])) ax.xaxis.set_major_formatter(ticker.StrMethodFormatter("{x:.1f}")) ax.set(xlabel="Odds-ratio") return ax
def summary(self, output=["observed", "expected"], dec=2): """ Print different output tables for a cross_tabs object Parameters ---------- output : list of tables to show Options include "observed" (observed frequencies), "expected" (expected frequencies), "chisq" (chi-square values) for each cell, "dev_std" (standardized deviations from expected) "perc_row" (percentages conditioned by row), "perc_col" (percentages conditioned by column), "perc" (percentages by the total number of observations). The default value is ["observed", "expected"] dec : int Number of decimal places to use in rounding Examples -------- import pyrsm as rsm rsm.load_data(pkg="basics", name="newspaper", dct=globals()) ct = rsm.cross_tabs(newspaper) ct.summary() """ output = ifelse(type(output) is list, output, [output]) prn = f""" Cross-tabs Variables: {self.var1}, {self.var2} Null hyp: there is no association between {self.var1} and {self.var2} Alt. hyp: there is an association between {self.var1} and {self.var2} """ if "observed" in output: prn += f""" Observed: {self.observed.applymap(lambda x: "{:,}".format(x))} """ if "expected" in output: prn += f""" Expected: (row total x column total) / total {self.expected.round(dec).applymap(lambda x: "{:,}".format(x))} """ if "chisq" in output: prn += f""" Contribution to chi-squared: (o - e)^2 / e {self.chisq.round(dec).applymap(lambda x: "{:,}".format(x))} """ if "dev_std" in output: prn += f""" Deviation standardized: (o - e) / sqrt(e) {self.dev_std.round(dec).applymap(lambda x: "{:,}".format(x))} """ if "perc_row" in output: prn += f""" Row percentages: {self.perc_row.transform(lambda x: (100*x).round(dec).astype(str) + "%")} """ if "perc_col" in output: prn += f""" Column percentages: {self.perc_col.transform(lambda x: (100*x).round(dec).astype(str) + "%")} """ if "perc_all" in output: prn += f""" Percentages: {self.perc.transform(lambda x: (100*x).round(dec).astype(str) + "%")} """ prn += f""" Chi-squared: {round(self.chisq_test[0], dec)} df({int(self.chisq_test[2])}), p.value {ifelse(self.chisq_test[1] < 0.001, "< .001", round(self.chisq_test[1], dec))} {100 * round(self.expected_low[0] / self.expected_low[1], dec)}% of cells have expected values below 5 """ print(prn)
def plot(self, output="perc_col", **kwargs): """ Plot of correlations between numeric variables in a Pandas dataframe Parameters ---------- output : list of tables to show Options include "observed" (observed frequencies), "expected" (expected frequencies), "chisq" (chi-square values) for each cell, "dev_std" (standardized deviations from expected) "perc_row" (percentages conditioned by row), "perc_col" (percentages conditioned by column), "perc" (percentages by the total number of observations). The default value is ["observed", "expected"] **kwargs : Named arguments to be passed to pandas plotting functions Examples -------- import pyrsm as rsm rsm.load_data(pkg="basics", name="newspaper", dct=globals()) ct = rsm.cross_tabs(newspaper, "Income", "Newspaper") ct.plot() """ output = ifelse(type(output) is list, output, [output]) args = {"rot": False} if "observed" in output: df = (self.observed.transpose().drop(columns="Total").drop( "Total", axis=0).apply(lambda x: x * 100 / sum(x), axis=1)) args["title"] = "Observed frequencies" args.update(**kwargs) fig = df.plot.bar(stacked=True, **args) if "expected" in output: df = (self.expected.transpose().drop(columns="Total").drop( "Total", axis=0).apply(lambda x: x * 100 / sum(x), axis=1)) args["title"] = "Expected frequencies" args.update(**kwargs) fig = df.plot.bar(stacked=True, **args) if "chisq" in output: df = self.chisq.transpose().drop(columns="Total").drop("Total", axis=0) args["title"] = "Contribution to chi-squared statistic" args.update(**kwargs) fig = df.plot.bar(**args) if "dev_std" in output: df = self.dev_std.transpose() args["title"] = "Deviation standardized" args.update(**kwargs) fig, ax = plt.subplots() df.plot.bar(**args, ax=ax) ax.axhline(y=1.96, color="black", linestyle="--") ax.axhline(y=1.64, color="black", linestyle="--") ax.axhline(y=-1.96, color="black", linestyle="--") ax.axhline(y=-1.64, color="black", linestyle="--") ax.annotate("95%", xy=(0, 2.1), va="bottom", ha="center") ax.annotate("90%", xy=(0, 1.4), va="top", ha="center") if "perc_col" in output: df = self.perc_col.transpose().drop(columns="Total").drop("Total", axis=0) args["title"] = "Column percentages" args.update(**kwargs) fig = df.plot.bar(**args) if "perc_row" in output: df = self.perc_row.transpose().drop(columns="Total").drop("Total", axis=0) args["title"] = "Row percentages" args.update(**kwargs) fig = df.plot.bar(**args) if "perc" in output: df = self.perc.transpose().drop(columns="Total").drop("Total", axis=0) args["title"] = "Table percentages" args.update(**kwargs) fig = df.plot.bar(**args)
def or_ci(fitted, alpha=0.05, intercept=False, importance=False, data=None, dec=3): """ Confidence interval for Odds ratios Parameters ---------- fitted : A fitted logistic regression model alpha : float Significance level intercept : bool Include intercept in output (True or False) importance : int Calculate variable importance. Only meaningful if data used in estimation was standardized prior to model estimation data : Pandas dataframe Unstandardized data used to calculate descriptive statistics dec : int Number of decimal places to use in rounding Returns ------- Pandas dataframe with Odd-ratios and confidence intervals """ df = pd.DataFrame(np.exp(fitted.params), columns=["OR"]).dropna() df["OR%"] = 100 * ifelse(df["OR"] < 1, -(1 - df["OR"]), df["OR"] - 1) low, high = [100 * alpha / 2, 100 * (1 - (alpha / 2))] df[[f"{low}%", f"{high}%"]] = np.exp(fitted.conf_int(alpha=alpha)) df["p.values"] = ifelse(fitted.pvalues < 0.001, "< .001", fitted.pvalues.round(dec)) df[" "] = sig_stars(fitted.pvalues) df["OR%"] = [f"{round(o, max(dec-2, 0))}%" for o in df["OR%"]] df = df.reset_index() if importance: df["dummy"] = df["index"].str.contains("[T", regex=False) df["importance"] = (pd.DataFrame().assign(OR=df["OR"], ORinv=1 / df["OR"]).max(axis=1)) if isinstance(data, pd.DataFrame): # using a fake response variable variable data = data.assign(__rvar__=1).copy() form = "__rvar__ ~ " + fitted.model.formula.split("~", 1)[1] exog = pd.DataFrame(smf.logit(formula=form, data=data).exog) weights = fitted._freq_weights if sum(weights) > len(weights): def wmean(x): return weighted_mean(x, weights) def wstd(x): return weighted_sd(pd.DataFrame(x), weights)[0] df = pd.concat( [df, exog.apply([wmean, wstd, "min", "max"]).T], axis=1, ) else: df = pd.concat( [df, exog.apply(["mean", "std", "min", "max"]).T], axis=1) if intercept is False: df = df.loc[df["index"] != "Intercept"] return df.round(dec)