def _profile_table(table, bins=10, check_correlation=False, correlation_threshold=0.9, correlation_overrides=None): rb = ReportBuilder() profile = pd_profiling.ProfileReport(table, bins=bins, check_correlation=check_correlation, correlation_threshold=correlation_threshold, correlation_overrides=correlation_overrides) rb.addHTML(profile.html) summary = dict() summary['report'] = rb.get() return {'result': summary}
def tukeys_range_test(table, response_cols, factor_col, alpha=0.05): rb = ReportBuilder() rb.addMD("""## Tukey's range test Result""") for response_col in response_cols: data = table[response_col] posthoc = pairwise_tukeyhsd(data, table[factor_col], alpha=alpha) posthoc_html = posthoc._results_table.as_html() posthoc.plot_simultaneous() rb.addMD("""### {response_col}""".format(response_col=response_col)) rb.addHTML(posthoc_html) rb.addPlt(plt) plt.clf() return {'result': {'report': rb.get()}}
def default(self, o): # TODO add more support types if isinstance(o, set): return {'__set__': list(o)} elif isinstance(o, numpy.ndarray): return {'__numpy__': _to_default_list(o)} elif hasattr(o, '_repr_html_'): rb = ReportBuilder() rb.addHTML(o._repr_html_()) return {'report': rb.get(), '__pickled__': list(pickle.dumps(o))} elif hasattr(o, 'savefig'): rb = ReportBuilder() rb.addPlt(o) return {'report': rb.get(), '__pickled__': list(pickle.dumps(o))} else: rb = ReportBuilder() rb.addRawTextMD(str(o)) return {'report': rb.get(), '__pickled__': list(pickle.dumps(o))}
def _linear_regression_train(table, feature_cols, label_col, fit_intercept=True): features = table[feature_cols] label = table[label_col] lr_model = LinearRegression(fit_intercept) lr_model.fit(features, label) predict = lr_model.predict(features) residual = label - predict if fit_intercept == True: lr_model_fit = sm.OLS(label, sm.add_constant(features)).fit() else: lr_model_fit = sm.OLS(label, features).fit() summary = lr_model_fit.summary() summary_tables = simple_tables2df_list(summary.tables) summary0 = summary_tables[0] summary1 = summary_tables[1] summary2 = summary_tables[2] html_result = summary.as_html() plt.figure() plt.scatter(predict, label) plt.xlabel('Predicted values for ' + label_col) plt.ylabel('Actual values for ' + label_col) x = predict y = np.array(label) a = x.size b = np.sum(x) c = b d = 0 for i in x: d += +i * i e = np.sum(y) f = 0 for i in range(0, x.size - 1): f += x[i] * y[i] det = a * d - b * c aa = (d * e - b * f) / det bb = (a * f - c * e) / det p1x = np.min(x) p1y = aa + bb * p1x p2x = np.max(x) p2y = aa + bb * p2x plt.plot([p1x, p2x], [p1y, p2y], 'r--') fig_actual_predict = plt2MD(plt) plt.figure() plt.scatter(predict, residual) plt.xlabel('Predicted values for ' + label_col) plt.ylabel('Residuals') plt.axhline(y=0, color='r', linestyle='--') fig_residual_1 = plt2MD(plt) plt.figure() sm.qqplot(residual, line='s') plt.ylabel('Residuals') fig_residual_2 = plt2MD(plt) plt.figure() sns.distplot(residual) plt.xlabel('Residuals') fig_residual_3 = plt2MD(plt) rb = ReportBuilder() rb.addMD(strip_margin(""" | ## Linear Regression Result | ### Summary | """)) rb.addHTML(html_result) rb.addMD(strip_margin(""" | | ### Predicted vs Actual | {image1} | | ### Fit Diagnostics | {image2} | {image3} | {image4} """.format(image1=fig_actual_predict, image2=fig_residual_1, image3=fig_residual_2, image4=fig_residual_3 ))) model = _model_dict('linear_regression_model') model['features'] = feature_cols model['label'] = label_col model['coefficients'] = lr_model_fit.params model['r2'] = lr_model_fit.rsquared model['adjusted_r2'] = lr_model_fit.rsquared_adj model['aic'] = lr_model_fit.aic model['bic'] = lr_model_fit.bic model['f_static'] = lr_model_fit.fvalue model['tvalues'] = lr_model_fit.tvalues model['pvalues'] = lr_model_fit.pvalues model['lr_model'] = lr_model model['report'] = rb.get() model['summary0'] = summary0 model['summary1'] = summary1 model['summary2'] = summary2 return {'model' : model}
def _glm_train(table, feature_cols, label_col, family="Gaussian", link="ident", fit_intercept=True): features = table[feature_cols] label = table[label_col] if label_col in feature_cols: raise Exception("%s is duplicated." % label_col) if family == "Gaussian": sm_family = sm.families.Gaussian() elif family == "inv_Gaussian": sm_family = sm.families.InverseGaussian() elif family == "binomial": sm_family = sm.families.Binomial() elif family == "Poisson": sm_family = sm.families.Poisson() elif family == "neg_binomial": sm_family = sm.families.NegativeBinomial() elif family == "gamma": sm_family = sm.families.Gamma() elif family == "Tweedie": sm_family = sm.families.Tweedie() if link == "ident": sm_link = sm.families.links.identity elif link == "log": sm_link = sm.families.links.log elif link == "logit": sm_link = sm.families.links.logit elif link == "probit": sm_link = sm.families.links.probit elif link == "cloglog": sm_link = sm.families.links.cLogLog elif link == "pow": sm_link = sm.families.links.Power elif link == "nbinom": sm_link = sm.families.links.binom if fit_intercept == True: glm_model = sm.GLM(label, sm.add_constant(features), family=sm_family, link=sm_link).fit() else: glm_model = sm.GLM(label, features, family=sm_family, link=sm_link).fit() summary = glm_model.summary().as_html() rb = ReportBuilder() rb.addMD(strip_margin(""" | ## GLM Result | ### Summary | """)) rb.addHTML(summary) model = _model_dict('glm_model') model['features'] = feature_cols model['label'] = label_col model['family'] = family model['link'] = link model['coefficients'] = glm_model.params model['aic'] = glm_model.aic model['bic'] = glm_model.bic model['tvalues'] = glm_model.tvalues model['pvalues'] = glm_model.pvalues model['fit_intercept'] = fit_intercept model['glm_model'] = glm_model model['report'] = rb.get() return {'model' : model}
return new_dict else: return item return super(DefaultEncoder, self).encode(hint_tuples(obj)) def default(self, o): <<<<<<< HEAD # TODO add more support types if isinstance(o, set): return {'__set__': list(o)} elif isinstance(o, numpy.ndarray): return {'__numpy__': _to_default_list(o)} elif hasattr(o, '_repr_html_'): rb = ReportBuilder() rb.addHTML(o._repr_html_()) return {'report': rb.get(), '__pickled__': list(pickle.dumps(o))} elif hasattr(o, 'savefig'): rb = ReportBuilder() rb.addPlt(o) return {'report': rb.get(), '__pickled__': list(pickle.dumps(o))} else: rb = ReportBuilder() rb.addRawTextMD(str(o)) return {'report': rb.get(), '__pickled__': list(pickle.dumps(o))} ======= if isinstance(o, set): return {'__set__': list(o)} elif isinstance(o, numpy.ndarray): return {'__numpy__': o.tolist()} # TODO add more support types