def run(self, dfx, ref_num=0): msg = {} x_numer_cols, x_cate_cols = ParseDFtypes(dfx) if x_numer_cols == []: logging.error( 'All input dfx are no numeric columns, Please check your input dfx data!' ) msg['error'] = '输入的dfx所有的列都不是数值型数据,请检查输入数据' return {'result': pd.DataFrame(), 'msg': msg} else: if x_cate_cols != []: logging.warning( 'input dfx has non-numeric columns: %s, will ignore these columns!' % x_cate_cols) msg['warning'] = '输入的dfx包含了非数值型的列: %s, 将会被自动忽略!' % x_cate_cols rs = [] for i in x_numer_cols: F, p = ttest_1samp(dfx[i], ref_num) columns = ['t-值', 'p-值'] rs.append(pd.DataFrame([F, p], index=columns, columns=[i]).T) res = pd.concat(rs) res['p-值'] = res['p-值'].apply(lambda x: '{:.5f}'.format(x)) return {'result': res.round(5), 'msg': msg}
def run(self, df): numer_cols,cate_cols = ParseDFtypes(df) msg = {} if numer_cols == []: logging.error('All input DataFrame are non-numeric columns, Please check your input data!') msg['error'] = '输入的所有的列都不是数值型数据,请检查输入数据df!' result = pd.DataFrame() else: if cate_cols != []: logging.warning('Input DataFrame has non-numeric columns, such as: %s will be ignored!' % cate_cols) msg['warning'] = '输入的数据包含非数值型数据, 比如列: %s 将会被忽略!' % cate_cols dfn = df[numer_cols] desb = dfn.describe().T desb['skew'] = dfn.skew() desb['kurt'] = dfn.kurt() desb['var'] = dfn.var() desb['mad'] = dfn.mad() result = desb result.columns = ['样本量','、平均值', '标准差', '最小值', '1/4分位数', '中位数', '3/4分位数', '最大值','偏度','峰度','方差','平均绝对误差'] return {'result':result, 'msg':msg}
def run(self, df, by, method='count'): numer_cols, cate_cols = ParseDFtypes(df) msg = {} if numer_cols == []: logging.error( 'All input DataFrame are no numeric columns, Please check your input data!' ) result = pd.DataFrame() msg['error'] = '输入的所有的列都不是数值型数据,请检查输入数据df!' else: if method != 'count': s1 = set(by) s2 = set(numer_cols) if s2.issubset(s1): logging.error( 'by columns contains all numeric columns, no numeric data to calculate!' ) msg['error'] = '参数by占用了所有数值型的列, 所以没有额外的数值型列来计算 %s 的结果!' % method result = pd.DataFrame() else: dfg = df.groupby(by) result = dfg.agg(method) result = result.round(5) else: result = df.groupby(by).size().to_frame(name='样本量') return {'result': result, 'msg': msg}
def run(self, dfx): msg = {} x_numer_cols, x_cate_cols = ParseDFtypes(dfx) if x_numer_cols ==[]: logging.error('All input dfx are no numeric columns, Please check your input dfx data!') msg['error'] = '输入的dfx所有的列都不是数值型数据,请检查输入数据' return {'result':pd.DataFrame(), 'msg':msg} else: if x_cate_cols != []: logging.warning('input dfx has non-numeric columns: %s, will ignore these columns!' % x_cate_cols) msg['warning'] = '输入的dfx包含了非数值型的列: %s, 将会被自动忽略!' % x_cate_cols rr = [] for i in x_numer_cols: dfi = dfx[i].dropna() l = len(dfi) ks_w, ks_p = kstest(dfi,'norm') ws_w,ws_p = shapiro(dfi) cols = ['样本量', 'KS检验:统计量','KS检验:p值','Shapro-Wilk检验:统计量', 'Shapro-Wilk检验:p-值'] dfr = pd.DataFrame([l, ks_w, ks_p,ws_w,ws_p],index=cols,columns=[i]).T rr.append(dfr) res = pd.concat(rr) res['KS检验:p值'] = res['KS检验:p值'].apply(lambda x:'{:.5f}'.format(x)) res['Shapro-Wilk检验:p-值'] = res['Shapro-Wilk检验:p-值'].apply(lambda x:'{:.5f}'.format(x)) return {'result':res.round(5), 'msg':msg}
def run(self, df, bins=10): self.bins = bins msg = {} numeric_cols, category_cols = ParseDFtypes(df) if numeric_cols != []: logging.warning( 'column %s is not category type, force convert to category by bins of %d' % (numeric_cols, self.bins)) msg['warning'] = '列 %s 不是定类(category)数据, 将强制通过bins:%d为转化为定类型数据' % ( numeric_cols, self.bins) res = [] for col in df.columns: if col in numeric_cols: vc = df[col].value_counts(bins=self.bins) dfc = vc.to_frame(name='frequency') dfc.index = dfc.index.map(str) _sum = dfc.frequency.sum() dfc['percentage'] = dfc.frequency.apply(lambda x: x / _sum) else: vc = df[col].value_counts() dfc = vc.to_frame(name='frequency') _sum = dfc.frequency.sum() dfc['percentage'] = dfc.frequency.apply( lambda x: x / _sum).round(5) * 100 index = pd.MultiIndex.from_product([[col], dfc.index], names=['分析项', '分析组']) dfc = dfc.set_index(index) res.append(dfc) df_res = pd.concat(res) df_res.columns = ['频数', '占比(%)'] return {'result': df_res, 'msg': msg}
def run(self, df, x, y, extra_args={'alpha': 0.001}): alpha = extra_args.get('alpha') if not alpha: alpha = 0.001 ''' x: y: y的唯一个数只能为2 ''' msg = {} dfx = df[x].reset_index(drop=True) dfx = sm.add_constant(dfx, prepend=True) dfx = dfx.rename(columns={'const': '截距'}) numeric_cols, category_cols = ParseDFtypes(dfx) target = y[0] tsy = df[target].reset_index(drop=True) #convert dict myd = {} myd_reverse = {} lst = list(tsy.unique()) for i, j in enumerate(lst): myd[j] = i myd_reverse[i] = j # build init model model = smf.OLS(tsy.map(myd), dfx[numeric_cols]) res = model.fit_regularized( maxiter=1000, alpha=alpha, #正则化系数 L1_wt=0) #L1_wt 0为l2 loss, 1为l1 loss #predict result prediction_probs = res.predict() prediction_bins = pd.Series( [1 if i >= 0.5 else 0 for i in prediction_probs], name='predicted_bins') tsy_predict = prediction_bins.map(myd_reverse) tsy_predict.name = '预测的' + tsy.name df_predict_result = pd.concat([tsy, tsy_predict], axis=1) #confusion matrix df_confusion_matrix = pd.DataFrame(confusion_matrix(tsy, tsy_predict), index=tsy.unique(), columns=tsy.unique()) #report df_report = pd.DataFrame(list( precision_recall_fscore_support(tsy, tsy_predict)), index=['精确度', '召回率', 'F1-值', '样本个数']).T.round(5) df_report.index = df_report.index.map(myd_reverse) #roc fpr, tpr, thresholds = roc_curve(tsy.map(myd), prediction_probs) roc_auc = auc(fpr, tpr) logging.info("Area under the ROC curve : %f" % roc_auc) i = np.arange(len(tpr)) # index for df df_roc = pd.DataFrame({ '假阳性率': pd.Series(fpr, index=i), '真阳性率': pd.Series(tpr, index=i) }) #model description tables = res.summary().tables df_list = [pd.read_html(StringIO(t.as_html()))[0] for t in tables] dfinfo1 = df_list[1].fillna('Variables').set_index(0) dfinfo1 = dfinfo1.T.set_index('Variables').T dfinfo1.index.name = '项' dfinfo1.columns.name = '参数类型' dfinfo1.columns = [ '回归系数', '标准误差', 'Z值', 'p值', '95%CI(下限)', '95%CI(上限)' ] dfinfo1['or值'] = np.exp(res.params) df_description = dfinfo1 df_report = df_report.append(df_report.sum().to_frame(name='总和/平均').T) df_report['召回率'].loc['总和/平均'] = df_report['召回率'].loc['总和/平均'] / 2 df_report['F1-值'].loc['总和/平均'] = df_report['F1-值'].loc['总和/平均'] / 2 df_report = df_report.T df_report['name'] = ['模型效果', '模型效果', '模型效果', '样本量'] df_confusion_matrix = df_confusion_matrix.append( df_confusion_matrix.sum().to_frame(name='总和/平均').T) df_confusion_matrix = df_confusion_matrix.T df_confusion_matrix['name'] = ['混淆矩阵', '混淆矩阵'] df_confusion_matrix = df_confusion_matrix.append( df_report).reset_index().set_index(['name', 'index']) df_confusion_matrix = df_confusion_matrix.T df_confusion_matrix.columns.names = [None, None] df_predict_result = df_predict_result.round(5) df_confusion_matrix = df_confusion_matrix.round(5) df_roc = df_roc.round(5) df_description = df_description.round(5) #self._debug = df_confusion_matrix return { 'tables': [ { 'table_info': 'Ringe回归分析结果汇总', 'table_json': df_description.to_json(), 'table_html': df_description.to_html(), 'chart': ['line', 'bar'] }, { 'table_info': 'Ringe回归预测效果汇总:', 'table_json': df_confusion_matrix.T.reset_index().to_json(), 'table_html': df_confusion_matrix.to_html(), 'chart': [] }, { 'table_info': "ROC曲线(曲线下面积:%0.3f)" % roc_auc, 'table_json': df_roc.to_json(), 'table_html': df_roc.to_html(), 'chart': ['scatter'] }, ], 'conf': self.get_info(), 'msg': msg }, [{ 'table_df': df_predict_result, 'label': '实际值与预测值' }]
def run(self, dfx, dfy): msg = {} xl = len(dfx) yl = len(dfy) if xl != yl: logging.error( 'the length of input X:%s is not equal the length of Y: %s ! ' % (xl, yl)) msg['error'] = '输入的dfx的长度为:%s 不等于输入的dfy的长度: %s ' % (xl, yl) return {'result': pd.DataFrame(), 'msg': msg} x_numer_cols, x_cate_cols = ParseDFtypes(dfx) y_numer_cols, y_cate_cols = ParseDFtypes(dfy) if (x_cate_cols != []) or (y_cate_cols != []): logging.error( 'input x or y has non-numeric data, please check your input data' ) msg['error'] = '输入的dfx或者dfy所有的列都不是数值型数据,请检查输入数据' return {'result': pd.DataFrame(), 'msg': msg} if len(x_numer_cols) != len(y_numer_cols): logging.error( 'the number of columns for input X:%s is not equal Y: %s ! ' % (x_numer_cols, y_numer_cols)) msg['error'] = '输入的dfx的可用的列为:%s ,这和输入的dfy可用的列: %s 在列数数量上不相等 ' % ( x_numer_cols, y_numer_cols) return {'result': pd.DataFrame(), 'msg': msg} else: rr = [] for i, j in zip(x_numer_cols, y_numer_cols): idx = '%s-配对-%s' % (i, j) F, p = ttest_rel(dfx[i], dfy[j]) m1 = dfx[i].mean() s1 = dfx[i].std() m2 = dfy[j].mean() s2 = dfy[j].std() r1 = '%s±%s' % (round(m1, 3), round(s1, 3)) r2 = '%s±%s' % (round(m2, 3), round(s2, 3)) e = m1 - m2 columns = [ '配对1(平均值±标准差)', '配对2(平均值±标准差)', '差值(配对1-配对2)', 't-值', 'p-值' ] dfr = pd.DataFrame([r1, r2, e, F, p], index=columns, columns=[idx]).T.round(5) rr.append(dfr) res = pd.concat(rr) res['p-值'] = res['p-值'].apply(lambda x: '{:.5f}'.format(x)) return {'result': res, 'msg': msg}
def run(self, dfx, tsy): tsy = tsy.reset_index(drop=True) dfx = dfx.reset_index(drop=True) msg = {} xl = len(dfx) yl = len(tsy) if xl != yl: logging.error( 'the length of input X:%s is not equal the length of Y: %s ! ' % (xl, yl)) msg['error'] = '输入的dfx的长度为:%s 不等于输入的tsy的长度: %s ' % (xl, yl) return {'result': pd.DataFrame(), 'msg': msg} if not isSeries(tsy) or not isCategory(tsy): logging.error( 'input tsy is not a pandas Series or not a category data!') msg['error'] = '输入的tsy不是定类型数据或者Series类型' return {'result': pd.DataFrame(), 'msg': msg} else: x_numer_cols, x_cate_cols = ParseDFtypes(dfx) if x_numer_cols == []: logging.error( 'All input dfx are no numeric columns, Please check your input dfx data!' ) msg['error'] = '输入的dfx所有的列都不是数值型数据,请检查输入数据' return {'result': pd.DataFrame(), 'msg': msg} else: if x_cate_cols != []: logging.warning( 'input dfx has non-numeric columns: %s, will ignore these columns!' % x_cate_cols) msg['warning'] = '输入的dfx包含了非数值型的列: %s, 将会被自动忽略!' % x_cate_cols name = tsy.name dfu = dfx[x_numer_cols].join(tsy) m = dfu.groupby(name).mean().T s = dfu.groupby(name).std().T def change(ts): v = [] for i in ts.index: r = '%s±%s' % (round(ts.loc[i], 2), round(s[ts.name].loc[i], 2)) v.append(r) return pd.Series(v, index=ts.index) m1 = m.apply(change) rs = [] for i in x_numer_cols: model = ols('%s ~ %s' % (i, tsy.name), dfu).fit() anovat = anova_lm(model) anovat.columns = ['自由度', '平方和', '均方和', 'F-值', 'p-值'] rs.append(anovat.iloc[0].to_frame(name=i).T) res = m1.join(pd.concat(rs)) res['p-值'] = res['p-值'].apply(lambda x: '{:.5f}'.format(x)) return {'result': res.round(5), 'msg': msg}
def run(self, dfx, tsy): tsy = tsy.reset_index(drop=True) dfx = dfx.reset_index(drop=True) msg = {} xl = len(dfx) yl = len(tsy) if xl != yl: logging.error( 'the length of input X:%s is not equal the length of Y: %s ! ' % (xl, yl)) msg['error'] = 'the length of input X:%s is not equal the length of Y: %s ! ' % ( xl, yl) return {'result': pd.DataFrame(), 'msg': msg} if not isSeries(tsy) or not isCategory(tsy): logging.error( 'input tsy is not a pandas Series or not a category data!') msg['error'] = 'input tsy is not a pandas Series or not a category data!' return {'result': pd.DataFrame(), 'msg': msg} else: x_numer_cols, x_cate_cols = ParseDFtypes(dfx) if x_numer_cols == []: logging.error( 'All input dfx are no numeric columns, Please check your input dfx data!' ) msg['error'] = 'All input dfx are no numeric columns, Please check your input dfx data!' return {'result': pd.DataFrame(), 'msg': msg} else: if x_cate_cols != []: logging.warning( 'input dfx has non-numeric columns: %s, will ignore these columns!' % x_cate_cols) msg['warning'] = 'input dfx has non-numeric columns: %s, will ignore these columns!' % x_cate_cols name = tsy.name dfu = dfx[x_numer_cols].join(tsy) m = dfu.groupby(name).mean().T s = dfu.groupby(name).std().T def change(ts): v = [] for i in ts.index: r = '%s±%s' % (round(ts.loc[i], 2), round(s[ts.name].loc[i], 2)) v.append(r) return pd.Series(v, index=ts.index) m1 = m.apply(change) rs = [] for i in x_numer_cols: dd = [dfu[dfu[tsy.name] == c][i] for c in tsy.unique()] F, p = levene(*dd) columns = ['F-值', 'p-值'] rs.append( pd.DataFrame([F, p], index=columns, columns=[i]).T) res = m1.join(pd.concat(rs)) return {'result': res, 'msg': msg}
def run(self, df, x, y, extra_args={ 'method': 'pearson', 'crosstab': False }): method = extra_args.get('method') crosstab = extra_args.get('crosstab') m = get_corr_func(method) #msg={'error':None,'warning':None} msg = {} dfx = df[x] dfy = df[y] if m: if method == 'pearson': x_numer_cols, x_cate_cols = ParseDFtypes(dfx) y_numer_cols, y_cate_cols = ParseDFtypes(dfy) if (x_numer_cols == []) | (y_numer_cols == []): logging.error( 'All input DataFrame are no numeric columns, Please check your input data!' ) msg['error'] = '输入的所有的列都不是数值型数据,请检查输入数据' dfres = pd.DataFrame() else: res = [] if (x_cate_cols != []) | (x_cate_cols != []): logging.warning( 'input DataFrame has no numeric data columns, will be ignored!' ) msg['warning'] = '输入的数据包含非数值型数据, 将会被忽略!' for xc in x_numer_cols: for yc in y_numer_cols: c, p = pearsonr(dfx[xc], dfy[yc]) res.append([xc, yc, c, p]) dfres = pd.DataFrame(res, columns=['x-列', 'y-列', '相关系数', 'p-值']) #dfres['p-值'] = dfres['p-值'].apply(lambda x:'{:.5f}'.format(x)) if crosstab: dfres = dfres.pivot_table(index='x-列', columns='y-列') else: res = [] for xc in dfx.columns: for yc in dfy.columns: c, p = m(dfx[xc], dfy[yc]) res.append([xc, yc, c, p]) dfres = pd.DataFrame(res, columns=['x-列', 'y-列', '相关系数', 'p-值']) #dfres['p-值'] = dfres['p-值'].apply(lambda x:'{:.5f}'.format(x)) if crosstab: dfres = dfres.pivot_table(index='x-列', columns='y-列') else: logging.error( "unknow method, only 'pearson','kendall'、'spearman' are supported!" ) dfres = pd.DataFrame() msg['error'] = "未知的方法, (meathd参数)只支持 'pearson','kendall'、'spearman' 这三种!" dfres = dfres.round(5) return { 'tables': [{ 'table_json': dfres.T.reset_index().to_json(), 'table_html': dfres.to_html(), 'table_info': '生成的字段之间的相关系数和p-值表', 'chart': ['heatmap', 'line', 'bar'] }], 'conf': self.get_info(), 'msg': msg }, [{ 'table_df': dfres, 'label': '生成的字段之间的相关系数和p-值表' }]
def run(self, dfx, tsy): tsy = tsy.reset_index(drop=True) dfx = dfx.reset_index(drop=True) msg = {} if not isSeries(tsy) or not isCategory(tsy): logging.error('input tsy is not a pandas Series or not a category data!') msg['error'] = '输入的tsy不是定类型数据或者Series类型' return {'result':pd.DataFrame(), 'msg':msg} else: if len(tsy.unique()) != 2: msg['error'] = '输入的tsy不能被分成2组,请确保值tsy中的数unique后元素个数为2,目前的元素为%s' % tsy.unique() return {'result':pd.DataFrame(), 'msg':msg} else: x_numer_cols, x_cate_cols = ParseDFtypes(dfx) if x_numer_cols ==[]: logging.error('All input dfx are no numeric columns, Please check your input dfx data!') msg['error'] = 'dfx输入的每列都不是数值型数据,请检查输入数据' return {'result':pd.DataFrame(), 'msg':msg} else: if x_cate_cols != []: logging.warning('input dfx has non-numeric columns: %s, will ignore these columns!' % x_cate_cols) msg['warning'] = '输入的dfx包含了非数值型的列: %s, 将会被自动忽略!' % x_cate_cols name = tsy.name dfu = dfx[x_numer_cols].join(tsy) m = dfu.groupby(name).mean().T s = dfu.groupby(name).std().T def change(ts): v= [] for i in ts.index: r = '%s±%s' % (round(ts.loc[i],2),round(s[ts.name].loc[i],2)) v.append(r) return pd.Series(v,index=ts.index) m1 = m.apply(change) rs = [] for i in x_numer_cols: c1 = tsy.unique()[0] c2 = tsy.unique()[1] d1 = dfu[dfu[tsy.name] == c1][i] d2 = dfu[dfu[tsy.name] == c2][i] F, p = ttest_ind(d1,d2) columns = ['t-值', 'p-值'] rs.append(pd.DataFrame([F,p],index=columns,columns=[i]).T) res = m1.join(pd.concat(rs)) return {'result':res.round(5), 'msg':msg}
def run(self, dfx, dfy): dfy = dfy.reset_index(drop=True) dfx = dfx.reset_index(drop=True) msg = {} xl = len(dfx) yl = len(dfy) if xl != yl: logging.error('the length of input X:%s is not equal the length of Y: %s ! ' % (xl,yl)) msg['error'] = '输入的dfx的长度为:%s 不等于输入的dfy的长度: %s ! ' % (xl,yl) return {'result':pd.DataFrame(), 'msg':msg} if len(dfy.columns) != 1: logging.warning('input DataFrame dfy has more than one columns, but only the first colum will be used!') msg['warning'] = '输入的dfy不只有一列数据,但是只有第一列会被使用' _dfy = dfy[[dfy.columns[0]]] else: _dfy = dfy x_numer_cols, x_cate_cols = ParseDFtypes(dfx) y_numer_cols, y_cate_cols = ParseDFtypes(_dfy) if (x_numer_cols ==[]) | (y_numer_cols == []): logging.error('All input DataFrame are no numeric columns, Please check your input data!') msg['error'] = '输入的所有的列都不是数值型数据,请检查输入数据' dfmain =pd.DataFrame() else: _dfx = dfx[x_numer_cols] X = sm.add_constant(_dfx, prepend=True) y = _dfy f = smf.OLS(y, X).fit() y_pre = f.predict(X) df_predicted = pd.DataFrame(y_pre,index = y.index, columns=['预测值']) df_predicted = y.join(df_predicted) tables = f.summary().tables df_list = [pd.read_html(StringIO(t.as_html()))[0] for t in tables ] def parse_table02(m_inf): df1 = m_inf[[0,1]] df1.columns=['items','values'] df2 = m_inf[[2,3]] df2.columns=['items','values'] dfinfo1 = df1.append(df2).dropna().set_index('items') return dfinfo1.T dfinfo0 = parse_table02(df_list[0]) dfinfo2 = parse_table02(df_list[2]) dfinfo1 = df_list[1].fillna('Variables').set_index(0) dfinfo1 = dfinfo1.T.set_index('Variables').T dfmain = dfinfo1[dfinfo1.columns[:4]] dfad = dfinfo0[['R-squared:', 'Adj. R-squared:', 'F-statistic:']].join(dfinfo2[['Durbin-Watson:', 'Jarque-Bera (JB):', 'Omnibus:']]) variables = f.model.exog dfmain['VIF'] = [variance_inflation_factor(variables, i) for i in range(variables.shape[1])] for i in dfad.columns: dfmain[i] = dfad[i].iloc[0] dfmain.columns = ['系数', '标准偏差', 't值', 'P值', 'VIF值', 'R平方:', '调整后R平方', 'F值:', 'Durbin-Watson检验:', 'Jarque-Bera (JB检验):', 'Omnibus检验'] dfmain.index.name = '变量' dfmain = dfmain.rename(index = {'const':'常数项'}) dfmain = dfmain.round(5) dfmain['P值'] = dfmain['P值'].apply(lambda x:'{:.5f}'.format(x)) return {'result':dfmain, 'msg':msg, 'model':f, 'predicted_result':df_predicted}
def run(self, df, x, y, *args): ''' x: y: y的唯一个数只能为2 ''' msg = {} dfx = df[x].reset_index(drop=True) dfx = sm.add_constant(dfx, prepend=True) dfx = dfx.rename(columns={'const': '截距'}) numeric_cols, category_cols = ParseDFtypes(dfx) target = y[0] tsy = df[target].reset_index(drop=True) types = list(tsy.unique()) types.sort() # build init model model = sm.MNLogit(tsy, dfx[numeric_cols]) res = model.fit() #predict result prediction_probs = res.predict() tsy_predict = pd.DataFrame(prediction_probs).apply( lambda x: types[x.idxmax()], axis=1) tsy_predict.name = '预测的' + tsy.name df_predict_result = pd.concat([tsy, tsy_predict], axis=1) df_dumps = pd.get_dummies(tsy)[types] df_prediction_probs = pd.DataFrame(prediction_probs, columns=types) #fpr, tpr, thresholds =roc_curve(tsy.map(myd), prediction_probs) #report df_report = pd.DataFrame(list( precision_recall_fscore_support(tsy, tsy_predict)), index=['召回率', '精确度', 'F1-值', '样本个数'], columns=types).T.round(5) #confusion matrix df_confusion_matrix = pd.DataFrame(res.pred_table(), index=types, columns=types) #roc roc_res_dict = {} for i in types: fpr, tpr, thresholds = roc_curve(df_dumps[i], df_prediction_probs[i]) tpr = pd.DataFrame(tpr, columns=['真阳性率']) fpr = pd.DataFrame(fpr, columns=['假阳性率']) roc_auc = auc(fpr, tpr) desc = "(曲线下面积:%0.3f)" % roc_auc key = '%s_%s' % (i, desc) r = fpr.join(tpr).T.reset_index() roc_res_dict[key] = r #model description tables = res.summary().tables df_list = [pd.read_html(StringIO(t.as_html()))[0] for t in tables] dfinfo1 = df_list[1].fillna('Variables').set_index(0) t = [] for i in res.params.columns: odd = np.exp(res.params[[i]]).round(5) odd.columns = ['or值'] odd = odd.T.reset_index().T t.append(odd) dft = pd.concat(t) dft.index = dfinfo1.index dft.columns = [7] df_res = dfinfo1.reset_index().join(dft.reset_index(drop=True)) df_res = df_res.set_index(0) change_lst = list(set(dfinfo1.index) - set(dfx.columns)) for i in change_lst: df_res.loc[i] = [ '回归系数', '标准误差', 'Z值', 'p值', '95%CI(下限)', '95%CI(上限)', 'or值' ] df_report = df_report.append(df_report.sum().to_frame(name='总和/平均').T) df_report['召回率'].loc['总和/平均'] = df_report['召回率'].loc['总和/平均'] / 2 df_report['F1-值'].loc['总和/平均'] = df_report['F1-值'].loc['总和/平均'] / 2 df_report = df_report.T df_report['name'] = ['模型效果', '模型效果', '模型效果', '样本量'] df_confusion_matrix = df_confusion_matrix.append( df_confusion_matrix.sum().to_frame(name='总和/平均').T) df_confusion_matrix = df_confusion_matrix.T df_confusion_matrix['name'] = '混淆矩阵' df_confusion_matrix = df_confusion_matrix.append( df_report).reset_index().set_index(['name', 'index']) df_confusion_matrix = df_confusion_matrix.T df_confusion_matrix.columns.names = [None, None] df_predict_result = df_predict_result.round(5) df_confusion_matrix = df_confusion_matrix.round(5) df_description = df_res.round(5) tt = [] for i in roc_res_dict.keys(): df = roc_res_dict[i] d = { 'table_info': i, 'table_json': df.to_json(), 'table_html': df.to_html(), 'chart': ['scatter'] } tt.append(d) #self.df_confusion_matrix = df_confusion_matrix #self._df_description = df_description return { 'tables': [{ 'table_info': '多元Logit回归分析结果汇总', 'table_json': df_description.reset_index().to_json(), 'table_html': df_description.to_html(), 'chart': ['line', 'bar'] }, { 'table_info': '多元Logit回归预测效果汇总:', 'table_json': df_confusion_matrix.T.reset_index().to_json(), 'table_html': df_confusion_matrix.to_html(), 'chart': [] }] + tt, 'conf': self.get_info(), 'msg': msg }, [{ 'table_df': df_predict_result, 'label': '实际值与预测值' }]