Python ParseDFtypesの例、MedLearn.utils.pandastool.ParseDFtypes Pythonの例

コード例 #1

0

ファイルを表示

ファイル: common_i_TTest1Samp.py プロジェクト: w940853815/medical-learn

    def run(self, dfx, ref_num=0):
        msg = {}
        x_numer_cols, x_cate_cols = ParseDFtypes(dfx)

        if x_numer_cols == []:
            logging.error(
                'All input dfx are no numeric columns, Please check your input dfx data!'
            )
            msg['error'] = '输入的dfx所有的列都不是数值型数据，请检查输入数据'
            return {'result': pd.DataFrame(), 'msg': msg}

        else:

            if x_cate_cols != []:
                logging.warning(
                    'input dfx has non-numeric columns: %s, will ignore these columns!'
                    % x_cate_cols)
                msg['warning'] = '输入的dfx包含了非数值型的列: %s, 将会被自动忽略！' % x_cate_cols

            rs = []
            for i in x_numer_cols:
                F, p = ttest_1samp(dfx[i], ref_num)
                columns = ['t-值', 'p-值']
                rs.append(pd.DataFrame([F, p], index=columns, columns=[i]).T)

            res = pd.concat(rs)

            res['p-值'] = res['p-值'].apply(lambda x: '{:.5f}'.format(x))

            return {'result': res.round(5), 'msg': msg}

コード例 #2

0

ファイルを表示

ファイル: common_c_DescripStat.py プロジェクト: w940853815/medical-learn

    def run(self, 
            df): 


        numer_cols,cate_cols = ParseDFtypes(df)
        
        msg = {}
        if numer_cols == []:
            logging.error('All input DataFrame are non-numeric columns, Please check your input data!')
            
            msg['error'] = '输入的所有的列都不是数值型数据，请检查输入数据df！'
            result = pd.DataFrame()
        

        else:
            if cate_cols != []:
                logging.warning('Input DataFrame has non-numeric columns, such as: %s will be ignored!' % cate_cols)
                msg['warning'] = '输入的数据包含非数值型数据, 比如列: %s 将会被忽略!' % cate_cols
                
            dfn = df[numer_cols]
            desb = dfn.describe().T
            
            desb['skew'] = dfn.skew()
            desb['kurt'] = dfn.kurt()
            desb['var'] = dfn.var()
            desb['mad'] = dfn.mad()
            result = desb
            
            result.columns = ['样本量','、平均值', '标准差', '最小值', '1/4分位数', 
                              '中位数', '3/4分位数', '最大值','偏度','峰度','方差','平均绝对误差']

    
    
        return {'result':result, 'msg':msg}

コード例 #3

0

ファイルを表示

ファイル: common_d_GroupByStat.py プロジェクト: w940853815/medical-learn

    def run(self, df, by, method='count'):

        numer_cols, cate_cols = ParseDFtypes(df)

        msg = {}

        if numer_cols == []:
            logging.error(
                'All input DataFrame are no numeric columns, Please check your input data!'
            )
            result = pd.DataFrame()
            msg['error'] = '输入的所有的列都不是数值型数据，请检查输入数据df！'

        else:
            if method != 'count':

                s1 = set(by)
                s2 = set(numer_cols)
                if s2.issubset(s1):
                    logging.error(
                        'by columns contains all numeric columns, no numeric data to calculate!'
                    )
                    msg['error'] = '参数by占用了所有数值型的列， 所以没有额外的数值型列来计算 %s 的结果！' % method

                    result = pd.DataFrame()
                else:
                    dfg = df.groupby(by)
                    result = dfg.agg(method)
                    result = result.round(5)

            else:
                result = df.groupby(by).size().to_frame(name='样本量')

        return {'result': result, 'msg': msg}

コード例 #4

0

ファイルを表示

    def run(self, 
            dfx): 

        
            
            msg = {}
            
            x_numer_cols, x_cate_cols = ParseDFtypes(dfx)


            if x_numer_cols ==[]:
                logging.error('All input dfx are no numeric columns, Please check your input dfx data!')
                msg['error'] = '输入的dfx所有的列都不是数值型数据，请检查输入数据'
                return  {'result':pd.DataFrame(), 'msg':msg}
            
            
            else:
                
                if x_cate_cols != []:
                    logging.warning('input dfx has non-numeric columns: %s, will ignore these columns!' % x_cate_cols)
                
                    msg['warning'] = '输入的dfx包含了非数值型的列: %s, 将会被自动忽略！' % x_cate_cols       

                 
                rr = []
                for i in x_numer_cols:
                    dfi = dfx[i].dropna()
                    
                    l = len(dfi)
                    
                    ks_w, ks_p  = kstest(dfi,'norm') 
                    ws_w,ws_p = shapiro(dfi)
                    
                    
                    cols = ['样本量', 'KS检验：统计量','KS检验：p值','Shapro-Wilk检验：统计量', 'Shapro-Wilk检验：p-值']	
                            
                    dfr = pd.DataFrame([l, ks_w, ks_p,ws_w,ws_p],index=cols,columns=[i]).T
                    
                    rr.append(dfr)
                res = pd.concat(rr)
                    
                res['KS检验：p值'] = res['KS检验：p值'].apply(lambda x:'{:.5f}'.format(x))
                res['Shapro-Wilk检验：p-值'] = res['Shapro-Wilk检验：p-值'].apply(lambda x:'{:.5f}'.format(x))
                
                
                return {'result':res.round(5), 'msg':msg}

コード例 #5

0

ファイルを表示

ファイル: common_a_CountFreq.py プロジェクト: w940853815/medical-learn

    def run(self, df, bins=10):

        self.bins = bins

        msg = {}

        numeric_cols, category_cols = ParseDFtypes(df)
        if numeric_cols != []:
            logging.warning(
                'column %s is not category type, force convert to category by bins of %d'
                % (numeric_cols, self.bins))
            msg['warning'] = '列 %s 不是定类（category）数据, 将强制通过bins:%d为转化为定类型数据' % (
                numeric_cols, self.bins)

        res = []
        for col in df.columns:
            if col in numeric_cols:
                vc = df[col].value_counts(bins=self.bins)
                dfc = vc.to_frame(name='frequency')
                dfc.index = dfc.index.map(str)

                _sum = dfc.frequency.sum()
                dfc['percentage'] = dfc.frequency.apply(lambda x: x / _sum)

            else:
                vc = df[col].value_counts()
                dfc = vc.to_frame(name='frequency')
                _sum = dfc.frequency.sum()
                dfc['percentage'] = dfc.frequency.apply(
                    lambda x: x / _sum).round(5) * 100

            index = pd.MultiIndex.from_product([[col], dfc.index],
                                               names=['分析项', '分析组'])

            dfc = dfc.set_index(index)

            res.append(dfc)

        df_res = pd.concat(res)

        df_res.columns = ['频数', '占比(%)']

        return {'result': df_res, 'msg': msg}

コード例 #6

0

ファイルを表示

ファイル: advance_j_岭回归分析.py プロジェクト: shenwanxiang/BioStat

    def run(self, df, x, y, extra_args={'alpha': 0.001}):

        alpha = extra_args.get('alpha')
        if not alpha:
            alpha = 0.001
        '''
        
        x:
        y: y的唯一个数只能为2
        
        '''
        msg = {}

        dfx = df[x].reset_index(drop=True)
        dfx = sm.add_constant(dfx, prepend=True)
        dfx = dfx.rename(columns={'const': '截距'})
        numeric_cols, category_cols = ParseDFtypes(dfx)

        target = y[0]
        tsy = df[target].reset_index(drop=True)

        #convert dict
        myd = {}
        myd_reverse = {}
        lst = list(tsy.unique())
        for i, j in enumerate(lst):
            myd[j] = i
            myd_reverse[i] = j

        # build init model

        model = smf.OLS(tsy.map(myd), dfx[numeric_cols])
        res = model.fit_regularized(
            maxiter=1000,
            alpha=alpha,  #正则化系数
            L1_wt=0)  #L1_wt 0为l2 loss, 1为l1 loss

        #predict result
        prediction_probs = res.predict()
        prediction_bins = pd.Series(
            [1 if i >= 0.5 else 0 for i in prediction_probs],
            name='predicted_bins')
        tsy_predict = prediction_bins.map(myd_reverse)
        tsy_predict.name = '预测的' + tsy.name
        df_predict_result = pd.concat([tsy, tsy_predict], axis=1)

        #confusion matrix
        df_confusion_matrix = pd.DataFrame(confusion_matrix(tsy, tsy_predict),
                                           index=tsy.unique(),
                                           columns=tsy.unique())

        #report
        df_report = pd.DataFrame(list(
            precision_recall_fscore_support(tsy, tsy_predict)),
                                 index=['精确度', '召回率', 'F1-值',
                                        '样本个数']).T.round(5)

        df_report.index = df_report.index.map(myd_reverse)

        #roc
        fpr, tpr, thresholds = roc_curve(tsy.map(myd), prediction_probs)
        roc_auc = auc(fpr, tpr)
        logging.info("Area under the ROC curve : %f" % roc_auc)
        i = np.arange(len(tpr))  # index for df
        df_roc = pd.DataFrame({
            '假阳性率': pd.Series(fpr, index=i),
            '真阳性率': pd.Series(tpr, index=i)
        })

        #model description
        tables = res.summary().tables
        df_list = [pd.read_html(StringIO(t.as_html()))[0] for t in tables]
        dfinfo1 = df_list[1].fillna('Variables').set_index(0)
        dfinfo1 = dfinfo1.T.set_index('Variables').T
        dfinfo1.index.name = '项'
        dfinfo1.columns.name = '参数类型'
        dfinfo1.columns = [
            '回归系数', '标准误差', 'Z值', 'p值', '95%CI(下限)', '95%CI(上限)'
        ]
        dfinfo1['or值'] = np.exp(res.params)
        df_description = dfinfo1

        df_report = df_report.append(df_report.sum().to_frame(name='总和/平均').T)
        df_report['召回率'].loc['总和/平均'] = df_report['召回率'].loc['总和/平均'] / 2
        df_report['F1-值'].loc['总和/平均'] = df_report['F1-值'].loc['总和/平均'] / 2
        df_report = df_report.T
        df_report['name'] = ['模型效果', '模型效果', '模型效果', '样本量']

        df_confusion_matrix = df_confusion_matrix.append(
            df_confusion_matrix.sum().to_frame(name='总和/平均').T)
        df_confusion_matrix = df_confusion_matrix.T
        df_confusion_matrix['name'] = ['混淆矩阵', '混淆矩阵']
        df_confusion_matrix = df_confusion_matrix.append(
            df_report).reset_index().set_index(['name', 'index'])
        df_confusion_matrix = df_confusion_matrix.T
        df_confusion_matrix.columns.names = [None, None]

        df_predict_result = df_predict_result.round(5)
        df_confusion_matrix = df_confusion_matrix.round(5)
        df_roc = df_roc.round(5)
        df_description = df_description.round(5)

        #self._debug = df_confusion_matrix
        return {
            'tables': [
                {
                    'table_info': 'Ringe回归分析结果汇总',
                    'table_json': df_description.to_json(),
                    'table_html': df_description.to_html(),
                    'chart': ['line', 'bar']
                },
                {
                    'table_info': 'Ringe回归预测效果汇总:',
                    'table_json':
                    df_confusion_matrix.T.reset_index().to_json(),
                    'table_html': df_confusion_matrix.to_html(),
                    'chart': []
                },
                {
                    'table_info': "ROC曲线（曲线下面积:%0.3f）" % roc_auc,
                    'table_json': df_roc.to_json(),
                    'table_html': df_roc.to_html(),
                    'chart': ['scatter']
                },
            ],
            'conf':
            self.get_info(),
            'msg':
            msg
        }, [{
            'table_df': df_predict_result,
            'label': '实际值与预测值'
        }]

コード例 #7

0

ファイルを表示

    def run(self, dfx, dfy):

        msg = {}

        xl = len(dfx)
        yl = len(dfy)
        if xl != yl:
            logging.error(
                'the length of input X:%s is not equal the length of Y: %s ! '
                % (xl, yl))
            msg['error'] = '输入的dfx的长度为:%s 不等于输入的dfy的长度: %s  ' % (xl, yl)
            return {'result': pd.DataFrame(), 'msg': msg}

        x_numer_cols, x_cate_cols = ParseDFtypes(dfx)
        y_numer_cols, y_cate_cols = ParseDFtypes(dfy)

        if (x_cate_cols != []) or (y_cate_cols != []):
            logging.error(
                'input x or y has non-numeric data, please check your input data'
            )
            msg['error'] = '输入的dfx或者dfy所有的列都不是数值型数据，请检查输入数据'
            return {'result': pd.DataFrame(), 'msg': msg}

        if len(x_numer_cols) != len(y_numer_cols):
            logging.error(
                'the number of columns for input X:%s is not equal Y: %s ! ' %
                (x_numer_cols, y_numer_cols))
            msg['error'] = '输入的dfx的可用的列为:%s ，这和输入的dfy可用的列: %s 在列数数量上不相等 ' % (
                x_numer_cols, y_numer_cols)
            return {'result': pd.DataFrame(), 'msg': msg}

        else:

            rr = []
            for i, j in zip(x_numer_cols, y_numer_cols):

                idx = '%s-配对-%s' % (i, j)
                F, p = ttest_rel(dfx[i], dfy[j])

                m1 = dfx[i].mean()
                s1 = dfx[i].std()

                m2 = dfy[j].mean()
                s2 = dfy[j].std()

                r1 = '%s±%s' % (round(m1, 3), round(s1, 3))
                r2 = '%s±%s' % (round(m2, 3), round(s2, 3))

                e = m1 - m2
                columns = [
                    '配对1(平均值±标准差)', '配对2(平均值±标准差)', '差值(配对1-配对2)', 't-值', 'p-值'
                ]
                dfr = pd.DataFrame([r1, r2, e, F, p],
                                   index=columns,
                                   columns=[idx]).T.round(5)
                rr.append(dfr)

            res = pd.concat(rr)

            res['p-值'] = res['p-值'].apply(lambda x: '{:.5f}'.format(x))

            return {'result': res, 'msg': msg}

コード例 #8

0

ファイルを表示

ファイル: common_g_OnewayAnova.py プロジェクト: w940853815/medical-learn

    def run(self, dfx, tsy):

        tsy = tsy.reset_index(drop=True)
        dfx = dfx.reset_index(drop=True)

        msg = {}

        xl = len(dfx)
        yl = len(tsy)
        if xl != yl:
            logging.error(
                'the length of input X:%s is not equal the length of Y: %s ! '
                % (xl, yl))
            msg['error'] = '输入的dfx的长度为:%s 不等于输入的tsy的长度: %s  ' % (xl, yl)
            return {'result': pd.DataFrame(), 'msg': msg}

        if not isSeries(tsy) or not isCategory(tsy):
            logging.error(
                'input tsy is not a pandas Series or not a category data!')
            msg['error'] = '输入的tsy不是定类型数据或者Series类型'

            return {'result': pd.DataFrame(), 'msg': msg}

        else:
            x_numer_cols, x_cate_cols = ParseDFtypes(dfx)

            if x_numer_cols == []:
                logging.error(
                    'All input dfx are no numeric columns, Please check your input dfx data!'
                )
                msg['error'] = '输入的dfx所有的列都不是数值型数据，请检查输入数据'
                return {'result': pd.DataFrame(), 'msg': msg}

            else:

                if x_cate_cols != []:
                    logging.warning(
                        'input dfx has non-numeric columns: %s, will ignore these columns!'
                        % x_cate_cols)

                    msg['warning'] = '输入的dfx包含了非数值型的列: %s, 将会被自动忽略！' % x_cate_cols

                name = tsy.name

                dfu = dfx[x_numer_cols].join(tsy)
                m = dfu.groupby(name).mean().T
                s = dfu.groupby(name).std().T

                def change(ts):
                    v = []
                    for i in ts.index:
                        r = '%s±%s' % (round(ts.loc[i],
                                             2), round(s[ts.name].loc[i], 2))
                        v.append(r)
                    return pd.Series(v, index=ts.index)

                m1 = m.apply(change)

                rs = []
                for i in x_numer_cols:
                    model = ols('%s ~ %s' % (i, tsy.name), dfu).fit()
                    anovat = anova_lm(model)
                    anovat.columns = ['自由度', '平方和', '均方和', 'F-值', 'p-值']
                    rs.append(anovat.iloc[0].to_frame(name=i).T)

                res = m1.join(pd.concat(rs))
                res['p-值'] = res['p-值'].apply(lambda x: '{:.5f}'.format(x))

                return {'result': res.round(5), 'msg': msg}

コード例 #9

0

ファイルを表示

    def run(self, dfx, tsy):

        tsy = tsy.reset_index(drop=True)
        dfx = dfx.reset_index(drop=True)

        msg = {}

        xl = len(dfx)
        yl = len(tsy)
        if xl != yl:
            logging.error(
                'the length of input X:%s is not equal the length of Y: %s ! '
                % (xl, yl))
            msg['error'] = 'the length of input X:%s is not equal the length of Y: %s ! ' % (
                xl, yl)
            return {'result': pd.DataFrame(), 'msg': msg}

        if not isSeries(tsy) or not isCategory(tsy):
            logging.error(
                'input tsy is not a pandas Series or not a category data!')
            msg['error'] = 'input tsy is not a pandas Series or not a category data!'

            return {'result': pd.DataFrame(), 'msg': msg}

        else:

            x_numer_cols, x_cate_cols = ParseDFtypes(dfx)

            if x_numer_cols == []:
                logging.error(
                    'All input dfx are no numeric columns, Please check your input dfx data!'
                )
                msg['error'] = 'All input dfx are no numeric columns, Please check your input dfx data!'
                return {'result': pd.DataFrame(), 'msg': msg}

            else:

                if x_cate_cols != []:
                    logging.warning(
                        'input dfx has non-numeric columns: %s, will ignore these columns!'
                        % x_cate_cols)

                    msg['warning'] = 'input dfx has non-numeric columns: %s, will ignore these columns!' % x_cate_cols

                name = tsy.name

                dfu = dfx[x_numer_cols].join(tsy)
                m = dfu.groupby(name).mean().T
                s = dfu.groupby(name).std().T

                def change(ts):
                    v = []
                    for i in ts.index:
                        r = '%s±%s' % (round(ts.loc[i],
                                             2), round(s[ts.name].loc[i], 2))
                        v.append(r)
                    return pd.Series(v, index=ts.index)

                m1 = m.apply(change)

                rs = []
                for i in x_numer_cols:

                    dd = [dfu[dfu[tsy.name] == c][i] for c in tsy.unique()]

                    F, p = levene(*dd)

                    columns = ['F-值', 'p-值']
                    rs.append(
                        pd.DataFrame([F, p], index=columns, columns=[i]).T)

                res = m1.join(pd.concat(rs))

                return {'result': res, 'msg': msg}

コード例 #10

0

ファイルを表示

ファイル: common_e_CorrStat.py プロジェクト: w940853815/medical-learn

    def run(self,
            df,
            x,
            y,
            extra_args={
                'method': 'pearson',
                'crosstab': False
            }):

        method = extra_args.get('method')
        crosstab = extra_args.get('crosstab')

        m = get_corr_func(method)
        #msg={'error':None,'warning':None}
        msg = {}

        dfx = df[x]
        dfy = df[y]

        if m:
            if method == 'pearson':
                x_numer_cols, x_cate_cols = ParseDFtypes(dfx)
                y_numer_cols, y_cate_cols = ParseDFtypes(dfy)

                if (x_numer_cols == []) | (y_numer_cols == []):

                    logging.error(
                        'All input DataFrame are no numeric columns, Please check your input data!'
                    )
                    msg['error'] = '输入的所有的列都不是数值型数据，请检查输入数据'
                    dfres = pd.DataFrame()

                else:

                    res = []
                    if (x_cate_cols != []) | (x_cate_cols != []):
                        logging.warning(
                            'input DataFrame has no numeric data columns, will be ignored!'
                        )
                        msg['warning'] = '输入的数据包含非数值型数据, 将会被忽略!'

                    for xc in x_numer_cols:
                        for yc in y_numer_cols:
                            c, p = pearsonr(dfx[xc], dfy[yc])
                            res.append([xc, yc, c, p])
                    dfres = pd.DataFrame(res,
                                         columns=['x-列', 'y-列', '相关系数', 'p-值'])
                    #dfres['p-值'] = dfres['p-值'].apply(lambda x:'{:.5f}'.format(x))

                    if crosstab:
                        dfres = dfres.pivot_table(index='x-列', columns='y-列')

            else:
                res = []
                for xc in dfx.columns:
                    for yc in dfy.columns:
                        c, p = m(dfx[xc], dfy[yc])
                        res.append([xc, yc, c, p])
                dfres = pd.DataFrame(res,
                                     columns=['x-列', 'y-列', '相关系数', 'p-值'])
                #dfres['p-值'] = dfres['p-值'].apply(lambda x:'{:.5f}'.format(x))

                if crosstab:
                    dfres = dfres.pivot_table(index='x-列', columns='y-列')

        else:
            logging.error(
                "unknow method, only 'pearson','kendall'、'spearman' are supported!"
            )
            dfres = pd.DataFrame()
            msg['error'] = "未知的方法, （meathd参数）只支持 'pearson','kendall'、'spearman' 这三种!"

        dfres = dfres.round(5)

        return {
            'tables': [{
                'table_json': dfres.T.reset_index().to_json(),
                'table_html': dfres.to_html(),
                'table_info': '生成的字段之间的相关系数和p-值表',
                'chart': ['heatmap', 'line', 'bar']
            }],
            'conf':
            self.get_info(),
            'msg':
            msg
        }, [{
            'table_df': dfres,
            'label': '生成的字段之间的相关系数和p-值表'
        }]

コード例 #11

0

ファイルを表示

ファイル: common_h_TTestInd.py プロジェクト: w940853815/medical-learn

    def run(self, 
            dfx, 
            tsy): 

            tsy = tsy.reset_index(drop=True)
            dfx = dfx.reset_index(drop=True)                 
            
            msg = {}
            

                
            
            if not isSeries(tsy) or not isCategory(tsy):
                logging.error('input tsy is not a pandas Series or not a category data!')
                msg['error'] = '输入的tsy不是定类型数据或者Series类型'
                
                return  {'result':pd.DataFrame(), 'msg':msg}
                
            
            
            else:
                
                if len(tsy.unique()) != 2:
                    msg['error'] = '输入的tsy不能被分成2组，请确保值tsy中的数unique后元素个数为2，目前的元素为%s' % tsy.unique()
                    
                    return  {'result':pd.DataFrame(), 'msg':msg}                    
                    

                else:
                    x_numer_cols, x_cate_cols = ParseDFtypes(dfx)
    
    
                    if x_numer_cols ==[]:
                        logging.error('All input dfx are no numeric columns, Please check your input dfx data!')
                        msg['error'] = 'dfx输入的每列都不是数值型数据，请检查输入数据'
                        return  {'result':pd.DataFrame(), 'msg':msg}
                    
                    
                    else:
                        
                        if x_cate_cols != []:
                            logging.warning('input dfx has non-numeric columns: %s, will ignore these columns!' % x_cate_cols)
                        
                            msg['warning'] = '输入的dfx包含了非数值型的列: %s, 将会被自动忽略！' % x_cate_cols
                        
                        
                        name = tsy.name
                        
                        dfu = dfx[x_numer_cols].join(tsy)
                        m = dfu.groupby(name).mean().T
                        s = dfu.groupby(name).std().T
    
                        def change(ts):
                            v= []
                            for i in ts.index:
                                r = '%s±%s' % (round(ts.loc[i],2),round(s[ts.name].loc[i],2))
                                v.append(r)
                            return pd.Series(v,index=ts.index)
    
    
                        m1 = m.apply(change)
                        
                        
                        

                        rs = []
                        for i in x_numer_cols:
                            
                            c1 = tsy.unique()[0]
                            c2 = tsy.unique()[1]
                            
                            d1 = dfu[dfu[tsy.name] == c1][i]
                            
                            d2 = dfu[dfu[tsy.name] == c2][i]
                            
                            F, p = ttest_ind(d1,d2)
                            
                            columns = ['t-值', 'p-值']
                            rs.append(pd.DataFrame([F,p],index=columns,columns=[i]).T)
    
                        
                        
                        res = m1.join(pd.concat(rs))
                        
        
                        return {'result':res.round(5), 'msg':msg}

コード例 #12

0

ファイルを表示

    def run(self, 
            dfx, 
            dfy): 

        
            dfy = dfy.reset_index(drop=True)
            dfx = dfx.reset_index(drop=True)            
        
        
        
            msg = {}
            
            xl = len(dfx)
            yl = len(dfy)
            if  xl != yl:
                logging.error('the length of input X:%s is not equal the length of Y: %s ! ' % (xl,yl))
                msg['error'] = '输入的dfx的长度为:%s 不等于输入的dfy的长度: %s ! ' % (xl,yl)
                return  {'result':pd.DataFrame(), 'msg':msg}        
        
    
    
            if len(dfy.columns) != 1:
                logging.warning('input DataFrame dfy has more than one columns, but only the first colum will be used!')
                msg['warning'] = '输入的dfy不只有一列数据，但是只有第一列会被使用'
                
                _dfy = dfy[[dfy.columns[0]]]
                
            else:
                _dfy = dfy
                
            x_numer_cols, x_cate_cols = ParseDFtypes(dfx)
            y_numer_cols, y_cate_cols = ParseDFtypes(_dfy)

            if (x_numer_cols ==[]) | (y_numer_cols == []):
                logging.error('All input DataFrame are no numeric columns, Please check your input data!')
                
                msg['error'] = '输入的所有的列都不是数值型数据，请检查输入数据'
                dfmain =pd.DataFrame()
                
                
            else:
                
                _dfx = dfx[x_numer_cols]
                
                X = sm.add_constant(_dfx, prepend=True)
                y = _dfy
                
                f = smf.OLS(y, X).fit()
                
                y_pre = f.predict(X)
                
                
                df_predicted = pd.DataFrame(y_pre,index = y.index, columns=['预测值'])
                
                df_predicted = y.join(df_predicted)
                
                tables = f.summary().tables

                df_list = [pd.read_html(StringIO(t.as_html()))[0] for t in tables ]

                def parse_table02(m_inf):
                    df1 = m_inf[[0,1]]
                    df1.columns=['items','values'] 
                    df2 = m_inf[[2,3]]
                    df2.columns=['items','values'] 
                    dfinfo1 = df1.append(df2).dropna().set_index('items')  
                    return dfinfo1.T

                dfinfo0 = parse_table02(df_list[0])
                dfinfo2 = parse_table02(df_list[2])

                dfinfo1 = df_list[1].fillna('Variables').set_index(0)
                dfinfo1 = dfinfo1.T.set_index('Variables').T

                dfmain = dfinfo1[dfinfo1.columns[:4]]

                dfad = dfinfo0[['R-squared:',
                                'Adj. R-squared:', 
                                'F-statistic:']].join(dfinfo2[['Durbin-Watson:',
                                                               'Jarque-Bera (JB):',
                                                               'Omnibus:']])

                variables = f.model.exog
                dfmain['VIF'] =  [variance_inflation_factor(variables, i) for i in range(variables.shape[1])]
                for i in dfad.columns:
                    dfmain[i] = dfad[i].iloc[0]
                
                
                dfmain.columns = ['系数', '标准偏差', 't值', 'P值', 
                                  'VIF值', 'R平方:', '调整后R平方',
                                  'F值:', 'Durbin-Watson检验:', 
                                  'Jarque-Bera (JB检验):', 'Omnibus检验']
                
                dfmain.index.name = '变量'
                dfmain = dfmain.rename(index = {'const':'常数项'})
                dfmain = dfmain.round(5)
                dfmain['P值'] = dfmain['P值'].apply(lambda x:'{:.5f}'.format(x))
                
            return {'result':dfmain, 'msg':msg, 'model':f, 'predicted_result':df_predicted}

コード例 #13

0

ファイルを表示

    def run(self, df, x, y, *args):
        '''
        
        x:
        y: y的唯一个数只能为2
        
        '''
        msg = {}

        dfx = df[x].reset_index(drop=True)
        dfx = sm.add_constant(dfx, prepend=True)
        dfx = dfx.rename(columns={'const': '截距'})

        numeric_cols, category_cols = ParseDFtypes(dfx)

        target = y[0]
        tsy = df[target].reset_index(drop=True)

        types = list(tsy.unique())
        types.sort()

        # build init model
        model = sm.MNLogit(tsy, dfx[numeric_cols])
        res = model.fit()

        #predict result
        prediction_probs = res.predict()
        tsy_predict = pd.DataFrame(prediction_probs).apply(
            lambda x: types[x.idxmax()], axis=1)
        tsy_predict.name = '预测的' + tsy.name
        df_predict_result = pd.concat([tsy, tsy_predict], axis=1)
        df_dumps = pd.get_dummies(tsy)[types]
        df_prediction_probs = pd.DataFrame(prediction_probs, columns=types)
        #fpr, tpr, thresholds =roc_curve(tsy.map(myd), prediction_probs)

        #report
        df_report = pd.DataFrame(list(
            precision_recall_fscore_support(tsy, tsy_predict)),
                                 index=['召回率', '精确度', 'F1-值', '样本个数'],
                                 columns=types).T.round(5)

        #confusion matrix
        df_confusion_matrix = pd.DataFrame(res.pred_table(),
                                           index=types,
                                           columns=types)

        #roc
        roc_res_dict = {}
        for i in types:
            fpr, tpr, thresholds = roc_curve(df_dumps[i],
                                             df_prediction_probs[i])
            tpr = pd.DataFrame(tpr, columns=['真阳性率'])
            fpr = pd.DataFrame(fpr, columns=['假阳性率'])

            roc_auc = auc(fpr, tpr)
            desc = "（曲线下面积:%0.3f）" % roc_auc
            key = '%s_%s' % (i, desc)
            r = fpr.join(tpr).T.reset_index()
            roc_res_dict[key] = r

        #model description
        tables = res.summary().tables
        df_list = [pd.read_html(StringIO(t.as_html()))[0] for t in tables]
        dfinfo1 = df_list[1].fillna('Variables').set_index(0)
        t = []
        for i in res.params.columns:
            odd = np.exp(res.params[[i]]).round(5)
            odd.columns = ['or值']
            odd = odd.T.reset_index().T
            t.append(odd)
        dft = pd.concat(t)
        dft.index = dfinfo1.index
        dft.columns = [7]
        df_res = dfinfo1.reset_index().join(dft.reset_index(drop=True))
        df_res = df_res.set_index(0)
        change_lst = list(set(dfinfo1.index) - set(dfx.columns))
        for i in change_lst:
            df_res.loc[i] = [
                '回归系数', '标准误差', 'Z值', 'p值', '95%CI(下限)', '95%CI(上限)', 'or值'
            ]

        df_report = df_report.append(df_report.sum().to_frame(name='总和/平均').T)
        df_report['召回率'].loc['总和/平均'] = df_report['召回率'].loc['总和/平均'] / 2
        df_report['F1-值'].loc['总和/平均'] = df_report['F1-值'].loc['总和/平均'] / 2
        df_report = df_report.T
        df_report['name'] = ['模型效果', '模型效果', '模型效果', '样本量']

        df_confusion_matrix = df_confusion_matrix.append(
            df_confusion_matrix.sum().to_frame(name='总和/平均').T)
        df_confusion_matrix = df_confusion_matrix.T
        df_confusion_matrix['name'] = '混淆矩阵'
        df_confusion_matrix = df_confusion_matrix.append(
            df_report).reset_index().set_index(['name', 'index'])
        df_confusion_matrix = df_confusion_matrix.T
        df_confusion_matrix.columns.names = [None, None]

        df_predict_result = df_predict_result.round(5)
        df_confusion_matrix = df_confusion_matrix.round(5)

        df_description = df_res.round(5)

        tt = []
        for i in roc_res_dict.keys():
            df = roc_res_dict[i]
            d = {
                'table_info': i,
                'table_json': df.to_json(),
                'table_html': df.to_html(),
                'chart': ['scatter']
            }

            tt.append(d)

        #self.df_confusion_matrix = df_confusion_matrix
        #self._df_description = df_description
        return {
            'tables':
            [{
                'table_info': '多元Logit回归分析结果汇总',
                'table_json': df_description.reset_index().to_json(),
                'table_html': df_description.to_html(),
                'chart': ['line', 'bar']
            }, {
                'table_info': '多元Logit回归预测效果汇总:',
                'table_json': df_confusion_matrix.T.reset_index().to_json(),
                'table_html': df_confusion_matrix.to_html(),
                'chart': []
            }] + tt,
            'conf':
            self.get_info(),
            'msg':
            msg
        }, [{
            'table_df': df_predict_result,
            'label': '实际值与预测值'
        }]