コード例 #1
0
ファイル: Data_analysis.py プロジェクト: zhaiwc/Analyse_Tool
    def g2g_anaysis(self, data, label_col, columnslist=None, method='mean'):
        '''
        限制两组进行对比
        '''
        groupindex = data[label_col].drop_duplicates()
        print(groupindex[0], 'vs', groupindex[1], 'method:', method)
        group1 = data[data[label_col] == groupindex[0]]
        group2 = data[data[label_col] == groupindex[1]]

        group1 = group1.drop(label_col, axis=1)
        group2 = group2.drop(label_col, axis=1)

        res = self.g2g_diff(group1, group2, method=method)

        #筛选最后5列
        col = res.index[-5:]
        Data_plot.plot_describe(data, label_col=label_col, columnslist=col)
        return res
コード例 #2
0
 def plot_cum_std(self, data, n=30):
     '''
     画各个维度pca累积贡献度
     '''
     pca_model = PCA(n_components=n)
     pca_model.fit(data)
     eplan_var_csum = pca_model.explained_variance_ratio_.cumsum()
     plt = Data_plot.plot_line(
         pd.DataFrame(eplan_var_csum, columns=['cum_var']))
     #        plt = Data_plot.plot_line(pd.DataFrame(np.ones(eplan_var_csum.shape)*0.8,columns=['base_line']))
     plt.title('cumsum explained_variance_ratio')
     plt.show()
コード例 #3
0
ファイル: Data_analysis.py プロジェクト: zhaiwc/Analyse_Tool
def spc_analysis(data, p1=None, p2=None, method='3sigma'):
    '''
    spc管控分析
    method = 3sigma/tukey
    针对每一个因子x,计算超出管控线的样本个数
    '''
    res_dict = {}
    #筛选数据列
    num_col = data.describe().columns

    data = data.loc[:, num_col]
    if method == '3sigma':
        for col in data.columns:
            #计算指标
            temp = data.loc[:, col]
            mean_data = temp.mean()
            std_data = temp.std()
            sigma3 = mean_data + 3 * std_data
            sigma_3 = mean_data - 3 * std_data
            #计算超出管控线的个数
            res_dict[col] = len(temp[(temp > sigma3) | (temp < sigma_3)])
        res = pd.DataFrame(res_dict, index=['cnt']).T.sort_values('cnt')
        #画图
        plt = Data_plot.plot_spc(data.loc[:, [res.index[-1]]], method=method)
        plt.show()
    elif method == 'tukey':
        for col in data.columns:
            for col in data.columns:
                #计算指标
                temp = data.loc[:, col]
                perc25 = np.percentile(temp, 25)
                perc75 = np.percentile(temp, 75)
                upper = perc75 + 3 * (perc75 - perc25)
                lower = perc25 - 3 * (perc75 - perc25)
                res_dict[col] = len(temp[(temp > upper) | (temp < lower)])
        res = pd.DataFrame(res_dict, index=['cnt']).T.sort_values('cnt')
        #画图
        plt = Data_plot.plot_spc(data.loc[:, [res.index[-1]]], method=method)
        plt.show()
    return res
コード例 #4
0
    def plot_score_scatter(self, data, label_col=None, is_plot=True):
        dr_data = self.dr_model.transform(data)
        plot_data = pd.DataFrame(dr_data[:, 0:2],
                                 index=data.index,
                                 columns=['pca1', 'pca2'])

        #画x,y轴
        max_x = max(dr_data[:, 0].std() * 3, dr_data[:, 0].max())
        max_y = max(dr_data[:, 1].std() * 3, dr_data[:, 1].max())
        x_mat = pd.DataFrame([[max_x, 0], [-max_x, 0]])
        y_mat = pd.DataFrame([[0, -max_y], [0, max_y]])
        if is_plot:
            plt = Data_plot.plot_scatter(plot_data,
                                         label_col=label_col,
                                         issns=False)
            plt = Data_plot.plot_line(x_mat, c=['b--'])
            plt = Data_plot.plot_line(y_mat, c=['b--'])
            plt.xlabel('pca1')
            plt.ylabel('pca2')
            plt.title('pca1 vs pca2 scatter')
            plt.show()
        return plot_data
コード例 #5
0
ファイル: Data_analysis.py プロジェクト: zhaiwc/Analyse_Tool
def search_machine(x, y, mapdict=None, wrongcode=0, method=1):
    '''
    机台集中性计算方法:
    method 1: 
        使用熵值法定位异常机台
    method 2:
        使用关键因子法确认异常机台
    '''
    if method == 1:
        #计算各个列的熵值
        idx = y[y == wrongcode].index
        ent_x = x.loc[idx, :]
        res_tot = pd.DataFrame()
        ent_x, entlist, col = entropy_analysis(ent_x)
        len_x = len(entlist[entlist == min(entlist.iloc[:, 0])].dropna())
        if len_x > 5:
            chose = list(entlist.index)[:len_x]
        else:
            chose = list(entlist.index)[:5]

        #计算选定工序各机台不良率
        for step in chose:
            temp_data = pd.concat([x.loc[:, [step]], y], axis=1)
            for mechine in set(x.loc[:, step]):
                pct = len(
                    temp_data[(temp_data.loc[:, step] == mechine)
                              & (temp_data.iloc[:, 1] == wrongcode)]) / len(
                                  temp_data[temp_data.loc[:, step] == mechine])
                res = pd.DataFrame([step, mechine, pct],
                                   index=['step', 'mechine', 'pct']).T
                res_tot = pd.concat([res_tot, res])
        res_tot = res_tot.sort_values(['pct'], ascending=False)
        res_tot = res_tot[res_tot.iloc[:, 2] > 0]

        if mapdict is not None:
            for i in range(len(res_tot)):
                res_tot.iloc[i, 1] = mapdict[res_tot.iloc[i,
                                                          0]][res_tot.iloc[i,
                                                                           1]]
        plt = Data_plot.plot_bar_analysis(res_tot.set_index('mechine'),
                                          ['pct'])
        plt.title(' NG rate')
        plt.show()
        return res_tot
コード例 #6
0
ファイル: Data_analysis.py プロジェクト: zhaiwc/Analyse_Tool
    def check_outlier_iforest(self, df, isplot=True):

        iforest = IsolationForest()
        iforest.fit(df)
        res = pd.DataFrame(iforest.predict(df),
                           index=df.index,
                           columns=['outlier'])

        if isplot:
            #画数据分布散点图
            fr = Data_feature_reduction.Feature_Reduction(2)
            fr.fit(df)
            pca_res = fr.transform(df)
            if pca_res.shape[1] > 2:
                pca_res = pca_res.iloc[:, :2]

            plotdata = pd.concat([res, pca_res], axis=1)
            plt = Data_plot.plot_scatter(plotdata, label_col='outlier')
            plt.show()

        return res
コード例 #7
0
    def get_vip(self,stack_method = 'avg',isplot = True):
        '''
        计算融合 关键因子
        ‘avg’:对关键因子权重求平均
        ‘weight’:对关键因子权重加权求和
        '''
        res = []
        idx = []
        for i,model_name in enumerate(self.listModelName):
            sub_model_res = []
            for sub_model in self.train_model[model_name]:
                vip = sub_model.get_vip(isplot = False)
                if vip is not None:
                    sub_model_res.append(vip)
                    
            #子模型结果融合
            if len(sub_model_res):
#                factor_name = sub_model_res.index
                idx.append(i)
                sub_model_res = pd.concat(sub_model_res,axis = 1).mean(axis = 1)
                res.append(sub_model_res)
                
        #不同模型结果融合
        if stack_method == 'avg':
            res = pd.concat(res,axis = 1).mean(axis = 1)
        elif stack_method == 'weight':
            res = pd.concat(res,axis = 1).values
            weight = np.array(self.mse_list[idx]).reshape(len(res),1)/sum(self.mse_list)
            res = np.dot(res,weight)

        res = pd.DataFrame(res.values,index = res.index,columns = ['variable importance']).sort_values('variable importance')
        
        #画条形图
        if isplot:
            plt = Data_plot.plot_bar_analysis(res)
            plt.title('variable importance')
            plt.show()
        
        return res
コード例 #8
0
 def get_vip(self,stack_method = 'weight',isplot = True):
     res = []
     idx = []
     for i,key in enumerate(self.train_model):
         vip = self.train_model[key].get_vip(isplot = False)
         if vip is not None:
             res.append(vip)
             idx.append(i)
     #不同模型结果融合
     temp = pd.concat(res,axis = 1)
     if stack_method == 'avg':
         res = temp.mean(axis = 1).sort_values()
     elif stack_method == 'weight':
         res = np.dot(temp.values,self.stack.coef_[idx])
         res = pd.DataFrame(res,index = temp.index,columns = ['variable importance']).sort_values('variable importance')
     
     #画条形图
     if isplot:
         plt = Data_plot.plot_bar_analysis(res)
         plt.title('variable importance')
         plt.show()
     
     return res
コード例 #9
0
ファイル: Data_analysis.py プロジェクト: zhaiwc/Analyse_Tool
def entropy_analysis(data,
                     columnslist=None,
                     label_col=None,
                     threshold=0.8,
                     mix_method='max',
                     isdrop=False):
    '''
    计算各个列的熵
    '''

    if columnslist is None:
        columnslist = list(data.columns)
    if label_col is not None and label_col in columnslist:
        columnslist.remove(label_col)
    print('开始对数据进行熵值分析......')
    #    pdb.set_trace()
    if label_col is None:
        entropy_list = []
        for col_x in columnslist:
            if __check_discrete(data[col_x], check_num=20):
                ths_data_x = np.array(data[col_x])
                #                ths_data_x = ths_data_x.reshape([len(ths_data_x),1])
                ent = discrete.entropy(ths_data_x)
            else:
                ths_data_x = np.array(data[col_x])
                ths_data_x = ths_data_x.reshape([len(ths_data_x), 1])
                ent = continuous.entropy(ths_data_x, method='gaussian')
            entropy_list.append(ent)

        df = pd.DataFrame(entropy_list,
                          index=columnslist,
                          columns=['entropy with ' + str(columnslist[-1])])
        df = df.sort_values(df.columns[0])
        plt = Data_plot.plot_bar_analysis(df, df.columns)
        plt.title('entropy')
        plt.show()

        drop_col = list(df.loc[abs(df.iloc[:, 0]) < threshold].index)

    else:
        label = set(data[label_col])
        df_total = pd.DataFrame()

        for lab in label:
            ths_data = data.loc[data[label_col] == lab]
            ths_data = ths_data[columnslist]
            entropy_list = []
            for col_x in columnslist:
                if __check_discrete(data[col_x], check_num=20):
                    try:
                        ths_data_x = np.array(data[col_x])
                        ent = discrete.entropy(ths_data_x)
                    except:
                        ent = np.nan
                else:
                    try:
                        ths_data_x = np.array(data[col_x])
                        ths_data_x = ths_data_x.reshape([len(ths_data_x), 1])
                        ent = continuous.entropy(ths_data_x, method='gaussian')
                    except:
                        ent = np.nan
                entropy_list.append(ent)

            df = pd.DataFrame(entropy_list,
                              index=columnslist,
                              columns=[
                                  'entropy by ' + str(lab) + ' ' +
                                  str(columnslist[-1])
                              ])
            df_total = pd.concat([df_total, df], axis=1)

        plt = Data_plot.plot_bar_analysis(df_total, df_total.columns)
        plt.show()

        if mix_method == 'max':
            mix_corr_df = pd.DataFrame(
                df_total.max(axis=1),
                columns=['max_label_corr_' + columnslist[-1]])
        elif mix_method == 'min':
            mix_corr_df = pd.DataFrame(
                df_total.min(axis=1),
                columns=['max_label_corr_' + columnslist[-1]])
        elif mix_method == 'mean':
            mix_corr_df = pd.DataFrame(
                df_total.mean(axis=1),
                columns=['max_label_corr_' + columnslist[-1]])

        mix_corr_df = mix_corr_df.sort_values(mix_corr_df.columns[0])

        plt = Data_plot.plot_bar_analysis(mix_corr_df, mix_corr_df.columns)
        plt.title(mix_method + ' entropy')
        plt.show()

        drop_col = list(
            mix_corr_df.loc[abs(mix_corr_df.iloc[:, 0]) < threshold].index)

    if isdrop:
        data = data.drop(list(drop_col), axis=1)

    return data, df, drop_col
コード例 #10
0
ファイル: Data_analysis.py プロジェクト: zhaiwc/Analyse_Tool
def granger_causal_analysis(data,
                            columnslist=None,
                            label_col=None,
                            threshold=0.8,
                            mix_method='max',
                            k=1,
                            m=1,
                            isdrop=False):
    '''
    格兰杰因果关系
    '''
    if columnslist is None:
        columnslist = list(data.columns)

    if label_col is None:
        g_c = []
        ths_data_y = np.array(data[columnslist[-1]])
        ths_data_y = ths_data_y.reshape([len(ths_data_y), 1])
        for col_x in columnslist[:-1]:
            ths_data_x = np.array(data[col_x])
            ths_data_x = ths_data_x.reshape([len(ths_data_x), 1])
            c = CausalCalculator.CausalCalculator(ths_data_x, ths_data_y)
            g_c.append(c.calcGrangerCausality(k, m))

        corr_df = pd.DataFrame(
            g_c,
            index=columnslist[:-1],
            columns=['granger_causal with ' + str(columnslist[-1])])
        corr_df = corr_df.sort_values(corr_df.columns[0])

        plt = Data_plot.plot_bar_analysis(corr_df,
                                          corr_df.columns,
                                          threshold=[threshold, -threshold])

        plt.ylim([-1.1, 1.1])
        plt.title('granger_causal with ' + str(columnslist[-1]))
        plt.show()
        drop_col = list(corr_df.loc[abs(corr_df.iloc[:, 0]) < threshold].index)

    else:
        label = set(data[label_col])
        corr_df_total = pd.DataFrame()

        for lab in label:
            ths_data = data.loc[data[label_col] == lab]
            ths_data = ths_data[columnslist]
            ths_data_y = np.array(ths_data[columnslist[-1]])
            ths_data_y = ths_data_y.reshape([len(ths_data_y), 1])
            g_c = []
            for col_x in columnslist[:-1]:
                ths_data_x = np.array(ths_data[col_x])
                ths_data_x = ths_data_x.reshape([len(ths_data_x), 1])
                c = CausalCalculator.CausalCalculator(ths_data_x, ths_data_y)
                g_c.append(c.calcGrangerCausality(k, m))

            corr_df = pd.DataFrame(g_c,
                                   index=columnslist[:-1],
                                   columns=[
                                       'granger_causal ' + str(lab) + ' ' +
                                       str(columnslist[-1])
                                   ])
            corr_df_total = pd.concat([corr_df_total, corr_df], axis=1)

        plt = Data_plot.plot_bar_analysis(corr_df_total,
                                          corr_df_total.columns,
                                          threshold=[threshold, -threshold])
        plt.show()

        if mix_method == 'max':
            mix_corr_df = pd.DataFrame(
                corr_df_total.max(axis=1),
                columns=['max_label_corr_' + columnslist[-1]])
        elif mix_method == 'min':
            mix_corr_df = pd.DataFrame(
                corr_df_total.min(axis=1),
                columns=['max_label_corr_' + columnslist[-1]])
        elif mix_method == 'mean':
            mix_corr_df = pd.DataFrame(
                corr_df_total.mean(axis=1),
                columns=['max_label_corr_' + columnslist[-1]])

        plt = Data_plot.plot_bar_analysis(mix_corr_df,
                                          mix_corr_df.columns,
                                          threshold=[threshold, -threshold])
        plt.ylim([-1.1, 1.1])
        plt.title(mix_method + ' granger_causal ' + str(columnslist[-1]))
        plt.show()
        drop_col = list(
            mix_corr_df.loc[abs(mix_corr_df.iloc[:, 0]) < threshold].index)

    if isdrop:
        data = data.drop(list(drop_col), axis=1)

    return data, corr_df, drop_col
コード例 #11
0
ファイル: Data_analysis.py プロジェクト: zhaiwc/Analyse_Tool
def nonlinear_corr_analysis(data,
                            y=None,
                            columnslist=None,
                            label_col=None,
                            threshold=0.8,
                            method='spearman',
                            mix_method='max',
                            isdrop=True):
    '''
    非线性相关系数
    秩相关
    距离相关
    '''
    if y is not None:
        data = pd.concat([data, y], axis=1)
    if columnslist is None:
        columnslist = list(data.columns)
    else:
        columnslist.append(y.columns)
    if label_col is not None and label_col in columnslist:
        columnslist.remove(label_col)
    print('-----  非线性相关分析  -----')
    if label_col is None:
        ths_data = data[columnslist]
        if method == 'spearman' or method == 'kendall':
            corr_df = ths_data.corr(method=method).iloc[:-1, [-1]]
            corr_df = corr_df.rename(
                columns={columnslist[-1]: 'corr_' + columnslist[-1]})
        elif method == 'distance':
            d_corr = []
            ths_data_y = np.array(data[columnslist[-1]])

            for col_x in columnslist[:-1]:
                try:
                    ths_data_x = np.array(data[col_x])
                    d_corr.append(dcor.dcor(ths_data_x, ths_data_y))
                except:
                    d_corr.append(np.nan)

            corr_df = pd.DataFrame(
                d_corr,
                index=columnslist[:-1],
                columns=['distance correlation with ' + str(columnslist[-1])])

        corr_df = corr_df.replace(np.nan, 0)
        corr_df = corr_df.sort_values(corr_df.columns[0])

        plt = Data_plot.plot_bar_analysis(corr_df,
                                          corr_df.columns,
                                          threshold=[threshold, -threshold])
        plt.title('correlation with ' + str(columnslist[-1]))
        plt.show()
        #如果有nan值,则用0填充
        abscorr = abs(corr_df.iloc[:, 0]).fillna(0)
        drop_col = list(corr_df.loc[abscorr < threshold].index)

    else:
        label = set(data[label_col])
        corr_df_total = pd.DataFrame()

        for lab in label:
            ths_data = data.loc[data[label_col] == lab]
            ths_data = ths_data[columnslist]

            if method == 'spearman' or method == 'kendall':
                corr_df = ths_data.corr(method=method).iloc[:-1, [-1]]
                corr_df = corr_df.rename(
                    columns={
                        columnslist[-1]: 'corr_' + str(lab) + '_' +
                        columnslist[-1]
                    })

            elif method == 'distance':
                d_corr = []
                ths_data_y = np.array(data[columnslist[-1]])
                for col_x in columnslist[:-1]:
                    ths_data_x = np.array(data[col_x])
                    d_corr.append(dcor.dcor(ths_data_x, ths_data_y))
                corr_df = pd.DataFrame(
                    d_corr,
                    index=columnslist[:-1],
                    columns=['corr_' + str(lab) + '_' + columnslist[-1]])

            corr_df_total = pd.concat([corr_df_total, corr_df], axis=1)

        if mix_method == 'max':
            mix_corr_df = pd.DataFrame(
                corr_df_total.max(axis=1),
                columns=['max_label_corr_' + columnslist[-1]])
        elif mix_method == 'min':
            mix_corr_df = pd.DataFrame(
                corr_df_total.min(axis=1),
                columns=['min_label_corr_' + columnslist[-1]])
        elif mix_method == 'mean':
            mix_corr_df = pd.DataFrame(
                corr_df_total.mean(axis=1),
                columns=['mean_label_corr_' + columnslist[-1]])

        mix_corr_df = mix_corr_df.sort_values(mix_corr_df.columns[0])
        corr_df = mix_corr_df

        plt = Data_plot.plot_bar_analysis(corr_df,
                                          corr_df.columns,
                                          threshold=[threshold, -threshold])
        plt.title(mix_method + ' correlation with ' + str(columnslist[-1]))
        plt.show()
        #如果有nan值,则用0填充
        abscorr = abs(corr_df.iloc[:, 0]).fillna(0)
        drop_col = list(corr_df.loc[abscorr < threshold].index)


#        drop_col = list(mix_corr_df.loc[abs(mix_corr_df.iloc[:,0])<threshold].index)

    print('非线性相关分析(阈值:{}结果):原数据:{}列,剔除数据{}列,筛选出:{}列。'.format(
        threshold, corr_df.shape[0], len(drop_col),
        corr_df.shape[0] - len(drop_col)))
    if isdrop:
        data = data.drop(list(drop_col), axis=1)
        if y is not None:
            data = data.drop(pd.DataFrame(y).columns[0], axis=1)
            data = data.reindex(index=y.index)
    return data, corr_df, drop_col
コード例 #12
0
ファイル: Data_analysis.py プロジェクト: zhaiwc/Analyse_Tool
def linear_corr_analysis(data,
                         y=None,
                         columnslist=None,
                         label_col=None,
                         threshold=0.8,
                         mix_method='mean',
                         isdrop=True):
    '''
    相关性分析,默认前面为x,最后一列为y。
    分别求每一列x与y的 线性相关,秩相关,剔除异常相关 等系数,进行绝对值排序
    分label 分别求每个label下 各列与y的相关系数
    '''
    if y is not None:
        data = pd.concat([data, y], axis=1)
    if columnslist is None:
        columnslist = list(data.columns)
    else:
        columnslist.append(y.columns)

    if label_col is not None and label_col in columnslist:
        columnslist.remove(label_col)
    print('----- 线性相关分析  -----')
    if label_col is None:
        data = data[columnslist]
        #求相关性系数矩阵
        corr_df = data.corr(method='pearson').iloc[:-1, [-1]]
        corr_df = corr_df.rename(
            columns={columnslist[-1]: 'corr_' + str(columnslist[-1])})
        corr_df = corr_df.sort_values(corr_df.columns[0])

        plt = Data_plot.plot_bar_analysis(corr_df,
                                          corr_df.columns,
                                          threshold=[threshold, -threshold])
        plt.title('correlation with ' + str(columnslist[-1]))
        plt.show()
        #如果有nan值,则用0填充
        abscorr = abs(corr_df.iloc[:, 0]).fillna(0)
        drop_col = list(corr_df.loc[abscorr < threshold].index)

    else:
        corr_df_total = pd.DataFrame()
        for key, group in data.groupby(label_col):
            #分group求相关系数矩阵
            ths_data = group[columnslist]
            corr_df = ths_data.corr(method='pearson').iloc[:-1, [-1]]
            corr_df = corr_df.rename(
                columns={
                    columnslist[-1]: 'corr_' + str(key) + '_' + columnslist[-1]
                })
            corr_df_total = pd.concat([corr_df_total, corr_df], axis=1)
        #group聚合
        if mix_method == 'max':
            mix_corr_df = pd.DataFrame(
                corr_df_total.max(axis=1),
                columns=['max_label_corr_' + columnslist[-1]])
        elif mix_method == 'min':
            mix_corr_df = pd.DataFrame(
                corr_df_total.min(axis=1),
                columns=['min_label_corr_' + columnslist[-1]])
        elif mix_method == 'mean':
            mix_corr_df = pd.DataFrame(
                corr_df_total.mean(axis=1),
                columns=['mean_label_corr_' + columnslist[-1]])

        mix_corr_df = mix_corr_df.sort_values(mix_corr_df.columns[0])

        plt = Data_plot.plot_bar_analysis(mix_corr_df,
                                          mix_corr_df.columns,
                                          threshold=[threshold, -threshold])
        plt.title(mix_method + ' correlation with ' + str(columnslist[-1]))
        plt.show()

        drop_col = list(
            mix_corr_df.loc[abs(mix_corr_df.iloc[:, 0]) < threshold].index)
        corr_df = mix_corr_df
    print('线性相关分析(阈值:{})结果:原数据:{}列,剔除数据{}列,筛选出:{}列。'.format(
        threshold, corr_df.shape[0], len(drop_col),
        corr_df.shape[0] - len(drop_col)))
    if isdrop:
        data = data.drop(list(drop_col), axis=1)
        if y is not None:
            data = data.drop(pd.DataFrame(y).columns[0], axis=1)
            data = data.reindex(index=y.index)
    return data, corr_df, drop_col
コード例 #13
0
def reg_score(reg_input,train_x,train_y,valid_x,valid_y,label = None,is_plot = True,
              y_change = None,**kw):
    '''
    对回归模型进行评价
    不分label: 对所有数据进行拟合预测,画散点折线图,计算mse,r2指标
    输入:
    reg:回归模型
    train_x,train_y,valid_x,valid_y:训练,验证 的x,y
    is_plot:是否输入图表
        
    '''
    if label is not None:
        train_x_input = train_x.drop(label,axis = 1)
        valid_x_input = valid_x.drop(label,axis = 1)
    else:
        train_x_input = train_x
        valid_x_input = valid_x
#    pdb.set_trace()
    if y_change is None:
        train_pred_y = reg_input.predict(train_x_input)
    else:
        train_pred_y = y_change.change_back(pd.DataFrame(reg_input.predict(train_x_input),columns=['train_pred_y'],index = train_y.index))
        train_y = y_change.change_back(train_y)
    
    train_mse = mean_squared_error(train_y,train_pred_y)
    train_r2 = r2_score(train_y,train_pred_y)
    train_pred_y = pd.DataFrame(train_pred_y,columns=['train_pred_y'],index = train_y.index)

    #画y预测与y真实 按原顺序比较
    if is_plot:
        plt = Data_plot.plot_scatter(train_y)   
        plt = Data_plot.plot_line(train_pred_y,c=['r--'])
        plt.show()
    
    
        plot_train_data = pd.concat([train_y,train_pred_y],axis=1)
        plt = Data_plot.plot_scatter(plot_train_data,issns=False) 
        line_data = np.array([[plot_train_data.max()[0],plot_train_data.max()[0]],[plot_train_data.min()[0],plot_train_data.min()[0]]])
        plt = Data_plot.plot_line(pd.DataFrame(line_data,columns=['y_true','y_pred']))
        plt.show()

    print('训练集:mse = {} , r2 = {}'.format(train_mse,train_r2))
    
    if y_change is None:
        valid_pred_y = reg_input.predict(valid_x_input)
    else:
        valid_pred_y = y_change.change_back(pd.DataFrame(reg_input.predict(valid_x_input),columns=['valid_pred_y'],index = valid_y.index))
        valid_y = y_change.change_back(valid_y)
    valid_mse = mean_squared_error(valid_y,valid_pred_y)
    valid_r2 = r2_score(valid_y,valid_pred_y)
    valid_pred_y = pd.DataFrame(valid_pred_y,columns=['valid_pred_y'],index = valid_y.index)
    
    if is_plot:
        plt = Data_plot.plot_scatter(valid_y)
        plt = Data_plot.plot_line(valid_pred_y,c=['r--'])
        plt.show()
        
        
        plot_valid_data = pd.concat([valid_y,valid_pred_y],axis=1)
        plt = Data_plot.plot_scatter(plot_valid_data,issns=False) 
        line_data = np.array([[plot_valid_data.max()[0],plot_valid_data.max()[0]],[plot_valid_data.min()[0],plot_valid_data.min()[0]]])
        plt = Data_plot.plot_line(pd.DataFrame(line_data,columns=['y_true','y_pred']),)
        plt.show()
    print('验证集:mse = {} , r2 = {}'.format(valid_mse,valid_r2))
    return valid_mse,valid_r2