コード例 #1
0
ファイル: Data_analysis.py プロジェクト: zhaiwc/Analyse_Tool
def reduce_dim_cluster(x, n_cluster=2, dim=2):
    '''
    将x 通过pca降成2维,或者3维,再通过聚类,画出效果图
    '''
    if dim == 2 or dim == 3:
        #标准化
        #        x,scaler = Data_Preprocess.data2avgstd(x)
        #降维
        dr = Data_feature_reduction.Feature_Reduction(dim)
        dr.fit(x)
        x_dr = dr.transform(x).iloc[:, 0:2]

        #聚类
        clu = KMeans(n_clusters=n_cluster).fit(x_dr)
        clu_label = clu.predict(np.array(x_dr))
        #        pdb.set_trace()
        if dim == 2:
            x_dr = pd.DataFrame(x_dr.values,
                                columns=['x1', 'x2'],
                                index=x_dr.index)
        else:
            x_dr = pd.DataFrame(x_dr.values,
                                columns=['x1', 'x2', 'x3'],
                                index=x_dr.index)
        clu_label = pd.DataFrame(clu_label, columns=['label'])
        plot_data = pd.concat([x_dr, clu_label], axis=1)
        plt = Data_plot.plot_scatter(plot_data, label_col='label')
        plt.show()
        res = pd.DataFrame(np.array(clu_label),
                           columns=['clu_label'],
                           index=x.index)
        return res
    else:
        return None
コード例 #2
0
    def plot_loading_scatter(self, data, label_col=None, is_plot=True):
        if self.method in ['pca', 'fa', 'spca', 'tsvd', 'ipca']:
            comp_mat = self.dr_model.components_.T[:, 0:2]
            plot_data = pd.DataFrame(comp_mat,
                                     index=data.columns,
                                     columns=['comp1', 'comp2'])

            #画x,y轴
            max_x = max(comp_mat[:, 0].std() * 3, comp_mat[:, 0].max())
            max_y = max(comp_mat[:, 1].std() * 3, comp_mat[:, 1].max())
            x_mat = pd.DataFrame([[max_x, 0], [-max_x, 0]])
            y_mat = pd.DataFrame([[0, -max_y], [0, max_y]])

            if is_plot:
                plt = Data_plot.plot_scatter(plot_data,
                                             label_col=label_col,
                                             issns=False)
                plt = Data_plot.plot_line(x_mat, c=['b--'])
                plt = Data_plot.plot_line(y_mat, c=['b--'])
                plt.xlabel('pca1')
                plt.ylabel('pca2')
                plt.title('pca1 vs pca2 loadings')
                plt.show()
            return plot_data
        else:
            return None
コード例 #3
0
ファイル: Data_analysis.py プロジェクト: zhaiwc/Analyse_Tool
    def check_outlier_iforest(self, df, isplot=True):

        iforest = IsolationForest()
        iforest.fit(df)
        res = pd.DataFrame(iforest.predict(df),
                           index=df.index,
                           columns=['outlier'])

        if isplot:
            #画数据分布散点图
            fr = Data_feature_reduction.Feature_Reduction(2)
            fr.fit(df)
            pca_res = fr.transform(df)
            if pca_res.shape[1] > 2:
                pca_res = pca_res.iloc[:, :2]

            plotdata = pd.concat([res, pca_res], axis=1)
            plt = Data_plot.plot_scatter(plotdata, label_col='outlier')
            plt.show()

        return res
コード例 #4
0
    def plot_score_scatter(self, data, label_col=None, is_plot=True):
        dr_data = self.dr_model.transform(data)
        plot_data = pd.DataFrame(dr_data[:, 0:2],
                                 index=data.index,
                                 columns=['pca1', 'pca2'])

        #画x,y轴
        max_x = max(dr_data[:, 0].std() * 3, dr_data[:, 0].max())
        max_y = max(dr_data[:, 1].std() * 3, dr_data[:, 1].max())
        x_mat = pd.DataFrame([[max_x, 0], [-max_x, 0]])
        y_mat = pd.DataFrame([[0, -max_y], [0, max_y]])
        if is_plot:
            plt = Data_plot.plot_scatter(plot_data,
                                         label_col=label_col,
                                         issns=False)
            plt = Data_plot.plot_line(x_mat, c=['b--'])
            plt = Data_plot.plot_line(y_mat, c=['b--'])
            plt.xlabel('pca1')
            plt.ylabel('pca2')
            plt.title('pca1 vs pca2 scatter')
            plt.show()
        return plot_data
コード例 #5
0
def reg_score(reg_input,train_x,train_y,valid_x,valid_y,label = None,is_plot = True,
              y_change = None,**kw):
    '''
    对回归模型进行评价
    不分label: 对所有数据进行拟合预测,画散点折线图,计算mse,r2指标
    输入:
    reg:回归模型
    train_x,train_y,valid_x,valid_y:训练,验证 的x,y
    is_plot:是否输入图表
        
    '''
    if label is not None:
        train_x_input = train_x.drop(label,axis = 1)
        valid_x_input = valid_x.drop(label,axis = 1)
    else:
        train_x_input = train_x
        valid_x_input = valid_x
#    pdb.set_trace()
    if y_change is None:
        train_pred_y = reg_input.predict(train_x_input)
    else:
        train_pred_y = y_change.change_back(pd.DataFrame(reg_input.predict(train_x_input),columns=['train_pred_y'],index = train_y.index))
        train_y = y_change.change_back(train_y)
    
    train_mse = mean_squared_error(train_y,train_pred_y)
    train_r2 = r2_score(train_y,train_pred_y)
    train_pred_y = pd.DataFrame(train_pred_y,columns=['train_pred_y'],index = train_y.index)

    #画y预测与y真实 按原顺序比较
    if is_plot:
        plt = Data_plot.plot_scatter(train_y)   
        plt = Data_plot.plot_line(train_pred_y,c=['r--'])
        plt.show()
    
    
        plot_train_data = pd.concat([train_y,train_pred_y],axis=1)
        plt = Data_plot.plot_scatter(plot_train_data,issns=False) 
        line_data = np.array([[plot_train_data.max()[0],plot_train_data.max()[0]],[plot_train_data.min()[0],plot_train_data.min()[0]]])
        plt = Data_plot.plot_line(pd.DataFrame(line_data,columns=['y_true','y_pred']))
        plt.show()

    print('训练集:mse = {} , r2 = {}'.format(train_mse,train_r2))
    
    if y_change is None:
        valid_pred_y = reg_input.predict(valid_x_input)
    else:
        valid_pred_y = y_change.change_back(pd.DataFrame(reg_input.predict(valid_x_input),columns=['valid_pred_y'],index = valid_y.index))
        valid_y = y_change.change_back(valid_y)
    valid_mse = mean_squared_error(valid_y,valid_pred_y)
    valid_r2 = r2_score(valid_y,valid_pred_y)
    valid_pred_y = pd.DataFrame(valid_pred_y,columns=['valid_pred_y'],index = valid_y.index)
    
    if is_plot:
        plt = Data_plot.plot_scatter(valid_y)
        plt = Data_plot.plot_line(valid_pred_y,c=['r--'])
        plt.show()
        
        
        plot_valid_data = pd.concat([valid_y,valid_pred_y],axis=1)
        plt = Data_plot.plot_scatter(plot_valid_data,issns=False) 
        line_data = np.array([[plot_valid_data.max()[0],plot_valid_data.max()[0]],[plot_valid_data.min()[0],plot_valid_data.min()[0]]])
        plt = Data_plot.plot_line(pd.DataFrame(line_data,columns=['y_true','y_pred']),)
        plt.show()
    print('验证集:mse = {} , r2 = {}'.format(valid_mse,valid_r2))
    return valid_mse,valid_r2