def g2g_anaysis(self, data, label_col, columnslist=None, method='mean'): ''' 限制两组进行对比 ''' groupindex = data[label_col].drop_duplicates() print(groupindex[0], 'vs', groupindex[1], 'method:', method) group1 = data[data[label_col] == groupindex[0]] group2 = data[data[label_col] == groupindex[1]] group1 = group1.drop(label_col, axis=1) group2 = group2.drop(label_col, axis=1) res = self.g2g_diff(group1, group2, method=method) #筛选最后5列 col = res.index[-5:] Data_plot.plot_describe(data, label_col=label_col, columnslist=col) return res
def plot_cum_std(self, data, n=30): ''' 画各个维度pca累积贡献度 ''' pca_model = PCA(n_components=n) pca_model.fit(data) eplan_var_csum = pca_model.explained_variance_ratio_.cumsum() plt = Data_plot.plot_line( pd.DataFrame(eplan_var_csum, columns=['cum_var'])) # plt = Data_plot.plot_line(pd.DataFrame(np.ones(eplan_var_csum.shape)*0.8,columns=['base_line'])) plt.title('cumsum explained_variance_ratio') plt.show()
def spc_analysis(data, p1=None, p2=None, method='3sigma'): ''' spc管控分析 method = 3sigma/tukey 针对每一个因子x,计算超出管控线的样本个数 ''' res_dict = {} #筛选数据列 num_col = data.describe().columns data = data.loc[:, num_col] if method == '3sigma': for col in data.columns: #计算指标 temp = data.loc[:, col] mean_data = temp.mean() std_data = temp.std() sigma3 = mean_data + 3 * std_data sigma_3 = mean_data - 3 * std_data #计算超出管控线的个数 res_dict[col] = len(temp[(temp > sigma3) | (temp < sigma_3)]) res = pd.DataFrame(res_dict, index=['cnt']).T.sort_values('cnt') #画图 plt = Data_plot.plot_spc(data.loc[:, [res.index[-1]]], method=method) plt.show() elif method == 'tukey': for col in data.columns: for col in data.columns: #计算指标 temp = data.loc[:, col] perc25 = np.percentile(temp, 25) perc75 = np.percentile(temp, 75) upper = perc75 + 3 * (perc75 - perc25) lower = perc25 - 3 * (perc75 - perc25) res_dict[col] = len(temp[(temp > upper) | (temp < lower)]) res = pd.DataFrame(res_dict, index=['cnt']).T.sort_values('cnt') #画图 plt = Data_plot.plot_spc(data.loc[:, [res.index[-1]]], method=method) plt.show() return res
def plot_score_scatter(self, data, label_col=None, is_plot=True): dr_data = self.dr_model.transform(data) plot_data = pd.DataFrame(dr_data[:, 0:2], index=data.index, columns=['pca1', 'pca2']) #画x,y轴 max_x = max(dr_data[:, 0].std() * 3, dr_data[:, 0].max()) max_y = max(dr_data[:, 1].std() * 3, dr_data[:, 1].max()) x_mat = pd.DataFrame([[max_x, 0], [-max_x, 0]]) y_mat = pd.DataFrame([[0, -max_y], [0, max_y]]) if is_plot: plt = Data_plot.plot_scatter(plot_data, label_col=label_col, issns=False) plt = Data_plot.plot_line(x_mat, c=['b--']) plt = Data_plot.plot_line(y_mat, c=['b--']) plt.xlabel('pca1') plt.ylabel('pca2') plt.title('pca1 vs pca2 scatter') plt.show() return plot_data
def search_machine(x, y, mapdict=None, wrongcode=0, method=1): ''' 机台集中性计算方法: method 1: 使用熵值法定位异常机台 method 2: 使用关键因子法确认异常机台 ''' if method == 1: #计算各个列的熵值 idx = y[y == wrongcode].index ent_x = x.loc[idx, :] res_tot = pd.DataFrame() ent_x, entlist, col = entropy_analysis(ent_x) len_x = len(entlist[entlist == min(entlist.iloc[:, 0])].dropna()) if len_x > 5: chose = list(entlist.index)[:len_x] else: chose = list(entlist.index)[:5] #计算选定工序各机台不良率 for step in chose: temp_data = pd.concat([x.loc[:, [step]], y], axis=1) for mechine in set(x.loc[:, step]): pct = len( temp_data[(temp_data.loc[:, step] == mechine) & (temp_data.iloc[:, 1] == wrongcode)]) / len( temp_data[temp_data.loc[:, step] == mechine]) res = pd.DataFrame([step, mechine, pct], index=['step', 'mechine', 'pct']).T res_tot = pd.concat([res_tot, res]) res_tot = res_tot.sort_values(['pct'], ascending=False) res_tot = res_tot[res_tot.iloc[:, 2] > 0] if mapdict is not None: for i in range(len(res_tot)): res_tot.iloc[i, 1] = mapdict[res_tot.iloc[i, 0]][res_tot.iloc[i, 1]] plt = Data_plot.plot_bar_analysis(res_tot.set_index('mechine'), ['pct']) plt.title(' NG rate') plt.show() return res_tot
def check_outlier_iforest(self, df, isplot=True): iforest = IsolationForest() iforest.fit(df) res = pd.DataFrame(iforest.predict(df), index=df.index, columns=['outlier']) if isplot: #画数据分布散点图 fr = Data_feature_reduction.Feature_Reduction(2) fr.fit(df) pca_res = fr.transform(df) if pca_res.shape[1] > 2: pca_res = pca_res.iloc[:, :2] plotdata = pd.concat([res, pca_res], axis=1) plt = Data_plot.plot_scatter(plotdata, label_col='outlier') plt.show() return res
def get_vip(self,stack_method = 'avg',isplot = True): ''' 计算融合 关键因子 ‘avg’:对关键因子权重求平均 ‘weight’:对关键因子权重加权求和 ''' res = [] idx = [] for i,model_name in enumerate(self.listModelName): sub_model_res = [] for sub_model in self.train_model[model_name]: vip = sub_model.get_vip(isplot = False) if vip is not None: sub_model_res.append(vip) #子模型结果融合 if len(sub_model_res): # factor_name = sub_model_res.index idx.append(i) sub_model_res = pd.concat(sub_model_res,axis = 1).mean(axis = 1) res.append(sub_model_res) #不同模型结果融合 if stack_method == 'avg': res = pd.concat(res,axis = 1).mean(axis = 1) elif stack_method == 'weight': res = pd.concat(res,axis = 1).values weight = np.array(self.mse_list[idx]).reshape(len(res),1)/sum(self.mse_list) res = np.dot(res,weight) res = pd.DataFrame(res.values,index = res.index,columns = ['variable importance']).sort_values('variable importance') #画条形图 if isplot: plt = Data_plot.plot_bar_analysis(res) plt.title('variable importance') plt.show() return res
def get_vip(self,stack_method = 'weight',isplot = True): res = [] idx = [] for i,key in enumerate(self.train_model): vip = self.train_model[key].get_vip(isplot = False) if vip is not None: res.append(vip) idx.append(i) #不同模型结果融合 temp = pd.concat(res,axis = 1) if stack_method == 'avg': res = temp.mean(axis = 1).sort_values() elif stack_method == 'weight': res = np.dot(temp.values,self.stack.coef_[idx]) res = pd.DataFrame(res,index = temp.index,columns = ['variable importance']).sort_values('variable importance') #画条形图 if isplot: plt = Data_plot.plot_bar_analysis(res) plt.title('variable importance') plt.show() return res
def entropy_analysis(data, columnslist=None, label_col=None, threshold=0.8, mix_method='max', isdrop=False): ''' 计算各个列的熵 ''' if columnslist is None: columnslist = list(data.columns) if label_col is not None and label_col in columnslist: columnslist.remove(label_col) print('开始对数据进行熵值分析......') # pdb.set_trace() if label_col is None: entropy_list = [] for col_x in columnslist: if __check_discrete(data[col_x], check_num=20): ths_data_x = np.array(data[col_x]) # ths_data_x = ths_data_x.reshape([len(ths_data_x),1]) ent = discrete.entropy(ths_data_x) else: ths_data_x = np.array(data[col_x]) ths_data_x = ths_data_x.reshape([len(ths_data_x), 1]) ent = continuous.entropy(ths_data_x, method='gaussian') entropy_list.append(ent) df = pd.DataFrame(entropy_list, index=columnslist, columns=['entropy with ' + str(columnslist[-1])]) df = df.sort_values(df.columns[0]) plt = Data_plot.plot_bar_analysis(df, df.columns) plt.title('entropy') plt.show() drop_col = list(df.loc[abs(df.iloc[:, 0]) < threshold].index) else: label = set(data[label_col]) df_total = pd.DataFrame() for lab in label: ths_data = data.loc[data[label_col] == lab] ths_data = ths_data[columnslist] entropy_list = [] for col_x in columnslist: if __check_discrete(data[col_x], check_num=20): try: ths_data_x = np.array(data[col_x]) ent = discrete.entropy(ths_data_x) except: ent = np.nan else: try: ths_data_x = np.array(data[col_x]) ths_data_x = ths_data_x.reshape([len(ths_data_x), 1]) ent = continuous.entropy(ths_data_x, method='gaussian') except: ent = np.nan entropy_list.append(ent) df = pd.DataFrame(entropy_list, index=columnslist, columns=[ 'entropy by ' + str(lab) + ' ' + str(columnslist[-1]) ]) df_total = pd.concat([df_total, df], axis=1) plt = Data_plot.plot_bar_analysis(df_total, df_total.columns) plt.show() if mix_method == 'max': mix_corr_df = pd.DataFrame( df_total.max(axis=1), columns=['max_label_corr_' + columnslist[-1]]) elif mix_method == 'min': mix_corr_df = pd.DataFrame( df_total.min(axis=1), columns=['max_label_corr_' + columnslist[-1]]) elif mix_method == 'mean': mix_corr_df = pd.DataFrame( df_total.mean(axis=1), columns=['max_label_corr_' + columnslist[-1]]) mix_corr_df = mix_corr_df.sort_values(mix_corr_df.columns[0]) plt = Data_plot.plot_bar_analysis(mix_corr_df, mix_corr_df.columns) plt.title(mix_method + ' entropy') plt.show() drop_col = list( mix_corr_df.loc[abs(mix_corr_df.iloc[:, 0]) < threshold].index) if isdrop: data = data.drop(list(drop_col), axis=1) return data, df, drop_col
def granger_causal_analysis(data, columnslist=None, label_col=None, threshold=0.8, mix_method='max', k=1, m=1, isdrop=False): ''' 格兰杰因果关系 ''' if columnslist is None: columnslist = list(data.columns) if label_col is None: g_c = [] ths_data_y = np.array(data[columnslist[-1]]) ths_data_y = ths_data_y.reshape([len(ths_data_y), 1]) for col_x in columnslist[:-1]: ths_data_x = np.array(data[col_x]) ths_data_x = ths_data_x.reshape([len(ths_data_x), 1]) c = CausalCalculator.CausalCalculator(ths_data_x, ths_data_y) g_c.append(c.calcGrangerCausality(k, m)) corr_df = pd.DataFrame( g_c, index=columnslist[:-1], columns=['granger_causal with ' + str(columnslist[-1])]) corr_df = corr_df.sort_values(corr_df.columns[0]) plt = Data_plot.plot_bar_analysis(corr_df, corr_df.columns, threshold=[threshold, -threshold]) plt.ylim([-1.1, 1.1]) plt.title('granger_causal with ' + str(columnslist[-1])) plt.show() drop_col = list(corr_df.loc[abs(corr_df.iloc[:, 0]) < threshold].index) else: label = set(data[label_col]) corr_df_total = pd.DataFrame() for lab in label: ths_data = data.loc[data[label_col] == lab] ths_data = ths_data[columnslist] ths_data_y = np.array(ths_data[columnslist[-1]]) ths_data_y = ths_data_y.reshape([len(ths_data_y), 1]) g_c = [] for col_x in columnslist[:-1]: ths_data_x = np.array(ths_data[col_x]) ths_data_x = ths_data_x.reshape([len(ths_data_x), 1]) c = CausalCalculator.CausalCalculator(ths_data_x, ths_data_y) g_c.append(c.calcGrangerCausality(k, m)) corr_df = pd.DataFrame(g_c, index=columnslist[:-1], columns=[ 'granger_causal ' + str(lab) + ' ' + str(columnslist[-1]) ]) corr_df_total = pd.concat([corr_df_total, corr_df], axis=1) plt = Data_plot.plot_bar_analysis(corr_df_total, corr_df_total.columns, threshold=[threshold, -threshold]) plt.show() if mix_method == 'max': mix_corr_df = pd.DataFrame( corr_df_total.max(axis=1), columns=['max_label_corr_' + columnslist[-1]]) elif mix_method == 'min': mix_corr_df = pd.DataFrame( corr_df_total.min(axis=1), columns=['max_label_corr_' + columnslist[-1]]) elif mix_method == 'mean': mix_corr_df = pd.DataFrame( corr_df_total.mean(axis=1), columns=['max_label_corr_' + columnslist[-1]]) plt = Data_plot.plot_bar_analysis(mix_corr_df, mix_corr_df.columns, threshold=[threshold, -threshold]) plt.ylim([-1.1, 1.1]) plt.title(mix_method + ' granger_causal ' + str(columnslist[-1])) plt.show() drop_col = list( mix_corr_df.loc[abs(mix_corr_df.iloc[:, 0]) < threshold].index) if isdrop: data = data.drop(list(drop_col), axis=1) return data, corr_df, drop_col
def nonlinear_corr_analysis(data, y=None, columnslist=None, label_col=None, threshold=0.8, method='spearman', mix_method='max', isdrop=True): ''' 非线性相关系数 秩相关 距离相关 ''' if y is not None: data = pd.concat([data, y], axis=1) if columnslist is None: columnslist = list(data.columns) else: columnslist.append(y.columns) if label_col is not None and label_col in columnslist: columnslist.remove(label_col) print('----- 非线性相关分析 -----') if label_col is None: ths_data = data[columnslist] if method == 'spearman' or method == 'kendall': corr_df = ths_data.corr(method=method).iloc[:-1, [-1]] corr_df = corr_df.rename( columns={columnslist[-1]: 'corr_' + columnslist[-1]}) elif method == 'distance': d_corr = [] ths_data_y = np.array(data[columnslist[-1]]) for col_x in columnslist[:-1]: try: ths_data_x = np.array(data[col_x]) d_corr.append(dcor.dcor(ths_data_x, ths_data_y)) except: d_corr.append(np.nan) corr_df = pd.DataFrame( d_corr, index=columnslist[:-1], columns=['distance correlation with ' + str(columnslist[-1])]) corr_df = corr_df.replace(np.nan, 0) corr_df = corr_df.sort_values(corr_df.columns[0]) plt = Data_plot.plot_bar_analysis(corr_df, corr_df.columns, threshold=[threshold, -threshold]) plt.title('correlation with ' + str(columnslist[-1])) plt.show() #如果有nan值,则用0填充 abscorr = abs(corr_df.iloc[:, 0]).fillna(0) drop_col = list(corr_df.loc[abscorr < threshold].index) else: label = set(data[label_col]) corr_df_total = pd.DataFrame() for lab in label: ths_data = data.loc[data[label_col] == lab] ths_data = ths_data[columnslist] if method == 'spearman' or method == 'kendall': corr_df = ths_data.corr(method=method).iloc[:-1, [-1]] corr_df = corr_df.rename( columns={ columnslist[-1]: 'corr_' + str(lab) + '_' + columnslist[-1] }) elif method == 'distance': d_corr = [] ths_data_y = np.array(data[columnslist[-1]]) for col_x in columnslist[:-1]: ths_data_x = np.array(data[col_x]) d_corr.append(dcor.dcor(ths_data_x, ths_data_y)) corr_df = pd.DataFrame( d_corr, index=columnslist[:-1], columns=['corr_' + str(lab) + '_' + columnslist[-1]]) corr_df_total = pd.concat([corr_df_total, corr_df], axis=1) if mix_method == 'max': mix_corr_df = pd.DataFrame( corr_df_total.max(axis=1), columns=['max_label_corr_' + columnslist[-1]]) elif mix_method == 'min': mix_corr_df = pd.DataFrame( corr_df_total.min(axis=1), columns=['min_label_corr_' + columnslist[-1]]) elif mix_method == 'mean': mix_corr_df = pd.DataFrame( corr_df_total.mean(axis=1), columns=['mean_label_corr_' + columnslist[-1]]) mix_corr_df = mix_corr_df.sort_values(mix_corr_df.columns[0]) corr_df = mix_corr_df plt = Data_plot.plot_bar_analysis(corr_df, corr_df.columns, threshold=[threshold, -threshold]) plt.title(mix_method + ' correlation with ' + str(columnslist[-1])) plt.show() #如果有nan值,则用0填充 abscorr = abs(corr_df.iloc[:, 0]).fillna(0) drop_col = list(corr_df.loc[abscorr < threshold].index) # drop_col = list(mix_corr_df.loc[abs(mix_corr_df.iloc[:,0])<threshold].index) print('非线性相关分析(阈值:{}结果):原数据:{}列,剔除数据{}列,筛选出:{}列。'.format( threshold, corr_df.shape[0], len(drop_col), corr_df.shape[0] - len(drop_col))) if isdrop: data = data.drop(list(drop_col), axis=1) if y is not None: data = data.drop(pd.DataFrame(y).columns[0], axis=1) data = data.reindex(index=y.index) return data, corr_df, drop_col
def linear_corr_analysis(data, y=None, columnslist=None, label_col=None, threshold=0.8, mix_method='mean', isdrop=True): ''' 相关性分析,默认前面为x,最后一列为y。 分别求每一列x与y的 线性相关,秩相关,剔除异常相关 等系数,进行绝对值排序 分label 分别求每个label下 各列与y的相关系数 ''' if y is not None: data = pd.concat([data, y], axis=1) if columnslist is None: columnslist = list(data.columns) else: columnslist.append(y.columns) if label_col is not None and label_col in columnslist: columnslist.remove(label_col) print('----- 线性相关分析 -----') if label_col is None: data = data[columnslist] #求相关性系数矩阵 corr_df = data.corr(method='pearson').iloc[:-1, [-1]] corr_df = corr_df.rename( columns={columnslist[-1]: 'corr_' + str(columnslist[-1])}) corr_df = corr_df.sort_values(corr_df.columns[0]) plt = Data_plot.plot_bar_analysis(corr_df, corr_df.columns, threshold=[threshold, -threshold]) plt.title('correlation with ' + str(columnslist[-1])) plt.show() #如果有nan值,则用0填充 abscorr = abs(corr_df.iloc[:, 0]).fillna(0) drop_col = list(corr_df.loc[abscorr < threshold].index) else: corr_df_total = pd.DataFrame() for key, group in data.groupby(label_col): #分group求相关系数矩阵 ths_data = group[columnslist] corr_df = ths_data.corr(method='pearson').iloc[:-1, [-1]] corr_df = corr_df.rename( columns={ columnslist[-1]: 'corr_' + str(key) + '_' + columnslist[-1] }) corr_df_total = pd.concat([corr_df_total, corr_df], axis=1) #group聚合 if mix_method == 'max': mix_corr_df = pd.DataFrame( corr_df_total.max(axis=1), columns=['max_label_corr_' + columnslist[-1]]) elif mix_method == 'min': mix_corr_df = pd.DataFrame( corr_df_total.min(axis=1), columns=['min_label_corr_' + columnslist[-1]]) elif mix_method == 'mean': mix_corr_df = pd.DataFrame( corr_df_total.mean(axis=1), columns=['mean_label_corr_' + columnslist[-1]]) mix_corr_df = mix_corr_df.sort_values(mix_corr_df.columns[0]) plt = Data_plot.plot_bar_analysis(mix_corr_df, mix_corr_df.columns, threshold=[threshold, -threshold]) plt.title(mix_method + ' correlation with ' + str(columnslist[-1])) plt.show() drop_col = list( mix_corr_df.loc[abs(mix_corr_df.iloc[:, 0]) < threshold].index) corr_df = mix_corr_df print('线性相关分析(阈值:{})结果:原数据:{}列,剔除数据{}列,筛选出:{}列。'.format( threshold, corr_df.shape[0], len(drop_col), corr_df.shape[0] - len(drop_col))) if isdrop: data = data.drop(list(drop_col), axis=1) if y is not None: data = data.drop(pd.DataFrame(y).columns[0], axis=1) data = data.reindex(index=y.index) return data, corr_df, drop_col
def reg_score(reg_input,train_x,train_y,valid_x,valid_y,label = None,is_plot = True, y_change = None,**kw): ''' 对回归模型进行评价 不分label: 对所有数据进行拟合预测,画散点折线图,计算mse,r2指标 输入: reg:回归模型 train_x,train_y,valid_x,valid_y:训练,验证 的x,y is_plot:是否输入图表 ''' if label is not None: train_x_input = train_x.drop(label,axis = 1) valid_x_input = valid_x.drop(label,axis = 1) else: train_x_input = train_x valid_x_input = valid_x # pdb.set_trace() if y_change is None: train_pred_y = reg_input.predict(train_x_input) else: train_pred_y = y_change.change_back(pd.DataFrame(reg_input.predict(train_x_input),columns=['train_pred_y'],index = train_y.index)) train_y = y_change.change_back(train_y) train_mse = mean_squared_error(train_y,train_pred_y) train_r2 = r2_score(train_y,train_pred_y) train_pred_y = pd.DataFrame(train_pred_y,columns=['train_pred_y'],index = train_y.index) #画y预测与y真实 按原顺序比较 if is_plot: plt = Data_plot.plot_scatter(train_y) plt = Data_plot.plot_line(train_pred_y,c=['r--']) plt.show() plot_train_data = pd.concat([train_y,train_pred_y],axis=1) plt = Data_plot.plot_scatter(plot_train_data,issns=False) line_data = np.array([[plot_train_data.max()[0],plot_train_data.max()[0]],[plot_train_data.min()[0],plot_train_data.min()[0]]]) plt = Data_plot.plot_line(pd.DataFrame(line_data,columns=['y_true','y_pred'])) plt.show() print('训练集:mse = {} , r2 = {}'.format(train_mse,train_r2)) if y_change is None: valid_pred_y = reg_input.predict(valid_x_input) else: valid_pred_y = y_change.change_back(pd.DataFrame(reg_input.predict(valid_x_input),columns=['valid_pred_y'],index = valid_y.index)) valid_y = y_change.change_back(valid_y) valid_mse = mean_squared_error(valid_y,valid_pred_y) valid_r2 = r2_score(valid_y,valid_pred_y) valid_pred_y = pd.DataFrame(valid_pred_y,columns=['valid_pred_y'],index = valid_y.index) if is_plot: plt = Data_plot.plot_scatter(valid_y) plt = Data_plot.plot_line(valid_pred_y,c=['r--']) plt.show() plot_valid_data = pd.concat([valid_y,valid_pred_y],axis=1) plt = Data_plot.plot_scatter(plot_valid_data,issns=False) line_data = np.array([[plot_valid_data.max()[0],plot_valid_data.max()[0]],[plot_valid_data.min()[0],plot_valid_data.min()[0]]]) plt = Data_plot.plot_line(pd.DataFrame(line_data,columns=['y_true','y_pred']),) plt.show() print('验证集:mse = {} , r2 = {}'.format(valid_mse,valid_r2)) return valid_mse,valid_r2