def detect_label_outliers(y, encoder=None): # remove outliers # TODO per-class outlier removal also # if an encoder is provided, use it to invert the encoding before computing stats. if encoder: y = encoder.inverse_transform(y) # perform outlier removal on mean/var-normalized samples (after filling missing data). y_std = RobustScaler().fit_transform( Imputer(strategy="median").fit_transform( y.reshape([-1, 1]) ) ).reshape([-1]) mask = (y_std < y_std.mean() - 3*y_std.std()) | (y_std > y_std.mean() + 3*y_std.std()) logger.info("Filtering {}/{} as outliers.".format(mask.sum(), len(mask))) return mask
elif c==2:#线性归一化 X_scale=minmax_scale(X) elif c==0:#不进行归一化 X_scale=X elif c==3:#鲁棒性归一化 from sklearn.preprocessing import RobustScaler X_scale=RobustScaler().fit_transform(X) print 'the standar result of X is:',X_scale ##测试X_scale,正常情况下均值为0,方差为1 #1. print 'mean=',X_scale.mean() print 'std=',X_scale.std() #2. print 'min=',X_scale.min() print 'max=',X_scale.max() csv_file1.close() ##为了理解方便、表示方法简单 X=X_scale ##归一化之后的统计信息 ##获得X的统计信息 statistics(X) ##频率分布图 #drawHist(X,'AOD','Frequency','the Frequency of standar AOD') ##频率累计图 #drawCumulativeHist(X,'AOD','Frequency','Curve cumulative of standar AOD')