def main(): data = LoadFile( p=r'F:\ProximityDetection\Stacking\dataset_PNY\PNY_fft_cl_1.pickle') imp = Imputer(missing_values='NaN', strategy='mean', axis=0, verbose=0, copy=True) dataset_sim = imp.fit_transform(data) XGBoost = multi_XGBoost(max_depth=2, learning_rate=1e-2, n_estimators=300, objective='binary:logistic', nthread=4, gamma=0.1, min_child_weight=1, subsample=1, reg_lambda=2, scale_pos_weight=1.) training_main(model=XGBoost, dataset_sim=dataset_sim) digraph = xgb.to_graphviz(XGBoost, num_trees=2) digraph.format = 'png' digraph.view('./ProximityDetection_xgb') xgb.plot_importance(XGBoost) plt.show()
def data_stepone(p_dataset_ori, proportion): ''' 数据集生成步骤1:划分训练/测试集数据 :param p_dataset_ori: string, 原始数据提取绝对路径 :param proportion: int, 选择10/5折交叉验证 :return: 训练集, 测试集 ,shape=((-1, 25/20+1), (-1, 25/20+1)) ''' dataset_ori = LoadFile(p=p_dataset_ori) batch_size = dataset_ori.shape[0] // proportion for i in range(0, dataset_ori.shape[0], batch_size): #取一折为测试集,剩下组合为训练集 train = np.vstack( (dataset_ori[:i, :], dataset_ori[i + batch_size:, :])) #只用后20个密度特征 test = dataset_ori[i:i + batch_size, :] yield train, test
def data_operation(p): ''' 数据制作 :param p: 导入数据路径 :return: None ''' data_fft = LoadFile(p) #将特征从整体数据中分离出来并做归一化后和标签进行组合 label = data_fft[:, -4:] label_one = np.argmax(label, axis=1) # print(Counter(label_one)) data_fft = np.hstack((data_fft[:, :-4], label_one[:, np.newaxis])) print(data_fft.shape) SaveFile(data=data_fft, savepickle_p= r'F:\ProximityDetection\Stacking\dataset_PNY\PNY_fft_cl_1.pickle')
def data_stepone_1(p_dataset_ori, proportion, is_shuffle): ''' 交叉验证,按比例划分训练/测试集 :param p_dataset_ori: string, 原始数据提取绝对路径 :param proportion: int, 选择10/5折交叉验证 :param is_shuffle: Ture/False, 选择是否随机划分 :return: 划分后的训练集和测试集 ''' dataset_ori = LoadFile(p=p_dataset_ori) # k-fold对象,用于生成训练集和交叉验证集数据 kf = model_selection.KFold(n_splits=proportion, shuffle=is_shuffle, random_state=32) for train_data_index, cv_data_index in kf.split(dataset_ori): # 找到对应索引数据 train_data, cv_data = dataset_ori[train_data_index], dataset_ori[ cv_data_index] # print(np.isnan(train_data).any(), np.isnan(cv_data).any()) yield train_data, cv_data
def data_make(): ''' 制作均衡分类数据 :return: None ''' rng = np.random.RandomState(0) # 制作fft均分类数据 data_PNY = LoadFile( p=r'F:\ProximityDetection\Stacking\dataset_PNY\PNY_data_train.pickle') # print(data_PNY.shape) PNY_features = data_PNY[:, :-1] # 归一化 PNY_features = (PNY_features - np.min(PNY_features, axis=0)) / \ (np.max(PNY_features, axis=0) - np.min(PNY_features, axis=0)) # 组合归一化后的特征和标签 PNY_data = np.hstack((PNY_features, data_PNY[:, -1][:, np.newaxis])) PNY_data = pd.DataFrame( PNY_data, columns=[i for i in range(1, PNY_data.shape[-1] + 1)]) divided = [(0, 10), (10, 20), (20, 100), (100, 300)] num_per_group = 1900 indexx = 0 PNY_data_classifier = np.zeros(shape=[1]) for i, j in divided: per_group = PNY_data.loc[PNY_data[PNY_data.shape[-1]] > i, :] per_group = per_group.loc[per_group[PNY_data.shape[-1]] <= j, :] per_group = np.array(per_group) # print(per_group.shape) rng.shuffle(per_group) per_group = per_group[:num_per_group, :] one_hot_label = np.zeros(shape=[num_per_group, 4], dtype=np.float32) one_hot_label[:, indexx] = 1 print(np.sum(one_hot_label, axis=0)) per_group = np.hstack((per_group[:, :-1], one_hot_label)) PNY_data_classifier = np.vstack((PNY_data_classifier, per_group)) if PNY_data_classifier.any() else \ per_group indexx += 1 rng.shuffle(PNY_data_classifier) SaveFile(data=PNY_data_classifier, savepickle_p= r'F:\ProximityDetection\Stacking\dataset_PNY\PNY_norm_cl.pickle') print(PNY_data_classifier.shape)
# #提取特征归一化后的fft数据(带标签) # PNY_fft_norm = LoadFile(p=r'F:\ProximityDetection\Stacking\dataset_PNY\PNY_fft_norm.pickle') # #提取标签 # fft_label = PNY_fft_norm[:, -1] # #生成类别标签 # fft_class = transform(fft_label) # #合成新数据 # PNY_fft_norm_c = np.hstack((PNY_fft_norm[:, :-1], fft_class[:, np.newaxis])) # # SaveFile(data=PNY_fft_norm_c, savepickle_p=r'F:\ProximityDetection\Stacking\dataset_PNY\PNY_fft_norm_c.pickle') # print(Counter(fft_class)) if __name__ == '__main__': #制作数据总数10000,5折 rng = np.random.RandomState(0) # dataset = rng.randint(0, 10, size= (10000, 21)) # SaveFile(data= dataset, savepickle_p= r'F:\ProximityDetection\Stacking\test_data.pickle') dataset_2 = rng.randint(0, 10, size=(10000, 7)) SaveFile(data=dataset_2, savepickle_p=r'F:\ProximityDetection\Stacking\test_data_2.pickle') # 导入数据 # p_dataset_ori = r'F:\ProximityDetection\Stacking\test_data.pickle' p_dataset_ori = r'F:\ProximityDetection\Stacking\test_data_2.pickle' dataset_ori = LoadFile(p=p_dataset_ori) #step1 for train, test in data_stepone(p_dataset_ori=p_dataset_ori, proportion=5): print(train.shape) print(test.shape) # for feature, label in data_steptwo(train_data= train, batch_size= 500): # print(feature, label) # break