예제 #1
0
def main():
    data = LoadFile(
        p=r'F:\ProximityDetection\Stacking\dataset_PNY\PNY_fft_cl_1.pickle')
    imp = Imputer(missing_values='NaN',
                  strategy='mean',
                  axis=0,
                  verbose=0,
                  copy=True)
    dataset_sim = imp.fit_transform(data)
    XGBoost = multi_XGBoost(max_depth=2,
                            learning_rate=1e-2,
                            n_estimators=300,
                            objective='binary:logistic',
                            nthread=4,
                            gamma=0.1,
                            min_child_weight=1,
                            subsample=1,
                            reg_lambda=2,
                            scale_pos_weight=1.)
    training_main(model=XGBoost, dataset_sim=dataset_sim)
    digraph = xgb.to_graphviz(XGBoost, num_trees=2)
    digraph.format = 'png'
    digraph.view('./ProximityDetection_xgb')
    xgb.plot_importance(XGBoost)
    plt.show()
def data_stepone(p_dataset_ori, proportion):
    '''
    数据集生成步骤1:划分训练/测试集数据
    :param p_dataset_ori: string, 原始数据提取绝对路径
    :param proportion: int, 选择10/5折交叉验证
    :return: 训练集, 测试集 ,shape=((-1, 25/20+1), (-1, 25/20+1))
    '''
    dataset_ori = LoadFile(p=p_dataset_ori)
    batch_size = dataset_ori.shape[0] // proportion
    for i in range(0, dataset_ori.shape[0], batch_size):  #取一折为测试集,剩下组合为训练集
        train = np.vstack(
            (dataset_ori[:i, :], dataset_ori[i + batch_size:, :]))  #只用后20个密度特征
        test = dataset_ori[i:i + batch_size, :]
        yield train, test
예제 #3
0
def data_operation(p):
    '''
    数据制作
    :param p: 导入数据路径
    :return: None
    '''
    data_fft = LoadFile(p)
    #将特征从整体数据中分离出来并做归一化后和标签进行组合
    label = data_fft[:, -4:]
    label_one = np.argmax(label, axis=1)
    # print(Counter(label_one))
    data_fft = np.hstack((data_fft[:, :-4], label_one[:, np.newaxis]))
    print(data_fft.shape)
    SaveFile(data=data_fft,
             savepickle_p=
             r'F:\ProximityDetection\Stacking\dataset_PNY\PNY_fft_cl_1.pickle')
def data_stepone_1(p_dataset_ori, proportion, is_shuffle):
    '''
    交叉验证,按比例划分训练/测试集
    :param p_dataset_ori: string, 原始数据提取绝对路径
    :param proportion: int, 选择10/5折交叉验证
    :param is_shuffle: Ture/False, 选择是否随机划分
    :return: 划分后的训练集和测试集
    '''
    dataset_ori = LoadFile(p=p_dataset_ori)
    # k-fold对象,用于生成训练集和交叉验证集数据
    kf = model_selection.KFold(n_splits=proportion,
                               shuffle=is_shuffle,
                               random_state=32)
    for train_data_index, cv_data_index in kf.split(dataset_ori):
        # 找到对应索引数据
        train_data, cv_data = dataset_ori[train_data_index], dataset_ori[
            cv_data_index]
        # print(np.isnan(train_data).any(), np.isnan(cv_data).any())
        yield train_data, cv_data
def data_make():
    '''
    制作均衡分类数据
    :return: None
    '''
    rng = np.random.RandomState(0)
    # 制作fft均分类数据
    data_PNY = LoadFile(
        p=r'F:\ProximityDetection\Stacking\dataset_PNY\PNY_data_train.pickle')
    # print(data_PNY.shape)
    PNY_features = data_PNY[:, :-1]
    # 归一化
    PNY_features = (PNY_features - np.min(PNY_features, axis=0)) / \
                       (np.max(PNY_features, axis=0) - np.min(PNY_features, axis=0))
    # 组合归一化后的特征和标签
    PNY_data = np.hstack((PNY_features, data_PNY[:, -1][:, np.newaxis]))
    PNY_data = pd.DataFrame(
        PNY_data, columns=[i for i in range(1, PNY_data.shape[-1] + 1)])
    divided = [(0, 10), (10, 20), (20, 100), (100, 300)]
    num_per_group = 1900
    indexx = 0
    PNY_data_classifier = np.zeros(shape=[1])
    for i, j in divided:
        per_group = PNY_data.loc[PNY_data[PNY_data.shape[-1]] > i, :]
        per_group = per_group.loc[per_group[PNY_data.shape[-1]] <= j, :]
        per_group = np.array(per_group)
        # print(per_group.shape)
        rng.shuffle(per_group)
        per_group = per_group[:num_per_group, :]
        one_hot_label = np.zeros(shape=[num_per_group, 4], dtype=np.float32)
        one_hot_label[:, indexx] = 1
        print(np.sum(one_hot_label, axis=0))
        per_group = np.hstack((per_group[:, :-1], one_hot_label))
        PNY_data_classifier = np.vstack((PNY_data_classifier, per_group)) if PNY_data_classifier.any() else \
            per_group
        indexx += 1

    rng.shuffle(PNY_data_classifier)
    SaveFile(data=PNY_data_classifier,
             savepickle_p=
             r'F:\ProximityDetection\Stacking\dataset_PNY\PNY_norm_cl.pickle')
    print(PNY_data_classifier.shape)
#     #提取特征归一化后的fft数据(带标签)
#     PNY_fft_norm = LoadFile(p=r'F:\ProximityDetection\Stacking\dataset_PNY\PNY_fft_norm.pickle')
#     #提取标签
#     fft_label = PNY_fft_norm[:, -1]
#     #生成类别标签
#     fft_class = transform(fft_label)
#     #合成新数据
#     PNY_fft_norm_c = np.hstack((PNY_fft_norm[:, :-1], fft_class[:, np.newaxis]))
#     # SaveFile(data=PNY_fft_norm_c, savepickle_p=r'F:\ProximityDetection\Stacking\dataset_PNY\PNY_fft_norm_c.pickle')
#     print(Counter(fft_class))

if __name__ == '__main__':
    #制作数据总数10000,5折
    rng = np.random.RandomState(0)
    # dataset = rng.randint(0, 10, size= (10000, 21))
    # SaveFile(data= dataset, savepickle_p= r'F:\ProximityDetection\Stacking\test_data.pickle')
    dataset_2 = rng.randint(0, 10, size=(10000, 7))
    SaveFile(data=dataset_2,
             savepickle_p=r'F:\ProximityDetection\Stacking\test_data_2.pickle')
    # 导入数据
    # p_dataset_ori = r'F:\ProximityDetection\Stacking\test_data.pickle'
    p_dataset_ori = r'F:\ProximityDetection\Stacking\test_data_2.pickle'
    dataset_ori = LoadFile(p=p_dataset_ori)
    #step1
    for train, test in data_stepone(p_dataset_ori=p_dataset_ori, proportion=5):
        print(train.shape)
        print(test.shape)
        # for feature, label in data_steptwo(train_data= train, batch_size= 500):
        #     print(feature, label)
        # break