Beispiel #1
0
def pure_feature_classification():
    trainX, trainY, testX, testY = get_pure_feature(False)
    trainX_2D, trainY_1D = trainX[:, subSum - 1, :], trainY[:, subSum - 1, 0]
    result = lgb_train_model_with_split(pd.DataFrame(trainX_2D),
                                        pd.DataFrame(trainY_1D), 2011)
    print_res(result)
    return result
def GCN_classfication():
    trainX, trainY, testX, testY = get_input_data(
        GCNPathName+"%d_%d" %(embSize,testNum), "/fGCNembedding",True, False)
    trainX_2D, trainY_1D = trainX[:, subSum-1, :], trainY[:, subSum-1, 0]
    ftrainX, ftrainY, ftestX, ftestY = get_pure_feature(False)
    inputX=np.concatenate((ftrainX[:,subSum-1,:], trainX[:, subSum-1, :]), axis=1)
    result = lgb_train_model_with_split(
        pd.DataFrame(inputX), pd.DataFrame(trainY_1D), 2011)
    print_res(result)
def GCN_classfication():
    gcn_res, rcnt = [0, 0, 0, 0], 5
    rs = 7
    for i in range(rcnt):
        for idx in range(subSum - 1, subSum):
            print(idx)
            print(tolFeaturePathName + "/tolFeature_%d.pkl" % idx)
            tolFeature = load_pickle(tolFeaturePathName,
                                     "/tolFeature_%d.pkl" % idx)
            tolFeature_df = pd.DataFrame(tolFeature,
                                         columns=[
                                             'label', 'AF1', 'AF2', 'AF3',
                                             'AF4', 'AF5', 'AF6', 'AF7', 'AF8'
                                         ])

            sp_muldG = load_pickle(muldigPathName, "/G_%d.pkl" % idx)
            #sp_mulG = load_pickle(mulgPathName+"/G_%d.pkl" % idx)
            print(mulgPathName + "/G_%d.pkl" % idx)
            y_cols_name = ['label']
            x_cols_name = [
                x for x in tolFeature_df.columns if x not in y_cols_name
            ]
            global scipy_adj_matrix, train_x, train_y
            train_x = dcopy(tolFeature_df[x_cols_name])
            train_y = dcopy(tolFeature_df[y_cols_name])
            pos_cnt, neg_cnt = int(
                train_y.sum()), int(len(train_y) - train_y.sum())

            scipy_adj_matrix = get_scipy_adj_matrix(sp_muldG)
            print('pos node cnts:', pos_cnt)
            print('neg node cnts:', neg_cnt, 'pos/all ratio:',
                  pos_cnt / (pos_cnt + neg_cnt))

            embSize = 8
            fGCNembedding = get_GCN_embedding(epoch=6,
                                              lr=0.0035,
                                              weight_decay=1e-6,
                                              esize=embSize,
                                              random_seed=rs + i)
            print("finish calculate embedding data!")
            save_pickle(fGCNembedding, GCNPathName + "%d" % testNum,
                        "/fGCNembedding_%d.pkl" % idx)
            trainX, trainY, testX, testY = get_input_data(
                GCNPathName + "%d" % testNum, "/fGCNembedding", True, False,
                True)
            trainX_2D, trainY_1D = trainX[:,
                                          subSum - 1, :], trainY[:, subSum - 1,
                                                                 0]

        lgb_res = lgb_train_model_with_split(pd.DataFrame(trainX_2D),
                                             pd.DataFrame(trainY_1D), 2011)
        print_res(lgb_res)
        for j in range(len(gcn_res)):
            gcn_res[j] += lgb_res[j]
    gcn_res = [i / rcnt for i in gcn_res]
    gc.collect()
    return gcn_res
def LSTM_classification(modelFileName):
    # 不切分训练测试集
    new_model = load_RNNmodel(modelFileName)
    trainX, trainY, testX, testY = get_pure_feature(False)
    print(trainX.shape,trainY.shape)
    
    trainX_emb, testX_emb = get_autoEncoder_Embedding_Layer(
        trainX, trainX, new_model)  
    #实际上传入的test也是trainX 因为在这里不切分数据集
    trainX_2D = trainX_emb[:, subSum-1, :]
    trainY_1D = trainY[:, subSum-1, 0]
    result = lgb_train_model_with_split(
        pd.DataFrame(trainX_2D), pd.DataFrame(trainY_1D), 2011)
    print_res(result)
def GCN_LSTM_classification(modelFileName):
    new_model = load_RNNmodel(modelFileName)
   
    trainX, trainY, testX, testY = get_input_data(
        GCNPathName+"%d_%d" %(embSize,testNum), "/fGCNembedding", True, False)
    #trainX= trainX[:, :, :8]
    ftrainX, ftrainY, ftestX, ftestY = get_pure_feature(False)
    print(trainX.shape,trainY.shape)
    trainX_emb, testX_emb = get_autoEncoder_Embedding_Layer(
        trainX, trainX, new_model)
    print(trainX_emb.shape)
    for i in range(subSum-1,subSum):
        #print(str(i)+":------")
        inputX=np.concatenate((ftrainX[:,i,:], trainX_emb[:, i, :]), axis=1)
        #inputX=trainX_emb[:, i, :]
        print(inputX.shape)
        trainX_2D, trainY_1D =inputX, trainY[:,subSum-1, 0]
        result = lgb_train_model_with_split(
            pd.DataFrame(trainX_2D), pd.DataFrame(trainY_1D), 2011)
        print_res(result)
                                     'AF5', 'AF6', 'AF7', 'AF8'
                                 ])
    sp_muldG = load_pickle(muldigPathName, "/G_%d.pkl" % idx)
    #print(mulgPathName+"/G_%d.pkl" % idx)
    y_cols_name = ['label']
    x_cols_name = [x for x in tolFeature_df.columns if x not in y_cols_name]
    global train_x, train_y
    train_x = dcopy(tolFeature_df[x_cols_name])
    train_y = dcopy(tolFeature_df[y_cols_name])
    pos_cnt, neg_cnt = int(train_y.sum()), int(len(train_y) - train_y.sum())


print("start!")
get_trainx_trainy()
gcn_res = GCN_classfication()
print_res(gcn_res)


# Deepwalk_classification
def Deepwalk_classification():
    global train_x, train_y
    dw_res, rcnt = [0, 0, 0, 0], 5
    for i in tqdm(range((rcnt))):
        node_feas, labels = dcopy(train_x.values), dcopy(train_y.values)
        embe_feas = read_embeds(DeepwalkPathName_new + '/embeds_dw_%d.dat' % i)
        np_fea_lab = np.hstack((node_feas, embe_feas, labels))
        columns_name = ['f%02d' % i
                        for i in range(np_fea_lab.shape[1] - 1)] + ['label']
        df_fea_lab = pd.DataFrame(data=np_fea_lab,
                                  columns=columns_name,
                                  dtype=float)