def dataSplit(self,df,label_col,test_size,Standardization = False,Normalization = False,Feature_selection = False):
     '''
     数据分割
     '''
     #string的类别映射到id
     classes = df[label_col].value_counts().index.tolist()
     class2id_map = {}
     for i in range(len(classes)):
         class2id_map[classes[i]] = i
     df[label_col].replace(class2id_map,inplace = True)
     print('map dict as follow:')#映射字典
     print('===========')
     for k,v in class2id_map.items():
         print('|',k,':',v,'|')
     print('==========')
     
     X = df.drop(label_col,axis = 1)
     y = df[label_col]
     
     if Standardization:
         self.dataStandardization(X)
         
     if Normalization:
         self.dataNormalization(X)
     if Feature_selection:
         self.featureSelection(X,y)
         
     
     
     X_train,X_test,y_train,y_test = sp(X,y,test_size = test_size,random_state = 2019)
     return X,X_train,X_test,y_train,y_test
Beispiel #2
0
def main():
    small_data, title = loading_data_svd_test(15, 938)
    # 划分数据集
    train_data, test_data = sp(small_data, test_size=0.25)
    # Create two user-item matrices for training and testing data
    train_data_matrix = train_data.as_matrix(
        columns=['user_id', 'movie_id', 'rating'])
    test_data_matrix = test_data.as_matrix(
        columns=['user_id', 'movie_id', 'rating'])
    print(np.shape(train_data))
    print(train_data)
Beispiel #3
0
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.linear_model import LogisticRegression as LR
from sklearn.model_selection import train_test_split as sp
from sklearn import metrics
from procee_data import get_data
from sklearn.externals import joblib
svc = SVC(probability=True)
gnb = GNB()
knn = KNN()
rfc = RFC(random_state=1)
model = [svc, gnb, knn]
#加载数据
X_train, y_train, X_test, y_test = get_data()
#训练模型
X_train_d1, X_train_d2, y_train_d1, y_train_d2 = sp(X_train,
                                                    y_train,
                                                    test_size=0.5,
                                                    random_state=1)
X_train_d2_blending = np.zeros((X_train_d2.shape[0], len(model)))
X_test_blending = np.zeros((X_test.shape[0], len(model)))
# X_train_d1=X_train_d1.todense()
for j, clf in enumerate(model):
    print(clf)
    clf.fit(X_train_d1, y_train_d1)
    y_test_value = clf.predict_proba(X_train_d2)[:, 1]
    X_train_d2_blending[:, j] = y_test_value
    X_test_blending[:, j] = clf.predict_proba(X_test)[:, 1]
    print('测试集AUC是: {:.4}'.format(
        metrics.roc_auc_score(y_test, X_test_blending[:, j])))
    print('------------------------------------')
lr = LR()
lr.fit(X_train_d2_blending, y_train_d2)