def dataSplit(self,df,label_col,test_size,Standardization = False,Normalization = False,Feature_selection = False): ''' 数据分割 ''' #string的类别映射到id classes = df[label_col].value_counts().index.tolist() class2id_map = {} for i in range(len(classes)): class2id_map[classes[i]] = i df[label_col].replace(class2id_map,inplace = True) print('map dict as follow:')#映射字典 print('===========') for k,v in class2id_map.items(): print('|',k,':',v,'|') print('==========') X = df.drop(label_col,axis = 1) y = df[label_col] if Standardization: self.dataStandardization(X) if Normalization: self.dataNormalization(X) if Feature_selection: self.featureSelection(X,y) X_train,X_test,y_train,y_test = sp(X,y,test_size = test_size,random_state = 2019) return X,X_train,X_test,y_train,y_test
def main(): small_data, title = loading_data_svd_test(15, 938) # 划分数据集 train_data, test_data = sp(small_data, test_size=0.25) # Create two user-item matrices for training and testing data train_data_matrix = train_data.as_matrix( columns=['user_id', 'movie_id', 'rating']) test_data_matrix = test_data.as_matrix( columns=['user_id', 'movie_id', 'rating']) print(np.shape(train_data)) print(train_data)
from sklearn.ensemble import RandomForestClassifier as RFC from sklearn.linear_model import LogisticRegression as LR from sklearn.model_selection import train_test_split as sp from sklearn import metrics from procee_data import get_data from sklearn.externals import joblib svc = SVC(probability=True) gnb = GNB() knn = KNN() rfc = RFC(random_state=1) model = [svc, gnb, knn] #加载数据 X_train, y_train, X_test, y_test = get_data() #训练模型 X_train_d1, X_train_d2, y_train_d1, y_train_d2 = sp(X_train, y_train, test_size=0.5, random_state=1) X_train_d2_blending = np.zeros((X_train_d2.shape[0], len(model))) X_test_blending = np.zeros((X_test.shape[0], len(model))) # X_train_d1=X_train_d1.todense() for j, clf in enumerate(model): print(clf) clf.fit(X_train_d1, y_train_d1) y_test_value = clf.predict_proba(X_train_d2)[:, 1] X_train_d2_blending[:, j] = y_test_value X_test_blending[:, j] = clf.predict_proba(X_test)[:, 1] print('测试集AUC是: {:.4}'.format( metrics.roc_auc_score(y_test, X_test_blending[:, j]))) print('------------------------------------') lr = LR() lr.fit(X_train_d2_blending, y_train_d2)