def kFoldCV(trainX, trainY, modelIndex, k=10, logFlag=True): datas, labels = splitData(trainX, trainY, k) res = [] for i in range(k): copydata = copy.deepcopy(datas) # 备份数据集 # 生成训练和测试样本 testArr = copydata[i] del copydata[i] trainArr = np.vstack(tuple(copydata)) # 生成测试和训练标签 copylabel = copy.deepcopy(labels) testLabel = copylabel[i] del copylabel[i] trainLabel = np.hstack(tuple(copylabel)) # 测试 rf = buildTrainModel(modelIndex=modelIndex) rf.fit(trainArr, trainLabel) pred = rf.predict(testArr) #pred = np.rint(pred) if logFlag: # 对数变换在做交叉验证时先还原数据 pred = np.expm1(pred) testLabel = np.expm1(testLabel) res.append(SMAPE(testLabel, pred)) print("score mean:", np.mean(res)) print("score std:", np.std(res)) return res
def crossValidation(trainX, trainY, modelIndex, cvRate=0.96, cvEpoch=20): scores = [] for i in range(cvEpoch): X, Y = shuffle(trainX, trainY) # 打乱数据 offset = int(X.shape[0] * cvRate) X_train, y_train = X[:offset], Y[:offset] X_test, y_test = X[offset:], Y[offset:] #y_train = np.log(y_train+1) rf = buildTrainModel(modelIndex=modelIndex) rf.fit(X_train, y_train) pred = rf.predict(X_test) # 四舍五入成整数 #pred = np.exp(pred) pred = np.rint(pred) acc = SMAPE(y_test, pred) scores.append(acc) # 去除评分中的inf scores = [x for x in scores if str(x) != 'nan' and str(x) != 'inf'] print("score mean:", np.mean(scores)) print("score std:", np.std(scores)) skscores = cross_val_score(rf, trainX, trainY, cv=10, scoring="neg_mean_absolute_error") print("sklearn cv mean:", skscores.mean()) print("sklearn cv std:", skscores.std()) return scores, skscores
def train(features, trainPath, index, saveName): dataDF = pd.read_csv(trainPath, dtype={'link_ID': str}) print("original dataset columns:", dataDF.columns) print("original data num is:", len(dataDF)) #dataDF = dataDF.dropna(axis=0) # 去除空值记录 #print("after drop na data num is:", len(dataDF)) trainX = dataDF[features] trainY = dataDF['travel_time'] trainY = np.log1p(trainY) print("trainx shape", trainX.values.shape) print("trainY shape", trainY.values.shape) rf = buildTrainModel(modelIndex=index) #rf = gridSearch(trainX, trainY, modelIndex=modelIndex) rf.fit(trainX, trainY) #scores, skscores = crossValidation(trainX, trainY, index) scores = kFoldCV(trainX, trainY, index, k=5) print("cross validation scores:", scores) #print("sklearn cross validation scores:", skscores) if index == 1 or index == 2: print("feature score ", pd.DataFrame(rf.feature_importances_)) if index == 3: #print("feature score ", pd.DataFrame(rf.feature_importances_)) xgb.plot_importance(rf) plt.savefig("../model/importance3.jpg") saveSuffix = "../model/" joblib.dump(rf, saveSuffix + saveName) return rf
def gridSearch(trainx, trainy, modelIndex): parameters = { 'learning_rate': [0.01, 0.05, 0.1], 'n_estimators': [150, 200, 250], 'subsample': [0.5, 0.7, 0.9, 1.0], 'max_depth': [6, 8, 10], 'max_features': ['sqrt', None] } rf = buildTrainModel(modelIndex=modelIndex) grid_search = GridSearchCV(rf, parameters, verbose=2, cv=10) grid_search.fit(trainx, trainy) best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name]))
def train(features, trainPath, testPath, index, saveName, onehotFeature): trainDF = pd.read_csv(trainPath, dtype={'link_ID': str}) print("original dataset columns:", trainDF.columns) testDF = pd.read_csv(testPath, dtype={'link_ID': str}) print(len(trainDF)) totalDF = pd.concat([trainDF, testDF], axis=0) print(len(totalDF)) trainX = totalDF[features] for catgoryFeature in onehotFeature: Onehot = pd.get_dummies(trainX[catgoryFeature], prefix=catgoryFeature, sparse=True) trainX = pd.concat([trainX, Onehot], axis=1) trainX = trainX[trainX["month"] < 6] trainX = trainX.drop(onehotFeature, axis=1) print(trainX.columns) # label 做对数变换 trainY = trainDF['travel_time'] trainY = np.log1p(trainY) print("trainx shape", trainX.values.shape) print("trainY shape", trainY.values.shape) rf = buildTrainModel(modelIndex=index) # rf = gridSearch(trainX, trainY, modelIndex=modelIndex) rf.fit(trainX, trainY) # scores, skscores = crossValidation(trainX, trainY, index) scores = kFoldCV(trainX, trainY, modelIndex, k=5) print("cross validation scores:", scores) # print("sklearn cross validation scores:", skscores) if index == 1 or index == 2: print("feature score ", pd.DataFrame(rf.feature_importances_)) if index == 3: # print("feature score ", pd.DataFrame(rf.feature_importances_)) xgb.plot_importance(rf) plt.savefig("../model/importance3.jpg") saveSuffix = "../model/" joblib.dump(rf, saveSuffix + saveName) return rf