def train_job(train_idx, test_idx, t_size, rf_mode, m_rules, r_seed, X, y, feas, n_samples): """ 每个 fold 中进行训练和验证 """ p = current_process() print('process counter:', p._identity[0], 'pid:', os.getpid()) # 初始化 estimator 训练集进入模型 rf = RuleFit(tree_size=t_size, rfmode=rf_mode, max_rules=m_rules, random_state=r_seed) print( "\nTree generator:{0}, \n\nMax rules:{1}, Tree size:{2}, Random state:{3}" .format(rf.tree_generator, rf.max_rules, rf.tree_size, rf.random_state)) rf.fit(X[train_idx], y[train_idx], feas) # 验证测试集 (通过 index 去除 fake data) real_test_index = test_idx[test_idx < n_samples] batch_test_x = X[real_test_index] batch_test_y = y[real_test_index] batch_test_size = len(real_test_index) y_pred = rf.predict(batch_test_x) # 计算测试集 ACC accTest = accuracy_score(batch_test_y, y_pred) print("\nTest Accuracy:", "{:.6f}".format(accTest), "Test Size:", batch_test_size) print( "\n=========================================================================" ) # 返回测试集和预测结果用于统计 return batch_test_y, y_pred
# 生成 k-fold 训练集、测试集索引 cv_index_set = rs.split(y) k_fold_step = 1 # 初始化折数 # 暂存每次选中的测试集和对应预测结果 test_cache = pred_cache = np.array([], dtype=np.int) # 迭代训练 k-fold 交叉验证 for train_index, test_index in cv_index_set: print("\nFold:", k_fold_step) # 初始化 estimator 训练集进入模型 rf = RuleFit(tree_size=args.treesize, rfmode=args.rfmode, max_rules=args.maxrules, random_state=args.randomseed) rf.fit(X[train_index], y[train_index], features) # 测试集验证 y_pred = rf.predict(X[test_index]) # 计算测试集 ACC accTest = accuracy_score(y[test_index], y_pred) print("\nFold:", k_fold_step, "Test Accuracy:", "{:.6f}".format(accTest), "Test Size:", test_index.size) # 暂存每次选中的测试集和预测结果 test_cache = np.concatenate((test_cache, y[test_index])) pred_cache = np.concatenate((pred_cache, y_pred)) print( "\n=========================================================================" ) # 每个fold训练结束后次数 +1 k_fold_step += 1 # 末尾输出rulefit模型参数 print("\n=== Model parameters ===")
max_depth=100, max_features=None, max_leaf_nodes=15, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=500, n_iter_no_change=None, presort='auto', random_state=572, subsample=0.46436099318265595, tol=0.0001, validation_fraction=0.1, verbose=0, warm_start=False), tree_size=3) rgb.fit(x_train, y_train) y_pred = rgb.predict(x_test) rules = rgb.get_rules() def scaled_absolute_error(y_test, y_pred): e1 = np.mean(y_test - y_pred) e2 = np.mean(y_test - np.median(y_test)) return np.round(e1 / e2, 4) scaled_absolute_error(y_test, y_pred)
boston_data = pd.read_csv("boston.csv", index_col=0) y = boston_data.medv.values X = boston_data.drop("medv", axis=1) features = X.columns X = X.values typ = 'regressor' #regressor or classifier if typ == 'regressor': rf = RuleFit( rfmode='regress', tree_generator=RandomForestRegressor() ) rf.fit(X, y, feature_names=features) y_pred = rf.predict(X) insample_rmse = np.sqrt(np.sum((y_pred - y)**2)/len(y)) elif typ == 'classifier': y_class = y.copy() y_class[y_class < 21] = -1 y_class[y_class >= 21] = +1 N = X.shape[0] rf = RuleFit( rfmode='classify', tree_generator=RandomForestClassifier() ) rf.fit(X, y_class, feature_names=features) y_pred = rf.predict(X) y_proba = rf.predict_proba(X) insample_acc = sum(y_pred == y_class) / len(y_class) rules = rf.get_rules()
# -*- coding: utf-8 -*- """ Created on Sun Nov 25 23:54:35 2018 @author: Melanie """ import numpy as np import pandas as pd from rulefit import RuleFit boston_data = pd.read_csv("prism_numeric.csv", index_col=0) y = boston_data.medv.values X = boston_data.drop("medv", axis=1) features = X.columns X = X.as_matrix() rf = RuleFit() rf.fit(X, y, feature_names=features) rf.predict(X) rules = rf.get_rules() rules = rules[rules.coef != 0].sort_values("support", ascending=False) print(rules)
train = data[200:, :] test = data[:200, :] train_target = target[200:] test_target = target[:200] from sklearn.ensemble import GradientBoostingRegressor gb = GradientBoostingRegressor(n_estimators=500, max_depth=10, learning_rate=0.01) relu_fit = RuleFit() relu_fit.max_iter = 4000 relu_fit.tree_generator = gb relu_fit.fit(train, train_target, feature_names=feature_name) f = relu_fit.predict(test) ff = relu_fit.predict(train) rule = relu_fit.get_rules() truth = 0 for i in range(test_target.shape[0]): if abs(test_target[i] - f[i]) / test_target[i] < 0.1: truth += 1 print("truth: ", truth / test_target.shape[0]) #print(rule) ruleset = pd.DataFrame(data=rule) writer = pd.ExcelWriter('./rules.xlsx') ruleset.to_excel(writer) writer.save() writer.close()
# 迭代训练 k-fold 交叉验证 for train_index, test_index in resampled_index_set: print("\nFold:", k_fold_step) # 初始化 estimator 训练集进入模型 rf = RuleFit(tree_size=args.treesize, rfmode=args.rfmode, max_rules=args.maxrules, random_state=args.randomseed) rf.fit(x_resampled[train_index], y_resampled[train_index], features) # 测试集验证 # 验证测试集 (通过 index 去除 fake data) real_test_index = test_index[test_index < X.shape[0]] batch_test_x = x_resampled[real_test_index] batch_test_y = y_resampled[real_test_index] batch_test_size = len(real_test_index) y_pred = rf.predict(batch_test_x) # 计算测试集 ACC accTest = accuracy_score(batch_test_y, y_pred) print("\nFold:", k_fold_step, "Test Accuracy:", "{:.6f}".format(accTest), "Test Size:", batch_test_size) # 暂存每次选中的测试集和预测结果 test_cache = np.concatenate((test_cache, batch_test_y)) pred_cache = np.concatenate((pred_cache, y_pred)) print( "\n=========================================================================" ) # 每个fold训练结束后次数 +1 k_fold_step += 1 # 末尾输出rulefit模型参数 print("\n=== Model parameters ===")