def histGradientModel(X_train,Y_train): # use Hist Gradient Boosting Classifier from sklearn.experimental import enable_hist_gradient_boosting from sklearn.ensemble import HistGradientBoostingClassifier histgrad=HistGradientBoostingClassifier() histgrad.fit(X_train,y_train) print('\nHist Gradient Boosting Training Score:',histgrad.score(X_train,Y_train)) return histgrad,histgrad.score(X_train,Y_train)
def hist_gradient_boosting_classifier(x_train, y_train, x_test, y_test): from sklearn.experimental import enable_hist_gradient_boosting from sklearn.ensemble import HistGradientBoostingClassifier ensem = HistGradientBoostingClassifier() ensem.fit(x_train, y_train) value = ensem.score(x_test, y_test) return "{0:.2f}".format(value)
def clasificar_HistGradientBoostingClassifier(X, y, df, trainInputs, trainOutputs, testInputs, testOutputs, graphname): print("\n[" + str(graphname) + "]") scoreArray = np.array([]) clf = HistGradientBoostingClassifier() scores = cross_val_score(clf, X, y, cv=10) clf = clf.fit(trainInputs, trainOutputs) precisionTrain = clf.score(trainInputs, trainOutputs) precisionTest = clf.score(testInputs, testOutputs) print("\tCCR train = %.2f%% | CCR test = %.2f%%" % (precisionTrain * 100, precisionTest * 100)) prediccion_test = clf.predict(testInputs) print(prediccion_test) print(testOutputs) return precisionTest
def k_fold_cross_val(Xs, y_var, k=10): clf = tree.DecisionTreeClassifier() clf_forest = RandomForestClassifier(n_estimators=10) clf_boost = HistGradientBoostingClassifier() num_folds = k N = Xs.shape[0] test_size = int(N / num_folds) test_idxs = np.random.permutation(N)[:num_folds * test_size].reshape( num_folds, test_size) total_score = np.asarray([0., 0., 0.]) total_F1_score = np.asarray([0., 0., 0.]) for i in range(num_folds): print("Iteration " + str(i) + ":") test_i = Xs.index.isin(test_idxs[i]) df_train, df_test = Xs[~test_i], Xs[test_i] y_train, y_test = y_var[~test_i], y_var[test_i] clf = clf.fit(df_train.to_numpy(), y_train.to_numpy().ravel()) score_b = clf.score(df_test.to_numpy(), y_test.to_numpy().ravel()) clf_forest = clf_forest.fit(df_train.to_numpy(), y_train.to_numpy().ravel()) score_f = clf_forest.score(df_test.to_numpy(), y_test.to_numpy().ravel()) clf_boost = clf_boost.fit(df_train.to_numpy(), y_train.to_numpy().ravel()) score_h = clf_boost.score(df_test.to_numpy(), y_test.to_numpy().ravel()) y_hat = clf.predict(df_test.to_numpy()) f1_b = f1_score(y_test.to_numpy().ravel(), y_hat, average='binary') print("F1 score (tree):", f1_b) y_hat = clf_forest.predict(df_test.to_numpy()) f1_f = f1_score(y_test.to_numpy().ravel(), y_hat, average='binary') print("F1 score (forest):", f1_f) y_hat = clf_boost.predict(df_test.to_numpy()) f1_boost = f1_score(y_test.to_numpy().ravel(), y_hat, average='binary') print("F1 score (boost):", f1_boost) print("Prediction scores for (tree,forest,boost):", score_b, score_f, score_h) total_score += np.asarray([score_b, score_f, score_h]) total_F1_score += np.asarray([f1_b, f1_f, f1_boost]) print("Avg. accuracy scores for (tree,forest,boost):", total_score / num_folds) print("Avg. F1 scores for (tree,forest,boost):", total_F1_score / num_folds) return clf, clf_forest, clf_boost
def test_missing_values_trivial(): # sanity check for missing values support. With only one feature and # y == isnan(X), the gbdt is supposed to reach perfect accuracy on the # training set. n_samples = 100 n_features = 1 rng = np.random.RandomState(0) X = rng.normal(size=(n_samples, n_features)) mask = rng.binomial(1, 0.5, size=X.shape).astype(bool) X[mask] = np.nan y = mask.ravel() gb = HistGradientBoostingClassifier() gb.fit(X, y) assert gb.score(X, y) == pytest.approx(1)
clf = HistGradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=n_estimators, criterion='friedman_mse', max_depth=max_depth, random_state=None, max_features=max_features, verbose=0, warm_start=warm_start, presort='deprecated') clf.fit(X_train, y_train) print("n_estimators : ", n_estimators) print("learning rate :", lr) print("Accuracy score (training): {0:.3f}".format( clf.score(X_train, y_train))) print("Accuracy score (validation): {0:.3f}".format( clf.score(X_test, y_test))) print("\n") filename = 'LGBM' + str(n_estimators) + str(lr) + str( max_depth) + str(max_features) + str(warm_start) + str( clf.score(X_test, y_test)) + "%" + '.sav' if clf.score(X_test, y_test) > 0.93: pickle.dump(clf, open(filename, 'wb')) y_predict = clf.predict(X_test) score = accuracy_score(y_test, y_predict) print("n_estimators = ", n_estimators) print("max_features = ", max_features) print("warm_start = ", warm_start)
def HGB(): actions = [ 'change_lane', 'pull_over', 'slow', 'stop', 'straight', 'turn_left', 'turn_right', 'wait_to_turn_left' ] data_points = [] with open('/Users/zephyryau/Documents/study/INF552/Project/data.csv', 'r') as fd: for row in fd: row_list = row[:-1].split(',') sample = [float(i) for i in row_list[:-1]] sample.append(actions.index(row_list[-1])) if len(sample) == 76: data_points.append(sample) data_points_xycl = np.array(data_points) data_points_xyc = data_points_xycl[:, :-1] y = data_points_xycl[:, -1] # centralize datapoints and normalize data_points_xy_cent = [] for row in data_points_xyc: # print(row) avg_x = row[3] avg_y = row[4] head_length = ((row[0] - row[3])**2 + (row[1] - row[4])**2)**0.5 shoulder_length = ((row[3] - row[6])**2 + (row[4] - row[7])**2)**0.5 new_row = [] for i in range(16): # first 16 points new_row.append((row[3 * i] - avg_x) / shoulder_length) new_row.append((row[3 * i + 1] - avg_y) / head_length) new_row.append(row[3 * i + 2]) # conf data_points_xy_cent.append(new_row) result_point = [] with open( '/Users/zephyryau/Documents/study/INF552/Project/input_picture/result.csv', 'r') as fd: for row in fd: row_list = row[:-1].split(',') sample = [float(i) for i in row_list[:-1]] sample.append(actions.index(row_list[-1])) if len(sample) == 76: result_point.append(sample) result_point_xycl = np.array(result_point) result_point_xyc = result_point_xycl[:, :-1] result_point_y = result_point_xycl[:, -1] result_point_xy_cent = [] for row in result_point_xyc: # print(row) avg_x = row[3] avg_y = row[4] head_length = ((row[0] - row[3])**2 + (row[1] - row[4])**2)**0.5 shoulder_length = ((row[3] - row[6])**2 + (row[4] - row[7])**2)**0.5 new_row = [] for i in range(16): # first 16 points new_row.append((row[3 * i] - avg_x) / shoulder_length) new_row.append((row[3 * i + 1] - avg_y) / head_length) new_row.append(row[3 * i + 2]) # conf result_point_xy_cent.append(new_row) '''sum = 0 gesture_results = [] for i in range(100): data_points_xy_train, data_points_xy_test, y_train, y_test = train_test_split(data_points_xy_cent, y, test_size=0.3) clf = MLPClassifier(hidden_layer_sizes=(512,)) clf.fit(data_points_xy_train, y_train) gesture_results.append(clf.predict([result_point_xy_cent[0]])[0]) score = clf.score(data_points_xy_test, y_test) #print(score) sum += score''' X_train, X_test, y_train, y_test = train_test_split(data_points_xy_cent, y, test_size=0.4) scaler = preprocessing.StandardScaler().fit(X_train) #print(scaler.mean_) X_train_scaled = scaler.transform(X_train) X_test_scaled = scaler.transform(X_test) r_X_scaled = scaler.transform(result_point_xy_cent) sum = 0 clf = HistGradientBoostingClassifier() for i in range(10): X_train, X_test, y_train, y_test = train_test_split( data_points_xy_cent, y, test_size=0.4) scaler = preprocessing.StandardScaler().fit(X_train) X_train_scaled = scaler.transform(X_train) X_test_scaled = scaler.transform(X_test) clf.fit(X_train_scaled, y_train) #print(clf.n_iter_, end=" ")= score_train = clf.score(X_train_scaled, y_train) #print(score_train, end=" ") score_test = clf.score(X_test_scaled, y_test) sum += score_test #print(score_test) tf = (clf.predict([r_X_scaled[0]])[0] == result_point_y[0]) return clf.predict([r_X_scaled[0]])[0], sum / 10, tf
따라서 결측치에 대한 전처리는 필요가 없다 - 샘플이 1만개 이상이면 기존 그래디언트 부스팅 보다 훨씬 빠름 ''' from sklearn.experimental import enable_hist_gradient_boosting from sklearn.ensemble import HistGradientBoostingClassifier hgb = HistGradientBoostingClassifier(random_state=42) sc = cross_validate(hgb, train_input, train_target, return_train_score=True, n_jobs=-1) print(np.mean(sc['train_score']), np.mean(sc['test_score'])) # 0.9321723946453317 0.8801241948619236 hgb.fit(train_input, train_target) print(hgb.score(test_input, test_target)) # 0.8723076923076923 87%정확도를 보임 ''' XGBoost => pip install xgboost LGBM (LightGBM) => pip install lightgbm ''' from xgboost import XGBClassifier xgb = XGBClassifier(tree_method='hist', random_state=42) sc = cross_validate(xgb, train_input, train_target, return_train_score=True, n_jobs=-1) print('@' * 50)
best_clf = None for alpha in np.arange(0.2, 1.01, 0.2): for beta in np.arange(0.2, 1.01, 0.2): all_sample_weights = getWeights(data, alpha * wPep_t, beta * wXL_t) clf = HistGradientBoostingClassifier( scoring="f1", monotonic_cst=monotonic_cst, tol=1e-7, random_state=42, validation_fraction=None) #, early_stopping=True) clf.fit(X, y, sample_weight=all_sample_weights) print("alpha,beta: " + str(alpha) + "\t" + str(beta)) print("Loss on all data (sample weight): {:.2f}".format( clf.score(X, y, sample_weight=all_sample_weights))) p = clf.predict_proba( data.loc[:, data.columns != 'Label'])[:, 1] # prob for class=1 (target) p = pd.DataFrame({'p-value': p}) data.reset_index(drop=True, inplace=True) p.reset_index(drop=True, inplace=True) data2 = pd.concat([data, p], axis=1) data2 = calcQ(data2, scoreColName="p-value") data2["Rank"] = 1 # store best fit nXLauc, XLauc = evalXL(data2, plot=False, maxQ=0.1) print("pAUC(peptides), pAUC(XLs): " + str(nXLauc) + "\t" + str(XLauc)) print("sum(pAUC): " + str(nXLauc + XLauc))
print(result.importances_mean) # [0.08876275 0.23438522 0.08027708] # n_repeat = None = 5 : 특성을 비교하기위해 섞을 횟수. 여기서는 10회 적용 result = permutation_importance(hgb, test_input, test_target, n_repeats=10, random_state=42, n_jobs=-1) print(result.importances) print(result.importances_std) print(result.importances_mean) # [0.05969231 0.20238462 0.049 ] hgb.score(test_input, test_target) # 0.8723076923076923 ##### # XHBoost ### import xgboost from xgboost import XGBClassifier xgb = XGBClassifier(tree_method='hist', random_state=42) scores = cross_validate(xgb, train_input, train_target, return_train_score=True, n_jobs=-1)