def deserialize_random_forest(model_dict): model = RandomForestClassifier(**model_dict['params']) estimators = [deserialize_decision_tree(decision_tree) for decision_tree in model_dict['estimators_']] model.estimators_ = np.array(estimators) model.classes_ = np.array(model_dict['classes_']) model.n_features_ = model_dict['n_features_'] model.n_outputs_ = model_dict['n_outputs_'] model.max_depth = model_dict['max_depth'] model.min_samples_split = model_dict['min_samples_split'] model.min_samples_leaf = model_dict['min_samples_leaf'] model.min_weight_fraction_leaf = model_dict['min_weight_fraction_leaf'] model.max_features = model_dict['max_features'] model.max_leaf_nodes = model_dict['max_leaf_nodes'] model.min_impurity_decrease = model_dict['min_impurity_decrease'] model.min_impurity_split = model_dict['min_impurity_split'] if 'oob_score_' in model_dict: model.oob_score_ = model_dict['oob_score_'] if 'oob_decision_function_' in model_dict: model.oob_decision_function_ = model_dict['oob_decision_function_'] if isinstance(model_dict['n_classes_'], list): model.n_classes_ = np.array(model_dict['n_classes_']) else: model.n_classes_ = model_dict['n_classes_'] return model
import pandas as pd #1 from sklearn.model_selection import train_test_split #2 from sklearn.ensemble import RandomForestClassifier #3 from sklearn.metrics import confusion_matrix #4 # import libs datas = pd.read_csv("datas.csv") # read datas #1 x = datas.iloc[:, 3:-3].values y = datas.iloc[:, -2].values # split values x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.10, random_state=0) # 90% for train, %10 for test #2 rfc = RandomForestClassifier() #3 rfc.max_depth = 100 rfc.criterion = "entropy" #select criterion,other criterion is 'gini' rfc.n_estimators = 1 rfc.fit(x_train, y_train) y_pred = rfc.predict(x_test) cm = confusion_matrix(y_test, y_pred) #4 print("RFC") print(cm)
plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) from sklearn.datasets import make_blobs X, y = make_blobs(n_samples=20, n_features=2, centers=2, cluster_std=2, random_state=3) plt.scatter(X[:, 0], X[:, 1], c=y, s=50, edgecolors='k') from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(random_state=8) clf.max_depth = 1 clf.n_estimators = 1 clf.fit(X, y) plt.scatter(X[:, 0], X[:, 1], c=y, edgecolors='k') plotBoundary(X, clf) for i in range(3, 10): clf.n_estimators = i clf.fit(X, y) plt.scatter(X[:, 0], X[:, 1], marker='o', c=y) plotBoundary(X, clf) plt.title("{0} estimators".format(i)) plt.show()
from sklearn import grid_search clf = grid_search.GridSearchCV(model, parameters, cv=4, verbose=10, n_jobs=1) print 'Grid Search for the model' clf.fit(X_trn, y_trn) print clf.best_params_ model.n_estimators = clf.best_params_['n_estimators'] model.oob_score = clf.best_estimator_['oob_score'] else: model.n_estimators = 600 model.oob_score = False model.max_depth = 20 model.n_jobs = 20 from sklearn import cross_validation as cv if args.SGD: from SGDRank import SGDClassifier model = SGDClassifier() print 'CV' cv.cross_val_score(model, X_trn, y_trn, cv=3, n_jobs=3) print 'Fit the model' model.fit(X_trn, y_trn)