def buildForest(self, X_train, y_train): NUM_TREES = 100 NUM_JOBS = 1 FEATURES_IN_EACH_TREE = "sqrt" rf = RandomForestRegressor(n_estimators=NUM_TREES, verbose=1, n_jobs=NUM_JOBS, max_features=FEATURES_IN_EACH_TREE, oob_score=True, max_depth=25) rf.fit_transform(X_train, y_train) return rf
def predict_on_test(): sample = pd.read_csv(join(SAMPLES_FILE_PATH, "sample_train.csv")) test = pd.read_csv(join(SAMPLES_FILE_PATH, "sample_test.csv")) preprocessed = Preprocess(sample) rf = RandomForestRegressor(n_estimators = 100, criterion = "mse", bootstrap = True, max_features = 'sqrt', depth = 40) rf.fit_transform(X = preprocessed.features, y = preprocessed.labels.values.ravel()) test_preprocessed = Preprocess(test) predicted_values = rf.predict(test_preprocessed.features) error_rate, _ = benchmark(predicted_values.ravel(), test_preprocessed.labels.values) print "Mean Square Prediction Erorr = %s" % MSE_prediction(predicted_values, test_preprocessed.labels) plot_feature_importances(preprocessed.features.columns.values, rf.feature_importances_)
def cross_validate_depth(): sample = pd.read_csv(join(SAMPLES_FILE_PATH, "sample_train.csv")) preprocessed = Preprocess(sample) depths = (2, 40, 60, 80) oob_scores = [] for depth in depths: rf = RandomForestRegressor(n_estimators = 60, criterion = "mse", bootstrap = True, oob_score = True, max_features = 'sqrt', max_depth = depth) rf.fit_transform(X = preprocessed.features, y = preprocessed.labels.values.ravel()) score = 1.0 - rf.oob_score_ oob_scores.append(score) print "Out-of-Bag Error for Depth %s: %s" % (depth, score) pdb.set_trace() plot_oob_error_depth(depths, oob_scores)
dftrain_X, feature_names = dense2sparse(dftrain_Xdense) dftest_X, feature_names = dense2sparse(dftest_Xdense) # X is your feature space, Y is your target variable dftrain_y = df1[df1.date <= date1]['FanDuelPts'] dftest_y = df1[df1.date > date1]['FanDuelPts'] #============================================================================== # Predict last two weeks #============================================================================== from sklearn.ensemble import RandomForestRegressor rf1 = RandomForestRegressor(verbose=True) # Parameters need to be tuned rf1.fit_transform(dftrain_X,dftrain_y) # Train the model rf1_preds = rf1.predict(dftest_X) # Predict against the test set # Performance metrics from sklearn.metrics import mean_absolute_error, mean_squared_error mean_absolute_error(dftest_y,rf1_preds) np.sqrt(mean_squared_error(dftest_y,rf1_preds)) # Plot model results dfplot = pd.merge(dftest_y.to_frame('Actual'), pd.DataFrame(rf1_preds, columns=['Pred']), left_index=True, right_index=True) dfplot.plot() import matplotlib.pyplot as plt plt.plot(np.arange(1,len(rf1_preds),),rf1_preds)
import matplotlib.pyplot as plt import pandas as pd # Importing the dataset dataset = pd.read_csv('Position_Salaries.csv') X = dataset.iloc[:, [1]].values y = dataset.iloc[:,[2]].values # Splitting the dataset into the Training set and Test set """from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)""" # Feature Scaling """from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() X_train = sc_X.fit_transform(X_train) X_test = sc_X.transform(X_test)""" from sklearn.ensemble import RandomForestRegressor rfr= RandomForestRegressor(n_estimators=50, random_state=0) rfr.fit_transform(X,y) y_pred= rfr.predict(6.5) y_pred X_grid=np.arange(min(X),max(X),0.01) X_grid=X_grid.reshape((len(X_grid),1)) plt.scatter(X,y, color='blue') plt.plot(X_grid, rfr.predict(X_grid), color='green') plt.title('RFR') plt.xlabel('positionsalaries') plt.ylabel('salary') plt.show()
# parameters n_estimators = 200 max_depth = 25 min_samples_split = 15 min_samples_leaf = 2 # Random forest classifier clf = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf) # It is trained of 2 Epochs X = np.concatenate((X, X), axis=0) Y = np.concatenate((Y, Y), axis=0) clf.fit_transform(X, Y) print "Feature Importance ranking", clf.feature_importances_ # Saving the model for predictions file_name = "model-final" print "---- Saved at -----" + file_name fileObject = open(file_name, 'wb') # pickle.dump(clf, fileObject) fileObject.close() print("--- %s seconds ---" % (time.time() - start_time))