Ejemplo n.º 1
0
 def buildForest(self, X_train, y_train):
     NUM_TREES = 100
     NUM_JOBS = 1
     FEATURES_IN_EACH_TREE = "sqrt"
     rf = RandomForestRegressor(n_estimators=NUM_TREES, verbose=1, n_jobs=NUM_JOBS, max_features=FEATURES_IN_EACH_TREE, oob_score=True, max_depth=25)
     rf.fit_transform(X_train, y_train)
     return rf
def predict_on_test():
  sample = pd.read_csv(join(SAMPLES_FILE_PATH, "sample_train.csv"))
  test = pd.read_csv(join(SAMPLES_FILE_PATH, "sample_test.csv"))

  preprocessed = Preprocess(sample)

  rf = RandomForestRegressor(n_estimators = 100, criterion = "mse", bootstrap = True, max_features = 'sqrt', depth = 40)
  rf.fit_transform(X = preprocessed.features, y = preprocessed.labels.values.ravel())
  test_preprocessed = Preprocess(test)
  predicted_values = rf.predict(test_preprocessed.features)
  error_rate, _ = benchmark(predicted_values.ravel(), test_preprocessed.labels.values)

  print "Mean Square Prediction Erorr = %s" % MSE_prediction(predicted_values, test_preprocessed.labels)

  plot_feature_importances(preprocessed.features.columns.values, rf.feature_importances_)
def cross_validate_depth():

  sample = pd.read_csv(join(SAMPLES_FILE_PATH, "sample_train.csv"))

  preprocessed = Preprocess(sample)

  depths = (2, 40, 60, 80)
  oob_scores = []
  for depth in depths:
    rf = RandomForestRegressor(n_estimators = 60, criterion = "mse", bootstrap = True, oob_score = True, max_features = 'sqrt', max_depth = depth)
    rf.fit_transform(X = preprocessed.features, y = preprocessed.labels.values.ravel())

    score = 1.0 - rf.oob_score_
    
    oob_scores.append(score)
    print "Out-of-Bag Error for Depth %s: %s" % (depth, score)

  pdb.set_trace()

  plot_oob_error_depth(depths, oob_scores)
Ejemplo n.º 4
0
dftrain_X, feature_names = dense2sparse(dftrain_Xdense)
dftest_X,  feature_names = dense2sparse(dftest_Xdense)


# X is your feature space, Y is your target variable
dftrain_y = df1[df1.date <= date1]['FanDuelPts']
dftest_y  = df1[df1.date >  date1]['FanDuelPts']

#==============================================================================
# Predict last two weeks
#==============================================================================

from sklearn.ensemble import RandomForestRegressor

rf1 = RandomForestRegressor(verbose=True)   # Parameters need to be tuned
rf1.fit_transform(dftrain_X,dftrain_y)      # Train the model
rf1_preds = rf1.predict(dftest_X)           # Predict against the test set

# Performance metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error
mean_absolute_error(dftest_y,rf1_preds)     
np.sqrt(mean_squared_error(dftest_y,rf1_preds))

# Plot model results
dfplot = pd.merge(dftest_y.to_frame('Actual'), pd.DataFrame(rf1_preds,
                  columns=['Pred']), left_index=True, right_index=True)

dfplot.plot()

import matplotlib.pyplot as plt
plt.plot(np.arange(1,len(rf1_preds),),rf1_preds)
Ejemplo n.º 5
0
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('Position_Salaries.csv')
X = dataset.iloc[:, [1]].values
y = dataset.iloc[:,[2]].values

# Splitting the dataset into the Training set and Test set
"""from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)"""

# Feature Scaling
"""from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)"""
from sklearn.ensemble import RandomForestRegressor
rfr= RandomForestRegressor(n_estimators=50, random_state=0)
rfr.fit_transform(X,y)
y_pred= rfr.predict(6.5)
y_pred

X_grid=np.arange(min(X),max(X),0.01)
X_grid=X_grid.reshape((len(X_grid),1))
plt.scatter(X,y, color='blue')
plt.plot(X_grid, rfr.predict(X_grid), color='green')
plt.title('RFR')
plt.xlabel('positionsalaries')
plt.ylabel('salary')
plt.show() 
Ejemplo n.º 6
0
# parameters
n_estimators = 200
max_depth = 25
min_samples_split = 15
min_samples_leaf = 2

# Random forest classifier
clf = RandomForestRegressor(n_estimators=n_estimators,
                            max_depth=max_depth,
                            min_samples_split=min_samples_split,
                            min_samples_leaf=min_samples_leaf)

# It is trained of 2 Epochs
X = np.concatenate((X, X), axis=0)
Y = np.concatenate((Y, Y), axis=0)

clf.fit_transform(X, Y)

print "Feature Importance ranking", clf.feature_importances_

# Saving the model for predictions

file_name = "model-final"
print "---- Saved at -----" + file_name
fileObject = open(file_name, 'wb')
#
pickle.dump(clf, fileObject)
fileObject.close()

print("--- %s seconds ---" % (time.time() - start_time))