Beispiel #1
0
#norm age and rating
#data[0:,-2] = data[0:,-2] / data[0:,-2].max()
#data[0:,-1] = data[0:,-1] / data[0:,-1].max()
#data_word_age = data[0:,0:-1]

train_x = data[0:, 0:-1]
train_y = np.array(data[0:, -1:]).reshape((data.shape[0], ))

X_train, X_test, y_train, y_test = train_test_split(train_x,
                                                    train_y,
                                                    test_size=0.3,
                                                    random_state=0)
reg = RandomForestRegressor()
reg.fit(X_train, y_train)
p = reg.predict(X_test)
s = reg.score(X_test, y_test)
print(s)

# above mean or not
train_rating_average = np.average(train_y)
binary_p = np.zeros([p.shape[0]])
iteration = 0
for i in p:
    if i > train_rating_average:
        binary_p[iteration] = 1.0
    else:
        binary_p[iteration] = 0.0
    iteration += 1
binary_ytest = np.zeros([p.shape[0]])
iteration = 0
for i in y_test:
for column in features:
    if plotnumber <= len(features):
        ax = plt.subplot(4, 4, plotnumber)
        sns.stripplot(target, features[column])
    plotnumber += 1
plt.show()

# In[15]:

from sklearn.ensemble.forest import RandomForestRegressor

# In[16]:

rand_clf = RandomForestRegressor()

# In[272]:

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

# In[273]:

rand_clf.fit(x_train, y_train)

# In[274]:

rand_clf.score(x_train, y_train)

# In[275]:

rand_clf.score(x_test, y_test)
Beispiel #3
0
 # Generate training and test set
 X_train,y_train = OrganizeData(nucleus, 'train')
 X_test, y_test = OrganizeData(nucleus, 'test')
 
 # Feature scaling
 X_train_scaled = preprocessing.scale(X_train)
 X_test_scaled = preprocessing.scale(X_test)
         
 # Set the parameters for the random forest estimator    
 estimator = RandomForestRegressor(n_estimators=50, max_features=16, max_depth=25,
 				min_samples_split=5, min_samples_leaf=5, random_state=0)
 
 # Build the random forest of regression trees from the training set
 estimator = estimator.fit(X_train_scaled,y_train)
 
 print estimator.score(X_train_scaled,y_train)
 print estimator.score(X_test_scaled,y_test)
     
 # Predict regression target for the test set
 predicted = estimator.predict(X_train_scaled)
 cc = np.corrcoef(y_train,predicted)
 print cc
 print estimator
 #my_plotting.simple_plot_overlay(y_train,predicted)
 
 predicted = estimator.predict(X_test_scaled)
 cc = np.corrcoef(y_test,predicted)
 print cc
 print estimator
 #my_plotting.simple_plot_overlay(y_test,predicted)    
 
Beispiel #4
0
#             if res[i][j] == 100:
#                 res[i][j] = 0
#             else:
#                 res[i][j] = -0.01 * res[i][j]
#     return res

# def normalizeY(arr):
#     arr=arr/100
#     return arr

if __name__ == '__main__':

    train_x, test_x, train_y, test_y, x_data, y_data = load(train_data_path)

    rf_model = RandomForestRegressor() 
    rf_model.fit(x_data, y_data)
    with open(filename, 'wb') as file:
	    pickle.dump(rf_model, file)
    rf_train_score = rf_model.score(x_data, y_data)
    rf_test_score = rf_model.score(test_x, test_y)
    print("RF train score:",rf_train_score)
    print("RF test score:",rf_test_score)

    dt_model =  DecisionTreeRegressor() 
    dt_model.fit(x_data, y_data)
    with open(filename2, 'wb') as file:
	    pickle.dump(dt_model, file)
    dt_train_score = dt_model.score(x_data, y_data)
    dt_test_score = dt_model.score(test_x, test_y)
    print("DT train score:",dt_train_score)
    print("DT test score:",dt_test_score)
# fit a linear model with no bells and whistles
model = linear_model.LinearRegression()
model.fit(train_X, train_Y)

# look at the r squared on the training data and the test data
model.score(train_X, train_Y)
model.score(test_X, test_Y)

# See if I can get the r squared on the test data lower by using more complex models
# random forest
forest = RandomForestRegressor()

# fit the data without using cross val to select parameters
# note that train score is much higher than test score
forest.fit(train_X, train_Y)
forest.score(train_X, train_Y)
forest.score(test_X, test_Y)


# fit a random forest regressor using grid search to 
# select the number of trees and max depth
new_forest = RandomForestRegressor()
params_grid = [{'max_depth': [3, 5,10, None], 'n_estimators': [5,10,15,20, 50, 80]} ]
grid_search = GridSearchCV(new_forest, params_grid, cv=10)
grid_search.fit(train_X, train_Y)
grid_search.score(test_X, test_Y)
grid_search.best_estimator_

# fit a boosted regression
boost = GradientBoostingRegressor()
params_grid = [{'learning_rate': [.05,.1,2], 'n_estimators': [20,50,100,150]} ]