def train(self, model, save=False, make_chart=False): """ Trains an input model. Makes Calculations, Charts, and Saves the model if necessary. Parameters ---------- model: SKLearn Model The regression model to use save: Boolean Whether or not the model should be saved make_chart Boolean Whether or not to make/save a chart Returns ------- float, float, float: The Average CV Mean Squared Error, Mean Absolute Error, and Test MSE """ #get/split data reader = DataReader() df = reader.create_input_data() df = self.preprocess(df) self.X_train, self.X_test, self.y_train, self.y_test = self.split_data( df) parameters = { 'n_estimators': [1, 5, 10, 20, 30], 'max_depth': [1, 5, 10] } rf = RandomForestRegressor() self.model = GridSearchCV(rf, parameters, cv=10) #train model self.model.fit(self.X_train, self.y_train) #Feature importance importances = self.model.best_estimator_.feature_importances_ cols = self.X_train.columns for i in range(len(importances)): print(cols[i], importances[i]) if save: joblib.dump(self.model.best_estimator_, "../models/" + self.name + "_2017.joblib") print("------------------------") MSEs = cross_val_score(estimator=self.model, X=self.X_train, y=self.y_train, scoring='neg_mean_squared_error', cv=8) predicted = self.model.predict(self.X_test) print("Average CV Mean Squared Error: ", abs(np.mean(MSEs))) print( "Testing Mean Absolute Error: ", mean_absolute_error(self.y_test, self.model.predict(self.X_test))) print("Testing MSE: ", mean_squared_error(self.y_test, predicted)) #print(self.model.feature_importances_) if make_chart: print("Generating Chart...") plt.style.use('dark_background') fig, ax = plt.subplots(nrows=1, ncols=1) ax.set_ylabel('HDI') ax.set_xlabel("Municipality Codmun ID") ax.set_title(self.name + 'Real vs Predicted') green, = ax.plot(np.arange(20), self.y_test[0:100:5], 'g', label='True') red, = ax.plot(np.arange(20), predicted[0:100:5], 'r', label='Predicted') ax.set_xticks(np.arange(20)) x_labels = self.X_test.iloc[0:100:5]['codmun'].tolist() ax.set_xticklabels([str(int(y)) for y in x_labels], rotation='vertical') plt.legend(handles=[green, red], labels=["True", "Predicted"]) plt.tight_layout() fig.savefig(self.name + "_real_v_predicted") for x in range(0, 100, 5): print(predicted[x], x_labels[int(x / 5)]) print(x_labels, predicted[0:100:5]) return np.mean(MSEs), mean_absolute_error( self.y_test, self.model.predict(self.X_test)), mean_squared_error( self.y_test, predicted)
print("Please add --train or --test after py Regressor.py") options = None if options == "--train": r = Regressor("Random Forest", load_model=False) mod = Regressor("Random Forest", load_model=False) cv, ma, mse = r.train(mod, save=False, make_chart=False) print(cv, ma, mse) elif options == "--test": model_name = sys.argv[2] + " " + sys.argv[ 3] #Random Forest_2017 or Random Forest_2016 year = int(model_name.split("_")[-1]) r = Regressor(model_name, load_model=True) reader = DataReader() df = reader.create_input_data() predictions = r.predict(df, year) print("Actual || Predicted") for i in range(len(predictions)): print(df.iloc[i]['hdi'], "||", predictions[i]) ######Training Code######### #cv_error = [] #testing_ma_error = [] #testing_mse = [] #mod = RandomForestRegressor(bootstrap=True, criterion='mae', n_estimators=100) #mod = RandomForestRegressor() #r = Regressor("Random Forest_2017", load_model=True) #importances = r.model.feature_importances_ #reader = DataReader()