x_features = x.drop(['Zillow Price Index'], axis=1) x_target = x['Zillow Price Index'] y_features = y.drop(['Zillow Price Index'], axis=1) y_target = y['Zillow Price Index'] print(x_features.shape) print(x_target.shape) print(y_features.shape) print(y_target.shape) print('-' * 100) print("Most Important Features in Order: ", x_features.columns[[17, 37, 36, 155]]) model = RFR(n_jobs=-1) # model = xgb.XGBRegressor() model.fit(x_features, x_target) y_pred = model.predict(y_features) mse = mean_squared_error(y_target, y_pred) print("MSE: ", mse) rmse = np.sqrt(mse) print("RMSE: ", rmse) print('-' * 100) print(model.feature_importances_) num_objects = np.arange(len(model.feature_importances_)) plt.bar(num_objects, model.feature_importances_) z = model.feature_importances_
# Splitting the dataset into the Training set and Test set """from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)""" # Feature Scaling """from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() X_train = sc_X.fit_transform(X_train) X_test = sc_X.transform(X_test) sc_y = StandardScaler() y_train = sc_y.fit_transform(y_train)""" # Fitting the Random Forest Regression Model to the dataset from sklearn.ensemble import RandomForestRegressor as RFR regressor = RFR( n_estimators=300, random_state=0, ) #estimators is number of trees regressor.fit(X, y) # Predicting a new result y_pred = regressor.predict([[6.5]]) # Visualising the Random Forest Regression results (for higher resolution and smoother curve) #Need this because the random forest isn't constant X_grid = np.arange(min(X), max(X), 0.01) X_grid = X_grid.reshape((len(X_grid), 1)) plt.scatter(X, y, color='red') plt.plot(X_grid, regressor.predict(X_grid), color='blue') plt.title('Truth or Bluff (Random Forest Regression Model)') plt.xlabel('Position level') plt.ylabel('Salary') plt.show()
from sklearn.svm import SVR model = SVR(kernel='rbf', C=20) model.fit(num_feat_train, y_train) # Model 3 : Linear/Polynomial Regression from sklearn.linear_model import ElasticNet model = ElasticNet(fit_intercept=True, normalize=True, alpha=0.1, l1_ratio=1, precompute=True) model.fit(num_feat_train, y_train) # Model 4 : Random Forest Regression from sklearn.ensemble import RandomForestRegressor as RFR model = RFR(n_estimators=300, max_depth=8) model.fit(num_feat_train_pca, y_train) # Model 5 : ANN import keras from keras import backend as K from keras.models import Sequential from keras.layers import Dense from keras.layers import Dropout from keras.layers import BatchNormalization, Activation from keras.optimizers import SGD # Defining custom R2 metric for ANN def r2_metric(y_true, y_pred): SS_res = K.sum(K.square(y_true - y_pred))
] for algorithm_name in ["rf", "etr"]: # ["xgBoost", "rf", "etr"] for estimator_output_length in [4, 5, 6]: iterOrCopy = "iterative" required_prediction_length = 14 if algorithm_name != "xgBoost": for n_estimators in [500]: for min_samples_split in [2]: for min_samples_leaf in [1]: if algorithm_name == "rf": estimator_withParams = RFR( n_estimators=n_estimators, max_features="auto", min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, oob_score=False, n_jobs=-1, random_state=2017) if algorithm_name == "etr": estimator_withParams = ETR( n_estimators=n_estimators, max_features="auto", min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, oob_score=False, n_jobs=-1, random_state=2017) Model_for_competition( algorithm_name=algorithm_name,
def test_ml(stock='F', forecast_out=5, month=None, day=None, year=2019, plot=False, volume=False): # Assume input day is valid trading day # Want to separate 1 percent of the data to forecast # Today info if (month == None or day == None): today = datetime.datetime.now() month = today.month day = today.day end_date = dt(year, month, day) trading_days = get_trading_days([2017, 2018, 2019]) end_idx = np.where(end_date == trading_days)[0][0] end = trading_days[end_idx - forecast_out] new_start = trading_days[end_idx - forecast_out] new_end = trading_days[end_idx] # For prediction start = datetime.datetime(2016, 4, 1) df = read_data(stock, start, end) #df = web.DataReader(stock, 'yahoo', start, end) #print(df.index) df = read_data(stock, start, end) if (df.empty): #print("SHOULD BE EMPTY") return [0] * 10, "ERROR" df = df[df.index <= end] #print(df.tail(forecast_out)) dfreg = df.loc[:, ['adjusted close', 'volume']] dfreg['HL_PCT'] = (df['high'] - df['low']) / df['adjusted close'] * 100.0 dfreg['PCT_change'] = (df['adjusted close'] - df['open']) / df['open'] * 100.0 # For volume testing if (volume): dfreg['adjusted close'] = dfreg['volume'] dfreg['EMA'] = get_ema(dfreg, forecast_out) if (dfreg['EMA'].empty): return [0] * 10, "ERROR" dfreg['old close'] = dfreg['adjusted close'] dfreg['adjusted close'] = dfreg['EMA'] # For validation #print("NEW START: \t{}".format(new_start)) #print("NEW END: \t{}".format(new_end)) #print("VALIDATION START: {} END: {}\n".format(new_start, new_end)) #new_df = web.DataReader(stock, 'yahoo', new_start, new_end) new_df = read_data(stock, new_start, new_end) #print("TESTING VALIDATION DATA") if (new_df.empty): return [0] * 10, "ERROR" #print(new_end) new_df = new_df[new_df.index <= new_end] #print(new_df) #exit(1) new_dfreg = new_df.loc[:, ['adjusted close', 'volume']] new_dfreg['HL_PCT'] = (new_df['high'] - new_df['low']) / new_df['adjusted close'] * 100.0 new_dfreg['PCT_change'] = (new_df['adjusted close'] - new_df['open']) / new_df['open'] * 100.0 # Drop missing value dfreg.fillna(value=-99999, inplace=True) new_dfreg.fillna(value=-99999, inplace=True) # Searating the label here, we want to predict the Adjclose forecast_col = 'adjusted close' dfreg['label'] = dfreg[forecast_col].shift(-forecast_out) X = np.array(dfreg.drop(['label'], 1)) # Scale X for linear regression X = preprocessing.scale(X) # Finally want late X and early X for model X_lately = X[-forecast_out:] X = X[:-forecast_out] # Separate label and identify it as y y = np.array(dfreg['label']) y = y[:-forecast_out] # Training and testing sets X_train = X[:len(X) - forecast_out] X_test = X[len(X) - forecast_out:] y_train = y[:len(y) - forecast_out] y_test = y[len(y) - forecast_out:] # LinReg clfreg = LinearRegression(n_jobs=-1) # QuadReg2 clfpoly2 = make_pipeline(PolynomialFeatures(2), Ridge()) # QuadReg3 clfpoly3 = make_pipeline(PolynomialFeatures(3), Ridge()) # QuadReg4 clfpoly4 = make_pipeline(PolynomialFeatures(4), Ridge()) # QuadReg5 clfpoly5 = make_pipeline(PolynomialFeatures(5), Ridge()) # KNN Regression clfknn = KNeighborsRegressor(n_neighbors=2) # Bayesian Ridge clfbayr = BayesianRidge() # Neural Network clfmlp = MLPRegressor(hidden_layer_sizes=(100, 100, 100), learning_rate='adaptive', solver='adam', max_iter=5, verbose=False) # Random Forest Regressor clfrfr = RFR(n_estimators=15) # Support Vector Regressor clfsvr = SVR(gamma='auto') threads = [] models = [ clfreg, clfpoly2, clfpoly3, clfpoly4, clfpoly5, clfknn, clfbayr, clfrfr, clfsvr ] fits = [''] * len(models) for i in range(len(models)): process = Thread(target=fitting, args=[models[i], X_train, y_train, fits, i], name=stock) process.start() threads.append(process) for process in threads: process.join() start = time.time() try: reg_forecast = fits[0].predict(X_lately) poly2_forecast = fits[1].predict(X_lately) poly3_forecast = fits[2].predict(X_lately) poly4_forecast = fits[3].predict(X_lately) poly5_forecast = fits[4].predict(X_lately) try: knn_forecast = fits[5].predict(X_lately) except ValueError: #print("KNN ERROR: {}".format(stock)) #print("F*****g really: {}".format(stock)) #print(X_lately) #print(X_lately.shape) knn_forecast = np.zeros(poly5_forecast.shape) #exit(1) bayr_forecast = fits[6].predict(X_lately) rfr_forecast = fits[7].predict(X_lately) svr_forecast = fits[8].predict(X_lately) mlp_forecast = fits[6].predict(X_lately) except AttributeError: #print("ISSUES WITH {}".format(stock)) return [0] * 10, {} #print(fits) #print(threads) #print(X_train, y_train) #print(X, y) #print(stock) #print(dfreg) #exit(1) #mlp_forecast = clfmlp.predict(X_lately) # Set up dataframe dfreg['reg_forecast'] = np.nan dfreg['poly2_forecast'] = np.nan dfreg['poly3_forecast'] = np.nan dfreg['poly4_forecast'] = np.nan dfreg['poly5_forecast'] = np.nan dfreg['knn_forecast'] = np.nan dfreg['bayr_forecast'] = np.nan dfreg['mlp_forecast'] = np.nan dfreg['rfr_forecast'] = np.nan dfreg['svr_forecast'] = np.nan last_date = dfreg.iloc[-1].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in zip(reg_forecast, poly2_forecast, poly3_forecast, poly4_forecast, poly5_forecast, knn_forecast, bayr_forecast, mlp_forecast, rfr_forecast, svr_forecast): next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg.loc[next_date] = list( [np.nan for _ in range(len(dfreg.columns) - 10)] + list(i)) #dfreg['mean_forecast'] = dfreg[['poly2_forecast', 'poly3_forecast']].mean(axis=1) #print(dfreg.tail(forecast_out+1)) dfreg['mean_forecast'] = dfreg[[ 'reg_forecast', 'poly2_forecast', 'poly3_forecast', 'knn_forecast', 'bayr_forecast', # mlp_forecast, 'rfr_forecast', 'svr_forecast' ]].mean(axis=1) as_list = dfreg.index.tolist() # I THINK THIS IS FIXED #print(as_list[-forecast_out-5:]) #for asd in as_list[-forecast_out-1:]: # print(asd) #print() #for asd in new_df.index.tolist():#[:forecast_out]: # print(asd) as_list[-forecast_out:] = new_df.index.tolist()[1:] try: dfreg.index = as_list except: print("DATA MISALIGNMENT FOR: {}".format(stock)) #print(new_df) #print(dfreg.tail(forecast_out+1)) #exit(1) return [0] * 10, {} #for asd in as_list[-forecast_out-5:]: # print(asd) dfreg[-forecast_out:].index = new_df.index.tolist()[:forecast_out] #print(dfreg.tail(forecast_out+1)) #return [None]*10, None #exit(1) # # Trying to do all combinations # forecasts = [ 'reg_forecast', 'poly2_forecast', 'poly3_forecast', 'poly4_forecast', 'poly5_forecast', 'knn_forecast', 'bayr_forecast', 'rfr_forecast', 'svr_forecast' ] if (plot): dfreg['old close'].tail(20).plot(figsize=(20, 12), lw=2) dfreg['adjusted close'].tail(20).plot(figsize=(20, 12), lw=2) dfreg['reg_forecast'].tail(20).plot(lw=0.5) dfreg['poly2_forecast'].tail(20).plot(lw=0.5) dfreg['poly3_forecast'].tail(20).plot(lw=0.5) dfreg['poly4_forecast'].tail(20).plot(lw=0.5) dfreg['poly5_forecast'].tail(20).plot(lw=0.5) dfreg['knn_forecast'].tail(20).plot(lw=0.5) dfreg['bayr_forecast'].tail(20).plot(lw=0.5) dfreg['mean_forecast'].tail(20).plot(c='k') #dfreg['mlp_forecast'].tail(20).plot() dfreg['rfr_forecast'].tail(20).plot(lw=0.5) dfreg['svr_forecast'].tail(20).plot(lw=0.5) new_dfreg['Actual close'] = new_df['adjusted close'] if (plot): new_dfreg['Actual close'].tail(20).plot(c='g', lw=2) fit = np.polyfit([i for i in range(forecast_out)], dfreg['mean_forecast'].values[-forecast_out:], deg=1) #print("CALCULATING CORRELATION BETWEEN METHOD AND ACTUAL") actual = new_dfreg['Actual close'].tail(forecast_out) highest_corr = 0 best_comb = '' num_combs = 0 correlations = [] good_combinations = [] #for j in range(1,9): # for comb in combinations(forecasts, j): # num_combs += 1 # comb_dat = dfreg[[*list(comb)]].mean(axis=1).tail(forecast_out) # new_correlation = corr(comb_dat, actual)[0] # correlations.append(new_correlation) # if(new_correlation > 0.4): # good_combinations.append(comb) # if(new_correlation > highest_corr): # highest_corr = new_correlation # best_comb = comb for comb in all_combinations: num_combs += 1 comb_dat = dfreg[[*list(comb)]].mean(axis=1).tail(forecast_out) new_correlation = corr(comb_dat, actual)[0] correlations.append(new_correlation) if (new_correlation > 0.4): good_combinations.append(comb) if (new_correlation > highest_corr): highest_corr = new_correlation best_comb = comb reg_dat = dfreg['reg_forecast'].tail(forecast_out) reg_corr = corr(reg_dat, actual) #print("Linear Regression: {}".format(reg_corr)) poly2_dat = dfreg['poly2_forecast'].tail(forecast_out) poly2_corr = corr(poly2_dat, actual) #print("Poly2: {}".format(poly2_corr)) poly3_dat = dfreg['poly3_forecast'].tail(forecast_out) poly3_corr = corr(poly3_dat, actual) #print("Poly3: {}".format(poly3_corr)) poly4_dat = dfreg['poly4_forecast'].tail(forecast_out) poly4_corr = corr(poly4_dat, actual) #print("Poly3: {}".format(poly3_corr)) poly5_dat = dfreg['poly5_forecast'].tail(forecast_out) poly5_corr = corr(poly5_dat, actual) #print("Poly3: {}".format(poly3_corr)) knn_dat = dfreg['knn_forecast'].tail(forecast_out) knn_corr = corr(knn_dat, actual) #print("K Nearest Neighbors: {}".format(knn_corr)) bayr_dat = dfreg['bayr_forecast'].tail(forecast_out) bayr_corr = corr(bayr_dat, actual) #print("Bayesian: {}".format(bayr_corr)) rfr_dat = dfreg['rfr_forecast'].tail(forecast_out) rfr_corr = corr(rfr_dat, actual) #print("Random Forest: {}".format(rfr_corr)) svr_dat = dfreg['svr_forecast'].tail(forecast_out) svr_corr = corr(svr_dat, actual) #print("Support Vector: {}".format(rfr_corr)) mean_dat = dfreg['mean_forecast'].tail(forecast_out) mean_corr = corr(mean_dat, actual) if (plot): plt.legend(loc='best') plt.xlabel('Date') plt.ylabel('Price') plt.title(stock) plt.savefig("./test_plots/{1}_{2}/{0}_{1}_{2}_{3}".format( stock, month, day, forecast_out)) plt.close() return (reg_corr[0], poly2_corr[0], poly3_corr[0], poly4_corr[0], poly5_corr[0],\ knn_corr[0], bayr_corr[0], rfr_corr[0], mean_corr[0], svr_corr[0]), good_combinations
def garch(input_col=['p_var', 'mean_return_square', 'sum_abs_sent_square'], file_name='modified_garch'): # data = resample_data.process_all_codes() data = pd.read_csv(settings.get_home_path() + 'data/week_data/total.csv', index_col='date') indexs = data.index.drop_duplicates() file = open(settings.get_home_path() + 'data/{}.csv'.format(file_name), 'w') file.write( "time_winodw_forecast_output,time_window_forecast_input,number of train,number of test,adj_svr_R^2,adj_rfr_R^2,svr_trend,rfr_trend\n" ) svr = SVR(kernel='rbf', C=64, gamma=1 / 3) rfr = RFR(max_features=1, warm_start=False) origin_col = ['p_var', 'mean_return', 'sum_abs_sent'] # the first train data train_input = data[data.index == indexs[0]].set_index('code')[origin_col] train_output = data[data.index == indexs[1]].set_index('code')['p_var'] train = train_input.join(train_output, rsuffix='_out') train = train.dropna() train['mean_return_square'] = np.square(train['mean_return']) train['sum_abs_sent_square'] = np.square(train['sum_abs_sent']) for i in range(1, len(indexs) - 1): # the number of companies and features in training set num = len(train.index) num_features = len(input_col) # train the model svr.fit(train[input_col], train.p_var_out.values) rfr.fit(train[input_col], train.p_var_out.values) # predict data test_input = data[data.index == indexs[i]].set_index( 'code')[origin_col] test_output = data[data.index == indexs[i + 1]].set_index('code')['p_var'] test = test_input.join(test_output, rsuffix='_out') test = test.dropna() test['mean_return_square'] = np.square(test['mean_return']) test['sum_abs_sent_square'] = np.square(test['sum_abs_sent']) num_test = len(test.index) # predict test['p_var_pre'] = svr.predict(test[input_col]) test['p_var_pre2'] = rfr.predict(test[input_col]) # get the R squared r1 = svr.score(train[input_col], train.p_var_out.values).round(4) r2 = rfr.score(train[input_col], train.p_var_out.values).round(4) # get the adjust R squared adj_r1 = 1 - (1 - r1) * (num - 1) / (num - num_features - 1) adj_r2 = 1 - (1 - r2) * (num - 1) / (num - num_features - 1) # calculate the right rate for predicting trend test = test.assign(trend=0) test.ix[(test.p_var_out >= test.p_var) & (test.p_var_pre >= test.p_var), 'trend'] = 1 test.ix[(test.p_var_out <= test.p_var) & (test.p_var_pre <= test.p_var), 'trend'] = 1 test = test.assign(trend2=0) test.ix[(test.p_var_out >= test.p_var) & (test.p_var_pre2 >= test.p_var), 'trend2'] = 1 test.ix[(test.p_var_out <= test.p_var) & (test.p_var_pre2 <= test.p_var), 'trend2'] = 1 t1 = test.trend.mean().round(4) t2 = test.trend2.mean().round(4) # output to file file.write("{},{},{},{},{},{},{},{}\n".format(indexs[i + 1], indexs[i], num, num_test, adj_r1, adj_r2, t1, t2)) # save all predict vol out = test[['p_var_pre', 'p_var_pre2', 'p_var_out']] out = out.assign(date=indexs[i + 1]) out.to_csv(settings.get_home_path() + 'data/total_predict_{}.csv'.format(file_name), mode='a', header=False, float_format='%.3f') # training set at next time window train = test[input_col + ['p_var_out']] print("finish: ", indexs[i]) file.close()
YtCV = [] YvCV = [] for tr_idx, va_idx in Xsp0: XtCV.append(X[tr_idx]) XvCV.append(X[va_idx]) YtCV.append(Y[tr_idx]) YvCV.append(Y[va_idx]) errTLF = [] errVLF = [] leaves = [2**k for k in range(1,11)] for lf in leaves: errti=[] errvi=[] for i in range(5): rfr = RFR(n_estimators=50,max_features=8,min_samples_leaf=lf) rfr.fit(XtCV[0],YtCV[0]) errti.append(mse(YtCV[0],rfr.predict(XtCV[0]))) errvi.append(mse(YvCV[0],rfr.predict(XvCV[0]))) errti = np.array(errti) errvi = np.array(errvi) errTLF.append(np.mean(errti)) errVLF.append(np.mean(errvi)) #%% plt.semilogx(leaves, errTLF,'*-', label='Train Err') plt.semilogx(leaves, errVLF,'*-', label='Valid Err') plt.legend() plt.title('RandomForest Err vs MinLeaf') plt.xticks(leaves,leaves) plt.xlabel('min leaves')
# In[12]: CVS(reg, xtrain, ytrain, cv=5, scoring='neg_mean_squared_error').mean() # In[13]: # 来查看下sklearn中的所有模型来评估指标 import sklearn sorted(sklearn.metrics.SCORERS.keys()) # In[14]: # 使用随机森林和线性回归作为对比 #随机森林 rfr = RFR(n_estimators=100) CVS(rfr, xtrain, ytrain, cv=5).mean() # In[15]: CVS(rfr, xtrain, ytrain, cv=5, scoring='neg_mean_squared_error').mean() # In[16]: #线性回归 lr = LinearR() CVS(lr, xtrain, ytrain, cv=5, scoring='neg_mean_squared_error').mean() # In[17]: # 开启参数slient ,当数据巨大,训练缓慢的时候,可以使用这个参数来监控模型的进度
def __init__(self, featureset=None, target=None, mode='predict', path=''): if (mode == 'train'): self.__svm = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) self.__svr = SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto', kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False) self.__nusvm = NuSVC(cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf', max_iter=-1, nu=0.5, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) self.__nusvr = NuSVR(C=1.0, cache_size=200, coef0=0.0, degree=3, gamma='auto', kernel='rbf', max_iter=-1, nu=0.5, shrinking=True, tol=0.001, verbose=False) self.__linsvm = LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True, intercept_scaling=1, loss='squared_hinge', max_iter=1000, multi_class='ovr', penalty='l2', random_state=None, tol=0.0001, verbose=0) self.__linsvr = LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True, intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000, random_state=None, tol=0.0001, verbose=0) self.__mlpc = MLPC(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08, hidden_layer_sizes=(100, 25), learning_rate='constant', learning_rate_init=0.001, max_iter=200, momentum=0.9, nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True, solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False, warm_start=False) self.__mlpr = MLPR(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08, hidden_layer_sizes=(100, 25), learning_rate='constant', learning_rate_init=0.001, max_iter=200, momentum=0.9, nesterovs_momentum=True, power_t=0.5, random_state=None, shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False, warm_start=False) self.__dtc = DTC(class_weight=None, criterion='gini', max_depth=None, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, presort=False, random_state=None, splitter='best') self.__dtr = DTR(criterion='mse', max_depth=None, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, presort=False, random_state=None, splitter='best') self.__rfc = RFC(bootstrap=True, class_weight=None, criterion='gini', max_depth=100, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False) self.__rfr = RFR(bootstrap=True, criterion='mse', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False) (self.__svm, self.__svr, self.__nusvm, self.__nusvr, self.__linsvm, self.__linsvr, self.__mlpc, self.__mlpr, self.__dtc, self.__dtr, self.__rfc, self.__rfr) = self.__trainAll(X=list(featureset), Y=list(target)) self.__saveModelsToFile(path) else: self.__svm = joblib.load(path + 'Mel_SVM.pkl') self.__svr = joblib.load(path + 'Mel_SVR.pkl') self.__nusvm = joblib.load(path + 'Mel_NuSVM.pkl') self.__nusvr = joblib.load(path + 'Mel_NuSVR.pkl') self.__linsvm = joblib.load(path + 'Mel_LinSVM.pkl') self.__linsvr = joblib.load(path + 'Mel_LinSVR.pkl') self.__mlpc = joblib.load(path + 'Mel_MLPC.pkl') self.__mlpr = joblib.load(path + 'Mel_MLPR.pkl') self.__dtc = joblib.load(path + 'Mel_DTC.pkl') self.__dtr = joblib.load(path + 'Mel_DTR.pkl') self.__rfc = joblib.load(path + 'Mel_RFC.pkl') self.__rfr = joblib.load(path + 'Mel_RFR.pkl')
dpaperi['collab_prestige'] = int( np.nan_to_num( np.median(np.exp(50) / (np.exp(50) + np.exp(prestige_ca_vec)))) >= 0.35) dpaperi['collab_citation'] = np.nan_to_num(np.mean(citation_ca_vec)) dpaperi['review'] = scipy.special.expit(20 * (review - 0.5)) df_s_a_emb[index] = dpaperi dsa = pd.DataFrame.from_dict(df_s_a_emb, orient='index') dsa11 = dsa.loc[dsa['prestige'] == 1].loc[dsa['collab_prestige'] == 1] dsa10 = dsa.loc[dsa['prestige'] == 1].loc[dsa['collab_prestige'] == 0] dsa01 = dsa.loc[dsa['prestige'] == 0].loc[dsa['collab_prestige'] == 1] dsa00 = dsa.loc[dsa['prestige'] == 0].loc[dsa['collab_prestige'] == 0] rfs11 = RFR(n_estimators=50) rfs10 = RFR(n_estimators=50) rfs01 = RFR(n_estimators=50) rfs00 = RFR(n_estimators=50) rfs11 = rfs11.fit(dsa11[['citation', 'collab_citation']], dsa11['review']) rfs10 = rfs10.fit(dsa10[['citation', 'collab_citation']], dsa10['review']) rfs01 = rfs01.fit(dsa01[['citation', 'collab_citation']], dsa01['review']) rfs00 = rfs00.fit(dsa00[['citation', 'collab_citation']], dsa00['review']) ie1 = rfs11.predict(dsa[['citation', 'collab_citation']]) - rfs01.predict( dsa[['citation', 'collab_citation']]) ie0 = rfs10.predict(dsa[['citation', 'collab_citation']]) - rfs00.predict( dsa[['citation', 'collab_citation']]) re1 = rfs11.predict(dsa[['citation', 'collab_citation']]) - rfs10.predict(
print("Accuracy: %0.2f (± %0.2f)" % (scores.mean(), scores.std() * 2)) keys = list(range(0, len(confs.keys()))) conf_keys = list(confs.keys()) acc = np.zeros((len(confs.keys()), )) blind = np.zeros((len(confs.keys()), )) for i in range(0, len(confs.keys())): d_temp = [] for j in range(0, len(data)): if data[j, 1] == i: d_temp.append(list(data[j, :]) + [target[j]]) if len(d_temp) > 0: d_temp = np.array(d_temp) lr = RFR() lr = lr.fit( np.array(d_temp)[:, 2].reshape(-1, 1), np.array(d_temp)[:, 3]) scores = lr.score( np.array(d_temp)[:, 2].reshape(-1, 1), np.array(d_temp)[:, 3]) # , cv=5) acc[i] = 1 - scores blind[i] = np.mean(d_temp[:, 0]) fig = plt.figure(figsize=(8.75, 7)) plt.scatter(np.array(d_temp)[:, 2], np.array(d_temp)[:, 3]) plt.title("Conference %s" % (conf_keys[i])) fig.savefig("output/status_review_conference_%d.png" % (keys[i])) # plt.scatter(range(0,len(acc)),acc,c=blind)
# display_score(rmse_scores) # Mean: 71227.31692492112 # Stardard deviation: 2926.49161963209 # 和LR的交叉驗證評分做個比較 lr_scores = cvs(lr, housing_prepared, housing_labels, scoring = "neg_mean_squared_error", cv = 10) lr_rmse = np.sqrt(-lr_scores) # display_score(lr_rmse) # Mean: 69052.46136345083 # Stardard deviation: 2731.6740017983425 # 明顯兩者比較出來的結果, 決策樹回歸已經嚴重的過度擬合了導致比線性回歸還差 # 所以來嘗試最後一個模型 RandomForestRegressor from sklearn.ensemble import RandomForestRegressor as RFR rfr = RFR() # rfr.fit(housing_prepared, housing_labels) # housing_predicted = rfr.predict(housing_prepared) # rmse = np.sqrt(mean_squared_error(housing_labels, housing_predicted)) # print("RMSE: ", rmse) # RMSE: 18620.70199601925 # rfr_scores = cvs(rfr, housing_prepared, housing_labels, scoring = "neg_mean_squared_error", cv = 10) # rfr_rmse = np.sqrt(-rfr_scores) # display_score(rfr_rmse) # Mean: 50243.380660403775 # Stardard deviation: 1997.2178724397745 # 結果仍然過度擬合了, 因為訓練分數(rmse)遠低於驗證分數(rfr_rmse) # 注意: 千萬別花太多時間調整超參數, 我們的目的是篩選幾個(2~5)有效的模型, 緊接著微調就可以了 # 6. 微調模型 # 格狀搜尋 GridSearch # 使用 Scikit-Learn GridSearchCV
def RandomForest_regression(self): model = RFR(n_estimators=1000, max_depth=10) model.fit(self.train_X, self.train_y) path = model.decision_path(self.train_X) self.y_pre_train = model.predict(self.train_X) self.y_pre_valid = model.predict(self.valid_X)
def train_model(): data = get_data() X_train, X_test, y_train, y_test = split_data(data) X_train, y_train = remove_county_state(X_train, y_train) X_test, y_test = remove_county_state(X_test, y_test) print('y_train', list(y_train)) print('y_test', list(y_test)) print('all y', list(y_train)+list(y_test)) # data preprocessing (removing mean and scaling to unit variance with StandardScaler) pipeline = make_pipeline(StandardScaler(), RFR()) # set hyperparameters hyperparameters = { # 'randomforestregressor__max_features' : ['auto', 'sqrt'], # 'randomforestregressor__max_depth': [3, 5, None], # 'randomforestregressor__bootstrap': [True, False], # 'randomforestregressor__min_samples_leaf': [3, 5, 7], # 'randomforestregressor__min_samples_split': [5, 10, 15], # 'randomforestregressor__n_estimators': [5, 8, 10, 15], # 'randomforestregressor__max_features' : ['sqrt'], # 'randomforestregressor__max_depth': [100], # 'randomforestregressor__bootstrap': [ False], # 'randomforestregressor__min_samples_leaf': [1], # 'randomforestregressor__min_samples_split': [2], # 'randomforestregressor__n_estimators': [200], # # 'randomforestregressor__max_leaf_nodes': [None], # 'randomforestregressor__min_impurity_decrease': [0.0], # 'randomforestregressor__min_impurity_split':[None], # 'randomforestregressor__min_weight_fraction_leaf':[0.0], 'randomforestregressor__max_features': ['auto'], 'randomforestregressor__max_depth': [None], 'randomforestregressor__bootstrap': [True], 'randomforestregressor__min_samples_leaf': [5], 'randomforestregressor__min_samples_split': [10], 'randomforestregressor__n_estimators':[10], # 'randomforestregressor__max_features': ['auto'], # 'randomforestregressor__max_depth': [None], # 'randomforestregressor__bootstrap': [True], # 'randomforestregressor__min_samples_leaf': [5], # 'randomforestregressor__min_samples_split': [10], # 'randomforestregressor__n_estimators':[10, 30, 50, 70, 100], } # tune model via pipeline clf = GridSearchCV(pipeline, hyperparameters, cv=3) clf.fit(X_train, y_train) pred = clf.predict(X_test) # print('feature importances:', clf.feature_importances_) print ('r2 score:',r2_score(y_test, pred)) print ('mse:',mean_squared_error(y_test, pred)) print('*'*20) print('best params:',clf.best_params_) print('best grid:', clf.best_estimator_) print('^'*20) eval_model(clf.best_estimator_, X_train, y_train, X_test, y_test) print('#'*20) print('score', clf.score) return clf
for file in files: # Load Data with open(file, "rb") as f: datas = pickle.load(f) results = {} # boxcox-shift params lambda_ = datas['cat_data']['lambda'] shift = datas['cat_data']['shift'] # models models = {} models["RF"] = GridSearchCV( RFR(n_jobs=-1), param_grid={ "n_estimators": [10, 100, 1000, 10000], "max_features": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] }, cv=5, n_jobs=20) models["LASSO"] = LassoCV(max_iter=100000, cv=5, n_jobs=20) models["RIDGE"] = RidgeCV(cv=5) models["LASSOLARS"] = LassoLarsCV(max_iter=5000, cv=5, n_jobs=-1) models["SVR_POLY2"] = GridSearchCV( SVR(kernel='poly', degree=2), param_grid={ "C": [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000], "gamma": [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000], "epsilon": [0.01, 0.1, 0.5, 1, 2, 4]
y_train, cv=2).mean() return val rfrBO = BayesianOptimization( rfrcv, { 'n_estimators': (100, 400), 'min_samples_split': (20, 100), 'max_features': (0.1, 0.999) }) gp_params = {"alpha": 1e-5} rfrBO.maximize(n_iter=10, **gp_params) rf = RFR(n_estimators=268, min_samples_split=20, max_features=9, random_state=42) rf.fit(X_train, y_train) pred = rf.predict(X_test) def ridgecv(alpha): val = cross_val_score(Ridge(alpha=alpha, random_state=42), X_train, y_train, cv=2).mean() return val ridgeBO = BayesianOptimization(ridgecv, {'alpha': (0.01, 11)}) ridgeBO.maximize(n_iter=20, **gp_params)
from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) #Applying PCA from sklearn.decomposition import PCA pca = PCA(n_components=1) X_train = pca.fit_transform(X_train) X_test = pca.transform(X_test) var = pca.explained_variance_ratio_ #Regressor from sklearn.ensemble import RandomForestRegressor as RFR regressor = RFR(n_estimators=100) regressor.fit(X_train, y_train) #prediction y_pred = regressor.predict(X_test) #Plotting plt.scatter(X_train, y_train, color="red") plt.plot(X_train, regressor.predict(X_train), color="blue") plt.show() plt.scatter(X_test, y_test, color="red") plt.plot(X_test, y_pred, color="blue") plt.show()
train_test_combine.name = train_test_combine.name.astype('category') train_test_combine.brand_name = train_test_combine.brand_name.astype( 'category') train_test_combine.general_cat = train_test_combine.general_cat.astype( 'category') train_test_combine.subcat_1 = train_test_combine.subcat_1.astype('category') train_test_combine.subcat_2 = train_test_combine.subcat_2.astype('category') train_test_combine.name = train_test_combine.name.cat.codes train_test_combine.brand_name = train_test_combine.brand_name.cat.codes train_test_combine.general_cat = train_test_combine.general_cat.cat.codes train_test_combine.subcat_1 = train_test_combine.subcat_1.cat.codes train_test_combine.subcat_2 = train_test_combine.subcat_2.cat.codes # modeling train_test_combine = train_test_combine.drop(["test_id", "train_id"], axis=1) train_test_combined = pd.concat([ train_test_combine.reset_index(drop=True), train_test_tfidf.reset_index(drop=True) ], axis=1) df_train = train_test_combined.loc[train_test_combined["is_train"] == 1] df_test = train_test_combined.loc[train_test_combined["is_train"] == 0] df_test = df_test.drop(["is_train"], axis=1) df_train = df_train.drop(["is_train"], axis=1) df_train["log_price"] = train.log_price.values x_train, y_train = df_train.drop(['log_price'], axis=1), df_train.log_price model = RFR(n_estimators=4) model.fit(x_train, y_train) y_test = model.predict(df_test) submission = pd.DataFrame({"test_id": list(test["test_id"])}) submission["price"] = y_test submission["price"] = submission["price"].apply(lambda x: np.exp(x) - 1)
def fun_rfr(x): clf = RFR(n_estimators=500, oob_score=True) rf_fit = clf.fit(X=x, y=pheno) return rf_fit.oob_score_
print([*zip(poly.get_feature_names(),reg.coef_)][:10]) # 放到dataframe中进行排序 coeff = pd.DataFrame([poly.get_feature_names(),reg.coef_.tolist()]).T coeff.columns = ["feature","coef"] coeff.sort_values(by="coef", inplace=True) # In[]: from time import time time0 = time() print("R2:{}".format(reg.score(X_,y))) print("time:{}".format(time()-time0)) # In[]: # 假设使用其他模型? from sklearn.ensemble import RandomForestRegressor as RFR time0 = time() print("R2:{}".format(RFR(n_estimators=100).fit(X,y).score(X,y))) # R2:0.9743205003727138 print("time:{}".format(time()-time0))
for tr_idx, va_idx in Xsp0: XtCV.append(X[tr_idx]) XvCV.append(X[va_idx]) YtCV.append(Y[tr_idx]) YvCV.append(Y[va_idx]) errTP = [] errVP = [] #%% parents = [2**k for k in range(1,11)] #%% for pr in parents: errti=[] errvi=[] for i in range(5): rfr = RFR(n_estimators=50,max_features=8,min_samples_split=pr) rfr.fit(XtCV[0],YtCV[0]) errti.append(mse(YtCV[0],rfr.predict(XtCV[0]))) errvi.append(mse(YvCV[0],rfr.predict(XvCV[0]))) errti = np.array(errti) errvi = np.array(errvi) errTP.append(np.mean(errti)) errVP.append(np.mean(errvi)) #%% plt.semilogx(parents, errTP,'*-', label='Train Err') plt.legend() plt.title('RandomForest Train Err vs MinParent') plt.xticks(parents,parents) plt.xlabel('min parent') plt.ylabel('err')
#KNeighbors Regression knr = KNR(n_neighbors = 4, weights = 'distance', p = 4) #training knr.fit(X_train, y_train) #testing y_pred_knr = knr.predict(X_test) #r_square r_2_knr = r2_score(y_test, y_pred_knr) #Random Forest Regression rfr = RFR(n_estimators = 100, max_features = 'auto', random_state = 1) #training rfr.fit(X_train, y_train) #testing y_pred_rfr = rfr.predict(X_test) #r_square r_2_rfr = r2_score(y_test, y_pred_rfr) #feature importance fet = rfr.feature_importances_
url_data['station'], np.dot(url_data['day'], 24) + url_data['hour'] ]).T if city == 'bj': y_train_data = np.array( [url_data['pm25'], url_data['pm10'], url_data['o3']]).T else: y_train_data = np.array([url_data['pm25'], url_data['pm10']]).T for i in range(2): station_list = [] for station in range(STATION_NUM[city]): station_list = np.concatenate( (station_list, [station] * HOUR_NUM)) day_delta_list = [url_data['day_delta'] + i ] * HOUR_NUM * STATION_NUM[city] hour_list = list(range(HOUR_NUM)) * STATION_NUM[city] x_data = np.array([station_list, day_delta_list, hour_list]).T hour_list = list(range(HOUR_NUM)) * STATION_NUM[city] + np.dot( day_delta_list, 24) x_predict_data = np.array([station_list, hour_list]).T regr_rf = RFR(max_depth=MAX_DEPTH, random_state=2) regr_rf.fit(x_train_data, y_train_data) y_rf = regr_rf.predict(x_predict_data) y_rf[y_rf < 0] = 0 filename = 'sub.csv' csv_saver = utils.CsvSaver(x_data, y_rf, city, filename, day=i) csv_saver.save()
def testPCA(components): #pca_trans=PCA(n_components=components,random_state=1) pca_trans = tsvd(n_components=components, random_state=7, n_iter=10) pca_trans.fit(data) data2 = pca_trans.transform(data) #MinMax Normalizer scaler = MinMaxScaler() scaler.fit(data2) data2 = scaler.transform(data2) y["target"] = np.log1p(y["target"]) #train test split x_train, x_test, y_train, y_test = tts(data2, y["target"], test_size=0.20) #######################----------Algos--------------------####################### ranfor = RFR(n_estimators=500, verbose=0, n_jobs=-1, random_state=7) extratrees = ETR(n_estimators=500, random_state=7) bagging = BR(ETR(n_estimators=10, random_state=1), n_estimators=100, random_state=7) """---XGBOOST---""" xgb_train = xgb.DMatrix(x_train, label=y_train) xgb_validate = xgb.DMatrix(x_test, label=y_test) xgb_test_pred = xgb.DMatrix(x_test) param = {} param['objective'] = 'reg:linear' param['eta'] = 0.001 param['max_depth'] = 6 param['alpha'] = 0.001 param['colsample_bytree'] = 0.6 param['subsample'] = 0.6 param['silent'] = 0 param['nthread'] = 4 param['random_state'] = 42 param['eval_metric'] = 'rmse' watchlist = [(xgb_train, 'train'), (xgb_validate, 'validation')] """-fit-""" ranfor.fit(x_train, y_train) extratrees.fit(x_train, y_train) bst = xgb.train(param, xgb_train, 10000, watchlist, early_stopping_rounds=100, verbose_eval=100, maximize=False) y_pred = ranfor.predict(x_test) y_pred_ada = extratrees.predict(x_test) y_pred_xgb = bst.predict(xgb_test_pred, ntree_limit=bst.best_ntree_limit) #blending blending_X = pd.DataFrame() blending_X['xgb'] = bst.predict(xgb.DMatrix(x_train), ntree_limit=bst.best_ntree_limit) blending_X['ExtraTrees'] = extratrees.predict(x_train) blending_X['ranfor'] = ranfor.predict(x_train) bagging.fit(blending_X, y_train) blending_test = pd.DataFrame() blending_test['xgb'] = y_pred_xgb blending_test['ExtraTrees'] = y_pred_ada blending_test['ranfor'] = y_pred y_pred_grad = bagging.predict(blending_test) ############################################### y_pred_2best = (0.6 * y_pred_ada) + (0.4 * y_pred_xgb) print("PCA: %s --- Ranfor RMSE is : %s" % (components, np.sqrt(mse(y_test, y_pred)))) print("PCA: %s --- ExtraTrees RMSE is : %s" % (components, np.sqrt(mse(y_test, y_pred_ada)))) print("PCA: %s --- XGBoost RMSE is : %s" % (components, np.sqrt(mse(y_test, y_pred_xgb)))) print("PCA: %s --- blended bagging RMSE is : %s" % (components, np.sqrt(mse(y_test, y_pred_grad)))) print("PCA: %s --- XGBoost+ExtraTrees RMSE is : %s" % (components, np.sqrt(mse(y_test, y_pred_2best)))) return { "pca": pca_trans, "scaler": scaler, "ranfor": ranfor, 'extratrees': extratrees, 'bagging': bagging, 'xgboost': bst }
def buy_ml_vol(stock, forecast_out=5, month=None, day=None, plot=False, year=2019, best_combination=None): # Want to separate 1 percent of the data to forecast days = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31] # Today info if ((month == None) or (day == None)): today = datetime.datetime.now() month = today.month if ((today.day + forecast_out) <= days[today.month - 1]) else today.month + 1 day = today.day+forecast_out if((today.day-forecast_out)<=days[today.month-1]) else \ today.day+forecast_out-days[today.month-1] day = today.day+forecast_out if(today.day+forecast_out == days[today.month-1]) else \ (today.day+forecast_out)%days[today.month-1] # For prediction start = datetime.datetime(2016, 4, 1) end = datetime.datetime(year, month, day) #df = web.DataReader(stock, 'yahoo', start, end) df = read_data(stock, start, end) #print("BUYING") if (df.empty): return [0] * 10, "ERROR" dfreg = df.loc[:, ['adjusted close', 'volume']] dfreg['HL_PCT'] = (df['high'] - df['low']) / df['adjusted close'] * 100.0 dfreg['PCT_change'] = (df['adjusted close'] - df['open']) / df['open'] * 100.0 # Drop missing value dfreg.fillna(value=-99999, inplace=True) # Searating the label here, we want to predict ht eAdjclose forecast_col = 'volume' dfreg['label'] = dfreg[forecast_col].shift(-forecast_out) X = np.array(dfreg.drop(['label'], 1)) # Scale X for linear regression try: X = preprocessing.scale(X) except ValueError: print("DATA: {}".format(X)) print("STOCK: {}".format(stock)) print("START PERIOD: {}".format(start)) print("END PERIOD: {}".format(end)) # Finally want late X and early X for model X_lately = X[-forecast_out:] X = X[:-forecast_out] # Separate label and identify it as y y = np.array(dfreg['label']) y = y[:-forecast_out] # Training and testing sets X_train = X[:len(X) - forecast_out] X_test = X[len(X) - forecast_out:] y_train = y[:len(y) - forecast_out] y_test = y[len(y) - forecast_out:] # LinReg clfreg = LinearRegression(n_jobs=-1) # QuadReg2 clfpoly2 = make_pipeline(PolynomialFeatures(2), Ridge()) # QuadReg3 clfpoly3 = make_pipeline(PolynomialFeatures(3), Ridge()) # QuadReg4 clfpoly4 = make_pipeline(PolynomialFeatures(4), Ridge()) # QuadReg5 clfpoly5 = make_pipeline(PolynomialFeatures(5), Ridge()) # KNN Regression clfknn = KNeighborsRegressor(n_neighbors=2) # Bayesian Ridge clfbayr = BayesianRidge() # Neural Network #clfmlp = MLPRegressor(hidden_layer_sizes=(100,100,100), learning_rate='adaptive', # solver='adam', max_iter=5, verbose=False) #clfmlp.fit(X_train, y_train) # Random Forest Regressor clfrfr = RFR(n_estimators=15, random_state=0) # Support Vector Regressor clfsvr = SVR(gamma='auto') # Fitting threads = [] models = [ clfreg, clfpoly2, clfpoly3, clfpoly4, clfpoly5, clfknn, clfbayr, clfrfr, clfsvr ] fits = [''] * len(models) for i in range(len(models)): process = Thread(target=fitting, args=[models[i], X_train, y_train, fits, i], name=stock) process.start() threads.append(process) for process in threads: process.join() # Evaluation #confidencereg = clfreg.score(X_train, y_train) #confidencepoly2 = clfpoly2.score(X_train, y_train) #confidencepoly3 = clfpoly3.score(X_train, y_train) #confidenceknn = clfknn.score(X_train, y_train) #confidencebayr = clfbayr.score(X_train, y_train) # Predictions reg_forecast = fits[0].predict(X_lately) poly2_forecast = fits[1].predict(X_lately) poly3_forecast = fits[2].predict(X_lately) poly4_forecast = fits[3].predict(X_lately) poly5_forecast = fits[4].predict(X_lately) knn_forecast = fits[5].predict(X_lately) bayr_forecast = fits[6].predict(X_lately) #mlp_forecast = clfmlp.predict(X_lately) rfr_forecast = fits[7].predict(X_lately) svr_forecast = fits[8].predict(X_lately) # Set up dataframe dfreg['reg_forecast'] = np.nan dfreg['poly2_forecast'] = np.nan dfreg['poly3_forecast'] = np.nan dfreg['poly4_forecast'] = np.nan dfreg['poly5_forecast'] = np.nan dfreg['knn_forecast'] = np.nan dfreg['bayr_forecast'] = np.nan dfreg['mlp_forecast'] = np.nan dfreg['rfr_forecast'] = np.nan dfreg['svr_forecast'] = np.nan last_date = dfreg.iloc[-1].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in zip(reg_forecast, poly2_forecast, poly3_forecast, poly4_forecast, poly5_forecast, knn_forecast, bayr_forecast, rfr_forecast, svr_forecast): next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg.loc[next_date] = list( [np.nan for _ in range(len(dfreg.columns) - 9)] + list(i)) dfreg['mean_forecast'] = dfreg[[ 'reg_forecast', 'poly2_forecast', 'poly3_forecast', 'knn_forecast', 'bayr_forecast', # 'mlp_forecast', 'rfr_forecast' ]].mean(axis=1) if (plot): dfreg['volume'].tail(50).plot(lw=2, figsize=(20, 12)) dfreg['mean_forecast'].tail(50).plot(lw=2, c='k') dfreg['bayr_forecast'].tail(50).plot(lw=0.5) dfreg['knn_forecast'].tail(50).plot(lw=0.5) dfreg['reg_forecast'].tail(50).plot(lw=0.5) dfreg['poly2_forecast'].tail(50).plot(lw=0.5) dfreg['poly3_forecast'].tail(50).plot(lw=0.5) #dfreg['mlp_forecast'].tail(50).plot(lw=0.5) dfreg['rfr_forecast'].tail(50).plot(lw=0.5) plt.legend(loc='best') plt.xlabel('Date') plt.ylabel('Price') plt.title(stock) plt.savefig("./pred_plots/{}_{}/volume/{}_{}_{}".format( today.day, today.month, stock, today.day, today.month)) #plt.show() plt.close() #dfreg['volume'].tail(200).plot() #plt.title(stock) #plt.show() #if(not(best_combination==None)): # dfreg['best_mean_forecast'] = dfreg[[*list(best_combination)]].mean(axis=1) # fit = np.polyfit([i for i in range(forecast_out)], # dfreg['best_mean_forecast'].values[-forecast_out:], deg=1) #else: try: fit = np.polyfit([i for i in range(forecast_out)], dfreg['mean_forecast'].values[-forecast_out:], deg=1) except: print("FORECASTING {} DAY OUT".format(forecast_out)) fit = [ dfreg['mean_forecast'].values[-1] - dfreg['adjusted close'].values[-1], 2 ] string = "VOLUME SHOULD GO UP" if (fit[0] > 0) else "VOlUME SHOULD GO DOWN" #print("{} {}".format(stock, string)) #print("VOLUME HAS BEEN FIT: {}".format(fit[0])) return fit[0], dfreg['volume'].values[-forecast_out - 1]
import numpy as np import matplotlib.pyplot as plt import pandas as pd dataset = pd.read_csv('Position_Salaries.csv') X = dataset.iloc[:, 1:2].values y = dataset.iloc[:, 2].values from sklearn.ensemble import RandomForestRegressor as RFR regressor = RFR(n_estimators=300, random_state=0) regressor.fit(X, y) y_pred = regressor.predict([[6.5]]) X_grid = np.arange(min(X), max(X), 0.01) X_grid = X_grid.reshape((len(X_grid), 1)) plt.scatter(X, y, color='red') plt.plot(X_grid, regressor.predict(X_grid), color='blue') plt.title('Truth or Bluff (Regression Model)') plt.xlabel('Position level') plt.ylabel('Salary') plt.show()
# There seems to be a large amount of noise in this model, perhaps due to overfitting. # ## Random Forest Algorithm # In[35]: from sklearn.ensemble import RandomForestRegressor as RFR # set seed for consistency np.random.seed(171) # fit random forest random_forest = RFR() random_forest.fit(X_train, y_train) # In[36]: random_forest.score(X=X_test, y=y_test) # In[37]: # predict using the random forest model y_pred = random_forest.predict(X_test)
def render(img, features='coordinates', ratio=0.00025, iterations=1, lab=True, depth=None, npxs=5e5, anti_aliasing=False, verbose=False): """ Features decides the inputs for the model, current options are 'coordinates' for coordinate based features and 'landmarks' for distance to landmarks features. Render time and memory usage is about 2x using landmarks features. Default is 'coordinates'. ratio corresponds to the ratio of the size of the smallest details the model is allowed to use compared to the whole image. Default is 0.001, 1 would correspond to not fitting to anything while 0 would fit down to individual pixels. iterations is how many randomized runs of the base model to use for averaging in the final prediction. 1 is default is corresponds to sharp boundaries. 10-100 would result in a much smoother more painterly result. Render time and memory usage increases linearly with number of iterations. lab decides whether to fit the model in lab color space instead of rgb color space. Default is True. depth decides how many levels of splits the regressor is allowed to have. Default is None which corresponds to as many as needed. np decides how many pixels to resize the source image to internally for fitting. Default is 500,000. anti_aliasing decides whether or not to use 2x grid super sampling. Default is False. Render time and memory usage will be increased over 2x. verbose controls whether to print info about a render Default is False. """ t = time() w, h = img.shape[:2] wrender, hrender = w, h if anti_aliasing: wrender, hrender = w * 2, h * 2 img_o = pixel_scale(img, npxs) wfit, hfit = img_o.shape[:2] if lab: img = rgb2lab(img_o) if features == 'landmarks': locations = list(np.linspace(0, 1, 7)) landmarks = list(product(locations, locations)) else: landmarks = None X, Y = gen_xy(img, landmarks) xrender = gen_x(wrender, hrender, landmarks) min_samples = int(round(ratio * len(X))) model = RFR(n_estimators=iterations, n_jobs=-1, max_depth=depth, random_state=42, min_samples_leaf=min_samples) model.fit(X[:, 2:], Y) pred = model.predict(xrender[:, 2:]) pred_img = pred_to_img(pred, xrender, wrender, hrender) if lab: pred_img = lab2rgb(pred_img) error = np.mean(np.square(resize(pred_img, (wfit, hfit)) - img_o)) * 255. if anti_aliasing: pred_img = resize(pred_img, (w, h)) if verbose: s = "%08.3f seconds to render\n" % (time() - t) s += "%08.3f error (0-255 scaled)\n" % (error) s += "%08.3f min pixels considered\n" % (min_samples) print(s) return pred_img
mean_squared_error(y_pred_knr, y_test_energy))) #Decision Tree Regressor from sklearn.tree import DecisionTreeRegressor as DTR dtr_energy = DTR(max_depth=11, min_samples_split=16, min_samples_leaf=2, random_state=37).fit(X_train_energy_stand, y_train_energy) y_pred_dtr = dtr_energy.predict(X_test_energy_stand) print("Mean squared error for DTR: {:.3f}.".format( mean_squared_error(y_pred_dtr, y_test_energy))) #Random Forest Regressor from sklearn.ensemble import RandomForestRegressor as RFR rfr_energy = RFR(n_estimators=100, min_samples_leaf=2, max_leaf_nodes=1000, random_state=37).fit(X_train_energy, y_train_energy) y_pred_rfr = rfr_energy.predict(X_test_energy) print("Mean squared error for RFR: {:.3f}.".format( mean_squared_error(y_pred_rfr, y_test_energy))) #Support Vector from sklearn.svm import SVR svr_energy = SVR().fit(X_train_energy_stand, y_train_energy) y_pred_svr = svr_energy.predict(X_test_energy_stand) print("Mean squared error for SVR: {:.3f}.".format( mean_squared_error(y_pred_svr, y_test_energy))) from sklearn.neural_network import MLPRegressor as MLPR mlpr_energy = MLPR(hidden_layer_sizes=(100, 100), alpha=.3,
YtCV = [] YvCV = [] for tr_idx, va_idx in Xsp0: XtCV.append(X[tr_idx]) XvCV.append(X[va_idx]) YtCV.append(Y[tr_idx]) YvCV.append(Y[va_idx]) errTD = [] errVD = [] D = list(range(5, 60, 5)) for d in D: errti = [] errvi = [] for i in range(5): rfr = RFR(n_estimators=50, max_depth=d) rfr.fit(XtCV[0], YtCV[0]) YtHat = rfr.predict(XtCV[0]) YvHat = rfr.predict(XvCV[0]) errti.append(mse(YtCV[0], YtHat)) errvi.append(mse(YvCV[0], YvHat)) errti = np.array(errti) errvi = np.array(errvi) errTD.append(np.mean(errti)) errVD.append(np.mean(errvi)) #%% plt.plot(D, errTD, '*-', label='Train Err') plt.plot(D, errVD, '*-', label='Valid Err') plt.legend() plt.title('RandomForest Err vs MaxDepth')