def load_and_split_data(self, x, y, test_ratio, purge_ratio, units, epochs, batch_size): self.x_train, x_test, self.y_train, y_test = tools.split_data( x, y, test_ratio=test_ratio, purge_ratio=purge_ratio) self.y_purge = y[len(self.y_train):len(y) - len(y_test)] self.y_train = self.scale_y.fit_transform( np.array(self.y_train).reshape(-1, self.output_size)) self.x_train = self.scale_x.fit_transform(self.x_train) # x_test = self.scale_x.transform(x_test) self.x_train = np.reshape(self.x_train, (-1, self.time_steps, self.input_size)) # x_test = np.reshape(x_test, (-1, self.time_steps, self.input_size)) self.model.add( LSTM(units=units, return_sequences=True, input_shape=(self.time_steps, self.input_size))) self.model.add(Dropout(0.2)) self.model.add(LSTM(units=units, return_sequences=True)) self.model.add(Dropout(0.2)) self.model.add(LSTM(units=units, return_sequences=True)) self.model.add(Dropout(0.2)) self.model.add(LSTM(units=units)) self.model.add(Dropout(0.2)) self.model.add(Dense(units=self.output_size)) self.model.compile(optimizer='adam', loss='mean_squared_error') hist = self.model.fit(self.x_train, self.y_train, epochs=epochs, batch_size=batch_size) return x_test, y_test
def main(): train=load_data('train.csv') lbl_enc = preprocessing.LabelEncoder() train['target'] = lbl_enc.fit_transform(train['target']) feature_cols= [col for col in train.columns if col not in ['target','id']] X_train,y_train=split_data(train,feature_cols) clf_scores=clf_score(create_clf(),X_train[feature_cols],y_train) print clf_scores plt.plot(clf_scores) plt.xticks(range(len(clf_scores)), clf_scores.index, fontsize=14, rotation=90) plt.show()
def split_data(cls, data, ratio=0.7): """Classmethod that splits data into training set and test set. :param data: The object containing data. :type data: :class:`pandas.DataFrame`. :param ratio: What portion of data to include in the training set and the test set. :obj:`0.5` means that the data will be distributed equaly. :type ratio: float """ data = tools.first_answers(data) return tools.split_data(data, ratio=ratio)
def get_data(path, training_percent=0.7, percent_zeroes=0.5, n_nonzero_repeat=0, shuffle=True, normalize=False): X, y = extract_data(path, normalize) X_train, y_train, X_test, y_test = split_data( X, y, training_percent=training_percent, percent_zeroes=percent_zeroes, n_nonzero_repeat=n_nonzero_repeat) if shuffle: X_train, y_train = shuffle_data(X_train, y_train) X_test, y_test = shuffle_data(X_test, y_test) return X_train, y_train, X_test, y_test
# In[149]: # %pdb print (" Plotting the Decision Surface of Training Set... ") t.plot_decision_regions(X[:,feat],Y,clf=dt, res=0.1, cycle_marker=True, legend=1) # In[150]: # Split your data into training and test-set... # see the documentation of split_data in tools for further information... Xtrain,Ytrain,Xtest,Ytest=t.split_data(X,Y) print (" Training Data Set Dimensions=", Xtrain.shape, "Training True Class labels dimensions", Ytrain.shape) print (" Test Data Set Dimensions=", Xtest.shape, "Test True Class labels dimensions", Ytrain.shape) # In[151]: # Lets train a Decision Tree Classifier on Petal Length and Width feat=[0,1] dt=DecisionTree(0.95,5) dt.train(Xtrain[:,feat],Ytrain) # In[152]:
def main(): train=load_data('train.csv') feature_cols= [col for col in train.columns if col not in ['target','id']] X_train,y_train=split_data(train,feature_cols) grid_search(X_train[feature_cols],y_train,get_clfs())
param_list = list(ParameterGrid(param_grid)) for i in range(0,len(param_list)): reg=clfs[name]['est'].set_params(**param_list[i]) cv=cv_score1(reg,X_train,y) print [cv.mean(),name,param_list[i]] def grid_search1(X_train,y,clfs): for name, clf in clfs.iteritems(): clf = GridSearchCV(clfs[name]['est'], clfs[name]['grid'], n_jobs=16, verbose=1, cv=5) clf.fit(X_train,y) print clf.score print clf.best_score_ print clf.best_params_ def main(): train=load_data('train.csv') feature_cols= [col for col in train.columns if col not in ['target','id']] X_train,y_train=split_data(train,feature_cols) grid_search(X_train[feature_cols],y_train,get_clfs()) #if __name__ == '__main__': # main() le = preprocessing.LabelEncoder() data=load_data('train.csv') train=data.loc[np.random.choice(data.index,np.around(len(data)*0.5), replace=False)] le.fit(train['target']) train['target']=le.transform(train['target']) feature_cols= [col for col in train.columns if col not in ['target','id']] X_train,y_train=split_data(train,feature_cols) grid_search(X_train[feature_cols],y_train,get_extra_trees())
percentage_to_evaluate = 0.05 interval_length = 1 window_length = 10 ema_hyper_parameter = 0.2 k_fold_hyperparameter = 6 my_df = tools.make_return_df('BMW', first_day, yesterday.strftime('%Y-%m-%d'), interval=interval_length) graph_returns(my_df) x, y = tools.build_x_y(my_df, window_length, ema_hyper_parameter) x, evaluation_set_x, y, evaluation_set_y = tools.split_data( x, y, percentage_to_evaluate) alphas_ridge = np.arange(0, 1, 0.01) alphas_lasso = np.arange(0, 100, 1) clf = RidgeCV(np.arange(0.01, 1, 0.01), cv=k_fold_hyperparameter, fit_intercept=False) clf.fit(x, y) print(f'coefficient is {clf.coef_}') def perform_arma(): # ARMA TESTING model = sm.tsa.ARMA(my_df['Return'], (10, 5)) res = model.fit()
def train_with_lstm(self): x_train, y_train, x_test, y_test = split_data( self.data, self.input_params['lookback']) print('x_train.shape = ', x_train.shape) print('y_train.shape = ', y_train.shape) print('x_test.shape = ', x_test.shape) print('y_test.shape = ', y_test.shape) x_train = torch.from_numpy(x_train).type(torch.Tensor) x_test = torch.from_numpy(x_test).type(torch.Tensor) y_train_lstm = torch.from_numpy(y_train).type(torch.Tensor) y_test_lstm = torch.from_numpy(y_test).type(torch.Tensor) y_train_gru = torch.from_numpy(y_train).type(torch.Tensor) y_test_gru = torch.from_numpy(y_test).type(torch.Tensor) model = LSTM(input_dim=self.input_params['input_dim'], hidden_dim=self.input_params['hidden_dim'], output_dim=self.input_params['output_dim'], num_layers=self.input_params['num_layers']) criterion = torch.nn.MSELoss(reduction='mean') # optimiser = torch.optim.Adam(model.parameters(), lr=0.01) # Adam 一种可以替代传统随机梯度下降过程的一阶优化算法,它能基于训练数据迭代地更新神经网络权重 optimiser = torch.optim.Adam(model.parameters(), lr=self.input_params['lr']) hist = np.zeros(self.input_params['num_epochs']) start_time = time.time() lstm = [] # 随机梯度下降 for t in range(self.input_params['num_epochs']): y_train_pred = model(x_train) loss = criterion(y_train_pred, y_train_lstm) print("Epoch ", t, "MSE: ", loss.item()) hist[t] = loss.item() # 将模型的参数梯度初始化为 0 optimiser.zero_grad() # 反向传播计算梯度 loss.backward() # 更新所有参数 optimiser.step() training_time = time.time() - start_time print("Training time: {}".format(training_time)) # 将标准化后的数据转换为原始数据 predict = pd.DataFrame( self.scaler.inverse_transform(y_train_pred.detach().numpy())) original = pd.DataFrame( self.scaler.inverse_transform(y_train_lstm.detach().numpy())) print(predict) fig = plt.figure() # 调整子图布局 fig.subplots_adjust(hspace=0.2, wspace=0.2) # 股票价格 plt.subplot(1, 2, 1) ax = sns.lineplot(x=original.index, y=original[0], label="Data", color='royalblue') ax = sns.lineplot(x=predict.index, y=predict[0], label="Training Prediction (LSTM)", color='tomato') ax.set_title('Stock price', size=14, fontweight='bold') ax.set_xlabel("Days", size=14) ax.set_ylabel("Cost (USD)", size=14) ax.set_xticklabels('', size=10) plt.show() # # 训练损失 # plt.subplot(1, 2, 2) # print(hist) # ax = sns.lineplot(data=hist, color='royalblue') # ax.set_xlabel("Epoch", size=14) # ax.set_ylabel("Loss", size=14) # ax.set_title("Training Loss", size=14, fontweight='bold') # fig.set_figheight(6) # fig.set_figwidth(16) # fig.show() # > 数据预测 # make predictions y_test_pred = model(x_test) # invert predictions # X = scaler.inverse_transform(X[, copy]) 将标准化后的数据转换为原始数据 y_train_pred = self.scaler.inverse_transform( y_train_pred.detach().numpy()) y_train = self.scaler.inverse_transform(y_train_lstm.detach().numpy()) y_test_pred = self.scaler.inverse_transform( y_test_pred.detach().numpy()) y_test = self.scaler.inverse_transform(y_test_lstm.detach().numpy()) # mean_squared_error 均方误差 trainScore = math.sqrt( mean_squared_error(y_train[:, 0], y_train_pred[:, 0])) print('Train Score: %.2f RMSE' % (trainScore)) testScore = math.sqrt( mean_squared_error(y_test[:, 0], y_test_pred[:, 0])) print('Test Score: %.2f RMSE' % (testScore)) lstm.append(trainScore) lstm.append(testScore) lstm.append(training_time) # > train and test # empty_like 生成和已有数组相同大小,类型的数组 trainPredictPlot = np.empty_like(self.data) trainPredictPlot[:, :] = np.nan trainPredictPlot[self.input_params['lookback']:len(y_train_pred) + self.input_params['lookback'], :] = y_train_pred # shift test predictions for plotting testPredictPlot = np.empty_like(self.data) testPredictPlot[:, :] = np.nan testPredictPlot[len(y_train_pred) + self.input_params['lookback'] - 1:len(self.data) - 1, :] = y_test_pred original = self.scaler.inverse_transform( self.data['Close'].values.reshape(-1, 1)) predictions = np.append(trainPredictPlot, testPredictPlot, axis=1) predictions = np.append(predictions, original, axis=1) result = pd.DataFrame(predictions) # >> fig fig = go.Figure() fig.add_trace( go.Scatter( go.Scatter(x=result.index, y=result[0], mode='lines', name='Train prediction'))) fig.add_trace( go.Scatter(x=result.index, y=result[1], mode='lines', name='Test prediction')) fig.add_trace( go.Scatter( go.Scatter(x=result.index, y=result[2], mode='lines', name='Actual Value'))) fig.update_layout(xaxis=dict(showline=True, showgrid=True, showticklabels=False, linecolor='white', linewidth=2), yaxis=dict( title_text='Close (USD)', titlefont=dict( family='Rockwell', size=12, color='white', ), showline=True, showgrid=True, showticklabels=True, linecolor='white', linewidth=2, ticks='outside', tickfont=dict( family='Rockwell', size=12, color='white', ), ), showlegend=True, template='plotly_dark') annotations = [] annotations.append( dict(xref='paper', yref='paper', x=0.0, y=1.05, xanchor='left', yanchor='bottom', text='Results (LSTM)', font=dict(family='Rockwell', size=26, color='white'), showarrow=False)) fig.update_layout(annotations=annotations) fig.show() # py.iplot(fig, filename='stock_prediction_lstm') return lstm