def get_train_data (self, fname, asset_class, cols): df = create_features(fname, asset_class) df = df[df['return_sign'] != 0.0] # Remove any zero returns to avoiding multi-class classification. X_train = df.loc[:, cols] y_classifier = df.return_sign return X_train, y_classifier
def pnl_backtesting (self, fname, asset_class, cols): """Perform PnL backtesting to check how accurate is the PnL predicted by the model is given previous realised price returns. """ X, y = self.get_train_data(fname, asset_class, cols) #Split the data into test and training data and train the model on the training data. X_train, X_test, y_train, y_test = train_test_split(X[:-1], y[:-1], test_size=0.3, shuffle=False) self.probability = True self.fit(X_train, y_train) probability_down = self.predict_proba(X_test)[:, 0] probability_up = self.predict_proba(X_test)[:, 1] predicted_direction = self.predict(X_test) xdata = get_dates(fname, asset_class) df0 = create_features(fname, asset_class)[:-1] df0 = df0[df0['return_sign'] != 0.0] # Remove any zero returns to avoiding multi-class classification. And select the returns corresponding to the dates on the test data. df0 = df0[-X_test.shape[0]:] true_realised_return = df0['return'] kelly_optimal_fraction = abs(probability_up - probability_down) realised_daily_profit = np.multiply(np.multiply(true_realised_return, predicted_direction), kelly_optimal_fraction) df = pd.DataFrame({'Prob-Down':probability_down, 'Prob-Up':probability_up, 'Predicted-Move':predicted_direction, 'Real-Move':y_test, 'Daily PnL':realised_daily_profit}) #print(df.head(30)) #Plot scatter plots for probabilities. #color_code = np.multiply(predicted_direction, y) # 1 if probability UP was correct #cmap = ListedColormap(['r', 'g']) # Red means incorrect prediction. #ax.scatter(xdata, probability_up, c=color_code, cmap=cmap) #ax.scatter(xdata, probability_down, c=color_code, cmap=cmap) #ax.set_title('Transition Probabilities for Down Moves') #ax.set_xlabel('Dates') #ax.set_ylabel('Probability') return df
def get_feature_scatter_plt(self, fname, asset_class, cols, ax): """Function to plot scatters with emphasize on highlighting points from different classes. """ df = create_features(fname, asset_class) df = df[ df['return_sign'] != 0.0] # Remove any zero returns to avoiding multi-class classification. #Seperate points into up/down moves. up_moves = df[df['return_sign'] == 1.0] down_moves = df[df['return_sign'] == -1.0] ax.scatter(up_moves[cols[0]], up_moves[cols[1]], color='g', s=1, alpha=1, label='Up') ax.scatter(down_moves[cols[0]], down_moves[cols[1]], color='r', s=1, alpha=1, label='Down') ax.legend(fontsize=10) ax.set_title('Decision Boundary') ax.set_xlabel(cols[0]) ax.set_ylabel(cols[1])
def get_train_data (self, fname, asset_class, cols): """Function to obtain training data (or training + test) from a dataframe. fname : name of the file under which the data is stored in the system's path. asset_class : name of the financial instrument to consider as contained in the file given by fname. """ df = create_features(fname, asset_class) df = df[df['return_sign'] != 0.0] # Remove any zero returns to avoiding multi-class classification. X_train = df.loc[:, cols] y_classifier = df.return_sign return X_train, y_classifier
def get_total_up_down_moves (self, fname, asset_class, cols): """Function to compute the overall realised up/down moves. """ df = create_features(fname, asset_class) df = df[df['return_sign'] != 0.0] # Remove any zero returns to avoiding multi-class classification. #Seperate points into up/down moves. up_moves = df[df['return_sign'] == 1.0] down_moves = df[df['return_sign'] == -1.0] print('real down moves: ', down_moves.shape[0]) print('real up moves: ', up_moves.shape[0])
def pnl_backtesting (self, fname, asset_class, cols): #Perform PnL backtesting to check how accurate the path predicted by the model would have been given previous price returns. X, y = self.get_train_data(fname, asset_class, cols) self.fit(X, y) probability_down = self.predict_proba(X)[:, 0] probability_up = self.predict_proba(X)[:, 1] predicted_direction = self.predict(X) xdata = get_dates(fname, asset_class) df0 = create_features(fname, asset_class) df0 = df0[df0['return_sign'] != 0.0] # Remove any zero returns to avoiding multi-class classification. true_realised_return = df0['return'] kelly_optimal_fraction = probability_up - probability_down realised_daily_profit = np.multiply(np.multiply(true_realised_return, predicted_direction), kelly_optimal_fraction) df = pd.DataFrame({'Prob-Down':probability_down, 'Prob-Up':probability_up, 'Predic-Move':predicted_direction, 'Real-Move':y, 'Daily PnL':realised_daily_profit})
def get_feature_scatter_plt (self, fname, asset_class, cols, ax): """Function to plot a simple scatter for two features of the data set, while separating up/down movements. """ df = create_features(fname, asset_class) df.dropna(inplace=True) # Remove any null values. df = df[df['return_sign'] != 0.0] # Remove any zero returns to avoiding multi-class classification. #Seperate points into up/down moves. up_moves = df[df['return_sign'] == 1.0] down_moves = df[df['return_sign'] == -1.0] ax.scatter(up_moves[cols[0]], up_moves[cols[1]], color='g', s=1, alpha=1, label='Up') ax.scatter(down_moves[cols[0]], down_moves[cols[1]], color='r', s=1, alpha=1, label='Down') ax.legend(fontsize=10) ax.set_title('2D Representation for 2 Features') ax.set_xlabel(cols[0]) ax.set_ylabel(cols[1])
metric = ['manhattan', 'euclidean', 'mahalanobis'] #Determine which combination of features to investigate; select one from all the possible combinations in the powerset. feature_combination = powerset(all_cols)[205] #Create KNNClassifier object, best parameters to use depends on the output of the GridSearchCV class methods. #knn_object = KNNClassifier(n_neighbors=10, weights='uniform', metric=metric[1], algorithm='brute') #df1 = knn_object.pnl_backtesting(fname, asset_class, feature_combination) #Create LogisticRegressionClassifier object. # high C corresponds to no regularization. logit_object = LogisticRegressionClassifier(C=50, solver='liblinear', penalty='l2') df2 = logit_object.pnl_backtesting(fname, asset_class, feature_combination) #Create SVM object. High C means hard margins, and hence takes longer to solve the optimization problem. svm_object = SVMClassifier(C=1, kernel='linear', cache_size=1000) df3 = svm_object.pnl_backtesting(fname, asset_class, feature_combination) #Dataframe carrying all info about features, especially the actual realised return, that will be compared with the predicted return. df = create_features(fname, asset_class) df = df[-df2.shape[0]:] #Put all predicted PnL for each classifier in the one dataframe df, and then plot all to comapre. #df['pred_ret_knn'] = df1['Daily PnL'] df['pred_ret_svm'] = df3['Daily PnL'] df['pred_ret_logit'] = df2['Daily PnL'] df[['pred_ret_svm', 'pred_ret_logit']].cumsum().apply(np.exp).plot(figsize=(15, 10)) plt.savefig('USDZAR.png')