def fit_naive_bayes(asset, y_col, X_col, window_size, results): y, X = make_data(asset, response_col=y_col, input_col=X_col, window_size=window_size) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) Log.info("*** Gaussian Naive Bayes") clf = GaussianNB() clf.fit(X_train, y_train) score = clf.score(X_test, y_test) results = add_result(results, "Gaussian Naive Bayes", y_col, X_col, window_size, score) return results
def fit_adaboost(asset, y_col, X_col, window_size, results): y, X = make_data(asset, response_col=y_col, input_col=X_col, window_size=window_size) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) Log.info("*** AdaBoost") clf = AdaBoostClassifier(n_estimators=100, random_state=0) clf.fit(X_train, y_train) score = clf.score(X_test, y_test) results = add_result(results, "AdaBoost", y_col, X_col, window_size, score) return results
def fit_KNN(asset, y_col, X_col, window_size, results): y, X = make_data(asset, response_col=y_col, input_col=X_col, window_size=window_size) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) Log.info("*** KNN") clf = KNeighborsClassifier(n_neighbors=3) clf.fit(X_train, y_train) score = clf.score(X_test, y_test) results = add_result(results, "KNN", y_col, X_col, window_size, score) return results
def fit_support_vector_machines(asset, y_col, X_col, window_size, results): y, X = make_data(asset, response_col=y_col, input_col=X_col, window_size=window_size) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) Log.info("*** Support Vector Machines") clf = svm.SVC(gamma='scale', decision_function_shape='ovo') clf.fit(X_train, y_train) score = clf.score(X_test, y_test) results = add_result(results, "Support Vector Machines", y_col, X_col, window_size, score) return results
def fit_binary_logistic_regression(asset, y_col, X_col, window_size, results): y, X = make_data(asset, response_col=y_col, input_col=X_col, window_size=window_size) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) Log.info("*** Binary logistic regression") clf = LogisticRegression(penalty='l2', solver='lbfgs') clf.fit(X_train, y_train) score = clf.score(X_test, y_test) results = add_result(results, "Binomial logistic regression", y_col, X_col, window_size, score) return results
def fit_decision_trees(asset, y_col, X_col, window_size, results): y, X = make_data(asset, response_col=y_col, input_col=X_col, window_size=window_size) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) Log.info("*** Decision Trees") clf = tree.DecisionTreeClassifier() clf.fit(X_train, y_train) score = clf.score(X_test, y_test) results = add_result(results, "Decision Trees", y_col, X_col, window_size, score) return results
def fit_multinomial_logistic_regression(asset, y_col, X_col, window_size, results): y, X = make_data(asset, response_col=y_col, input_col=X_col, window_size=window_size) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) Log.info("*** Multinomial logistic regression") clf = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000) clf.fit(X_train, y_train) score = clf.score(X_test, y_test) results = add_result(results, "Multinomial logistic regression", y_col, X_col, window_size, score) return results
def fit_bagging_logistic_regression(asset, y_col, X_col, window_size, results): y, X = make_data(asset, response_col=y_col, input_col=X_col, window_size=window_size) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) Log.info("*** Bagging (Logistic Regression)") clf = BaggingClassifier(LogisticRegression(solver='lbfgs', multi_class='multinomial'), n_estimators=5, max_samples=0.5, max_features=0.5) clf.fit(X_train, y_train) score = clf.score(X_test, y_test) results = add_result(results, "Bagging (log. regression)", y_col, X_col, window_size, score) return results
def fit_ANN(asset, y_col, X_col, window_size, results): y, X = make_data(asset, response_col=y_col, input_col=X_col, window_size=window_size) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) Log.info("*** ANN") scaler = StandardScaler() scaler.fit(X_train) X_train_scaled = scaler.transform(X_train) X_test_scaled = scaler.transform(X_test) from sklearn.neural_network import MLPClassifier clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(6, 2), random_state=1) clf.fit(X_train_scaled, y_train) score = clf.score(X_test_scaled, y_test) results = add_result(results, "ANN(6,2)", y_col, X_col, window_size, score) return results
def fit_LSTM_model_signal(asset, response_var, input_vars, window_size, model_layers, epochs=3, outfile=None): Log.info( "Fitting categorical LSTM model %s", str((response_var, input_vars, window_size, model_layers, epochs))) # Create features if necessary input_cols = [] for input_col in input_vars: asset = create_input_data(asset, input_col) input_cols.append(input_col) # Create response features if necessary response_col = make_response_col(response_var) asset = create_response_data(asset, response_var) # Make response variable y and input matrix X X, y = make_data(asset, response_col, input_cols, window_size, config().days()) # Convert X to numpy array X = np.array([x_i.values for x_i in X]) # Convert y to categorical variable y = to_categorical(np.array(y) + 1, num_classes=3) # Split data into training and test set X_val_train, X_test, y_val_train, y_test = split_data(X, y) X_train, X_validate, y_train, y_validate = train_test_split( X_val_train, y_val_train) model_name = "LSTM_Signal_" + str(int(np.random.uniform(10000, 99999))) if len(np.unique(np.argmax(y_validate, axis=1))) == 1 or len( np.unique(np.argmax(y_train, axis=1))) == 1: Log.error("Only one class in y_train or y_validate. Skip training.") if outfile is not None: outfile.write("%s;%s;%s;%s;%s;%d;%d;%s;%s;%s;%s\n" % (asset.symbol, response_var, str(input_vars), model_name, str(model_layers), len(X_train), len(X_test), "n/a", "n/a", "n/a", "n/a")) outfile.flush() return asset # Construct model model = Sequential() for layer in model_layers: model.add(LSTM(layer, return_sequences=True)) # We end up with 3 categories model.add(LSTM(3, activation="softmax")) # Compile model model.compile(loss='categorical_crossentropy', optimizer='adagrad', metrics=['categorical_accuracy']) # Train model logfile = open(os.path.join(output_path(), model_name + ".log"), "w") model.fit(X_train, y_train, epochs=epochs, validation_split=0.2, callbacks=[TestCallback(X_test, y_test, logfile)]) # print(model.summary()) logfile.close() # Evaluate model loss, accuracy = model.evaluate(X_validate, y_validate) y_predicted_class = model.predict_classes(X_validate) y_true_class = np.argmax(y_validate, axis=1) precision = precision_score(y_true_class, y_predicted_class, average="macro") recall = recall_score(y_true_class, y_predicted_class, average="macro") # print("y true:", y_true_class) # print("y_predicted:", y_predicted_class) # # print("Loss: {:.2f}".format(loss)) # print("Accuracy: {:.2f}%".format(accuracy * 100)) # print("Precision: {:.2f}%".format(precision * 100)) # print("Recall: {:.2f}%".format(recall * 100)) # Write results to file if outfile is not None: outfile.write("%s;%s;%s;%s;%s;%d;%d;%.4f;%.4f;%.4f;%.4f\n" % (asset.symbol, response_var, str(input_vars), model_name, str(model_layers), len(X_train), len(X_test), loss, accuracy, precision, recall)) outfile.flush() # Save model to file model.save(os.path.join(output_path(), model_name + ".h5")) return asset