def evaluateRandomForest(): print("\nEvaluating Random Forest") regResults.append(["Results for Random Forest"]) for data in regDatasets: #Import the Dataset and separate X and y data_to_test = "regression/" + data + '.csv' X_before, y_before = importDataset(data_to_test) count = 0 avg_explained_variance_score = 0 avg_max_error = 0 avg_mae = 0 avg_mse = 0 avg_r2_score = 0 for train, test in kfold.split(X_before): print("Test:", count+1, "for", data_to_test) X_train, X_test = X_before.iloc[train], X_before.iloc[test] y_train, y_true = y_before[train], y_before[test] #feature scaling X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) # run algorithm from randomForest import randomForest rfModel = randomForest(X_train, y_train, X_test, y_true, X_before) predictions = rfModel.getPredictions() # get metrics avg_explained_variance_score += metrics.explained_variance_score(y_true, predictions) avg_max_error += metrics.max_error(y_true, predictions) avg_mae += metrics.mean_absolute_error(y_true, predictions) avg_mse += metrics.mean_squared_error(y_true, predictions) avg_r2_score += metrics.r2_score(y_true, predictions) count += 1 avg_explained_variance_score = avg_explained_variance_score / count avg_max_error = avg_max_error / count avg_mae = avg_mae / count avg_mse = avg_mse / count avg_r2_score = avg_r2_score / count regResults.append(['', data_to_test, float(avg_explained_variance_score), float(avg_max_error), float(avg_mae), float(avg_mse), float(avg_r2_score)]) print("Random Forest evaluation results") print("Average explained variance score:", avg_explained_variance_score) print("Average mean absolute error:", avg_mae) print("Average mean squared error:", avg_mse) print("Average r2 score:", avg_r2_score)
def evaluateANN(): regResults.append(["Results for ANN"]) for data in regDatasets: #Import the Dataset and separate X and y data_to_test = "regression/" + data + '.csv' X_before, y_before = importDataset(data_to_test) count = 0 avg_explained_variance_score = 0 avg_max_error = 0 avg_mae = 0 avg_mse = 0 avg_r2_score = 0 for train, test in kfold.split(X_before): print("Test:", count+1, " for", data_to_test) X_train, X_test = X_before.iloc[train], X_before.iloc[test] y_train, y_true = y_before[train], y_before[test] #feature scaling X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) # run ANN from regressionAnalysis import sequentialNN regressor = sequentialNN(X_train, y_train, X_test, y_true) exp_variance_score, max_error, loss, mae, mse = regressor.getEvaluationMetrics() # get metrics avg_explained_variance_score += exp_variance_score avg_max_error += max_error avg_mae += mae avg_mse += mse avg_r2_score += metrics.r2_score(y_true, regressor.getPredictions()) count += 1 avg_explained_variance_score = avg_explained_variance_score / count avg_max_error = avg_max_error / count avg_mae = avg_mae / count avg_mse = avg_mse / count avg_r2_score = avg_r2_score / count regResults.append(['', data_to_test, float(avg_explained_variance_score), float(avg_max_error), float(avg_mae), float(avg_mse), float(avg_r2_score)]) print("ANN evaluation results") print("Average explained variance score:", avg_explained_variance_score) print("Average mean absolute error:", avg_mae) print("Average mean squared error:", avg_mse) print("Average r2 score:", avg_r2_score)
def selectDataset(): print("Select a file to use:") print("1 - Regression Original 1437 rows") print("2 - Regression Balanced (83% deleted)") print("3 - Regression Encoded variables all 1437 rows") print("4 - Regression Encoded variables balanced (83% deleted)") print("5 - Regression no 365 days") print("6 - Regression only 365 days") print("7 - Regression only synthetic 3211 rows") print("8 - Regression synthetic plus 365 days") print("9 - Another dataset") number = 0 acceptedDataset = False while acceptedDataset is False: number = int(input("Select number to import dataset: ")) if number > 0 and number < 10: acceptedDataset = True else: print( "Invalid number, select a dataset by selecting its number (1 to 9)" ) choice = "" if number == 1: choice = "regression/regAll.csv" if number == 2: choice = "regression/regBalanced.csv" if number == 3: choice = "regression/regEncoded.csv" if number == 4: choice = "regression/regEncodedBalanced.csv" if number == 5: choice = "regression/regNo365.csv" if number == 6: choice = "regression/regOnly365.csv" if number == 7: choice = "regression/regSynthetic.csv" if number == 8: choice = "regression/regSyntheticWith365.csv" if number == 9: choice = input("input full path of dataset: ") print("dataset chosen:", choice) # Import the Dataset and separate X and y X_before, y_before = importDataset(choice) # Split the dataset X_train, X_test, y_train, y_test = splitAndScale(X_before, y_before) return X_before, y_before, X_train, X_test, y_train, y_test, choice
def findBestMatch(): print("\nEvaluating different recipients") X_before, y_before = importDataset('regression/regSyntheticWith365.csv') X_train, X_test, y_train, y_true = splitAndScale(X_before, y_before) # Train models with synthetic dataset from regressionAnalysis import sequentialNN sequentialNN(X_train, y_train, X_test, y_true) ann = tf.keras.models.load_model('models/ann.h5') from randomForest import randomForest randomForest(X_train, y_train, X_test, y_true, X_before) rf = joblib.load('models/rf.sav') from svr import svr svr(X_train, y_train, X_test, y_true) svr = joblib.load('models/svr.sav') MLmodels = [ann, rf, svr] for data in recipientDatasets: predict_results.append([data]) print("Predicting for",data) dataset = pd.read_csv('datasets/' + data + '.csv') to_predict = dataset.iloc[:, :-1].values count = 1 for row in to_predict: transform = scaler.fit_transform(row.reshape(-1, 1)) prediction = ['','donor'+ str(count)] for model in MLmodels: new_pred = model.predict(transform.reshape(1, -1)) if 'Sequential' in str(type(model)): prediction.append(new_pred[0][0]) else: prediction.append(new_pred[0]) predict_results.append(prediction) count += 1 print('Predictions saved to file RecipientsPredictions.csv')