import os import numpy as np import xgboost as xgb from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.metrics import r2_score, mean_squared_error from sklearn.externals import joblib from data_utils import DataUtils # basic_only | acidic only DATA_CATEGORY = "basic_only" FEATURE_TYPE = "morgan+macc" cur_dir = os.path.dirname(__file__) d_utils = DataUtils(filepath=os.path.join(cur_dir, "data/pKaInWater.csv")) X_data, y_data = d_utils.get_regression_data(data_category=DATA_CATEGORY, feature_type=FEATURE_TYPE) # train test split seed = 7 X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=seed) print("\n ========================= \n") print("X_train.shape:", X_train.shape, "X_test.shape", X_test.shape) print("\n ========================= \n") def model_evaluation(model, x_input, y_input): y_pred = model.predict(x_input) rmse_value = np.sqrt(mean_squared_error(y_true=y_input, y_pred=y_pred)) r2_value = r2_score(y_true=y_input, y_pred=y_pred) return rmse_value, r2_value