def predict(course_code, user_id): filename = get_path(course_code, '%s_model.xgb' % course_code) X, y = load_data(course_code) user_X = X.loc[user_id] # Normalization if course_code not in data_transformer: scaler = MinMaxScaler() scaler.fit(X) data_transformer[course_code] = scaler scaler = data_transformer[course_code] if course_code not in model_cache: model = XGBRegressor() if os.path.isfile(filename): model.load_model(filename) else: X = scaler.transform(X) model.fit(X, y) model.save_model(filename) model_cache[course_code] = model model = model_cache[course_code] X = scaler.transform(X) y_ = model.predict(X) hist, bin_edges = np.histogram(y_, bins=10, range=[0, 1]) return { "classFinalExamDistribution": hist.tolist(), "myChapterScore": get_user_chapter_grades(course_code, user_id), "myPredictedFinalExamScore": float(model.predict(user_X)[0]) }
def __init__(self, json_file): name = "_".join(json_file.split("/")[-4:]) name = name.replace("/", "_") name = name.replace('.json', '') name = "XGB_" + name name = name.replace("-", "_") self.name = name # load json and create model loaded_model = XGBRegressor() loaded_model.load_model(json_file) print("Loaded XGBRegressor model from disk:") print("\t{}".format(json_file)) self.model = loaded_model # load list of inputs for the model sys.path.insert(0, json_file.rstrip(json_file.split('/')[-1])) import inputs_for_models_in_this_dir reload(inputs_for_models_in_this_dir ) # avoid being stuck with previous versions this_model_inputs = inputs_for_models_in_this_dir.inputs this_model_inputs = [ i if i not in var_names_at_KIT.keys() else var_names_at_KIT[i] for i in this_model_inputs ] self.inputs = this_model_inputs
def main(): print("Loading data...") # The training data is used to train your model how to predict the targets. training_data = read_csv("numerai_training_data.csv") # The tournament data is the data that Numerai uses to evaluate your model. tournament_data = read_csv("numerai_tournament_data.csv") feature_names = [ f for f in training_data.columns if f.startswith("feature") ] print(f"Loaded {len(feature_names)} features") # This is the model that generates the included example predictions file. # Taking too long? Set learning_rate=0.1 and n_estimators=200 to make this run faster. # Remember to delete example_model.xgb if you change any of the parameters below. model = XGBRegressor(max_depth=5, learning_rate=0.01, n_estimators=2000, n_jobs=-1, colsample_bytree=0.1) if MODEL_FILE.is_file(): print("Loading pre-trained model...") model.load_model(MODEL_FILE) else: print("Training model...") model.fit(training_data[feature_names], training_data[TARGET_NAME]) model.save_model(MODEL_FILE) # Generate predictions on both training and tournament data print("Generating predictions...") training_data[PREDICTION_NAME] = model.predict( training_data[feature_names]) tournament_data[PREDICTION_NAME] = model.predict( tournament_data[feature_names]) # Check the per-era correlations on the training set (in sample) train_correlations = training_data.groupby("era").apply(score) print( f"On training the correlation has mean {train_correlations.mean()} and std {train_correlations.std()}" ) print( f"On training the average per-era payout is {payout(train_correlations).mean()}" ) # Check the per-era correlations on the validation set (out of sample) validation_data = tournament_data[tournament_data.data_type == "validation"] validation_correlations = validation_data.groupby("era").apply(score) print( f"On validation the correlation has mean {validation_correlations.mean()} and " f"std {validation_correlations.std()}") print( f"On validation the average per-era payout is {payout(validation_correlations).mean()}" ) # Save predictions as a CSV and upload to https://numer.ai tournament_data[PREDICTION_NAME].to_csv(TOURNAMENT_NAME + "_submission.csv")
class GDPGrowthPredictor: """Gbm class""" def __init__(self, *args, **kwargs): """Create model with given parameters""" self.model = XGBRegressor(*args, **kwargs) def train(self, filename, split, previous_year, plot, *args, **kwargs): """Train model, and plot results""" X_train, X_test, y_train, y_test, features = _io.retrieve_training_dataset( split, previous_year) self.model.fit(X_train, y_train, *args, **kwargs) self.save(filename) if split != 0: self.test(X_test, y_test, features, split, plot) def test(self, X_test, y_test, features, split, plot): """Test model""" model_y_pred = self.model.predict(X_test) results_df = X_test results_df = results_df.drop(columns=features) results_df["y_real"] = y_test results_df["y_pred"] = model_y_pred results_df["err"] = np.absolute(results_df["y_real"] - results_df["y_pred"]) results_df["%_err"] = ((results_df["err"]) / (np.absolute(results_df["y_real"])) * 100) logging.info("Test results with %s split:", split) logging.info("\t RMSE: %.3f", mean_squared_error(y_test, model_y_pred)**0.5) logging.info("\t R^2: %.3f", r2_score(y_test, model_y_pred)) if plot: logging.info("Generating plots") plots.plot_performance_results(y_test, model_y_pred) plots.plot_shap_results(X_test, features, self.model) def predict(self, filename, previous_year, year, *args, **kwargs): """Make predictions for next year GDP growth, returns a pandas df""" self.load(filename) predictions, X_predict = _io.retrieve_predict_dataset( previous_year, year) predictions["Value"] = self.model.predict(X_predict, *args, **kwargs) return predictions def save(self, filename): """ Save model to file""" self.model.save_model(filename) logging.info("Model saved") def load(self, filename): """ Load model from file""" self.model.load_model(filename) logging.info("Model loaded")
def get_regressors(): a = XGBRegressor() a_model_path = os.path.join(STORAGE, "modela.xgb") a.load_model(a_model_path) b = XGBRegressor() b_model_path = os.path.join(STORAGE, "modelb.xgb") b.load_model(b_model_path) return a, b
def pred_psm(self, path, main_df_col, historical_postal_code_area, area_centroids, sch_gdf, train_gdf, police_centre_gdf, avg_cases_by_npc): ''' :param path: takes in path where model weights and scalers are stored :param main_df_col: list of training dataset column names so that prediction df tallies :param historical_postal_code_area: to get planning area/region of postal code if it is in our dataset instead of using distance measures due to differences in areas returned for some postal codes :param area_centroids: to get the planning area property is in :param sch_gdf: to get nearest school distance :param train_gdf: to get nearest stations/lines :param police_centre_gdf: to get nearest police centre :param avg_cases_by_npc: to get avg crime cases per year for nearest police centre :return: predicted price per sqm ''' property_df = self.convert_to_df(main_df_col, historical_postal_code_area, area_centroids, sch_gdf, train_gdf, police_centre_gdf, avg_cases_by_npc) s_scaler = joblib.load(path + 'standard_scaler.bin') mm_scaler = joblib.load(path + 'mm_scaler.bin') standardScale_vars = [ 'Area (SQM)', 'Floor Number', 'PPI', 'Average Cases Per Year', 'Nearest Primary School', 'nearest_station_distance' ] minMax_vars = ['Remaining Lease'] s_scaled = pd.DataFrame(s_scaler.transform( property_df.loc[:, standardScale_vars].copy()), columns=standardScale_vars) mm_scaled = pd.DataFrame(mm_scaler.transform( property_df.loc[:, minMax_vars].copy()), columns=minMax_vars) property_df_scaled = pd.concat([ s_scaled, mm_scaled, property_df.loc[:, 'Ang Mo Kio':'Executive Condominium'].copy() ], axis=1) # Initialize model model = XGBRegressor() # Load model model.load_model(path + 'model_xgboost.bin') # Use the loaded model to make predictions prediction = model.predict(property_df_scaled)[0] # Covert prediction in SQM to SQFT prediction = prediction / 10.7639 return prediction
class XGBModel(GenericModel): def __init__(self, name, version=1, classifier=True, xgb_kwargs=None): super().__init__(name, version) self.xgb_kwargs = xgb_kwargs if classifier: self.model = XGBClassifier(**xgb_kwargs) else: self.model = XGBRegressor(**xgb_kwargs) def train(self): print( 'No custom train method implemented. Instead call self.model.fit(...)' ) def save_model(self, notes=None, update_version=False, config=None, save_attributes=True): if update_version: self.version += 1 try: model_path = self.model_dir / Path(f'v{self.version}.json') self.model.save_model(model_path.as_posix()) except Exception as e: print('Error saving model') print(e) raise if save_attributes: self._save_attributes() if notes is not None: self._save_notes(notes) if config is not None: self._save_config(config) def load_model(self, version, load_attributes=True): # First load the xgb_kwargs so that we can create a new instance of XGB self._load_attributes(self.attr_dir) if hasattr(self, 'xgb_kwargs'): self.model = self.model(self.xgb_kwargs) # Next load the model model_path = self.model_dir / Path(f'v{self.version}.json') assert model_path.exists( ), f'No model exists at {model_path.as_posix()}' self.model.load_model(model_path)
def model_predict(s): param = { 'colsample_bytree': 0.8, 'subsample': 0.75, 'eta': 0.02, 'n_estimators': 1100, 'max_depth': 7, 'min_child_weight': 1 } model = XGBRegressor(**param) try: model.load_model("./models/xgbmodelprime") except: model.load_model("./models/xgbmodel") y_pred = model.predict(s[[ "date_block_num", "shop_id", "item_id", "id_struct", "item_category", "Price_agg", "keyz", "item_cnt_month_lag1", "item_cnt_month_lag2", "item_cnt_month_lag3", "item_cnt_month_lag4", "item_cnt_month_lag5", "item_cnt_month_lag6", "item_cnt_month_lag7", "Price_agg_lag1", "Price_agg_lag2" ]]) #create current preds file s["predictions"] = y_pred c_pred = s[["Date", "shop_id", "item_id", "predictions"]] c_pred["Date"] = c_pred["Date"].astype("str") try: h_preds = pd.read_csv("data/prediction/h_predictions.csv") except: hp_df = pd.DataFrame({ 'Date': pd.Series([], dtype='str'), 'shop_id': pd.Series([], dtype='int'), 'item_id': pd.Series([], dtype='int'), 'predictions': pd.Series([], dtype='float') }) hp_df["Date"] = pd.to_datetime(hp_df["Date"]) hp_df.to_csv("data/prediction/h_predictions.csv", index=False) h_preds = pd.read_csv("data/prediction/h_predictions.csv") new_dff = pd.concat( [h_preds[["Date", "shop_id", "item_id", "predictions"]], c_pred], axis=0, sort=False) new_dff1 = new_dff.drop_duplicates(["Date", "shop_id", "item_id" ]).reset_index().drop(["index"], axis=1) new_dff1.to_csv("data/prediction/h_predictions.csv", index=False) return s[["Date", "shop_id", "item_id", "predictions"]]
def main(): course = 'VJx__VJx_2__3T2016' filename = 'model.xgb' X, y = load_data(course) # Normalization scaler = MinMaxScaler() scaler.fit(X) X = scaler.transform(X) model = XGBRegressor() if os.path.isfile(filename): model.load_model(filename) else: model.fit(X, y) model.save_model(filename) y_ = model.predict(X) print(y_)
class XGBModel(Model): def Build(self): self.model = XGBRegressor(max_depth=10, n_estimators=1000, objective='reg:squarederror', seed=config.random_state, nthread=12, tree_method='gpu_hist') def Load(self, fileName): self.Build() self.model.load_model(fileName + '.xgb') def Save(self, fileName): self.model.save_model(fileName + '.xgb') def Fit(self, X_trn, y_trn, X_tst, y_tst, plot=False): self.model.fit(X_trn, y_trn, eval_metric='rmse', eval_set=[(X_trn, y_trn), (X_tst, y_tst)], verbose=True, early_stopping_rounds=50) if plot: results = self.model.evals_result() loss = results['validation_0']['rmse'] val_loss = results['validation_1']['rmse'] plot_loss(loss, val_loss) def Predict(self, X): return self.model.predict(X).reshape(-1,1)
def bulid_models(x_train, y_train, x_test, y_test, best_grida, best_gridb): root_folder = lib.features.STORAGE file_patha = os.path.join(root_folder, "modela.xgb") file_pathb = os.path.join(root_folder, "modelb.xgb") modela = XGBRegressor() modela.load_model(file_patha) y_preda = modela.predict(x_test) base_scorea = mean_absolute_error(y_test[:, 0], y_preda) modelb = XGBRegressor() modelb.load_model(file_pathb) y_predb = modela.predict(x_test) base_scoreb = mean_absolute_error(y_test[:, 1], y_predb) modela = XGBRegressor(**best_grida) modela = modela.fit(x_train, y_train[:, 0], eval_set=[(x_test, y_test[:, 0])], early_stopping_rounds=100, verbose=False) y_preda = modela.predict(x_test) scorea = mean_absolute_error(y_test[:, 0], y_preda) print("score A : {} vs {}".format(scorea, base_scorea)) if scorea <= base_scorea: modela.save_model(file_patha) print("model A saved !") modelb = XGBRegressor(**best_gridb) modelb = modelb.fit(x_train, y_train[:, 1], eval_set=[(x_test, y_test[:, 1])], early_stopping_rounds=100, verbose=False) y_predb = modelb.predict(x_test) scoreb = mean_absolute_error(y_test[:, 1], y_predb) print("score B : {} vs {}".format(scoreb, base_scoreb)) if scoreb <= base_scoreb: modelb.save_model(file_pathb) print("model B saved !")
# 각 훈련별 loss값이 반환 aaa = model.score(x_test, y_test) # print("model.score : ", aaa) y_pred = model.predict(x_test) r2 = r2_score(y_test, y_pred) print("r2 : ", r2) # aaa : 0.9329663244922279 # r2 : 0.9329663244922279 # print("===========================") # result = model.evals_result() # print(result) #저장 import pickle # pickle.dump(model, open('../data/xgb_save/m39.pickle.dat','wb')) import joblib # joblib.dump(model,'../data/xgb_save/m39.joblib.dat') # model.save_model('../data/xgb_save/m39.xgb.model') # print('저장완료') #불러오기 # model2 = pickle.load(open('../data/xgb_save/m39.pickle.dat','rb')) # model2 = joblib.load('../data/xgb_save/m39.pickle.dat') model2 = XGBRegressor() model2.load_model('../data/xgb_save/m39.xgb.model') print('불러오기') r22 = model2.score(x_test, y_test) print('r22 : ', r22)
XGboost_model = XGBoosting( 0.7, # subsample 20, # max_depth 5, # min_samples_split 0.09, # learning_rate 'mae', # eval_metric 1, # num_parallel_tree 15) # number of trees XGboost_model.fit(X=x_train, y=y_train) XGboost_model.save_model('xgboost_model') if __name__ == '__main__': test_data = pd.read_csv('./x_test.csv') test_labels = pd.read_csv('./y_test.csv') # train_data = pd.read_csv('./x_train.csv') # train_labels = pd.read_csv('./y_train.csv') # build_models(train_data, train_labels) ''' Load the models from their files ''' XGboost_model = XGBRegressor() XGboost_model.load_model('xgboost_model') # boost_RF_model = XGBRegressor() # boost_RF_model.load_model('RF_model') # '''' Initiate score check on the XGBoost model ''' predictions = XGboost_model.predict(test_data) print(test_labels.values()) labels_arr = test_labels.to_numpy().reshape(-1) print(explained_variance_score(predictions, test_labels))
from xgboost import XGBRegressor import flask import locale import pandas as pd from df_schema import df_dict model = XGBRegressor() model.load_model('model/best_model.json') app = flask.Flask(__name__, template_folder='templates') def brl(value): locale.setlocale(locale.LC_ALL, 'pt_BR.UTF-8') return 'R$ {}'.format(locale.currency(value, grouping=True, symbol=False)) @app.route('/', methods=['GET', 'POST']) def main(): if flask.request.method == 'GET': return flask.render_template('main.html') if flask.request.method == 'POST': type_ = flask.request.form['type'] neighborhood = flask.request.form['neighborhood'] if type_ != 'Apartamento': df_dict[type_] = 1.
class Regressor: # for initializing train and test sets, classifier and accuracy score # Change method to gpu_hist if you want xgboost to run on a GPU def __init__(self, params={ 'objective': 'reg:squarederror', 'verbosity': 0 }): self.X_train = [] self.X_labels = [] self.test = [] self.test_labels = [] self.model = XGBRegressor(**params) self.prediction = 0 self.error = 0 def size(self): if isinstance(self.X_train, np.ndarray): return self.X_train.size return len(self.X_train) # adding the data points def input_train(self, features, feature): if isinstance(self.X_train, np.ndarray) and self.X_train.size > 0: self.X_train = self.X_train.tolist() self.X_labels = self.X_labels.tolist() self.X_train.append(features) self.X_labels.append(feature) # train the data def train(self): self.X_train = np.asarray(self.X_train) self.X_labels = np.asarray(self.X_labels) self.model.fit(self.X_train, self.X_labels) def train_eval(self, metric='error'): self.X_train = np.asarray(self.X_train) self.X_labels = np.asarray(self.X_labels) X_train, X_test, y_train, y_test = train_test_split(self.X_train, self.X_labels, test_size=0.33) self.model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], eval_metric=metric) evals_result = self.model.evals_result() if metric == 'error': validations = [] for val in evals_result.values(): lst = val.get("error") validations.append(sum(lst) / len(lst)) return 1 - (sum(validations) / len(validations)) else: validations = [] for val in evals_result.values(): lst = val.get(metric) validations.append(lst[-1]) return validations # input test labels if you want to check accuracy def label(self, label): self.test_labels.append(label) def input_test(self, features): if isinstance(self.test, np.ndarray) and self.test.size > 0: self.test = self.test.tolist() self.test.append(features) # test data def predict(self): if not isinstance(self.test, np.ndarray): self.test = np.asarray(self.test) self.prediction = self.model.predict(self.test) return self.prediction # if you have the test labels you can check the error rate (you want error close to 0) def check_error(self): self.test_labels = np.asarray(self.test_labels) self.error = metrics.mean_absolute_error(self.test_labels, self.prediction) return self.error # save classifier def save_classifier(self, file): self.model.save_model(file) # open saved classifier def open_classifier(self, file): self.model.load_model(file) # removes all training data def clean_train(self): self.X_train = [] self.X_labels = [] # removes all testing data def clean_test(self): self.test = [] self.test_labels = []
def load(name): model = XGBRegressor() model.load_model(MODEL_DIRECTORY + name + ".json") return model
y_pred = model.predict(x_test) r2 = r2_score(y_test, y_pred) print('r2 : ', r2) result = model.evals_result() # print(result) # 모델 저장 import pickle import joblib # pickle.dump(model, open('../data/xgb_save/m_39.pickle.dat','wb')) # dump : save랑 같다 # print('저장') # joblib.dump(model,('../data/xgb_save/m_40.jolib.dat')) # print('저장') # model.save_model('../data/xgb_save/m_41.xgb.dat') # print('저장') # print('============================================================') # # 모델 불러오기 # # model2 = pickle.load(open('../data/xgb_save/m_39.pickle.dat','rb')) # model2 = joblib.load('../data/xgb_save/m_40.jolib.dat') model2 = XGBRegressor() model2.load_model('../data/xgb_save/m_41.xgb.dat') print('불러오기') r22 = model2.score(x_test, y_test) print(r22)
def load_model(model_name): model = XGBRegressor() model.load_model(f"resources/{model_name}.json") return model
plt.xticks(rotation=90, fontsize=12) plt.yticks(fontsize=12) plt.grid(alpha=0.6) plt.ylim(0, 0.6) plt.xlabel('Feature', fontsize=20) plt.ylabel('Importance', fontsize=20) plt.axes().set_axisbelow(True) plt.savefig( f'ModelComparison/plots/PROACT_{model_name}_feature_imprtance_last_30.png', quality=100, bbox_inches='tight') plt.show() i = 1 model = XGBRegressor() model.load_model( f'C:/Users/Ben/Desktop/Results/PROACT/XGB/models/model_{i}.model') model.feature_importances_ PROACT_feature_importances = pd.DataFrame( columns=['feature', 'importance', 'iter']) for i in range(60): try: temp = pd.DataFrame( dict(feature=columns, importance=a.feature_importances_, iter=i)) PROACT_feature_importances = pd.concat( [PROACT_feature_importances, temp]) except: pass aa = PROACT_feature_importances.groupby(
class BostonHouseFeatures(BaseModel): crim: float # per capita crime rate by town zn: float # proportion of residential land zoned for lots over 25,000 sq.ft. indus: float # proportion of non-retail business acres per town chas: float # Charles River dummy variable (= 1 if tract bounds river; 0 otherwise) nox: float # nitric oxides concentration (parts per 10 million) rm: float # average number of rooms per dwelling age: float # proportion of owner-occupied units built prior to 1940 dis: float # weighted distances to five Boston employment centres rad: float # index of accessibility to radial highways tax: float # full-value property-tax rate per $10,000 ptratio: float # pupil-teacher ratio by town b: float # 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town lstat: float # % lower status of the population # uvicorn boston_inference:app --reload app = FastAPI() xgb = XGBRegressor() xgb.load_model("xgbregressor_boston.json") @app.post("/predict") async def predict_house_price(features: BostonHouseFeatures): X = np.array(list(features.dict().values())) y = float(xgb.predict(np.expand_dims(X, axis=0))[0]) return {"price": y}
# importing packages import flask from flask import Flask import pandas as pd import pickle import xgboost from xgboost import XGBRegressor print(pickle.format_version) # importing model and features ml = XGBRegressor() #with open('model/xgb_new.pkl', 'rb') as file: # modelo_simples = pickle.load(file) ml.load_model('model/xgb_new.pkl') with open('model/features.names', 'rb') as file: features = pickle.load(file) app = Flask(__name__, template_folder='templates') @app.route('/', methods=['GET', 'POST']) def main(): if flask.request.method == 'GET': return flask.render_template('airbnb.html') if flask.request.method == 'POST': user_inputs = { 'Latitude': flask.request.form['latitude'], 'Longitude': flask.request.form['longitude'], 'Minimum Nights': flask.request.form['minimum_nights'], 'Available Days In A Year': flask.request.form['availability_365'],
col_list = list(template.columns) def complete_cols(userinp): for i in range(0,len(col_list)): if userinp.lower() in str(col_list[i]): colname = str(col_list[i]) template[colname] = 1 #converting the binary input bininput(quilts,'has_quilts') bininput(logo,'has_logo') bininput(chain,'has_chain') bininput(otherdef,'other_defects') bininput(odor,'has_smell') complete_cols(material) complete_cols(color) complete_cols(style) complete_cols(size) complete_cols(year) complete_cols(cond) template['acc_included']=acc #predict value loaded_model = XGBRegressor(random_state=1) loaded_model.load_model("xgb1_tunedCHANEL_alldata.model") prediction = str(loaded_model.predict(template)) prediction = float((prediction.strip('[').replace('','')).strip(']').replace('','')) #FRONT-END: OUTPUT st.markdown("---") st.header("The current resale value of this bag is:") st.title(f'$%10.f'%prediction)
from numpy.core.numeric import load from preprocessing import load_data from feature_engineering import * from xgboost import XGBRegressor import pandas as pd prediction_path = "../Case_material/predictions/predictions.csv" (X_test, _) = load_data(train=False) xgb_model = XGBRegressor(n_jobs=8) xgb_model.load_model("models/xgb_handin.model") predictions = xgb_model.predict(X_test) pd.DataFrame({ "Prediction": predictions }).to_csv(prediction_path, sep=",", index=False) print("First five predictions:", predictions[:5]) print("The predictions were place in:", prediction_path)
thresholds = np.sort(model.feature_importances_) print(thresholds) for thresh in thresholds: selection = SelectFromModel(model, threshold=thresh, prefit=True) select_x_train = selection.transform(x_train) selection_model = XGBRegressor() selection_model.fit(select_x_train, y_train) select_x_test = selection.transform(x_test) y_pred = selection_model.predict(select_x_test) score = r2_score(y_test, y_pred) print("thresh=%.3f, n = %d, R2 : %2.f%%" %(thresh, select_x_train.shape[1], score*100.0)) model.save_model('./model/xgb_save/boston_rmse') print("저장 됬다.") model2=XGBRegressor() model2.load_model('./model/xgb_save/boston_rmse') print("불러왔다.") y_pred = model2.predict(x_test) score = r2_score(y_pred, y_test) print("score : ", score)
def linear_model(x_train, y_train): reg = LinearRegression().fit(x_train, y_train) print(reg.score(x_train, y_train)) return reg if __name__ == '__main__': ''' build the models and put them into files ''' train_data = pd.read_csv('./x_train.csv') train_labels = pd.read_csv('./y_train.csv') # build_models(train_data, train_labels) ''' Load the models from their files ''' XGboost_model = XGBRegressor() XGboost_model.load_model('xgboost_model') boost_RF_model = XGBRegressor() boost_RF_model.load_model('RF_model') '''' Initiate score check on the XGBoost model ''' test_data = pd.read_csv('./x_test.csv') test_labels = pd.read_csv('./y_test.csv') # Put test_labels into an array y_test_num = pd.Series(test_labels.iloc[:, 0]).tolist() y_test_num = [round(value) for value in y_test_num] ''' XGBoost Predictor ''' predictions = XGboost_model.predict(test_data) predictions = [round(value) for value in predictions]
class FeatureRegressorXGB(): def __init__(self, modelfile='featureregressor_xgb.bin'): pwd = os.path.dirname(__file__) self.model = XGBRegressor() self.model.load_model(pwd + '/models/' + modelfile) def check_errors(self, sim): if sim.N_real < 4: raise AttributeError( "SPOCK Error: SPOCK only applicable to systems with 3 or more planets" ) def predict(self, sim): """ Predict instability time (log10(T)) of passed simulation Parameters: sim (rebound.Simulation): Orbital configuration to test Returns: float: Estimated instability log10(time) """ triofeatures, stable = self.generate_features(sim) if stable == False: return 4.0 triovals = self.predict_from_features(triofeatures) return triovals.min() # minimum time among all trios tested def generate_features(self, sim): """ Generates the set of summary features used by the feature classifier for prediction. Parameters: sim (rebound.Simulation): Orbital configuration to test Returns: List of OrderedDicts: A list of sets of features for each adjacent trio of planets in system. Each set of features is an ordered dictionary of 10 summary features. See paper. stable (int): An integer for whether the N-body integration survived the 10^4 orbits (1) or went unstable (0). """ sim = sim.copy() init_sim_parameters(sim) self.check_errors(sim) trios = [[i, i + 1, i + 2] for i in range(1, sim.N_real - 2)] # list of adjacent trios featureargs = [10000, 80, trios] triofeatures, stable = features( sim, featureargs) # stable will be 0 if an orbit is hyperbolic # sim.dt = nan in init_sim_parameters return triofeatures, stable def predict_from_features(self, triofeatures): """ Estimate probability of stability from the list of features created by FeatureClassifier.generate_features. Parameters: triofeatures (List of Ordered Dicts): Sets of features for each adjacent planet trio (returned from FeatureClassifier.generate_features) Returns: list (float): Estimated probabilities of stability for set of features passed (for each adjacent trio of planets). """ # xgboost model expects a 2D array of shape (Npred, Nfeatures) where Npred is number of samples to predict, Nfeatures is # of features per sample expected_features = ( "EMcrossnear MMRstrengthnear MMRstrengthfar EPstdnear".split(' ') + "EMfracstdfar EMfracstdnear EMcrossfar EPstdfar MEGNOstd".split( ' ') + "MEGNO".split(' ')) featurevals = np.array([[obj[feat] for feat in expected_features] for obj in triofeatures]) predictions = self.model.predict(featurevals) return predictions
class RaceStrategyModel(object): def __init__(self, year: int, verbose=False, n_cores=1): print("XGB using {} threads".format(n_cores)) self.regular_model = XGBRegressor(n_jobs=n_cores) self.pit_model = XGBRegressor(n_jobs=n_cores) self.safety_model = XGBRegressor(n_jobs=n_cores) self.test_race = None self.scaler = None self.test_race_pit_model = None self.dummy_columns = None self.n_cores = n_cores # self.start_lap = start_lap if year == 2014: year = "year_1" elif year == 2015: year = "year_2" elif year == 2016: year = "year_3" elif year == 2017: year = "year_4" elif year == 2018: year = "year_5" elif year == 2019: year = "year_6" else: raise ValueError("No race available for year " + str(year)) self.year = year self.verbose = verbose def split_train_test(self, df: pd.DataFrame, split_fraction: float): """ Split the dataset randomly but keeping whole races together """ test_data = pd.DataFrame(columns=df.columns) races = df[df[self.year] == 1]['raceId'].unique() if split_fraction != 0: split_size = int(round(split_fraction * len(races))) else: # Leave only one race out from the training split_size = 1 test_races = np.random.choice(races, size=split_size) for race in test_races: race_laps = df.loc[df['raceId'] == race] test_data = test_data.append(race_laps) df = df[df.raceId != race] return df, test_data def normalize_dataset(self, df): """ Normalize integer-valued columns of the dataset """ data = df.copy() # print(df.columns) # Remove columns not to be normalized zero_one = [ 'battle', 'drs', "circuitId_1", "circuitId_2", "circuitId_3", "circuitId_4", "circuitId_6", "circuitId_7", "circuitId_9", "circuitId_10", "circuitId_11", "circuitId_13", "circuitId_14", "circuitId_15", "circuitId_17", "circuitId_18", "circuitId_22", "circuitId_24", "circuitId_32", "circuitId_34", "circuitId_69", "circuitId_70", "circuitId_71", "circuitId_73", "tyre_1", "tyre_2", "tyre_3", "tyre_4", "tyre_5", "tyre_6", "year_1", "year_2", "year_3", "year_4", "year_5", "year_6", "nextLap", 'pit', 'safety', "unnorm_lap" ] #'milliseconds', #'cumulative', 'unnorm_lap'] temp_df = data[zero_one].copy() data.drop(zero_one, axis=1, inplace=True) # if self.columns is not None and len(data.columns) != len(self.columns): # print(set(data.columns).difference(set(self.columns))) # exit(-1) if not self.scaler: self.scaler = MinMaxScaler(feature_range=(-1, 1)) self.scaler.fit(data) scaled = data else: scaled = self.scaler.transform(data) data.loc[:, :] = scaled data = data.join(temp_df) del temp_df return data def __process_dataset(self, dataset): """ Pre-process the dataset to obtain training data and its labels""" # Discard wet and suspended races old_races = len(dataset['raceId'].unique()) dataset = discard_wet(dataset) dataset = discard_suspended_races(dataset) new_races = len(dataset['raceId'].unique()) if self.verbose: print( "{} wet and suspended races were discarded".format(old_races - new_races)) # Eliminate the last lap from the training data, as it has 0 target dataset = dataset[dataset['nextLap'] > 0] # Express the next lap target as a delta to the pole lap dataset['nextLap'] = (dataset['nextLap'] - dataset['pole']) # Duplicate columns to use them after normalization dataset['base'] = dataset['pole'].astype(int) dataset['true'] = dataset['milliseconds'].astype(int) dataset['true_cumulative'] = dataset['cumulative'].astype(int) # Normalize the dataset, but normalize the lap time and cumulative time individually, in order to be able to # normalize them at runtime # Remove the duplicated unnormalized columns from the train data dataset = dataset.drop(columns=['base', 'true', 'true_cumulative']) dataset = self.normalize_dataset(dataset) _, self.test_race = self.split_train_test(dataset, split_fraction=0) self.__compute_pitstop_model(dataset) self.dummy_columns = dataset.columns train_data = self.normalize_dataset(dataset) # train_data = train_data[train_data['unnorm_lap'] > self.start_lap] # Take laps after a threshold # Remove columns used only to identify the laps in testing train_data = train_data.drop( columns=['unnorm_lap', "raceId", "driverId", "race_length"]) # Split the dataset into three separate datasets, one per each model to be trained train_pit = deepcopy(train_data.loc[train_data['pit'] != 0]) train_safety = deepcopy(train_data.loc[(train_data['safety'] != 0) & (train_data['pit'] == 0)]) train_regular = deepcopy(train_data.loc[(train_data['pit'] == 0) & (train_data['safety'] == 0)]) # Remove features related to pit and safety in the "regular" laps model train_regular = train_regular.drop( columns=['safety', 'pit', 'pit-cost', 'pitstop-milliseconds']) # Extract the target labels labels_pit = train_pit.pop('nextLap') labels_safety = train_safety.pop('nextLap') labels_regular = train_regular.pop('nextLap') train_data = { 'regular': train_regular, 'safety': train_safety, 'pit': train_pit } labels = { 'regular': labels_regular, 'safety': labels_safety, 'pit': labels_pit } return train_data, labels def __compute_pitstop_model(self, full_dataset: pd.DataFrame): """Compute a normal distribution's parameters for each driver's pit-stop times""" circuit = get_current_circuit(self.test_race) pits = [] pits_safety = [] stop_laps = full_dataset[(full_dataset['pitstop-milliseconds'] > 0) & ( full_dataset[circuit] == 1)].sort_values('lap') pit_times = stop_laps[stop_laps['safety'] == 0]['pitstop-milliseconds'].values pit_safety_times = stop_laps[ stop_laps['safety'] > 0]['pitstop-milliseconds'].values pits.extend(pit_times.tolist()) pits_safety.extend(pit_safety_times.tolist()) safety_mean = np.mean( pit_safety_times) if len(pit_safety_times) > 0 else 0 safety_std = np.std( pit_safety_times) if len(pit_safety_times) > 0 else 0 mean = np.mean(pit_times) if len(pit_times) > 0 else 0 std = np.std(pit_times) if len(pit_times) > 0 else 0 self.test_race_pit_model = { 'regular': (mean, std), 'safety': (safety_mean, safety_std) } def train(self): """ Train the regression models """ if self.verbose: print('Training models...') self.scaler = None if self.verbose: print("Model uses {} cores".format(self.n_cores)) # self.regular_model = XGBRegressor(n_jobs=self.n_cores) # self.pit_model = XGBRegressor(n_jobs=self.n_cores) # self.safety_model = XGBRegressor(n_jobs=self.n_cores) dataset = load_dataset() datasets, labels = self.__process_dataset(dataset) self.regular_model.fit(datasets['regular'], labels['regular']) self.pit_model.fit(datasets['pit'], labels['pit']) self.safety_model.fit(datasets['safety'], labels['safety']) if self.verbose: print('Done!\n') def resplit(self): # TODO fix the invalidation of scaler to avoid the normalization of test races self.scaler = None dataset = load_dataset() self.__process_dataset(dataset) self._test_race = fix_data_types(self.test_race) self.laps_database = defaultdict(lambda: None) self.race_id = self.test_race["raceId"].values[0] for i in range(self.test_race["lap"].count()): row = self.test_race.iloc[[i]] self.laps_database[(row["driverId"].values[0], row["lap"].values[0])] = row def load(self): """ Restore prediction models from previously pickled files to avoid retraining """ if self.verbose: print("Loading prediction models from pickled files...") if not os.path.isfile( "./envs/race_strategy_model/pickled_models/regular.model"): print("ERROR: regular.model is missing") exit(-1) else: self.regular_model.load_model( './envs/race_strategy_model/pickled_models/regular.model') if not os.path.isfile( "./envs/race_strategy_model/pickled_models/safety.model"): print("ERROR: safety.model is missing") exit(-1) else: self.safety_model.load_model( './envs/race_strategy_model/pickled_models/safety.model') if not os.path.isfile( "./envs/race_strategy_model/pickled_models/pit.model"): print("ERROR: pit.model is missing") exit(-1) else: self.pit_model.load_model( './envs/race_strategy_model/pickled_models/pit.model') if not os.path.isfile( "./envs/race_strategy_model/pickled_models/scaler.pickle"): print("ERROR: scaler.pickle is missing") exit(-1) else: with open( './envs/race_strategy_model/pickled_models/scaler.pickle', 'rb') as scaler_file: self.scaler = pickle.load(scaler_file) scaler_file.close() # if not os.path.isfile("pickled_models/test_race.pickle"): # print("ERROR: test_race.pickle is missing") # exit(-1) # else: # with open('pickled_models/test_race.pickle', 'rb') as pit_file: # self.pit_model = pickle.load(pit_file) # pit_file.close() if self.verbose: print("Done!\n") # self.regular_model.set_params(**{"n_jobs": self.n_cores}) # self.safety_model.set_params(**{"n_jobs": self.n_cores}) # self.pit_model.set_params(**{"n_jobs": self.n_cores}) print(self.regular_model.get_params()) def save(self): """ Pickle the model objects to avoid retraining """ for model, name in zip( [self.regular_model, self.safety_model, self.pit_model], ['regular', 'safety', 'pit']): model.save_model( './envs/race_strategy_model/pickled_models/{}.model'.format( name)) with open('./envs/race_strategy_model/pickled_models/scaler.pickle', 'wb') as savefile: pickle.dump(self.scaler, savefile) savefile.close() #self.test_race.to_csv(".envs/race_strategy_model/dataset/test_race.csv") def predict(self, state, lap_type): if lap_type == 'regular': state.drop( columns=['safety', 'pit', 'pit-cost', 'pitstop-milliseconds']) return self.regular_model.predict(state) elif lap_type == 'pit': return self.regular_model.predict(state) else: return self.safety_model.predict(state) def get_prediction_model(self, state: str): if state == 'regular': return self.regular_model if state == 'safety': return self.safety_model if state == 'pit': return self.pit_model else: raise ValueError( "The specified state is not valid, allowed model states are 'regular', 'safety' and 'pit'" )
import os import numpy as np from xgboost import XGBRegressor from utils import ( add_handler, init_logger ) ROOT = os.path.abspath(os.path.dirname(__file__)) PATH = 'xgb_trained.bin' # Load trained model xgb_trained = XGBRegressor() xgb_trained.load_model(os.path.join(ROOT, PATH)) def handler(event, context): # Initialize Logger log = init_logger() log = add_handler(log) input_data = json.loads(event['body']) log.info(f"Input data: {input_data}") # Retrieve inputs input_X = input_data['input_X'] # Process input image log.info(f"INFO -- Processing input data")
def main(): print("Loading data...") # The training data is used to train your model how to predict the targets. training_data = read_csv("numerai_training_data.csv") # The tournament data is the data that Numerai uses to evaluate your model. tournament_data = read_csv("numerai_tournament_data.csv") feature_names = [ f for f in training_data.columns if f.startswith("feature") ] print(f"Loaded {len(feature_names)} features") # This is the model that generates the included example predictions file. # Taking too long? Set learning_rate=0.1 and n_estimators=200 to make this run faster. # Remember to delete example_model.xgb if you change any of the parameters below. model = XGBRegressor(max_depth=5, learning_rate=0.01, n_estimators=2000, n_jobs=-1, colsample_bytree=0.1) if MODEL_FILE.is_file(): print("Loading pre-trained model...") model.load_model(MODEL_FILE) else: print("Training model...") model.fit(training_data[feature_names], training_data[TARGET_NAME]) model.save_model(MODEL_FILE) # Generate predictions on both training and tournament data print("Generating predictions...") training_data[PREDICTION_NAME] = model.predict( training_data[feature_names]) tournament_data[PREDICTION_NAME] = model.predict( tournament_data[feature_names]) # Check the per-era correlations on the training set (in sample) train_correlations = training_data.groupby("era").apply(score) print( f"On training the correlation has mean {train_correlations.mean()} and std {train_correlations.std()}" ) print( f"On training the average per-era payout is {payout(train_correlations).mean()}" ) """Validation Metrics""" # Check the per-era correlations on the validation set (out of sample) validation_data = tournament_data[tournament_data.data_type == "validation"] validation_correlations = validation_data.groupby("era").apply(score) print( f"On validation the correlation has mean {validation_correlations.mean()} and " f"std {validation_correlations.std(ddof=0)}") print( f"On validation the average per-era payout is {payout(validation_correlations).mean()}" ) # Check the "sharpe" ratio on the validation set validation_sharpe = validation_correlations.mean( ) / validation_correlations.std(ddof=0) print(f"Validation Sharpe: {validation_sharpe}") print("checking max drawdown...") rolling_max = (validation_correlations + 1).cumprod().rolling( window=100, min_periods=1).max() daily_value = (validation_correlations + 1).cumprod() max_drawdown = -(rolling_max - daily_value).max() print(f"max drawdown: {max_drawdown}") # Check the feature exposure of your validation predictions feature_exposures = validation_data[feature_names].apply( lambda d: correlation(validation_data[PREDICTION_NAME], d), axis=0) max_per_era = validation_data.groupby("era").apply( lambda d: d[feature_names].corrwith(d[PREDICTION_NAME]).abs().max()) max_feature_exposure = max_per_era.mean() print(f"Max Feature Exposure: {max_feature_exposure}") # Check feature neutral mean print("Calculating feature neutral mean...") feature_neutral_mean = get_feature_neutral_mean(validation_data) print(f"Feature Neutral Mean is {feature_neutral_mean}") # Load example preds to get MMC metrics example_preds = pd.read_csv("example_predictions.csv").set_index( "id")["prediction"] validation_example_preds = example_preds.loc[validation_data.index] validation_data["ExamplePreds"] = validation_example_preds print("calculating MMC stats...") # MMC over validation mmc_scores = [] corr_scores = [] for _, x in validation_data.groupby("era"): series = neutralize_series(pd.Series(unif(x[PREDICTION_NAME])), pd.Series(unif(x["ExamplePreds"]))) mmc_scores.append(np.cov(series, x[TARGET_NAME])[0, 1] / (0.29**2)) corr_scores.append( correlation(unif(x[PREDICTION_NAME]), x[TARGET_NAME])) val_mmc_mean = np.mean(mmc_scores) val_mmc_std = np.std(mmc_scores) val_mmc_sharpe = val_mmc_mean / val_mmc_std corr_plus_mmcs = [c + m for c, m in zip(corr_scores, mmc_scores)] corr_plus_mmc_sharpe = np.mean(corr_plus_mmcs) / np.std(corr_plus_mmcs) corr_plus_mmc_mean = np.mean(corr_plus_mmcs) corr_plus_mmc_sharpe_diff = corr_plus_mmc_sharpe - validation_sharpe print(f"MMC Mean: {val_mmc_mean}\n" f"Corr Plus MMC Sharpe:{corr_plus_mmc_sharpe}\n" f"Corr Plus MMC Diff:{corr_plus_mmc_sharpe_diff}") # Check correlation with example predictions full_df = pd.concat([ validation_example_preds, validation_data[PREDICTION_NAME], validation_data["era"] ], axis=1) full_df.columns = ["example_preds", "prediction", "era"] per_era_corrs = full_df.groupby('era').apply( lambda d: correlation(unif(d["prediction"]), unif(d["example_preds"]))) corr_with_example_preds = per_era_corrs.mean() print(f"Corr with example preds: {corr_with_example_preds}") # Save predictions as a CSV and upload to https://numer.ai tournament_data[PREDICTION_NAME].to_csv("submission.csv", header=True)
### 방법 1 ### # python에서 제공하는 기능 import pickle pickle.dump(model, open('../data/xgb_save/m39.pickle.dat', 'wb')) #dump == save, write binary print('pickle 저장 완료') model_pic = pickle.load(open('../data/xgb_save/m39.pickle.dat', 'rb')) print('pickle 불러오기 완료') r2_pic = model_pic.score(x_test, y_test) print('r2 pickle :', r2_pic) ### 방법 2 ### import joblib joblib.dump(model, '../data/xgb_save/m40.joblib.dat') # pickle과 달리 open 없이 경로만 쓰면 됨 print('joblib 저장하기 완료') model_job = joblib.load('../data/xgb_save/m40.joblib.dat') print('joblib 불러오기 완료') r2_job = model_job.score(x_test, y_test) print('r2 joblib :', r2_job) ### 방법 3 ### # xgb 자체 model.save_model("../data/xgb_save/m41.xgb.model") print('xgb model 저장하기 완료') model_xgb = XGBRegressor() model_xgb.load_model('../data/xgb_save/m41.xgb.model') print('xgb model 불러오기 완료') r2_xgb = model_xgb.score(x_test, y_test) print('r2 xgb model : ', r2_xgb)