def make_prediction_and_save(X_test, linear_model_file, lgbm_model_file, y_predicted_file): """Make a prediction for the given dataset based on the given linear and lgbm models and save the result to a CSV file Parameters ---------- X_test : Pandas dataframe Input testing data. linear_model_file : string Path to a linear model pickle file lgbm_model_file : string Path to a multioutput gradient boosting model pickle file y_predicted_file : string CSV data file where to save the predictions """ if __debug__: print("In make_prediction_and_save()") linear_model = load_linear_model(linear_model_file) lgbm_model = load_lgbm_model(lgbm_model_file) df = make_prediction(X_test, linear_model, lgbm_model) print(strftime('%H:%M:%S'), "- Saving the prediction under", y_predicted_file) start_time = time() df.to_csv(y_predicted_file, sep=',', encoding='utf-8') print("Prediction saved in", utils.time_me(time() - start_time), "\n")
def load_study_and_return_best_params(optuna_study_name, optuna_storage): """Load (or create if do not exist) an Optuna study (https://optuna.readthedocs.io) Parameters ---------- optuna_study_name : Study’s name. Each study has a unique name as an identifier. optuna_storage : Database URL such as sqlite:///example.db. Please see also the documentation of create_study() for further details. Returns ------- params : object Such as { "data": { "center_decay": 0.14281578186170577, "use_cyclical": True, "vehicle_decay": 0.17590059703294494, }, "model": { "boosting_type": "gbdt", "colsample_bytree": 0.5279207022532362, "learning_rate": 0.012081577123096265, "min_child_samples": 45, "min_child_weight": 0.007084184412851127, "n_estimators": 568, "num_leaves": 483, "reg_alpha": 0.10389662610302736, "reg_lambda": 0.026121337399318097, "subsample": 0.9076986626277991, "subsample_freq": 0, }, } """ if __debug__: print("In load_study_and_return_best_params()") start_time = time() # Create a study if do not exist study = optuna.load_study(study_name=optuna_study_name, storage=optuna_storage) print("Optuna study loaded in", utils.time_me(time() - start_time), "\n") # Retrieve best parameters trial = study.best_trial params = utils.sample_params(optuna.trial.FixedTrial(trial.params)) return params
def load_linear_model(linear_model_file): """Load a linear model from the given file Parameters ---------- linear_model_file : string Path to a pickle file Returns ------- linear_model : sklearn.linear_model.LinearRegression Trained linear model """ if __debug__: print("In load_linear_model()") print(strftime('%H:%M:%S'), "- Load the linear model") start_time = time() linear_model = pickle.load(open(linear_model_file, 'rb')) print("Linar model loaded in", utils.time_me(time() - start_time), "\n") return linear_model
def load_lgbm_model(lgbm_model_file): """Load a multiouput gradient boosting model from the given file Parameters ---------- lgbm_model_file : string Path to a multioutput gradient boosting model pickle file Returns ------- lgbm_model : object Multioutput gradient boosting model. """ if __debug__: print("In load_lgbm_model()") print(strftime('%H:%M:%S'), "- Load the lgbm model") start_time = time() lgbm_model = pickle.load(open(lgbm_model_file, 'rb')) print("Lgbm model loaded in", utils.time_me(time() - start_time), "\n") return lgbm_model
def run_optuna_study_and_return_best_params(X_train, y_train): """Run an Optuna study and return the best hyperparameters Parameters ---------- X_train : Pandas dataframe Input training data. y_train : Pandas dataframe Output training data. Returns ------- params : object Best hyperparameters from the Optuna study """ if __debug__: print("In run_optuna_study_and_return_best_params()") # Create a study if do not exist study = optuna.create_study(study_name=OPTUNA_STUDY_NAME, storage=OPTUNA_STORAGE, direction=OPTUNA_OPTIMIZATION_DIRECTION, load_if_exists=OPTUNA_LOAD_STUDY_IF_EXIST) print("") print(strftime('%H:%M:%S'), "- Start an Optuna hyperparameter optimization") start_time = time() # Optimize an objective function study.optimize(lambda trial: objective( trial, OPTUNA_OBJECTIVE_AGGREGATION_FUNCTION, X_train, y_train), n_trials=OPTUNA_NUMBER_OF_TRIALS, n_jobs=OPTUNA_NUMBER_OF_PARALLEL_JOBS) print("Optuna hyperparameter optimization done in", utils.time_me(time() - start_time), "\n") trial = study.best_trial return sample_params(optuna.trial.FixedTrial(trial.params))
def train_and_save_model(X_train, y_train, linear_model_file, lgbm_model_file, run_optuna_study=False): """Train and save linear and gradient boosting models Parameters ---------- X_train : Pandas dataframe Input training data. y_train : Pandas dataframe Output training data. linear_model_file : string Path to a linear model pickle file lgbm_model_file : string Path to a multioutput gradient boosting model pickle file run_optuna_study : bool False (default) : To use hyperparameters defined in the config.py file True : To run an Optuna study to use the best hyperparameters found from it Returns ------- params : object Used hyperparameters """ if __debug__: print("In train_and_save_model()") # train_and_save_linear_model if 'routing engine estimated duration' in X_train.columns: print(strftime('%H:%M:%S'), "- Start the linear model training") start_time = time() linear_model = train_and_save_linear_model( X_train[["routing engine estimated duration"]], y_train["delta selection-presentation"], linear_model_file) print("Linear model trained and saved in", utils.time_me(time() - start_time), "\n") elif __debug__: print("Do not train linear model") print(strftime('%H:%M:%S'), "- Start the computation for a first features set") start_time = time() if 'linear_model' in locals(): X_train = features.compute_feature_set_one(X_train, linear_model) else: X_train = features.compute_feature_set_one(X_train) print("First features set computed in", utils.time_me(time() - start_time), "\n") params = {} if run_optuna_study == True: params = run_optuna_study_and_return_best_params(X_train, y_train) else: params = PARAMS print(strftime('%H:%M:%S'), "- Start the computation of another feature set") start_time = time() X_train = features.compute_feature_set_two(X_train, ID, **params["data"]) print("Features computed in", utils.time_me(time() - start_time), "\n") print("Drop useless parameters\n") X_train.drop(IGNORED, axis=1, inplace=True, errors='ignore') model = get_model(params) if WITH_CROSS_VALIDATION == True: print(strftime('%H:%M:%S'), "- Start the training with cross validation") cv = cross_validate(get_model(params), X_train, y_train, cv=5, scoring="r2", return_estimator=True) print("Training with cross validation completed in", utils.time_me(time() - start_time), "\n") utils.save_object(cv, lgbm_model_file) print("Model with cross validation saved in", lgbm_model_file, "\n") else: print(strftime('%H:%M:%S'), "- Start the training without cross validation") model.fit(X_train, y_train) print("Training without cross validation completed in", utils.time_me(time() - start_time), "\n") pickle.dump(model, open(lgbm_model_file, 'wb')) print("Model without cross validation saved in", lgbm_model_file, "\n") return params
print("*** EQUIVALENT LAUNCHED COMMAND ***") print(sys.executable, sys.argv[0], x_train_file, y_train_file, linear_model_file, lgbm_model_file, "\n") print("*** INPUT FILES ***") print("- Input training data file:", x_train_file) print("- Output training data file:", y_train_file) print("- Gradient boosting model will be saved into:", lgbm_model_file) print("") print("*** START DATA PROCESSING ***") print(strftime('%H:%M:%S'), "- Start loading training data") start_time = time() X_train = pd.read_csv(x_train_file, index_col=ID) y_train = pd.read_csv(y_train_file, index_col=ID) print("Training data loaded in", utils.time_me(time() - start_time), "\n") if DEV == False: print("The full dataset will be used for training purpose.") else: print("Only", NUMBER_OF_ROWS_USED_FOR_DEV, "rows will be used for training purpose.") X_train = X_train.head(NUMBER_OF_ROWS_USED_FOR_DEV) y_train = y_train.loc[X_train.index, :] train_and_save_model(X_train, y_train, linear_model_file, lgbm_model_file, RUN_OPTUNA_STUDY) print("*** SCRIPT COMPLETED IN", utils.time_me(time() - script_start_time), "***")
# ML Flow log param to record mlflow.log_param("DEV", DEV) mlflow.log_param("Amount of data used for testing", y_test.shape[0]) mlflow.log_param("RUN_OPTUNA_STUDY", RUN_OPTUNA_STUDY) mlflow.log_param("PARAMS_FROM_BEST_OPTUNA_STUDY_IN_DB", PARAMS_FROM_BEST_OPTUNA_STUDY_IN_DB) mlflow.log_param("WITH_CROSS_VALIDATION", WITH_CROSS_VALIDATION) mlflow.log_param("LINEAR_MODEL_FILE", LINEAR_MODEL_FILE) if WITH_CROSS_VALIDATION == True: mlflow.log_param("LGBM_MODEL_FILE", LGBM_MODEL_WITH_CV_FILE) else: mlflow.log_param("LGBM_MODEL_FILE", LGBM_MODEL_WITHOUT_CV_FILE) mlflow.log_param("X_TRAIN_PREPROCESSED_FILE", X_TRAIN_PREPROCESSED_FILE) mlflow.log_param("X_TEST_PREPROCESSED_FILE", X_TEST_PREPROCESSED_FILE) mlflow.log_param("Y_TRAIN_FILE", Y_TRAIN_FILE) mlflow.log_param("Y_TEST_FILE", Y_TEST_FILE) mlflow.log_param("Y_PREDICTED_WITHOUT_CV_FILE", Y_PREDICTED_WITHOUT_CV_FILE) mlflow.log_param("Y_PREDICTED_WITH_CV_FILE", Y_PREDICTED_WITH_CV_FILE) # ML Flow log metric to record mlflow.log_metric("Selection-departure R2 score", selection_departure_r2_score) mlflow.log_metric("Departure-presentation R2 score", departure_presentation_r2_score) mlflow.log_metric("Selection-presentation R2 score", selection_presentation_r2_score) mlflow.log_metric("Mean R2 scores", mean_r2_scores) mlflow.log_metric("Root mean squared logarithmic error", rmsle) mlflow.log_metric("Root mean squared error", root_mean_squared_error) mlflow.log_metric("Median error in seconds", median_error) mlflow.log_metric("Mean error in seconds", mean_error) print("*** SCRIPT COMPLETED IN", utils.time_me(time() - script_start_time), "***")
print("*** START DATA PROCESSING ***") print(strftime('%H:%M:%S'), "- Start loading raw data") start_time = time() X = pd.read_csv(x_train_file, index_col=ID, parse_dates=["selection time"]) additional_data = pd.read_csv(x_train_additional_file, index_col=ID) X = X.merge(additional_data, how="left", on=ID) X_train_index = X.index X_test = pd.read_csv(x_test_file, index_col=ID, parse_dates=["selection time"]) additional_data_test = pd.read_csv(x_test_additional_file, index_col=ID) X_test = X_test.merge(additional_data_test, how="left", on=ID) X_test_index = X_test.index X = pd.concat((X, X_test), sort=False) print("Raw data loaded in", utils.time_me(time() - start_time), "\n") print(strftime('%H:%M:%S'), "- Start data extraction") start_time = time() X = data_extraction(X) print("Extraction completed in", utils.time_me(time() - start_time), "\n") print(len(list(X)), "columns to export:") print([X.index.name] + list(X)) print("") # Export training input dataset print(strftime('%H:%M:%S'), "- Start data export to", x_train_preprocessed_file) start_time = time() compression_opts = dict(method='zip',
def make_prediction(X_test, linear_model, lgbm_model): """Make a prediction for the given dataset based on the given linear and lgbm models Parameters ---------- X_test : Pandas dataframe Input testing data. linear_model : object Linear model lgbm_model : object Multiouput gradient boosting model Returns ------- lgbm_model : object Multioutput gradient boosting model. """ if __debug__: print("In make_prediction()") print(strftime('%H:%M:%S'), "- Start the computation for a first features set") start_time = time() X_test = features.compute_feature_set_one(X_test, linear_model) print("First features set computed in", utils.time_me(time() - start_time), "\n") params = {} if PARAMS_FROM_BEST_OPTUNA_STUDY_IN_DB == True: params = load_study_and_return_best_params(OPTUNA_STUDY_NAME, OPTUNA_STORAGE) else: params = PARAMS print(strftime('%H:%M:%S'), "- Start the computation of another feature set") start_time = time() X_test = features.compute_feature_set_two(X_test, ID, **params["data"]) print("Features computed in", utils.time_me(time() - start_time), "\n") print("Drop useless parameters") X_test.drop(IGNORED, axis=1, inplace=True, errors='ignore') print(len(list(X_test)), "parameters left: ", list(X_test)) print("Load the model\n") if WITH_CROSS_VALIDATION == True: print(strftime('%H:%M:%S'), "- Start computing the predictions with cross validation") start_time = time() y_cv = [ loaded_model.predict(X_test) for loaded_model in lgbm_model["estimator"] ] print("Predictions computed in", utils.time_me(time() - start_time), "for", np.shape(y_cv)[0], "entries\n") print("Cross validation test scores:") for item in lgbm_model["test_score"]: print(item) print("") y_cv = np.mean(y_cv, axis=0) df = pd.DataFrame(data=y_cv, index=X_test.index, columns=[ "delta selection-departure", "delta departure-presentation", "delta selection-presentation" ]) return df else: print(strftime('%H:%M:%S'), "- Start computing the predictions") start_time = time() y = lgbm_model.predict(X_test) print("Predictions with no cross validation computed in", utils.time_me(time() - start_time), "for", np.shape(y)[0], "entries\n") df = pd.DataFrame(data=y, index=X_test.index, columns=[ "delta selection-departure", "delta departure-presentation", "delta selection-presentation" ]) return df