Beispiel #1
0
def make_prediction_and_save(X_test, linear_model_file, lgbm_model_file,
                             y_predicted_file):
    """Make a prediction for the given dataset based on the given linear and lgbm models and save the result to a CSV file

	Parameters
	----------
	X_test : Pandas dataframe
		Input testing data.

	linear_model_file : string
		Path to a linear model pickle file
	
	lgbm_model_file : string
		Path to a multioutput gradient boosting model pickle file

	y_predicted_file : string
		CSV data file where to save the predictions 

	"""
    if __debug__: print("In make_prediction_and_save()")

    linear_model = load_linear_model(linear_model_file)
    lgbm_model = load_lgbm_model(lgbm_model_file)
    df = make_prediction(X_test, linear_model, lgbm_model)
    print(strftime('%H:%M:%S'), "- Saving the prediction under",
          y_predicted_file)
    start_time = time()
    df.to_csv(y_predicted_file, sep=',', encoding='utf-8')
    print("Prediction saved in", utils.time_me(time() - start_time), "\n")
Beispiel #2
0
def load_study_and_return_best_params(optuna_study_name, optuna_storage):
    """Load (or create if do not exist) an Optuna study (https://optuna.readthedocs.io)

	Parameters
	----------
	optuna_study_name :
		Study’s name. Each study has a unique name as an identifier.

	optuna_storage :
		Database URL such as sqlite:///example.db. Please see also the documentation of create_study() for further details.

	Returns
	-------
	params : object
		Such as
		{
			"data": {
				"center_decay": 0.14281578186170577,
				"use_cyclical": True,
				"vehicle_decay": 0.17590059703294494,
			},
			"model": {
				"boosting_type": "gbdt",
				"colsample_bytree": 0.5279207022532362,
				"learning_rate": 0.012081577123096265,
				"min_child_samples": 45,
				"min_child_weight": 0.007084184412851127,
				"n_estimators": 568,
				"num_leaves": 483,
				"reg_alpha": 0.10389662610302736,
				"reg_lambda": 0.026121337399318097,
				"subsample": 0.9076986626277991,
				"subsample_freq": 0,
			},
		} 
	"""
    if __debug__: print("In load_study_and_return_best_params()")
    start_time = time()

    # Create a study if do not exist
    study = optuna.load_study(study_name=optuna_study_name,
                              storage=optuna_storage)
    print("Optuna study loaded in", utils.time_me(time() - start_time), "\n")

    # Retrieve best parameters
    trial = study.best_trial
    params = utils.sample_params(optuna.trial.FixedTrial(trial.params))

    return params
Beispiel #3
0
def load_linear_model(linear_model_file):
    """Load a linear model from the given file

	Parameters
	----------
	linear_model_file : string
		Path to a pickle file

	Returns
	-------
	linear_model : sklearn.linear_model.LinearRegression
		Trained linear model
	"""
    if __debug__: print("In load_linear_model()")
    print(strftime('%H:%M:%S'), "- Load the linear model")
    start_time = time()
    linear_model = pickle.load(open(linear_model_file, 'rb'))

    print("Linar model loaded in", utils.time_me(time() - start_time), "\n")

    return linear_model
Beispiel #4
0
def load_lgbm_model(lgbm_model_file):
    """Load a multiouput gradient boosting model from the given file

	Parameters
	----------
	lgbm_model_file : string
		Path to a multioutput gradient boosting model pickle file

	Returns
	-------
	lgbm_model : object
		Multioutput gradient boosting model.
	"""
    if __debug__: print("In load_lgbm_model()")
    print(strftime('%H:%M:%S'), "- Load the lgbm model")
    start_time = time()
    lgbm_model = pickle.load(open(lgbm_model_file, 'rb'))

    print("Lgbm model loaded in", utils.time_me(time() - start_time), "\n")

    return lgbm_model
def run_optuna_study_and_return_best_params(X_train, y_train):
    """Run an Optuna study and return the best hyperparameters

	Parameters
	----------
	X_train : Pandas dataframe
		Input training data.

	y_train : Pandas dataframe
		Output training data.
	
	Returns
	-------
	params : object
		Best hyperparameters from the Optuna study

	"""
    if __debug__: print("In run_optuna_study_and_return_best_params()")

    # Create a study if do not exist
    study = optuna.create_study(study_name=OPTUNA_STUDY_NAME,
                                storage=OPTUNA_STORAGE,
                                direction=OPTUNA_OPTIMIZATION_DIRECTION,
                                load_if_exists=OPTUNA_LOAD_STUDY_IF_EXIST)
    print("")

    print(strftime('%H:%M:%S'),
          "- Start an Optuna hyperparameter optimization")
    start_time = time()
    # Optimize an objective function
    study.optimize(lambda trial: objective(
        trial, OPTUNA_OBJECTIVE_AGGREGATION_FUNCTION, X_train, y_train),
                   n_trials=OPTUNA_NUMBER_OF_TRIALS,
                   n_jobs=OPTUNA_NUMBER_OF_PARALLEL_JOBS)
    print("Optuna hyperparameter optimization done in",
          utils.time_me(time() - start_time), "\n")

    trial = study.best_trial

    return sample_params(optuna.trial.FixedTrial(trial.params))
def train_and_save_model(X_train,
                         y_train,
                         linear_model_file,
                         lgbm_model_file,
                         run_optuna_study=False):
    """Train and save linear and gradient boosting models

	Parameters
	----------
	X_train : Pandas dataframe
		Input training data.

	y_train : Pandas dataframe
		Output training data.

	linear_model_file : string
		Path to a linear model pickle file
	
	lgbm_model_file : string
		Path to a multioutput gradient boosting model pickle file

	run_optuna_study : bool
		False (default) : To use hyperparameters defined in the config.py file
		True : To run an Optuna study to use the best hyperparameters found from it

	Returns
	-------
	params : object
		Used hyperparameters

	"""
    if __debug__: print("In train_and_save_model()")

    # train_and_save_linear_model
    if 'routing engine estimated duration' in X_train.columns:
        print(strftime('%H:%M:%S'), "- Start the linear model training")
        start_time = time()
        linear_model = train_and_save_linear_model(
            X_train[["routing engine estimated duration"]],
            y_train["delta selection-presentation"], linear_model_file)
        print("Linear model trained and saved in",
              utils.time_me(time() - start_time), "\n")
    elif __debug__:
        print("Do not train linear model")

    print(strftime('%H:%M:%S'),
          "- Start the computation for a first features set")
    start_time = time()
    if 'linear_model' in locals():
        X_train = features.compute_feature_set_one(X_train, linear_model)
    else:
        X_train = features.compute_feature_set_one(X_train)
    print("First features set computed in", utils.time_me(time() - start_time),
          "\n")

    params = {}

    if run_optuna_study == True:
        params = run_optuna_study_and_return_best_params(X_train, y_train)

    else:
        params = PARAMS

    print(strftime('%H:%M:%S'),
          "- Start the computation of another feature set")
    start_time = time()
    X_train = features.compute_feature_set_two(X_train, ID, **params["data"])
    print("Features computed in", utils.time_me(time() - start_time), "\n")

    print("Drop useless parameters\n")
    X_train.drop(IGNORED, axis=1, inplace=True, errors='ignore')

    model = get_model(params)

    if WITH_CROSS_VALIDATION == True:
        print(strftime('%H:%M:%S'),
              "- Start the training with cross validation")
        cv = cross_validate(get_model(params),
                            X_train,
                            y_train,
                            cv=5,
                            scoring="r2",
                            return_estimator=True)
        print("Training with cross validation completed in",
              utils.time_me(time() - start_time), "\n")

        utils.save_object(cv, lgbm_model_file)
        print("Model with cross validation saved in", lgbm_model_file, "\n")

    else:
        print(strftime('%H:%M:%S'),
              "- Start the training without cross validation")
        model.fit(X_train, y_train)
        print("Training without cross validation completed in",
              utils.time_me(time() - start_time), "\n")

        pickle.dump(model, open(lgbm_model_file, 'wb'))
        print("Model without cross validation saved in", lgbm_model_file, "\n")

    return params
    print("*** EQUIVALENT LAUNCHED COMMAND ***")
    print(sys.executable, sys.argv[0], x_train_file, y_train_file,
          linear_model_file, lgbm_model_file, "\n")

    print("*** INPUT FILES ***")
    print("- Input training data file:", x_train_file)
    print("- Output training data file:", y_train_file)
    print("- Gradient boosting model will be saved into:", lgbm_model_file)
    print("")

    print("*** START DATA PROCESSING ***")
    print(strftime('%H:%M:%S'), "- Start loading training data")
    start_time = time()
    X_train = pd.read_csv(x_train_file, index_col=ID)
    y_train = pd.read_csv(y_train_file, index_col=ID)
    print("Training data loaded in", utils.time_me(time() - start_time), "\n")

    if DEV == False:
        print("The full dataset will be used for training purpose.")
    else:
        print("Only", NUMBER_OF_ROWS_USED_FOR_DEV,
              "rows will be used for training purpose.")
        X_train = X_train.head(NUMBER_OF_ROWS_USED_FOR_DEV)
        y_train = y_train.loc[X_train.index, :]

    train_and_save_model(X_train, y_train, linear_model_file, lgbm_model_file,
                         RUN_OPTUNA_STUDY)

    print("*** SCRIPT COMPLETED IN", utils.time_me(time() - script_start_time),
          "***")
Beispiel #8
0
	# ML Flow log param to record
	mlflow.log_param("DEV", DEV)
	mlflow.log_param("Amount of data used for testing", y_test.shape[0])
	mlflow.log_param("RUN_OPTUNA_STUDY", RUN_OPTUNA_STUDY)
	mlflow.log_param("PARAMS_FROM_BEST_OPTUNA_STUDY_IN_DB", PARAMS_FROM_BEST_OPTUNA_STUDY_IN_DB)
	mlflow.log_param("WITH_CROSS_VALIDATION", WITH_CROSS_VALIDATION)
	mlflow.log_param("LINEAR_MODEL_FILE", LINEAR_MODEL_FILE)
	if WITH_CROSS_VALIDATION == True:
		mlflow.log_param("LGBM_MODEL_FILE", LGBM_MODEL_WITH_CV_FILE)
	else:
		mlflow.log_param("LGBM_MODEL_FILE", LGBM_MODEL_WITHOUT_CV_FILE)

	mlflow.log_param("X_TRAIN_PREPROCESSED_FILE", X_TRAIN_PREPROCESSED_FILE)
	mlflow.log_param("X_TEST_PREPROCESSED_FILE", X_TEST_PREPROCESSED_FILE)
	mlflow.log_param("Y_TRAIN_FILE", Y_TRAIN_FILE)
	mlflow.log_param("Y_TEST_FILE", Y_TEST_FILE)
	mlflow.log_param("Y_PREDICTED_WITHOUT_CV_FILE", Y_PREDICTED_WITHOUT_CV_FILE)
	mlflow.log_param("Y_PREDICTED_WITH_CV_FILE", Y_PREDICTED_WITH_CV_FILE)

	# ML Flow log metric to record
	mlflow.log_metric("Selection-departure R2 score", selection_departure_r2_score)
	mlflow.log_metric("Departure-presentation R2 score", departure_presentation_r2_score)
	mlflow.log_metric("Selection-presentation R2 score", selection_presentation_r2_score)
	mlflow.log_metric("Mean R2 scores", mean_r2_scores)
	mlflow.log_metric("Root mean squared logarithmic error", rmsle)
	mlflow.log_metric("Root mean squared error", root_mean_squared_error)
	mlflow.log_metric("Median error in seconds", median_error)
	mlflow.log_metric("Mean error in seconds", mean_error)

	print("*** SCRIPT COMPLETED IN", utils.time_me(time() - script_start_time), "***")
    print("*** START DATA PROCESSING ***")

    print(strftime('%H:%M:%S'), "- Start loading raw data")
    start_time = time()
    X = pd.read_csv(x_train_file, index_col=ID, parse_dates=["selection time"])
    additional_data = pd.read_csv(x_train_additional_file, index_col=ID)
    X = X.merge(additional_data, how="left", on=ID)
    X_train_index = X.index
    X_test = pd.read_csv(x_test_file,
                         index_col=ID,
                         parse_dates=["selection time"])
    additional_data_test = pd.read_csv(x_test_additional_file, index_col=ID)
    X_test = X_test.merge(additional_data_test, how="left", on=ID)
    X_test_index = X_test.index
    X = pd.concat((X, X_test), sort=False)
    print("Raw data loaded in", utils.time_me(time() - start_time), "\n")

    print(strftime('%H:%M:%S'), "- Start data extraction")
    start_time = time()
    X = data_extraction(X)
    print("Extraction completed in", utils.time_me(time() - start_time), "\n")

    print(len(list(X)), "columns to export:")
    print([X.index.name] + list(X))
    print("")

    # Export training input dataset
    print(strftime('%H:%M:%S'), "- Start data export to",
          x_train_preprocessed_file)
    start_time = time()
    compression_opts = dict(method='zip',
Beispiel #10
0
def make_prediction(X_test, linear_model, lgbm_model):
    """Make a prediction for the given dataset based on the given linear and lgbm models

	Parameters
	----------
	X_test : Pandas dataframe
		Input testing data.

	linear_model : object
		Linear model
	
	lgbm_model : object
		Multiouput gradient boosting model

	Returns
	-------
	lgbm_model : object
		Multioutput gradient boosting model.

	"""
    if __debug__: print("In make_prediction()")

    print(strftime('%H:%M:%S'),
          "- Start the computation for a first features set")
    start_time = time()
    X_test = features.compute_feature_set_one(X_test, linear_model)
    print("First features set computed in", utils.time_me(time() - start_time),
          "\n")

    params = {}
    if PARAMS_FROM_BEST_OPTUNA_STUDY_IN_DB == True:
        params = load_study_and_return_best_params(OPTUNA_STUDY_NAME,
                                                   OPTUNA_STORAGE)

    else:
        params = PARAMS

    print(strftime('%H:%M:%S'),
          "- Start the computation of another feature set")
    start_time = time()
    X_test = features.compute_feature_set_two(X_test, ID, **params["data"])
    print("Features computed in", utils.time_me(time() - start_time), "\n")

    print("Drop useless parameters")
    X_test.drop(IGNORED, axis=1, inplace=True, errors='ignore')
    print(len(list(X_test)), "parameters left: ", list(X_test))

    print("Load the model\n")

    if WITH_CROSS_VALIDATION == True:

        print(strftime('%H:%M:%S'),
              "- Start computing the predictions with cross validation")
        start_time = time()
        y_cv = [
            loaded_model.predict(X_test)
            for loaded_model in lgbm_model["estimator"]
        ]
        print("Predictions computed in", utils.time_me(time() - start_time),
              "for",
              np.shape(y_cv)[0], "entries\n")

        print("Cross validation test scores:")
        for item in lgbm_model["test_score"]:
            print(item)
        print("")

        y_cv = np.mean(y_cv, axis=0)

        df = pd.DataFrame(data=y_cv,
                          index=X_test.index,
                          columns=[
                              "delta selection-departure",
                              "delta departure-presentation",
                              "delta selection-presentation"
                          ])
        return df

    else:

        print(strftime('%H:%M:%S'), "- Start computing the predictions")
        start_time = time()
        y = lgbm_model.predict(X_test)
        print("Predictions with no cross validation computed in",
              utils.time_me(time() - start_time), "for",
              np.shape(y)[0], "entries\n")

        df = pd.DataFrame(data=y,
                          index=X_test.index,
                          columns=[
                              "delta selection-departure",
                              "delta departure-presentation",
                              "delta selection-presentation"
                          ])
        return df