Ejemplo n.º 1
0
def log_evaluated_results(
    eval_results,
    mlflow_tracking,
    fraction,
    n_splits,
    n_repeats,
):
    """Logs the genereted plot

    Args:
        plots (list of pandas.plot): genereted plots via Pandas
        df_names (list of strings): short names of dataset used
        model_name (string): short name of chosen model
    """
    if mlflow_tracking:
        print(f"Log artifacts for model evaluated...")
        model_name = eval_results[0]
        plots = eval_results[1]
        scores = eval_results[2]
        df_names = eval_results[3]
        time_spent = eval_results[4]

        exp_id = mlflow_set_exp_id("Model:Choose")
        run_name = f"{model_name} : Best"
        with mlflow.start_run(experiment_id=exp_id, run_name=run_name):
            for i in range(2):
                fig = plots[i].get_figure()
                path = f"./plots/{model_name}_on_{df_names[i]}.png"
                mlflow.log_figure(fig, path)
                mlflow.log_params(
                    {
                        "time_spent": time_spent,
                        "fraction": fraction,
                        "cv_n_splits": n_splits,
                        "cv_n_repeats": n_repeats,
                        "random_state": rnd_state,
                    }
                )
                mlflow.log_metrics(
                    {
                        "score_on_train": scores[0],
                        "score_on_val": scores[1],
                    }
                )
Ejemplo n.º 2
0
def trivial_fit(
    X_fit,
    y_fit,
    X_train,
    y_train,
    X_val,
    y_val,
    model,
    log_residuals,
):
    print(
        f"\n-------------- Trivial model training w\o any parameters' searching started...."
    )
    model_name = type(model).__name__
    # Setup MLflow tracking server
    exp_id = mlflow_set_exp_id("Model:Fit")
    run_name = f"{model_name}-None"
    ## Enable autologging
    mlflow.sklearn.autolog(log_model_signatures=False)
    print(f"Autologging {model_name} started...")
    ##* Fit model with MLflow logging
    with mlflow.start_run(experiment_id=exp_id, run_name=run_name):
        tic = time.time()
        model.fit(X_fit, y_fit)
        min, sec = divmod(time.time() - tic, 60)
        ## Disable autologging
        mlflow.sklearn.autolog(disable=True)
        # Log custom metrics and data
        print(f"Training took: {int(min)}min {int(sec)}sec")
        print(f"Log custom metrics...")
        log_custom_metrics(model, X_train, y_train, X_val, y_val)
        if log_residuals:
            log_model_residuals(model, X_train, y_train, X_val, y_val)

    print(f"{model_name.title()} model:")
    print_custom_metrics(model, X_train, y_train, X_val, y_val)
    winsound.Beep(frequency=2000, duration=300)
    return model
Ejemplo n.º 3
0
def train_model(
    model,
    X_dev,
    y_dev,
    X_train,
    y_train,
    X_val,
    y_val,
    mlflow_tracking=True,
    log_residuals=True,
    save_mlmodel_separatly=True,
):
    print(f"\nTrain final model on Development set...")

    tic = time.time()
    model_name = type(model).__name__
    if mlflow_tracking:
        # Setup MLflow tracking server
        exp_id = mlflow_set_exp_id("Model:Train")
        run_name = f"{model_name}"
        ## Enable autologging
        mlflow.sklearn.autolog()
        ##* Fit model with MLflow logging
        with mlflow.start_run(experiment_id=exp_id, run_name=run_name) as run:
            run_id = run.info.run_id
            print(f"Active run_id: {run_id} ...\n")
            model = model.fit(X_dev, y_dev)
            toc = time.time()
            ## Disable autologging
            mlflow.sklearn.autolog(disable=True)

            ##* Log custom metrics
            mare_on_dev = mare(y_dev, model.predict(X_dev))
            mare_on_train = mare(y_train, model.predict(X_train))
            mare_on_val = mare(y_val, model.predict(X_val))
            print(f"\nMARE on DEV: {mare_on_dev}")
            print(f"MARE on TRAIN: {mare_on_train}")
            print(f"MARE on VAL: {mare_on_val}")
            mlflow.log_metrics({
                "mare_on_dev": mare_on_dev,
                "mare_on_train": mare_on_train,
                "mare_on_val": mare_on_val,
            })
            ##* Log custom plots
            if log_residuals:
                print(f"\nCalculate and log model's residuals...")
                fig = plot_residuals_errors(model, X_train, y_train, X_val,
                                            y_val)
                mlflow.log_figure(fig, "./plots/residuals_errors.png")
    else:
        ##* Fit trivial
        model = model.fit(X_dev, y_dev)
        toc = time.time()
        exp_id, run_id = None, None

    ## Evaluate time spent
    min, sec = divmod(toc - tic, 60)
    print(f"Model training took: {int(min)}min {int(sec)}sec\n")

    ## Save trained pipeline
    if save_mlmodel_separatly:
        folder = save_mlmodel_aside(model, run_id)
    else:
        print(f"No one model was NOT saved separatly...")
        folder = None

    print(f"\nExperiment ID: {exp_id}")
    print(f"Run ID: {run_id}")
    print(f"Folder: {folder}")

    return exp_id, run_id, folder
Ejemplo n.º 4
0
def choose_model(X, y, fraction, n_splits, n_repeats, n_jobs, mlflow_tracking):
    print(f"\nStart model selection...")

    # Define dataset for modeling
    X_fit, y_fit = get_fractioned_data(X, y, fraction)

    # Get list of basic models will being estimated
    basic_models = get_list_of_basic_models()

    # # Create dict for modeling results
    # baseline_score = get_baseline_score(y)
    #  = {
    #     "Baseline": {
    #         "cv_score_mean": baseline_score,
    #         "cv_score_std": None,
    #         "time_spent": None,
    #     }
    # }
    basic_results = {}
    # Define num. of CV splits and K-repeats
    scorer, cv = set_custom_scorer_cv(n_splits, n_repeats)

    # Starts MLflow Tracking
    if mlflow_tracking:
        # Setup MLflow tracking server
        exp_id = mlflow_set_exp_id("Model:Choose")

    # Run loop through list of basic models
    for basic_model in basic_models:
        model_name = type(basic_model).__name__
        print(f"Modeling {model_name}...")
        # Fit each basic model via cross-validation
        tic = time.time()
        basic_model_scores = model_selection.cross_val_score(
            X=X_fit,
            y=y_fit,
            estimator=basic_model,
            scoring=scorer,
            cv=cv,
            n_jobs=n_jobs,  # -1 means using all processors
            verbose=0,  # The verbosity level. default=0
        )
        # Calculate time spent
        min, sec = divmod(time.time() - tic, 60)
        time_spent = f"{int(min)}min {int(sec)}sec"
        # Save results to dict
        basic_results.update(
            {
                basic_model: {
                    "cv_score_mean": basic_model_scores.mean(),
                    "cv_score_std": basic_model_scores.std(),
                    "time_spent": time_spent,
                }
            }
        )

        ##* Log models with MLflow logging
        if mlflow_tracking:
            print(f"\tLogging {model_name} results to runs...")
            with mlflow.start_run(experiment_id=exp_id, run_name=model_name):
                mlflow.log_params(
                    {
                        "time_spent": time_spent,
                        "fraction": fraction,
                        "cv_n_splits": n_splits,
                        "cv_n_repeats": n_repeats,
                        "random_state": rnd_state,
                    }
                )
                mlflow.log_metrics(
                    {
                        "cv_score_mean": basic_model_scores.mean(),
                        "cv_score_std": basic_model_scores.std(),
                    }
                )

    # Sort dict by score
    basic_results = dict(
        sorted(
            basic_results.items(),
            key=lambda x: (
                x[1]["cv_score_mean"],
                x[1]["cv_score_std"],
                x[1]["time_spent"],
            ),
        )
    )
    print(" ")
    print("-------------- Models' rating --------------")
    pprint(basic_results, sort_dicts=False)
    # Pick up best model from basic set of
    chosen_model = list(basic_results.keys())[0]

    return basic_results, chosen_model
Ejemplo n.º 5
0
def grid_search_cv(
    X_fit,
    y_fit,
    X_train,
    y_train,
    X_val,
    y_val,
    model,
    params_grid,
    scorer,
    cv,
    simple_grid_search,
    n_jobs,
    log_residuals,
):
    if simple_grid_search:
        print(f"\n-------------- Simple Grid SearchCV started....")
        pprint(f"Parameters' grid: {params_grid}")
        model_name = type(model).__name__
        # Setup MLflow tracking server
        exp_id = mlflow_set_exp_id("Model:Fit")
        run_name = f"{model_name}-grid"
        # Enable autologging
        mlflow.sklearn.autolog(log_model_signatures=False)
        # Define SIMPLE grid search
        grid_search = model_selection.GridSearchCV(
            model,
            param_grid=params_grid,
            scoring=scorer,
            n_jobs=n_jobs,
            cv=cv,
            refit=True,
            return_train_score=True,
            verbose=3,
        )
        ##* Fit model with MLflow logging
        with mlflow.start_run(experiment_id=exp_id, run_name=run_name):
            tic = time.time()
            model_grid_search = grid_search.fit(
                X_fit,
                y_fit,
            )
            min, sec = divmod(time.time() - tic, 60)
            # Disable autologging
            mlflow.sklearn.autolog(disable=True)
            # Log custom metrics and data
            print(f"Simple grid search took: {int(min)}min {int(sec)}sec")
            print(f"Log custom metrics...")
            log_custom_metrics(model_grid_search, X_train, y_train, X_val,
                               y_val)
            if log_residuals:
                log_model_residuals(model_grid_search, X_train, y_train, X_val,
                                    y_val)

        print(
            f"Simple search: Best params are:\n {model_grid_search.best_params_}"
        )
        print(f"{model_name.title()}: Simple search:")
        print_custom_metrics(model_grid_search, X_train, y_train, X_val, y_val)
        winsound.Beep(frequency=2000, duration=300)
        return model, model_grid_search.best_estimator_, model_grid_search.best_params_
    else:
        print(f"\nSkip a Simple Grid SearchCV....")
        return model, None, None
Ejemplo n.º 6
0
def bayesian_search_cv(
    X_fit,
    y_fit,
    X_train,
    y_train,
    X_val,
    y_val,
    model,
    bayes_space,
    scorer,
    cv,
    n_jobs,
    bayesian_search_params,
    log_residuals,
):
    if bayesian_search_params[0]:
        print(
            f"\n-------------- Bayesian optimization of hyper-params started...."
        )
        pprint(f"Parameters' space: {bayes_space}")
        model_name = type(model).__name__
        # Setup MLflow tracking server
        exp_id = mlflow_set_exp_id("Model:Fit")
        run_name = f"{model_name}-bayes"
        ## Enable autologging
        mlflow.sklearn.autolog(log_model_signatures=False)
        # Define bayesian search
        bayes_search = BayesSearchCV(
            model,
            search_spaces=bayes_space,
            n_iter=bayesian_search_params[1],  # default 50
            scoring=scorer,
            n_jobs=n_jobs,
            cv=cv,
            refit=True,
            return_train_score=True,
            verbose=3,
            random_state=rnd_state,
        )

        # Callback handler
        def on_step(optim_result):
            """ Print scores after each iteration while performing optimization """
            score = bayes_search.best_score_
            print(f"...current best score: {score}")
            if score <= 2:
                print("Interrupting!")
                return True

        ##* Fit model with MLflow logging
        with mlflow.start_run(experiment_id=exp_id, run_name=run_name):
            tic = time.time()
            model_bayes_search = bayes_search.fit(
                X_fit,
                y_fit,
                callback=on_step,
            )
            min, sec = divmod(time.time() - tic, 60)
            ## Disable autologging
            mlflow.sklearn.autolog(disable=True)
            # Log custom metrics and data
            print(f"Bayesian search took: {int(min)}min {int(sec)}sec")
            print(f"Log custom metrics...")
            log_custom_metrics(model_bayes_search, X_train, y_train, X_val,
                               y_val)
            if log_residuals:
                log_model_residuals(model_bayes_search, X_train, y_train,
                                    X_val, y_val)

        print(
            f"Bayesian search: Best params are:\n {model_bayes_search.best_params_}"
        )
        print(f"{model_name.title()}: Bayesian search:")
        print_custom_metrics(model_bayes_search, X_train, y_train, X_val,
                             y_val)
        winsound.Beep(frequency=2000, duration=300)
        return model, model_bayes_search.best_estimator_, model_bayes_search.best_params_
    else:
        print(f"\nSkip Bayesian Optimization....")
        return model, None, None
Ejemplo n.º 7
0
def randomized_search_cv(
    X_fit,
    y_fit,
    X_train,
    y_train,
    X_val,
    y_val,
    model,
    params_dist,
    scorer,
    cv,
    n_jobs,
    random_search_params,
    log_residuals,
):
    if random_search_params[0]:
        print(f"\n-------------- Randomized Grid SearchCV started....")
        pprint(f"Parameters' distributions: {params_dist}")
        model_name = type(model).__name__
        # Setup MLflow tracking server
        exp_id = mlflow_set_exp_id("Model:Fit")
        run_name = f"{model_name}-rand"
        ## Enable autologging
        mlflow.sklearn.autolog(log_model_signatures=False)
        print(f"Autologging {model_name} started...")
        # Define RANDOMIZED grid search
        random_search = model_selection.RandomizedSearchCV(
            model,
            param_distributions=params_dist,
            n_iter=random_search_params[1],  # default 10
            scoring=scorer,
            n_jobs=n_jobs,
            cv=cv,
            refit=True,
            return_train_score=True,
            verbose=3,
            random_state=rnd_state,
        )
        ##* Fit model with MLflow logging
        with mlflow.start_run(experiment_id=exp_id, run_name=run_name):
            tic = time.time()
            model_random_search = random_search.fit(
                X_fit,
                y_fit,
            )
            min, sec = divmod(time.time() - tic, 60)
            ## Disable autologging
            mlflow.sklearn.autolog(disable=True)
            # Log custom metrics and data
            print(f"Randomized grid search took: {int(min)}min {int(sec)}sec")
            print(f"Log custom metrics...")
            log_custom_metrics(model_random_search, X_train, y_train, X_val,
                               y_val)
            if log_residuals:
                log_model_residuals(model_random_search, X_train, y_train,
                                    X_val, y_val)

        print(
            f"Randomized search: Best params are:\n {model_random_search.best_params_}"
        )
        print(f"{model_name.title()}: Random search:")
        print_custom_metrics(model_random_search, X_train, y_train, X_val,
                             y_val)
        winsound.Beep(frequency=2000, duration=300)
        return model, model_random_search.best_estimator_, model_random_search.best_params_
    else:
        print(f"\nSkip a Randomized Grid SearchCV....")
        return model, None, None