Example #1
0
def run_decisiontree(
    X_train: pd.DataFrame,
    X_test: pd.DataFrame,
    y_train: pd.DataFrame,
    y_test: pd.DataFrame,
    config: Dict[str, Any],
) -> Any:
    LOGGER.info("Finding best decision tree..")
    search_space = {
        "type": "decision_tree",
        "max_depth": hp.uniformint("max_depth", 2, 15),
        "min_samples_split": hp.uniformint("n_estimators", 2, 20),
        # "class_weight": hp.choice("class_weight", ["balanced"]),
    }

    best_params = hyperopt_search(X_train, y_train, search_space, config)
    model = make_pipeline(DecisionTreeClassifier(**best_params))

    mean_cross_val_score = cross_validate_model(model, X_train, y_train)
    LOGGER.info(
        f"Decision tree cross validation score: {mean_cross_val_score}")

    if config["test"]:
        print(classification_report(model.predict(X_test), y_test))

    return model
Example #2
0
def run_knn(
    X_train: pd.DataFrame,
    X_test: pd.DataFrame,
    y_train: pd.DataFrame,
    y_test: pd.DataFrame,
    config: Dict[str, Any],
) -> Any:
    """ Finds optimal model parameters for a KNN classifier, evaluates model and return model object."""
    LOGGER.info("Finding best knn..")
    search_space = {
        "type": "knn",
        "n_neighbors": hp.uniformint("n_neighbors", 2, 15),
        "weights": hp.choice("weights", ["uniform", "distance"]),
    }

    best_params = hyperopt_search(X_train, y_train, search_space, config)
    model = make_pipeline(get_scaler(config),
                          KNeighborsClassifier(**best_params))

    mean_cross_val_score = cross_validate_model(model, X_train, y_train)
    LOGGER.info(f"KNN cross validation score: {mean_cross_val_score}")

    if config["test"]:
        print(classification_report(model.predict(X_test), y_test))

    return model
Example #3
0
def run_svm_vote(
    X_train: pd.DataFrame,
    X_test: pd.DataFrame,
    y_train: pd.DataFrame,
    y_test: pd.DataFrame,
    config: Dict[str, Any],
) -> Any:
    LOGGER.info("Finding best svm..")
    search_space = {
        "type": "svm",
        "C": hp.lognormal("C", 0, 100.0),
        "gamma": hp.lognormal("gamma", 0, 1.0),
        "kernel": hp.choice("kernel", ["rbf"]),
    }

    best_params = hyperopt_search(X_train, y_train, search_space, config)
    model = make_pipeline(
        get_scaler(config),
        SVC(**best_params, class_weight="balanced", probability=True),
    )

    mean_cross_val_score = cross_validate_model(model, X_train, y_train)
    LOGGER.info(f"SVM cross validation score: {mean_cross_val_score}")

    if config["test"]:
        print(classification_report(model.predict(X_test), y_test))

    return model
Example #4
0
def run_xgboost(
    X_train: pd.DataFrame,
    X_test: pd.DataFrame,
    y_train: pd.DataFrame,
    y_test: pd.DataFrame,
    config: Dict[str, Any],
) -> Tuple[Any, Any]:
    LOGGER.info("Running XGboost..")
    if config["find_optimal_model"]:

        search_space = {
            "type": "xgboost",
            "max_depth": hp.choice("max_depth", range(5, 15, 1)),
            "learning_rate": hp.quniform("learning_rate", 0.01, 0.5, 0.01),
            "n_estimators": hp.choice("n_estimators", range(20, 205, 5)),
            "gamma": hp.quniform("gamma", 0, 0.50, 0.01),
            "min_child_weight": hp.quniform("min_child_weight", 1, 10, 1),
            "subsample": hp.quniform("subsample", 0.1, 1, 0.01),
            # "colsample_bytree": hp.quniform("colsample_bytree", 0.1, 1.0, 0.01)
        }

        best_params = hyperopt_search(X_train, y_train, search_space, config)
        model = make_pipeline(XGBClassifier(**best_params)).fit(
            X_train, y_train)

        mean_cross_val_score = cross_validate_model(model, X_train, y_train)
        LOGGER.info(
            f"XGboost classifier cross validation score: {mean_cross_val_score}"
        )

    else:
        model = XGBClassifier(**config["models"]["xgboost"]).fit(
            X_train, y_train)
        if config["test"]:
            print(classification_report(model.predict(X_test), y_test))

    if config["test"]:
        y_pred = model.predict(X_test)
        score = accuracy_score(y_pred=y_pred, y_true=y_test)

        LOGGER.info(f"Xgboost has a train accuracy of {score}")

        return model, y_pred

    return model, None
Example #5
0
def run_gradient_boosting_classifier(
    X_train: pd.DataFrame,
    X_test: pd.DataFrame,
    y_train: pd.DataFrame,
    y_test: pd.DataFrame,
    config: Dict[str, Any],
) -> Tuple[Any, Any]:
    LOGGER.info("Running Gradient boosting classifier..")
    if config["find_optimal_model"]:
        search_space = {
            "type": "gradient_boosting",
            "max_depth": hp.uniformint("max_depth", 2, 15),
            "n_estimators": hp.uniformint("n_estimators", 50, 300),
            "max_features": hp.choice("max_features", ("auto", "sqrt", None)),
            "learning_rate": hp.quniform("learning_rate", 0.025, 0.5, 0.025),
        }

        best_params = hyperopt_search(X_train, y_train, search_space, config)
        model = make_pipeline(GradientBoostingClassifier(**best_params)).fit(
            X_train, y_train)

        mean_cross_val_score = cross_validate_model(model, X_train, y_train)
        LOGGER.info(
            f"Gradient boosting classifier cross validation score: {mean_cross_val_score}"
        )

    else:
        model = make_pipeline(
            GradientBoostingClassifier(
                **config["models"]["gradient_boosting"])).fit(
                    X_train, y_train)
        if config["test"]:
            print(classification_report(model.predict(X_test), y_test))

    if config["test"]:
        y_pred = model.predict(X_test)
        score = accuracy_score(y_pred=y_pred, y_true=y_test)

        LOGGER.info(
            f"The gradient boosting classifier has a train accuracy of {score}"
        )

        return model, y_pred

    return model, None
Example #6
0
def run_random_forest(
    X_train: pd.DataFrame,
    X_test: pd.DataFrame,
    y_train: pd.DataFrame,
    y_test: pd.DataFrame,
    config: Dict[str, Any],
) -> Tuple[Any, Any]:
    LOGGER.info("Running Random Forest model..")
    if config["find_optimal_model"]:
        search_space = {
            "type": "random_forest",
            "max_depth": hp.uniformint("max_depth", 2, 30),
            "n_estimators": hp.uniformint("n_estimators", 10, 1000),
            "max_features": hp.choice("max_features", ("auto", "sqrt", None)),
        }

        best_params = hyperopt_search(X_train, y_train, search_space, config)
        model = make_pipeline(RandomForestClassifier(**best_params))

        mean_cross_val_score = cross_validate_model(model, X_train, y_train)
        LOGGER.info(
            f"Random Forest classifier cross validation score: {mean_cross_val_score}"
        )

    else:
        model = make_pipeline(
            RandomForestClassifier(**config["models"]["random_forest"])).fit(
                X_train, y_train)
        if config["test"]:
            print(classification_report(model.predict(X_test), y_test))

    if config["test"]:
        y_pred = model.predict(X_test)
        score = accuracy_score(y_pred=y_pred, y_true=y_test)

        LOGGER.info(f"Random forest model has a train accuracy of {score}")

        return model, y_pred

    return model, None
Example #7
0
def run_svm(
    X_train: pd.DataFrame,
    X_test: pd.DataFrame,
    y_train: pd.DataFrame,
    y_test: pd.DataFrame,
    config: Dict[str, Any],
) -> Tuple[Any, Any]:
    LOGGER.info("Running SVM..")
    if config["find_optimal_model"]:
        search_space = {
            "type": "svm",
            "C": hp.lognormal("C", 0, 10.0),
            "gamma": hp.lognormal("gamma", 0, 1.0),
            "kernel": hp.choice("kernel", ["rbf"]),
        }

        best_params = hyperopt_search(X_train, y_train, search_space, config)
        model = make_pipeline(SVC(**best_params)).fit(X_train, y_train)

        mean_cross_val_score = cross_validate_model(model, X_train, y_train)
        LOGGER.info(
            f"SVM classifier cross validation score: {mean_cross_val_score}")

    else:
        model = make_pipeline(SVC(**config["models"]["svm"])).fit(
            X_train, y_train)
        mean_cross_val_score = cross_validate_model(model, X_train, y_train)
        LOGGER.info(f"SVM cross validation score: {mean_cross_val_score}")
        if config["test"]:
            print(classification_report(model.predict(X_test), y_test))

    if config["test"]:
        y_pred = model.predict(X_test)
        score = accuracy_score(y_pred=y_pred, y_true=y_test)

        LOGGER.info(f"SVM has a test accuracy of {score}")

        return model, y_pred

    return model, None