Beispiel #1
0
    def test_set_cursor(self, base):
        model_test = KMeans("kmeans_cursor_test",
                            cursor=base.cursor,
                            init="kmeanspp")
        # TODO: creat a new cursor
        model_test.set_cursor(base.cursor)
        model_test.drop()
        model_test.fit("public.iris", ["SepalLengthCm", "SepalWidthCm"])

        base.cursor.execute(
            "SELECT model_name FROM models WHERE model_name = 'kmeans_cursor_test'"
        )
        assert base.cursor.fetchone()[0] == "kmeans_cursor_test"
        model_test.drop()
Beispiel #2
0
 def test_get_plot(self, base, winequality_vd):
     base.cursor.execute("DROP MODEL IF EXISTS model_test_plot")
     model_test = KMeans("model_test_plot", cursor=base.cursor)
     model_test.fit(winequality_vd, ["alcohol", "quality"])
     result = model_test.plot(color="b", )
     assert len(result.get_default_bbox_extra_artists()) == 16
     plt.close("all")
     model_test.drop()
Beispiel #3
0
 def test_get_voronoi_plot(self, iris_vd):
     current_cursor().execute("DROP MODEL IF EXISTS model_test_plot")
     model_test = KMeans("model_test_plot", )
     model_test.fit(iris_vd, ["SepalLengthCm", "SepalWidthCm"])
     result = model_test.plot_voronoi(color="b")
     assert len(result.gca().get_default_bbox_extra_artists()) == 21
     plt.close("all")
     model_test.drop()
Beispiel #4
0
 def test_model_from_vDF(self, base, iris_vd):
     base.cursor.execute("DROP MODEL IF EXISTS kmeans_vDF")
     model_test = KMeans("kmeans_vDF", cursor=base.cursor, init="random")
     model_test.fit(iris_vd, ["SepalLengthCm", "SepalWidthCm"])
     base.cursor.execute(
         "SELECT model_name FROM models WHERE model_name = 'kmeans_vDF'")
     assert base.cursor.fetchone()[0] == "kmeans_vDF"
     model_test.drop()
Beispiel #5
0
def model(iris_vd):
    model_class = KMeans(
        "kmeans_model_test",
        n_cluster=3,
        max_iter=10,
        init=[[7.2, 3.0, 5.8, 1.6], [6.9, 3.1, 4.9, 1.5], [5.7, 4.4, 1.5,
                                                           0.4]],
    )
    model_class.drop()
    model_class.fit(
        "public.iris",
        ["SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm"],
    )
    yield model_class
    model_class.drop()
Beispiel #6
0
    def test_drop(self, base):
        base.cursor.execute("DROP MODEL IF EXISTS kmeans_model_test_drop")
        model_test = KMeans("kmeans_model_test_drop", cursor=base.cursor)
        model_test.fit("public.iris", ["SepalLengthCm", "SepalWidthCm"])

        base.cursor.execute(
            "SELECT model_name FROM models WHERE model_name = 'kmeans_model_test_drop'"
        )
        assert base.cursor.fetchone()[0] == "kmeans_model_test_drop"

        model_test.drop()
        base.cursor.execute(
            "SELECT model_name FROM models WHERE model_name = 'kmeans_model_test_drop'"
        )
        assert base.cursor.fetchone() is None
Beispiel #7
0
    def test_drop(self):
        current_cursor().execute("DROP MODEL IF EXISTS kmeans_model_test_drop")
        model_test = KMeans("kmeans_model_test_drop", )
        model_test.fit("public.iris", ["SepalLengthCm", "SepalWidthCm"])

        current_cursor().execute(
            "SELECT model_name FROM models WHERE model_name = 'kmeans_model_test_drop'"
        )
        assert current_cursor().fetchone()[0] == "kmeans_model_test_drop"

        model_test.drop()
        current_cursor().execute(
            "SELECT model_name FROM models WHERE model_name = 'kmeans_model_test_drop'"
        )
        assert current_cursor().fetchone() is None
Beispiel #8
0
def model(base, iris_vd):
    base.cursor.execute("DROP MODEL IF EXISTS kmeans_model_test")

    model_class = KMeans(
        "kmeans_model_test",
        cursor=base.cursor,
        n_cluster=3,
        max_iter=10,
        init=[[7.2, 3.0, 5.8, 1.6], [6.9, 3.1, 4.9, 1.5], [5.7, 4.4, 1.5,
                                                           0.4]],
    )
    model_class.fit(
        "public.iris",
        ["SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm"],
    )
    yield model_class
    model_class.drop()
Beispiel #9
0
def best_k(
    input_relation: (str, vDataFrame),
    X: list = [],
    cursor=None,
    n_cluster: (tuple, list) = (1, 100),
    init: (str, list) = "kmeanspp",
    max_iter: int = 50,
    tol: float = 1e-4,
    elbow_score_stop: float = 0.8,
):
    """
---------------------------------------------------------------------------
Finds the KMeans K based on a score.

Parameters
----------
input_relation: str/vDataFrame
    Relation to use to train the model.
X: list, optional
	List of the predictor columns. If empty, all the numerical columns will
    be used.
cursor: DBcursor, optional
	Vertica DB cursor.
n_cluster: tuple/list, optional
	Tuple representing the number of cluster to start with and to end with.
	It can also be customized list with the different K to test.
init: str/list, optional
	The method to use to find the initial cluster centers.
		kmeanspp : Use the KMeans++ method to initialize the centers.
		random   : The initial centers
	It can be also a list with the initial cluster centers to use.
max_iter: int, optional
	The maximum number of iterations the algorithm performs.
tol: float, optional
	Determines whether the algorithm has converged. The algorithm is considered 
	converged after no center has moved more than a distance of 'tol' from the 
	previous iteration.
elbow_score_stop: float, optional
	Stops the Parameters Search when this Elbow score is reached.

Returns
-------
int
	the KMeans K
	"""
    check_types([
        (
            "X",
            X,
            [list],
        ),
        (
            "input_relation",
            input_relation,
            [str, vDataFrame],
        ),
        (
            "n_cluster",
            n_cluster,
            [list],
        ),
        (
            "init",
            init,
            ["kmeanspp", "random"],
        ),
        (
            "max_iter",
            max_iter,
            [int, float],
        ),
        (
            "tol",
            tol,
            [int, float],
        ),
        (
            "elbow_score_stop",
            elbow_score_stop,
            [int, float],
        ),
    ])

    from verticapy.learn.cluster import KMeans

    cursor, conn = check_cursor(cursor, input_relation)[0:2]
    if isinstance(n_cluster, tuple):
        L = range(n_cluster[0], n_cluster[1])
    else:
        L = n_cluster
        L.sort()
    schema, relation = schema_relation(input_relation)
    if isinstance(input_relation, vDataFrame):
        if not (schema):
            schema = "public"
    schema = str_column(schema)
    for i in L:
        cursor.execute(
            "DROP MODEL IF EXISTS {}.__VERTICAPY_TEMP_MODEL_KMEANS_{}__".
            format(schema, get_session(cursor)))
        model = KMeans(
            "{}.__VERTICAPY_TEMP_MODEL_KMEANS_{}__".format(
                schema, get_session(cursor)),
            cursor,
            i,
            init,
            max_iter,
            tol,
        )
        model.fit(input_relation, X)
        score = model.metrics_.values["value"][3]
        if score > elbow_score_stop:
            return i
        score_prev = score
    if conn:
        conn.close()
    print(
        "\u26A0 The K was not found. The last K (= {}) is returned with an elbow score of {}"
        .format(i, score))
    return i
Beispiel #10
0
def elbow(
    input_relation: (str, vDataFrame),
    X: list = [],
    cursor=None,
    n_cluster: (tuple, list) = (1, 15),
    init: (str, list) = "kmeanspp",
    max_iter: int = 50,
    tol: float = 1e-4,
    ax=None,
    **style_kwds,
):
    """
---------------------------------------------------------------------------
Draws an Elbow Curve.

Parameters
----------
input_relation: str/vDataFrame
    Relation to use to train the model.
X: list, optional
    List of the predictor columns. If empty all the numerical vcolumns will
    be used.
cursor: DBcursor, optional
    Vertica DB cursor.
n_cluster: tuple/list, optional
    Tuple representing the number of cluster to start with and to end with.
    It can also be customized list with the different K to test.
init: str/list, optional
    The method to use to find the initial cluster centers.
        kmeanspp : Use the KMeans++ method to initialize the centers.
        random   : The initial centers
    It can be also a list with the initial cluster centers to use.
max_iter: int, optional
    The maximum number of iterations the algorithm performs.
tol: float, optional
    Determines whether the algorithm has converged. The algorithm is considered 
    converged after no center has moved more than a distance of 'tol' from the 
    previous iteration.
ax: Matplotlib axes object, optional
    The axes to plot on.
**style_kwds
    Any optional parameter to pass to the Matplotlib functions.

Returns
-------
tablesample
    An object containing the result. For more information, see
    utilities.tablesample.
    """
    check_types([
        (
            "X",
            X,
            [list],
        ),
        (
            "input_relation",
            input_relation,
            [str, vDataFrame],
        ),
        (
            "n_cluster",
            n_cluster,
            [list],
        ),
        (
            "init",
            init,
            ["kmeanspp", "random"],
        ),
        (
            "max_iter",
            max_iter,
            [int, float],
        ),
        (
            "tol",
            tol,
            [int, float],
        ),
    ])
    cursor, conn = check_cursor(cursor, input_relation)[0:2]
    version(cursor=cursor, condition=[8, 0, 0])
    if isinstance(n_cluster, tuple):
        L = range(n_cluster[0], n_cluster[1])
    else:
        L = n_cluster
        L.sort()
    schema, relation = schema_relation(input_relation)
    all_within_cluster_SS = []
    if isinstance(n_cluster, tuple):
        L = [i for i in range(n_cluster[0], n_cluster[1])]
    else:
        L = n_cluster
        L.sort()
    for i in L:
        cursor.execute(
            "DROP MODEL IF EXISTS {}.VERTICAPY_KMEANS_TMP_{}".format(
                schema, get_session(cursor)))
        from verticapy.learn.cluster import KMeans

        model = KMeans(
            "{}.VERTICAPY_KMEANS_TMP_{}".format(schema, get_session(cursor)),
            cursor,
            i,
            init,
            max_iter,
            tol,
        )
        model.fit(input_relation, X)
        all_within_cluster_SS += [float(model.metrics_.values["value"][3])]
        model.drop()
    if conn:
        conn.close()
    if not (ax):
        fig, ax = plt.subplots()
        if isnotebook():
            fig.set_size_inches(8, 6)
        ax.grid(axis="y")
    param = {
        "color": gen_colors()[0],
        "marker": "o",
        "markerfacecolor": "white",
        "markersize": 7,
        "markeredgecolor": "black",
    }
    ax.plot(
        L,
        all_within_cluster_SS,
        **updated_dict(param, style_kwds),
    )
    ax.set_title("Elbow Curve")
    ax.set_xlabel("Number of Clusters")
    ax.set_ylabel("Between-Cluster SS / Total SS")
    values = {"index": L, "Within-Cluster SS": all_within_cluster_SS}
    return tablesample(values=values)
Beispiel #11
0
def load_model(name: str, cursor=None, input_relation: str = "", test_relation: str = ""):
    """
---------------------------------------------------------------------------
Loads a Vertica model and returns the associated object.

Parameters
----------
name: str
    Model Name.
cursor: DBcursor, optional
    Vertica database cursor.
input_relation: str, optional
    Some automated functions may depend on the input relation. If the 
    load_model function cannot find the input relation from the call string, 
    you should fill it manually.
test_relation: str, optional
    Relation to use to do the testing. All the methods will use this relation 
    for the scoring. If empty, the training relation will be used as testing.

Returns
-------
model
    The model.
    """
    check_types([("name", name, [str],), 
                 ("test_relation", test_relation, [str],),
                 ("input_relation", input_relation, [str],),])
    cursor = check_cursor(cursor)[0]
    does_exist = does_model_exist(name=name, cursor=cursor, raise_error=False)
    schema, name = schema_relation(name)
    schema, name = schema[1:-1], name[1:-1]
    assert does_exist, NameError("The model '{}' doesn't exist.".format(name))
    if does_exist == 2:
        cursor.execute(
            "SELECT attr_name, value FROM verticapy.attr WHERE LOWER(model_name) = LOWER('{}')".format(
                str_column(name.lower())
            )
        )
        result = cursor.fetchall()
        model_save = {}
        for elem in result:
            ldic = {}
            try:
                exec("result_tmp = {}".format(elem[1]), globals(), ldic)
            except:
                exec(
                    "result_tmp = '{}'".format(elem[1].replace("'", "''")),
                    globals(),
                    ldic,
                )
            result_tmp = ldic["result_tmp"]
            try:
                result_tmp = float(result_tmp)
            except:
                pass
            if result_tmp == None:
                result_tmp = "None"
            model_save[elem[0]] = result_tmp
        if model_save["type"] == "NearestCentroid":
            from verticapy.learn.neighbors import NearestCentroid

            model = NearestCentroid(name, cursor, model_save["p"])
            model.centroids_ = tablesample(model_save["centroids"])
            model.classes_ = model_save["classes"]
        elif model_save["type"] == "KNeighborsClassifier":
            from verticapy.learn.neighbors import KNeighborsClassifier

            model = KNeighborsClassifier(
                name, cursor, model_save["n_neighbors"], model_save["p"]
            )
            model.classes_ = model_save["classes"]
        elif model_save["type"] == "KNeighborsRegressor":
            from verticapy.learn.neighbors import KNeighborsRegressor

            model = KNeighborsRegressor(
                name, cursor, model_save["n_neighbors"], model_save["p"]
            )
        elif model_save["type"] == "KernelDensity":
            from verticapy.learn.neighbors import KernelDensity

            model = KernelDensity(
                name,
                cursor,
                model_save["bandwidth"],
                model_save["kernel"],
                model_save["p"],
                model_save["max_leaf_nodes"],
                model_save["max_depth"],
                model_save["min_samples_leaf"],
                model_save["nbins"],
                model_save["xlim"],
            )
            model.y = "KDE"
            model.map = model_save["map"]
            model.tree_name = model_save["tree_name"]
        elif model_save["type"] == "LocalOutlierFactor":
            from verticapy.learn.neighbors import LocalOutlierFactor

            model = LocalOutlierFactor(
                name, cursor, model_save["n_neighbors"], model_save["p"]
            )
            model.n_errors_ = model_save["n_errors"]
        elif model_save["type"] == "DBSCAN":
            from verticapy.learn.cluster import DBSCAN

            model = DBSCAN(
                name,
                cursor,
                model_save["eps"],
                model_save["min_samples"],
                model_save["p"],
            )
            model.n_cluster_ = model_save["n_cluster"]
            model.n_noise_ = model_save["n_noise"]
        elif model_save["type"] == "CountVectorizer":
            from verticapy.learn.preprocessing import CountVectorizer

            model = CountVectorizer(
                name,
                cursor,
                model_save["lowercase"],
                model_save["max_df"],
                model_save["min_df"],
                model_save["max_features"],
                model_save["ignore_special"],
                model_save["max_text_size"],
            )
            model.vocabulary_ = model_save["vocabulary"]
            model.stop_words_ = model_save["stop_words"]
        elif model_save["type"] == "SARIMAX":
            from verticapy.learn.tsa import SARIMAX

            model = SARIMAX(
                name,
                cursor,
                model_save["p"],
                model_save["d"],
                model_save["q"],
                model_save["P"],
                model_save["D"],
                model_save["Q"],
                model_save["s"],
                model_save["tol"],
                model_save["max_iter"],
                model_save["solver"],
                model_save["max_pik"],
                model_save["papprox_ma"],
            )
            model.transform_relation = model_save["transform_relation"]
            model.coef_ = tablesample(model_save["coef"])
            model.ma_avg_ = model_save["ma_avg"]
            if isinstance(model_save["ma_piq"], dict):
                model.ma_piq_ = tablesample(model_save["ma_piq"])
            else:
                model.ma_piq_ = None
            model.ts = model_save["ts"]
            model.exogenous = model_save["exogenous"]
            model.deploy_predict_ = model_save["deploy_predict"]
        elif model_save["type"] == "VAR":
            from verticapy.learn.tsa import VAR

            model = VAR(
                name,
                cursor,
                model_save["p"],
                model_save["tol"],
                model_save["max_iter"],
                model_save["solver"],
            )
            model.transform_relation = model_save["transform_relation"]
            model.coef_ = []
            for i in range(len(model_save["X"])):
                model.coef_ += [tablesample(model_save["coef_{}".format(i)])]
            model.ts = model_save["ts"]
            model.deploy_predict_ = model_save["deploy_predict"]
            model.X = model_save["X"]
            if not(input_relation):
                model.input_relation = model_save["input_relation"]
            else:
                model.input_relation = input_relation
            model.X = model_save["X"]
            if model_save["type"] in (
                "KNeighborsRegressor",
                "KNeighborsClassifier",
                "NearestCentroid",
                "SARIMAX",
            ):
                model.y = model_save["y"]
                model.test_relation = model_save["test_relation"]
            elif model_save["type"] not in ("CountVectorizer", "VAR"):
                model.key_columns = model_save["key_columns"]
    else:
        model_type = does_model_exist(name="{}.{}".format(schema, name), cursor=cursor, raise_error=False, return_model_type=True,)
        if model_type.lower() == "kmeans":
            cursor.execute(
                "SELECT GET_MODEL_SUMMARY (USING PARAMETERS model_name = '"
                + name
                + "')"
            )
            info = cursor.fetchone()[0].replace("\n", " ")
            info = "kmeans(" + info.split("kmeans(")[1]
        elif model_type.lower() == "normalize_fit":
            from verticapy.learn.preprocessing import Normalizer

            model = Normalizer(name, cursor)
            model.param_ = model.get_attr("details")
            model.X = [
                '"' + item + '"' for item in model.param_.values["column_name"]
            ]
            if "avg" in model.param_.values:
                model.parameters["method"] = "zscore"
            elif "max" in model.param_.values:
                model.parameters["method"] = "minmax"
            else:
                model.parameters["method"] = "robust_zscore"
            return model
        else:
            cursor.execute(
                "SELECT GET_MODEL_ATTRIBUTE (USING PARAMETERS model_name = '"
                + name
                + "', attr_name = 'call_string')"
            )
            info = cursor.fetchone()[0].replace("\n", " ")
        if "SELECT " in info:
            info = info.split("SELECT ")[1].split("(")
        else:
            info = info.split("(")
        model_type = info[0].lower()
        info = info[1].split(")")[0].replace(" ", "").split("USINGPARAMETERS")
        if model_type == "svm_classifier" and "class_weights='none'" not in " ".join(info).lower():
            parameters = "".join(info[1].split("class_weights=")[1].split("'"))
            parameters = parameters[3 : len(parameters)].split(",")
            del parameters[0]
            parameters += [
                "class_weights=" + info[1].split("class_weights=")[1].split("'")[1]
            ]
        elif model_type != "svd":
            parameters = info[1].split(",")
        else:
            parameters = []
        parameters = [item.split("=") for item in parameters]
        parameters_dict = {}
        for item in parameters:
            if len(item) > 1:
                parameters_dict[item[0]] = item[1]
        info = info[0]
        for elem in parameters_dict:
            if isinstance(parameters_dict[elem], str):
                parameters_dict[elem] = parameters_dict[elem].replace("'", "")
        if model_type == "rf_regressor":
            from verticapy.learn.ensemble import RandomForestRegressor

            model = RandomForestRegressor(
                name,
                cursor,
                int(parameters_dict["ntree"]),
                int(parameters_dict["mtry"]),
                int(parameters_dict["max_breadth"]),
                float(parameters_dict["sampling_size"]),
                int(parameters_dict["max_depth"]),
                int(parameters_dict["min_leaf_size"]),
                float(parameters_dict["min_info_gain"]),
                int(parameters_dict["nbins"]),
            )
        elif model_type == "rf_classifier":
            from verticapy.learn.ensemble import RandomForestClassifier

            model = RandomForestClassifier(
                name,
                cursor,
                int(parameters_dict["ntree"]),
                int(parameters_dict["mtry"]),
                int(parameters_dict["max_breadth"]),
                float(parameters_dict["sampling_size"]),
                int(parameters_dict["max_depth"]),
                int(parameters_dict["min_leaf_size"]),
                float(parameters_dict["min_info_gain"]),
                int(parameters_dict["nbins"]),
            )
        elif model_type == "xgb_classifier":
            from verticapy.learn.ensemble import XGBoostClassifier

            model = XGBoostClassifier(
                name,
                cursor,
                int(parameters_dict["max_ntree"]),
                int(parameters_dict["max_depth"]),
                int(parameters_dict["nbins"]),
                parameters_dict["objective"],
                parameters_dict["split_proposal_method"],
                float(parameters_dict["epsilon"]),
                float(parameters_dict["learning_rate"]),
                float(parameters_dict["min_split_loss"]),
                float(parameters_dict["weight_reg"]),
                float(parameters_dict["sampling_size"]),
            )
        elif model_type == "xgb_regressor":
            from verticapy.learn.ensemble import XGBoostRegressor

            model = XGBoostRegressor(
                name,
                cursor,
                int(parameters_dict["max_ntree"]),
                int(parameters_dict["max_depth"]),
                int(parameters_dict["nbins"]),
                parameters_dict["objective"],
                parameters_dict["split_proposal_method"],
                float(parameters_dict["epsilon"]),
                float(parameters_dict["learning_rate"]),
                float(parameters_dict["min_split_loss"]),
                float(parameters_dict["weight_reg"]),
                float(parameters_dict["sampling_size"]),
            )
        elif model_type == "logistic_reg":
            from verticapy.learn.linear_model import LogisticRegression

            model = LogisticRegression(
                name,
                cursor,
                parameters_dict["regularization"],
                float(parameters_dict["epsilon"]),
                float(parameters_dict["lambda"]),
                int(parameters_dict["max_iterations"]),
                parameters_dict["optimizer"],
                float(parameters_dict["alpha"]),
            )
        elif model_type == "linear_reg":
            from verticapy.learn.linear_model import (
                LinearRegression,
                Lasso,
                Ridge,
                ElasticNet,
            )
            if parameters_dict["regularization"] == "none":
                model = LinearRegression(
                    name,
                    cursor,
                    float(parameters_dict["epsilon"]),
                    int(parameters_dict["max_iterations"]),
                    parameters_dict["optimizer"],
                )
            elif parameters_dict["regularization"] == "l1":
                model = Lasso(
                    name,
                    cursor,
                    float(parameters_dict["epsilon"]),
                    float(parameters_dict["lambda"]),
                    int(parameters_dict["max_iterations"]),
                    parameters_dict["optimizer"],
                )
            elif parameters_dict["regularization"] == "l2":
                model = Ridge(
                    name,
                    cursor,
                    float(parameters_dict["epsilon"]),
                    float(parameters_dict["lambda"]),
                    int(parameters_dict["max_iterations"]),
                    parameters_dict["optimizer"],
                )
            else:
                model = ElasticNet(
                    name,
                    cursor,
                    float(parameters_dict["epsilon"]),
                    float(parameters_dict["lambda"]),
                    int(parameters_dict["max_iterations"]),
                    parameters_dict["optimizer"],
                    float(parameters_dict["alpha"]),
                )
        elif model_type == "naive_bayes":
            from verticapy.learn.naive_bayes import NaiveBayes

            model = NaiveBayes(name, cursor, float(parameters_dict["alpha"]))
        elif model_type == "svm_regressor":
            from verticapy.learn.svm import LinearSVR

            model = LinearSVR(
                name,
                cursor,
                float(parameters_dict["epsilon"]),
                float(parameters_dict["C"]),
                True,
                float(parameters_dict["intercept_scaling"]),
                parameters_dict["intercept_mode"],
                float(parameters_dict["error_tolerance"]),
                int(parameters_dict["max_iterations"]),
            )
        elif model_type == "svm_classifier":
            from verticapy.learn.svm import LinearSVC

            class_weights = parameters_dict["class_weights"].split(",")
            for idx, elem in enumerate(class_weights):
                try:
                    class_weights[idx] = float(class_weights[idx])
                except:
                    class_weights[idx] = None
            model = LinearSVC(
                name,
                cursor,
                float(parameters_dict["epsilon"]),
                float(parameters_dict["C"]),
                True,
                float(parameters_dict["intercept_scaling"]),
                parameters_dict["intercept_mode"],
                class_weights,
                int(parameters_dict["max_iterations"]),
            )
        elif model_type == "kmeans":
            from verticapy.learn.cluster import KMeans

            model = KMeans(
                name,
                cursor,
                int(info.split(",")[-1]),
                parameters_dict["init_method"],
                int(parameters_dict["max_iterations"]),
                float(parameters_dict["epsilon"]),
            )
            model.cluster_centers_ = model.get_attr("centers")
            result = model.get_attr("metrics").values["metrics"][0]
            values = {
                "index": [
                    "Between-Cluster Sum of Squares",
                    "Total Sum of Squares",
                    "Total Within-Cluster Sum of Squares",
                    "Between-Cluster SS / Total SS",
                    "converged",
                ]
            }
            values["value"] = [
                float(result.split("Between-Cluster Sum of Squares: ")[1].split("\n")[0]),
                float(result.split("Total Sum of Squares: ")[1].split("\n")[0]),
                float(
                    result.split("Total Within-Cluster Sum of Squares: ")[1].split("\n")[0]
                ),
                float(result.split("Between-Cluster Sum of Squares: ")[1].split("\n")[0])
                / float(result.split("Total Sum of Squares: ")[1].split("\n")[0]),
                result.split("Converged: ")[1].split("\n")[0] == "True",
            ]
            model.metrics_ = tablesample(values)
        elif model_type == "bisecting_kmeans":
            from verticapy.learn.cluster import BisectingKMeans

            model = BisectingKMeans(
                name,
                cursor,
                int(info.split(",")[-1]),
                int(parameters_dict["bisection_iterations"]),
                parameters_dict["split_method"],
                int(parameters_dict["min_divisible_cluster_size"]),
                parameters_dict["distance_method"],
                parameters_dict["kmeans_center_init_method"],
                int(parameters_dict["kmeans_max_iterations"]),
                float(parameters_dict["kmeans_epsilon"]),
            )
            model.metrics_ = model.get_attr("Metrics")
            model.cluster_centers_ = model.get_attr("BKTree")
        elif model_type == "pca":
            from verticapy.learn.decomposition import PCA

            model = PCA(name, cursor, 0, bool(parameters_dict["scale"]))
            model.components_ = model.get_attr("principal_components")
            model.explained_variance_ = model.get_attr("singular_values")
            model.mean_ = model.get_attr("columns")
        elif model_type == "svd":
            from verticapy.learn.decomposition import SVD

            model = SVD(name, cursor)
            model.singular_values_ = model.get_attr("right_singular_vectors")
            model.explained_variance_ = model.get_attr("singular_values")
        elif model_type == "one_hot_encoder_fit":
            from verticapy.learn.preprocessing import OneHotEncoder

            model = OneHotEncoder(name, cursor)
            try:
                model.param_ = to_tablesample(
                    query="SELECT category_name, category_level::varchar, category_level_index FROM (SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'integer_categories')) VERTICAPY_SUBTABLE UNION ALL SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'varchar_categories')".format(
                        model.name, model.name
                    ),
                    cursor=model.cursor,
                )
            except:
                try:
                    model.param_ = model.get_attr("integer_categories")
                except:
                    model.param_ = model.get_attr("varchar_categories")
        if not(input_relation):
            model.input_relation = info.split(",")[1].replace("'", "").replace("\\", "")
        else:
            model.input_relation = input_relation
        model.test_relation = test_relation if (test_relation) else model.input_relation
        if model_type not in ("kmeans", "pca", "svd", "one_hot_encoder_fit"):
            model.X = info.split(",")[3 : len(info.split(","))]
            model.X = [item.replace("'", "").replace("\\", "") for item in model.X]
            model.y = info.split(",")[2].replace("'", "").replace("\\", "")
        elif model_type in (
            "svd",
            "pca",
            "one_hot_encoder_fit",
            "normalizer",
            "kmeans",
            "bisectingkmeans",
        ):
            model.X = info.split(",")[2 : len(info.split(","))]
            model.X = [item.replace("'", "").replace("\\", "") for item in model.X]
        if model_type in ("naive_bayes", "rf_classifier", "xgb_classifier"):
            try:
                cursor.execute(
                    "SELECT DISTINCT {} FROM {} WHERE {} IS NOT NULL ORDER BY 1".format(
                        model.y, model.input_relation, model.y
                    )
                )
                classes = cursor.fetchall()
                model.classes_ = [item[0] for item in classes]
            except:
                model.classes_ = [0, 1]
        elif model_type in ("svm_classifier", "logistic_reg"):
            model.classes_ = [0, 1]
        if model_type in ("svm_classifier", "svm_regressor", "logistic_reg", "linear_reg",):
            model.coef_ = model.get_attr("details")
    return model
Beispiel #12
0
 def test_repr(self, model):
     assert "kmeans" in model.__repr__()
     model_repr = KMeans("model_repr")
     model_repr.drop()
     assert model_repr.__repr__() == "<KMeans>"
Beispiel #13
0
    def test_init_method(self, base):
        model_test_kmeanspp = KMeans("kmeanspp_test",
                                     cursor=base.cursor,
                                     init="kmeanspp")
        model_test_kmeanspp.drop()
        model_test_kmeanspp.fit("public.iris")

        base.cursor.execute(
            "SELECT model_name FROM models WHERE model_name = 'kmeanspp_test'")
        assert base.cursor.fetchone()[0] == "kmeanspp_test"
        model_test_kmeanspp.drop()

        model_test_random = KMeans("random_test",
                                   cursor=base.cursor,
                                   init="random")
        model_test_random.drop()
        model_test_random.fit("public.iris")

        base.cursor.execute(
            "SELECT model_name FROM models WHERE model_name = 'random_test'")
        assert base.cursor.fetchone()[0] == "random_test"
        model_test_random.drop()
def best_k(
    X: list,
    input_relation: str,
    cursor=None,
    n_cluster=(1, 100),
    init="kmeanspp",
    max_iter: int = 50,
    tol: float = 1e-4,
    elbow_score_stop: float = 0.8,
):
    """
---------------------------------------------------------------------------
Finds the KMeans K based on a score.

Parameters
----------
X: list
	List of the predictor columns.
input_relation: str
	Relation to use to train the model.
cursor: DBcursor, optional
	Vertica DB cursor.
n_cluster: int, optional
	Tuple representing the number of cluster to start with and to end with.
	It can also be customized list with the different K to test.
init: str/list, optional
	The method to use to find the initial cluster centers.
		kmeanspp : Use the KMeans++ method to initialize the centers.
		random   : The initial centers
	It can be also a list with the initial cluster centers to use.
max_iter: int, optional
	The maximum number of iterations the algorithm performs.
tol: float, optional
	Determines whether the algorithm has converged. The algorithm is considered 
	converged after no center has moved more than a distance of 'tol' from the 
	previous iteration.
elbow_score_stop: float, optional
	Stops the Parameters Search when this Elbow score is reached.

Returns
-------
int
	the KMeans K
	"""
    check_types([
        (
            "X",
            X,
            [list],
        ),
        (
            "input_relation",
            input_relation,
            [str],
        ),
        (
            "n_cluster",
            n_cluster,
            [list],
        ),
        (
            "init",
            init,
            ["kmeanspp", "random"],
        ),
        (
            "max_iter",
            max_iter,
            [int, float],
        ),
        (
            "tol",
            tol,
            [int, float],
        ),
        (
            "elbow_score_stop",
            elbow_score_stop,
            [int, float],
        ),
    ])

    from verticapy.learn.cluster import KMeans

    if not (cursor):
        conn = read_auto_connect()
        cursor = conn.cursor()
    else:
        conn = False
        check_cursor(cursor)
    if not (isinstance(n_cluster, Iterable)):
        L = range(n_cluster[0], n_cluster[1])
    else:
        L = n_cluster
        L.sort()
    schema, relation = schema_relation(input_relation)
    schema = str_column(schema)
    relation_alpha = "".join(ch for ch in relation if ch.isalnum())
    for i in L:
        cursor.execute(
            "DROP MODEL IF EXISTS {}.__vpython_kmeans_tmp_model_{}__".format(
                schema, relation_alpha))
        model = KMeans(
            "{}.__vpython_kmeans_tmp_model_{}__".format(
                schema, relation_alpha),
            cursor,
            i,
            init,
            max_iter,
            tol,
        )
        model.fit(input_relation, X)
        score = model.metrics.values["value"][3]
        if score > elbow_score_stop:
            return i
        score_prev = score
    if conn:
        conn.close()
    print(
        "\u26A0 The K was not found. The last K (= {}) is returned with an elbow score of {}"
        .format(i, score))
    return i
def elbow(
    X: list,
    input_relation: str,
    cursor=None,
    n_cluster=(1, 15),
    init="kmeanspp",
    max_iter: int = 50,
    tol: float = 1e-4,
    ax=None,
):
    """
---------------------------------------------------------------------------
Draws an Elbow Curve.

Parameters
----------
X: list
    List of the predictor columns.
input_relation: str
    Relation to use to train the model.
cursor: DBcursor, optional
    Vertica DB cursor.
n_cluster: int, optional
    Tuple representing the number of cluster to start with and to end with.
    It can also be customized list with the different K to test.
init: str/list, optional
    The method to use to find the initial cluster centers.
        kmeanspp : Use the KMeans++ method to initialize the centers.
        random   : The initial centers
    It can be also a list with the initial cluster centers to use.
max_iter: int, optional
    The maximum number of iterations the algorithm performs.
tol: float, optional
    Determines whether the algorithm has converged. The algorithm is considered 
    converged after no center has moved more than a distance of 'tol' from the 
    previous iteration.
ax: Matplotlib axes object, optional
    The axes to plot on.

Returns
-------
tablesample
    An object containing the result. For more information, see
    utilities.tablesample.
    """
    check_types([
        (
            "X",
            X,
            [list],
        ),
        (
            "input_relation",
            input_relation,
            [str],
        ),
        (
            "n_cluster",
            n_cluster,
            [list],
        ),
        (
            "init",
            init,
            ["kmeanspp", "random"],
        ),
        (
            "max_iter",
            max_iter,
            [int, float],
        ),
        (
            "tol",
            tol,
            [int, float],
        ),
    ])
    if not (cursor):
        conn = read_auto_connect()
        cursor = conn.cursor()
    else:
        conn = False
        check_cursor(cursor)
    version(cursor=cursor, condition=[8, 0, 0])
    schema, relation = schema_relation(input_relation)
    schema = str_column(schema)
    relation_alpha = "".join(ch for ch in relation if ch.isalnum())
    all_within_cluster_SS = []
    if isinstance(n_cluster, tuple):
        L = [i for i in range(n_cluster[0], n_cluster[1])]
    else:
        L = n_cluster
        L.sort()
    for i in L:
        cursor.execute(
            "DROP MODEL IF EXISTS {}.VERTICAPY_KMEANS_TMP_{}".format(
                schema, relation_alpha))
        from verticapy.learn.cluster import KMeans

        model = KMeans(
            "{}.VERTICAPY_KMEANS_TMP_{}".format(schema, relation_alpha),
            cursor,
            i,
            init,
            max_iter,
            tol,
        )
        model.fit(input_relation, X)
        all_within_cluster_SS += [float(model.metrics_.values["value"][3])]
        model.drop()
    if conn:
        conn.close()
    if not (ax):
        fig, ax = plt.subplots()
        if isnotebook():
            fig.set_size_inches(8, 6)
    ax.set_facecolor("#F5F5F5")
    ax.grid()
    ax.plot(L, all_within_cluster_SS, marker="s", color="#FE5016")
    ax.set_title("Elbow Curve")
    ax.set_xlabel("Number of Clusters")
    ax.set_ylabel("Between-Cluster SS / Total SS")
    values = {"index": L, "Within-Cluster SS": all_within_cluster_SS}
    return tablesample(values=values)