Example #1
0
def classification_report(
    y_true: str = "",
    y_score: list = [],
    input_relation: str = "",
    cursor=None,
    labels: list = [],
    cutoff=[],
    estimator=None,
):
    """
---------------------------------------------------------------------------
Computes a classification report using multiple metrics (AUC, accuracy, PRC 
AUC, F1...). It will consider each category as positive and switch to the 
next one during the computation.

Parameters
----------
y_true: str, optional
	Response column.
y_score: list, optional
	List containing the probability and the prediction.
input_relation: str, optional
	Relation to use to do the scoring. The relation can be a view or a table
	or even a customized relation. For example, you could write:
	"(SELECT ... FROM ...) x" as long as an alias is given at the end of the
	relation.
cursor: DBcursor, optional
	Vertica DB cursor.
labels: list, optional
	List of the response column categories to use.
cutoff: float / list, optional
	Cutoff for which the tested category will be accepted as prediction. 
	In case of multiclass classification, the list will represent the 
	the classes threshold. If it is empty, the best cutoff will be used.
estimator: object, optional
	Estimator to use to compute the classification report.

Returns
-------
tablesample
 	An object containing the result. For more information, see
 	utilities.tablesample.
	"""
    check_types([
        (
            "y_true",
            y_true,
            [str],
        ),
        (
            "y_score",
            y_score,
            [list],
        ),
        (
            "input_relation",
            input_relation,
            [str],
        ),
        (
            "labels",
            labels,
            [list],
        ),
        (
            "cutoff",
            cutoff,
            [int, float, list],
        ),
    ])
    if estimator:
        num_classes = len(estimator.classes_)
        labels = labels if (num_classes > 2) else [estimator.classes_[1]]
    else:
        if not (cursor):
            conn = read_auto_connect()
            cursor = conn.cursor()
        else:
            conn = False
            check_cursor(cursor)
        labels = [1] if not (labels) else labels
        num_classes = len(labels) + 1
    values = {
        "index": [
            "auc",
            "prc_auc",
            "accuracy",
            "log_loss",
            "precision",
            "recall",
            "f1_score",
            "mcc",
            "informedness",
            "markedness",
            "csi",
            "cutoff",
        ]
    }
    for idx, elem in enumerate(labels):
        pos_label = elem
        non_pos_label = 0 if (elem == 1) else "Non-{}".format(elem)
        if estimator:
            if not (cutoff):
                current_cutoff = estimator.score(method="best_cutoff",
                                                 pos_label=pos_label)
            elif isinstance(cutoff, Iterable):
                if len(cutoff) == 1:
                    current_cutoff = cutoff[0]
                else:
                    current_cutoff = cutoff[idx]
            else:
                current_cutoff = cutoff
            try:
                matrix = estimator.confusion_matrix(pos_label, current_cutoff)
            except:
                matrix = estimator.confusion_matrix(pos_label)
        else:
            y_s, y_p, y_t = (
                y_score[0].format(elem),
                y_score[1],
                "DECODE({}, '{}', 1, 0)".format(y_true, elem),
            )
            matrix = confusion_matrix(y_true, y_p, input_relation, cursor,
                                      pos_label)
        try:
            tn, fn, fp, tp = (
                matrix.values[non_pos_label][0],
                matrix.values[non_pos_label][1],
                matrix.values[pos_label][0],
                matrix.values[pos_label][1],
            )
        except:
            try:
                tn, fn, fp, tp = (
                    matrix.values[0][0],
                    matrix.values[0][1],
                    matrix.values[1][0],
                    matrix.values[1][1],
                )
            except:
                tn, fn, fp, tp = (
                    matrix.values["0"][0],
                    matrix.values["0"][1],
                    matrix.values["1"][0],
                    matrix.values["1"][1],
                )
        ppv = tp / (tp + fp) if (tp + fp != 0) else 0  # precision
        tpr = tp / (tp + fn) if (tp + fn != 0) else 0  # recall
        tnr = tn / (tn + fp) if (tn + fp != 0) else 0
        npv = tn / (tn + fn) if (tn + fn != 0) else 0
        f1 = 2 * (tpr * tnr) / (tpr + tnr) if (tpr + tnr != 0) else 0  # f1
        csi = tp / (tp + fn + fp) if (tp + fn + fp != 0) else 0  # csi
        bm = tpr + tnr - 1  # informedness
        mk = ppv + npv - 1  # markedness
        mcc = ((tp * tn - fp * fn) / math.sqrt(
            (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) if
               (tp + fp != 0) and (tp + fn != 0) and (tn + fp != 0) and
               (tn + fn != 0) else 0)
        accuracy = (tp + tn) / (tp + tn + fp + fn)
        if estimator:
            auc_score, logloss, prc_auc_score = (
                estimator.score(pos_label=pos_label, method="auc"),
                estimator.score(pos_label=pos_label, method="log_loss"),
                estimator.score(pos_label=pos_label, method="prc_auc"),
            )
        else:
            auc_score = auc(y_t, y_s, input_relation, cursor, 1)
            prc_auc_score = prc_auc(y_t, y_s, input_relation, cursor, 1)
            y_p = "DECODE({}, '{}', 1, 0)".format(y_p, elem)
            logloss = log_loss(y_t, y_s, input_relation, cursor, 1)
            if not (cutoff):
                current_cutoff = roc_curve(y_t,
                                           y_p,
                                           input_relation,
                                           cursor,
                                           best_threshold=True)
            elif isinstance(cutoff, Iterable):
                if len(cutoff) == 1:
                    current_cutoff = cutoff[0]
                else:
                    current_cutoff = cutoff[idx]
            else:
                current_cutoff = cutoff
        elem = "value" if (len(labels) == 1) else elem
        values[elem] = [
            auc_score,
            prc_auc_score,
            accuracy,
            logloss,
            ppv,
            tpr,
            f1,
            mcc,
            bm,
            mk,
            csi,
            current_cutoff,
        ]
    if not (estimator):
        if conn:
            conn.close()
    return tablesample(values)
Example #2
0
def regression_report(y_true: str,
                      y_score: str,
                      input_relation: str,
                      cursor=None):
    """
---------------------------------------------------------------------------
Computes a regression report using multiple metrics (r2, mse, max error...). 

Parameters
----------
y_true: str
	Response column.
y_score: str
	Prediction.
input_relation: str
	Relation to use to do the scoring. The relation can be a view or a table
	or even a customized relation. For example, you could write:
	"(SELECT ... FROM ...) x" as long as an alias is given at the end of the
	relation.
cursor: DBcursor, optional
	Vertica DB cursor.

Returns
-------
tablesample
 	An object containing the result. For more information, see
 	utilities.tablesample.
	"""
    check_types([
        (
            "y_true",
            y_true,
            [str],
        ),
        (
            "y_score",
            y_score,
            [str],
        ),
        (
            "input_relation",
            input_relation,
            [str],
        ),
    ])
    if not (cursor):
        conn = read_auto_connect()
        cursor = conn.cursor()
    else:
        conn = False
        check_cursor(cursor)
    query = "SELECT 1 - VARIANCE({} - {}) / VARIANCE({}), MAX(ABS({} - {})), ".format(
        y_true, y_score, y_true, y_true, y_score)
    query += "APPROXIMATE_MEDIAN(ABS({} - {})), AVG(ABS({} - {})), ".format(
        y_true, y_score, y_true, y_score)
    query += "AVG(POW({} - {}, 2)) FROM {}".format(y_true, y_score,
                                                   input_relation)
    r2 = r2_score(y_true, y_score, input_relation, cursor)
    values = {
        "index": [
            "explained_variance",
            "max_error",
            "median_absolute_error",
            "mean_absolute_error",
            "mean_squared_error",
            "r2",
        ]
    }
    cursor.execute(query)
    values["value"] = [item for item in cursor.fetchone()] + [r2]
    if conn:
        conn.close()
    return tablesample(values)
Example #3
0
def accuracy_score(y_true: str,
                   y_score: str,
                   input_relation: str,
                   cursor=None,
                   pos_label=1):
    """
---------------------------------------------------------------------------
Computes the Accuracy Score.

Parameters
----------
y_true: str
	Response column.
y_score: str
	Prediction.
input_relation: str
	Relation to use to do the scoring. The relation can be a view or a table
	or even a customized relation. For example, you could write:
	"(SELECT ... FROM ...) x" as long as an alias is given at the end of the
	relation.
cursor: DBcursor, optional
	Vertica DB cursor.
pos_label: int/float/str, optional
	Label to use to identify the positive class. If pos_label is NULL then the
	global accuracy will be computed.

Returns
-------
float
	score
	"""
    check_types([
        (
            "y_true",
            y_true,
            [str],
        ),
        (
            "y_score",
            y_score,
            [str],
        ),
        (
            "input_relation",
            input_relation,
            [str],
        ),
    ])
    if not (cursor):
        conn = read_auto_connect()
        cursor = conn.cursor()
    else:
        conn = False
        check_cursor(cursor)
    if pos_label != None:
        matrix = confusion_matrix(y_true, y_score, input_relation, cursor,
                                  pos_label)
        non_pos_label = 0 if (pos_label == 1) else "Non-{}".format(pos_label)
        tn, fn, fp, tp = (
            matrix.values[non_pos_label][0],
            matrix.values[non_pos_label][1],
            matrix.values[pos_label][0],
            matrix.values[pos_label][1],
        )
        acc = (tp + tn) / (tp + tn + fn + fp)
    else:
        try:
            query = "SELECT AVG(CASE WHEN {} = {} THEN 1 ELSE 0 END) AS accuracy FROM {} WHERE {} IS NOT NULL AND {} IS NOT NULL"
            query = query.format(y_true, y_score, input_relation, y_true,
                                 y_score)
            cursor.execute(query)
        except:
            query = "SELECT AVG(CASE WHEN {}::varchar = {}::varchar THEN 1 ELSE 0 END) AS accuracy FROM {} WHERE {} IS NOT NULL AND {} IS NOT NULL"
            query = query.format(y_true, y_score, input_relation, y_true,
                                 y_score)
            cursor.execute(query)
        acc = cursor.fetchone()[0]
    if conn:
        conn.close()
    return acc
Example #4
0
def specificity_score(y_true: str,
                      y_score: str,
                      input_relation: str,
                      cursor=None,
                      pos_label=1):
    """
---------------------------------------------------------------------------
Computes the Specificity Score.

Parameters
----------
y_true: str
	Response column.
y_score: str
	Prediction.
input_relation: str
	Relation to use to do the scoring. The relation can be a view or a table
	or even a customized relation. For example, you could write:
	"(SELECT ... FROM ...) x" as long as an alias is given at the end of the
	relation.
cursor: DBcursor, optional
	Vertica DB cursor.
pos_label: int/float/str, optional
	To compute the Specificity Score, one of the response column class has to 
	be the positive one. The parameter 'pos_label' represents this class.

Returns
-------
float
	score
	"""
    check_types([
        (
            "y_true",
            y_true,
            [str],
        ),
        (
            "y_score",
            y_score,
            [str],
        ),
        (
            "input_relation",
            input_relation,
            [str],
        ),
    ])
    if not (cursor):
        conn = read_auto_connect()
        cursor = conn.cursor()
    else:
        conn = False
        check_cursor(cursor)
    matrix = confusion_matrix(y_true, y_score, input_relation, cursor,
                              pos_label)
    if conn:
        conn.close()
    non_pos_label = 0 if (pos_label == 1) else "Non-{}".format(pos_label)
    tn, fn, fp, tp = (
        matrix.values[non_pos_label][0],
        matrix.values[non_pos_label][1],
        matrix.values[pos_label][0],
        matrix.values[pos_label][1],
    )
    tnr = tn / (tn + fp) if (tn + fp != 0) else 0
    return tnr
Example #5
0
def quantile_error(q: float,
                   y_true: str,
                   y_score: str,
                   input_relation: str,
                   cursor=None):
    """
---------------------------------------------------------------------------
Computes the input Quantile of the Error.

Parameters
----------
q: float
    Input Quantile
y_true: str
    Response column.
y_score: str
    Prediction.
input_relation: str
    Relation to use to do the scoring. The relation can be a view or a table
    or even a customized relation. For example, you could write:
    "(SELECT ... FROM ...) x" as long as an alias is given at the end of the
    relation.
cursor: DBcursor, optional
    Vertica DB cursor.

Returns
-------
float
    score
    """
    check_types([
        (
            "q",
            q,
            [int, float],
        ),
        (
            "y_true",
            y_true,
            [str],
        ),
        (
            "y_score",
            y_score,
            [str],
        ),
        (
            "input_relation",
            input_relation,
            [str],
        ),
    ])
    if not (cursor):
        conn = read_auto_connect()
        cursor = conn.cursor()
    else:
        conn = False
        check_cursor(cursor)
    query = "SELECT APPROXIMATE_PERCENTILE(ABS({} - {}) USING PARAMETERS percentile = {}) FROM {}".format(
        y_true, y_score, q, input_relation)
    cursor.execute(query)
    result = cursor.fetchone()[0]
    if conn:
        conn.close()
    return result
def lift_chart(
    y_true: str,
    y_score: str,
    input_relation: str,
    cursor=None,
    pos_label=1,
    nbins: int = 1000,
    ax=None,
):
    """
---------------------------------------------------------------------------
Draws the Lift Chart.

Parameters
----------
y_true: str
    Response column.
y_score: str
    Prediction Probability.
input_relation: str
    Relation to use to do the scoring. The relation can be a view or a table
    or even a customized relation. For example, you could write:
    "(SELECT ... FROM ...) x" as long as an alias is given at the end of the
    relation.
cursor: DBcursor, optional
    Vertica DB cursor.
pos_label: int/float/str, optional
    To compute the Lift Chart, one of the response column class has to be the 
    positive one. The parameter 'pos_label' represents this class.
nbins: int, optional
    Curve number of bins.
ax: Matplotlib axes object, optional
    The axes to plot on.

Returns
-------
tablesample
    An object containing the result. For more information, see
    utilities.tablesample.
    """
    check_types([
        (
            "y_true",
            y_true,
            [str],
        ),
        (
            "y_score",
            y_score,
            [str],
        ),
        (
            "input_relation",
            input_relation,
            [str],
        ),
        (
            "nbins",
            nbins,
            [int, float],
        ),
    ])
    if not (cursor):
        conn = read_auto_connect()
        cursor = conn.cursor()
    else:
        conn = False
        check_cursor(cursor)
    version(cursor=cursor, condition=[8, 0, 0])
    query = "SELECT LIFT_TABLE(obs, prob USING PARAMETERS num_bins = {}) OVER() FROM (SELECT (CASE WHEN {} = '{}' THEN 1 ELSE 0 END) AS obs, {}::float AS prob FROM {}) AS prediction_output"
    query = query.format(nbins, y_true, pos_label, y_score, input_relation)
    cursor.execute(query)
    query_result = cursor.fetchall()
    if conn:
        conn.close()
    decision_boundary, positive_prediction_ratio, lift = (
        [item[0] for item in query_result],
        [item[1] for item in query_result],
        [item[2] for item in query_result],
    )
    decision_boundary.reverse()
    if not (ax):
        fig, ax = plt.subplots()
        if isnotebook():
            fig.set_size_inches(8, 6)
    ax.set_facecolor("#F5F5F5")
    ax.set_xlabel("Cumulative Data Fraction")
    ax.plot(decision_boundary, lift, color="#FE5016")
    ax.plot(decision_boundary, positive_prediction_ratio, color="#444444")
    ax.set_title("Lift Table")
    ax.set_axisbelow(True)
    ax.grid()
    color1 = mpatches.Patch(color="#FE5016", label="Cumulative Lift")
    color2 = mpatches.Patch(color="#444444", label="Cumulative Capture Rate")
    ax.legend(handles=[color1, color2])
    return tablesample(values={
        "decision_boundary": decision_boundary,
        "positive_prediction_ratio": positive_prediction_ratio,
        "lift": lift,
    }, )
Example #7
0
def multilabel_confusion_matrix(y_true: str,
                                y_score: str,
                                input_relation: str,
                                labels: list,
                                cursor=None):
    """
---------------------------------------------------------------------------
Computes the Multi Label Confusion Matrix.

Parameters
----------
y_true: str
	Response column.
y_score: str
	Prediction.
input_relation: str
	Relation to use to do the scoring. The relation can be a view or a table
	or even a customized relation. For example, you could write:
	"(SELECT ... FROM ...) x" as long as an alias is given at the end of the
	relation.
labels: list
	List of the response column categories.
cursor: DBcursor, optional
	Vertica DB cursor.

Returns
-------
tablesample
 	An object containing the result. For more information, see
 	utilities.tablesample.
	"""
    check_types([
        (
            "y_true",
            y_true,
            [str],
        ),
        (
            "y_score",
            y_score,
            [str],
        ),
        (
            "input_relation",
            input_relation,
            [str],
        ),
        (
            "labels",
            labels,
            [list],
        ),
    ])
    if not (cursor):
        conn = read_auto_connect()
        cursor = conn.cursor()
    else:
        conn = False
        check_cursor(cursor)
    version(cursor=cursor, condition=[8, 0, 0])
    num_classes = str(len(labels))
    query = "SELECT CONFUSION_MATRIX(obs, response USING PARAMETERS num_classes = {}) OVER() FROM (SELECT DECODE({}".format(
        num_classes, y_true)
    for idx, item in enumerate(labels):
        query += ", '{}', {}".format(item, idx)
    query += ") AS obs, DECODE({}".format(y_score)
    for idx, item in enumerate(labels):
        query += ", '{}', {}".format(item, idx)
    query += ") AS response FROM {}) VERTICAPY_SUBTABLE".format(input_relation)
    result = to_tablesample(query, cursor)
    if conn:
        conn.close()
    del result.values["comment"]
    result = result.transpose()
    result.values["actual_class"] = labels
    result = result.transpose()
    matrix = {"index": labels}
    for elem in result.values:
        if elem != "actual_class":
            matrix[elem] = result.values[elem]
    result.values = matrix
    return result
Example #8
0
def load_winequality(cursor=None, schema: str = "public", name: str = "winequality"):
    """
---------------------------------------------------------------------------
Ingests the winequality dataset in the Vertica DB (Dataset ideal for Regression
and Classification). If a table with the same name and schema already exists, 
this function will create a vDataFrame from the input relation.

Parameters
----------
cursor: DBcursor, optional
	Vertica DB cursor. 
schema: str, optional
	Schema of the new relation. The default schema is public.
name: str, optional
	Name of the new relation.

Returns
-------
vDataFrame
	the winequality vDataFrame.

See Also
--------
load_amazon       : Ingests the amazon dataset in the Vertica DB.
	(Time Series / Regression).
load_commodities  : Ingests the commodities dataset in the Vertica DB.
    (Time Series / Regression).
load_iris         : Ingests the iris dataset in the Vertica DB.
	(Clustering / Classification).
load_market       : Ingests the market dataset in the Vertica DB.
	(Basic Data Exploration).
load_smart_meters : Ingests the smart meters dataset in the Vertica DB.
	(Time Series / Regression).
load_titanic      : Ingests the titanic dataset in the Vertica DB.
	(Classification).
	"""
    check_types([("schema", schema, [str],), ("name", name, [str],)])
    if not (cursor):
        cursor = read_auto_connect().cursor()
    else:
        check_cursor(cursor)
    try:
        vdf = vDataFrame(name, cursor, schema=schema)
    except:
        cursor.execute(
            'CREATE TABLE {}.{}("fixed_acidity" Numeric(6,3), "volatile_acidity" Numeric(7,4), "citric_acid" Numeric(6,3), "residual_sugar" Numeric(7,3), "chlorides" Float, "free_sulfur_dioxide" Numeric(7,2), "total_sulfur_dioxide" Numeric(7,2), "density" Float, "pH" Numeric(6,3), "sulphates" Numeric(6,3), "alcohol" Float, "quality" Integer, "good" Integer, "color" Varchar(20));'.format(
                str_column(schema), str_column(name)
            )
        )
        try:
            path = os.path.dirname(verticapy.__file__) + "/learn/data/winequality.csv"
            query = 'COPY {}.{}("fixed_acidity", "volatile_acidity", "citric_acid", "residual_sugar", "chlorides", "free_sulfur_dioxide", "total_sulfur_dioxide", "density", "pH", "sulphates", "alcohol", "quality", "good", "color") FROM {} DELIMITER \',\' NULL \'\' ENCLOSED BY \'"\' ESCAPE AS \'\\\' SKIP 1;'.format(
                str_column(schema), str_column(name), "{}"
            )
            import vertica_python

            if isinstance(cursor, vertica_python.vertica.cursor.Cursor):
                with open(path, "r") as fs:
                    cursor.copy(query.format("STDIN"), fs)
            else:
                cursor.execute(query.format("LOCAL '{}'".format(path)))
            cursor.execute("COMMIT;")
            vdf = vDataFrame(name, cursor, schema=schema)
        except:
            cursor.execute(
                "DROP TABLE {}.{}".format(str_column(schema), str_column(name))
            )
            raise
    return vdf
Example #9
0
def load_amazon(cursor=None, schema: str = "public", name: str = "amazon"):
    """
---------------------------------------------------------------------------
Ingests the amazon dataset in the Vertica DB (Dataset ideal for TS and
Regression). If a table with the same name and schema already exists, 
this function will create a vDataFrame from the input relation.

Parameters
----------
cursor: DBcursor, optional
	Vertica DB cursor. 
schema: str, optional
	Schema of the new relation. The default schema is public.
name: str, optional
	Name of the new relation.

Returns
-------
vDataFrame
	the amazon vDataFrame.

See Also
--------
load_commodities  : Ingests the commodities dataset in the Vertica DB.
    (Time Series / Regression).
load_iris         : Ingests the iris dataset in the Vertica DB.
	(Clustering / Classification).
load_market       : Ingests the market dataset in the Vertica DB.
	(Basic Data Exploration).
load_smart_meters : Ingests the smart meters dataset in the Vertica DB.
	(Time Series / Regression).
load_titanic      : Ingests the titanic dataset in the Vertica DB.
	(Classification).
load_winequality  : Ingests the winequality dataset in the Vertica DB.
	(Regression / Classification).
	"""
    check_types([("schema", schema, [str],), ("name", name, [str],)])
    if not (cursor):
        cursor = read_auto_connect().cursor()
    else:
        check_cursor(cursor)
    try:
        vdf = vDataFrame(name, cursor, schema=schema)
    except:
        cursor.execute(
            'CREATE TABLE {}.{}("date" Date, "state" Varchar(32), "number" Integer);'.format(
                str_column(schema), str_column(name)
            )
        )
        try:
            path = os.path.dirname(verticapy.__file__) + "/learn/data/amazon.csv"
            query = "COPY {}.{}(\"date\", \"state\", \"number\") FROM {} DELIMITER ',' NULL '' ENCLOSED BY '\"' ESCAPE AS '\\' SKIP 1;".format(
                str_column(schema), str_column(name), "{}"
            )
            import vertica_python

            if isinstance(cursor, vertica_python.vertica.cursor.Cursor):
                with open(path, "r") as fs:
                    cursor.copy(query.format("STDIN"), fs)
            else:
                cursor.execute(query.format("LOCAL '{}'".format(path)))
            cursor.execute("COMMIT;")
            vdf = vDataFrame(name, cursor, schema=schema)
        except:
            cursor.execute(
                "DROP TABLE {}.{}".format(str_column(schema), str_column(name))
            )
            raise
    return vdf
def train_test_split(input_relation: str,
                     cursor=None,
                     test_size: float = 0.33,
                     schema_writing: str = ""):
    """
---------------------------------------------------------------------------
Creates a temporary table and 2 views which can be to use to evaluate a model. 
The table will include all the main relation information with a test column 
(boolean) which represents if the data belong to the test or train set.

Parameters
----------
input_relation: str
	Input Relation.
cursor: DBcursor, optional
	Vertica DB cursor.
test_size: float, optional
	Proportion of the test set comparint to the training set.
schema_writing: str, optional
	Schema to use to write the main relation.

Returns
-------
tuple
 	(name of the train view, name of the test view)
	"""
    check_types([
        (
            "test_size",
            test_size,
            [float],
        ),
        (
            "schema_writing",
            schema_writing,
            [str],
        ),
        (
            "input_relation",
            input_relation,
            [str],
        ),
    ])
    if not (cursor):
        conn = read_auto_connect()
        cursor = conn.cursor()
    else:
        conn = False
        check_cursor(cursor)
    schema, relation = schema_relation(input_relation)
    schema = str_column(schema) if not (schema_writing) else schema_writing
    relation_alpha = "".join(ch for ch in relation if ch.isalnum())
    test_name, train_name = (
        "{}_{}".format(relation_alpha, int(test_size * 100)),
        "{}_{}".format(relation_alpha, int(100 - test_size * 100)),
    )
    try:
        cursor.execute("DROP TABLE IF EXISTS {}.VERTICAPY_SPLIT_{}".format(
            schema, relation_alpha))
    except:
        pass
    cursor.execute("DROP VIEW IF EXISTS {}.VERTICAPY_SPLIT_{}_TEST".format(
        schema, test_name))
    cursor.execute("DROP VIEW IF EXISTS {}.VERTICAPY_SPLIT_{}_TRAIN".format(
        schema, train_name))
    query = "CREATE TABLE {}.VERTICAPY_SPLIT_{} AS SELECT *, (CASE WHEN RANDOM() < {} THEN True ELSE False END) AS test FROM {}".format(
        schema, relation_alpha, test_size, input_relation)
    cursor.execute(query)
    query = "CREATE VIEW {}.VERTICAPY_SPLIT_{}_TEST AS SELECT * FROM {} WHERE test".format(
        schema, test_name,
        "{}.VERTICAPY_SPLIT_{}".format(schema, relation_alpha))
    cursor.execute(query)
    query = "CREATE VIEW {}.VERTICAPY_SPLIT_{}_TRAIN AS SELECT * FROM {} WHERE NOT(test)".format(
        schema, train_name,
        "{}.VERTICAPY_SPLIT_{}".format(schema, relation_alpha))
    cursor.execute(query)
    if conn:
        conn.close()
    return (
        "{}.VERTICAPY_SPLIT_{}_TRAIN".format(schema, train_name),
        "{}.VERTICAPY_SPLIT_{}_TEST".format(schema, test_name),
    )
Example #11
0
def load_titanic(cursor=None, schema: str = "public", name: str = "titanic"):
    """
---------------------------------------------------------------------------
Ingests the titanic dataset in the Vertica DB (Dataset ideal for 
Classification). If a table with the same name and schema already exists, 
this function will create a vDataFrame from the input relation.

Parameters
----------
cursor: DBcursor, optional
	Vertica DB cursor. 
schema: str, optional
	Schema of the new relation. The default schema is public.
name: str, optional
	Name of the new relation.

Returns
-------
vDataFrame
	the titanic vDataFrame.

See Also
--------
load_amazon       : Ingests the amazon dataset in the Vertica DB.
	(Time Series / Regression).
load_commodities  : Ingests the commodities dataset in the Vertica DB.
    (Time Series / Regression).
load_iris         : Ingests the iris dataset in the Vertica DB.
	(Clustering / Classification).
load_market       : Ingests the market dataset in the Vertica DB.
	(Basic Data Exploration).
load_smart_meters : Ingests the smart meters dataset in the Vertica DB.
	(Time Series / Regression).
load_winequality  : Ingests the winequality dataset in the Vertica DB.
	(Regression / Classification).
	"""
    check_types([("schema", schema, [str],), ("name", name, [str],)])
    if not (cursor):
        cursor = read_auto_connect().cursor()
    else:
        check_cursor(cursor)
    try:
        vdf = vDataFrame(name, cursor, schema=schema)
    except:
        cursor.execute(
            'CREATE TABLE {}.{}("pclass" Integer, "survived" Integer, "name" Varchar(164), "sex" Varchar(20), "age" Numeric(6,3), "sibsp" Integer, "parch" Integer, "ticket" Varchar(36), "fare" Numeric(10,5), "cabin" Varchar(30), "embarked" Varchar(20), "boat" Varchar(100), "body" Integer, "home.dest" Varchar(100));'.format(
                str_column(schema), str_column(name)
            )
        )
        try:
            path = os.path.dirname(verticapy.__file__) + "/learn/data/titanic.csv"
            query = 'COPY {}.{}("pclass", "survived", "name", "sex", "age", "sibsp", "parch", "ticket", "fare", "cabin", "embarked", "boat", "body", "home.dest") FROM {} DELIMITER \',\' NULL \'\' ENCLOSED BY \'"\' ESCAPE AS \'\\\' SKIP 1;'.format(
                str_column(schema), str_column(name), "{}"
            )
            import vertica_python

            if isinstance(cursor, vertica_python.vertica.cursor.Cursor):
                with open(path, "r") as fs:
                    cursor.copy(query.format("STDIN"), fs)
            else:
                cursor.execute(query.format("LOCAL '{}'".format(path)))
            cursor.execute("COMMIT;")
            vdf = vDataFrame(name, cursor, schema=schema)
        except:
            cursor.execute(
                "DROP TABLE {}.{}".format(str_column(schema), str_column(name))
            )
            raise
    return vdf
def roc_curve(
    y_true: str,
    y_score: str,
    input_relation: str,
    cursor=None,
    pos_label=1,
    nbins: int = 1000,
    auc_roc: bool = False,
    best_threshold: bool = False,
    ax=None,
):
    """
---------------------------------------------------------------------------
Draws the ROC Curve.

Parameters
----------
y_true: str
    Response column.
y_score: str
    Prediction Probability.
input_relation: str
    Relation to use to do the scoring. The relation can be a view or a table
    or even a customized relation. For example, you could write:
    "(SELECT ... FROM ...) x" as long as an alias is given at the end of the
    relation.
cursor: DBcursor, optional
    Vertica DB cursor.
pos_label: int/float/str, optional
    To compute the PRC Curve, one of the response column class has to be the 
    positive one. The parameter 'pos_label' represents this class.
nbins: int, optional
    Curve number of bins.
auc_roc: bool, optional
    If set to true, the function will return the ROC AUC without drawing the 
    curve.
best_threshold: bool, optional
    If set to True, the function will return the best threshold without drawing 
    the curve. The best threshold is the threshold of the point which is the 
    farest from the random line.
ax: Matplotlib axes object, optional
    The axes to plot on.

Returns
-------
tablesample
    An object containing the result. For more information, see
    utilities.tablesample.
    """
    check_types([
        (
            "y_true",
            y_true,
            [str],
        ),
        (
            "y_score",
            y_score,
            [str],
        ),
        (
            "input_relation",
            input_relation,
            [str],
        ),
        (
            "nbins",
            nbins,
            [int, float],
        ),
        (
            "auc_roc",
            auc_roc,
            [bool],
        ),
        (
            "best_threshold",
            best_threshold,
            [bool],
        ),
    ])
    if not (cursor):
        conn = read_auto_connect()
        cursor = conn.cursor()
    else:
        conn = False
        check_cursor(cursor)
    version(cursor=cursor, condition=[8, 0, 0])
    query = "SELECT ROC(obs, prob USING PARAMETERS num_bins = {}) OVER() FROM (SELECT (CASE WHEN {} = '{}' THEN 1 ELSE 0 END) AS obs, {}::float AS prob FROM {}) AS prediction_output"
    query = query.format(nbins, y_true, pos_label, y_score, input_relation)
    cursor.execute(query)
    query_result = cursor.fetchall()
    if conn:
        conn.close()
    threshold, false_positive, true_positive = (
        [item[0] for item in query_result],
        [item[1] for item in query_result],
        [item[2] for item in query_result],
    )
    auc = 0
    for i in range(len(false_positive) - 1):
        if false_positive[i + 1] - false_positive[i] != 0.0:
            a = (true_positive[i + 1] - true_positive[i]) / (
                false_positive[i + 1] - false_positive[i])
            b = true_positive[i + 1] - a * false_positive[i + 1]
            auc = (auc + a * (false_positive[i + 1] * false_positive[i + 1] -
                              false_positive[i] * false_positive[i]) / 2 + b *
                   (false_positive[i + 1] - false_positive[i]))
    auc = -auc
    auc = min(auc, 1.0)
    if auc_roc:
        return auc
    if best_threshold:
        l = [abs(y - x) for x, y in zip(false_positive, true_positive)]
        best_threshold_arg = max(zip(l, range(len(l))))[1]
        best = max(threshold[best_threshold_arg], 0.001)
        best = min(best, 0.999)
        return best
    if not (ax):
        fig, ax = plt.subplots()
        if isnotebook():
            fig.set_size_inches(8, 6)
    ax.set_xlabel("False Positive Rate (1-Specificity)")
    ax.set_ylabel("True Positive Rate (Sensitivity)")
    ax.plot(false_positive, true_positive, color="#FE5016")
    ax.plot([0, 1], [0, 1], color="#444444")
    ax.set_ylim(0, 1)
    ax.set_xlim(0, 1)
    ax.set_title("ROC Curve\nAUC = " + str(auc))
    ax.set_axisbelow(True)
    ax.grid()
    return tablesample(values={
        "threshold": threshold,
        "false_positive": false_positive,
        "true_positive": true_positive,
    }, )
def best_k(
    X: list,
    input_relation: str,
    cursor=None,
    n_cluster=(1, 100),
    init="kmeanspp",
    max_iter: int = 50,
    tol: float = 1e-4,
    elbow_score_stop: float = 0.8,
):
    """
---------------------------------------------------------------------------
Finds the KMeans K based on a score.

Parameters
----------
X: list
	List of the predictor columns.
input_relation: str
	Relation to use to train the model.
cursor: DBcursor, optional
	Vertica DB cursor.
n_cluster: int, optional
	Tuple representing the number of cluster to start with and to end with.
	It can also be customized list with the different K to test.
init: str/list, optional
	The method to use to find the initial cluster centers.
		kmeanspp : Use the KMeans++ method to initialize the centers.
		random   : The initial centers
	It can be also a list with the initial cluster centers to use.
max_iter: int, optional
	The maximum number of iterations the algorithm performs.
tol: float, optional
	Determines whether the algorithm has converged. The algorithm is considered 
	converged after no center has moved more than a distance of 'tol' from the 
	previous iteration.
elbow_score_stop: float, optional
	Stops the Parameters Search when this Elbow score is reached.

Returns
-------
int
	the KMeans K
	"""
    check_types([
        (
            "X",
            X,
            [list],
        ),
        (
            "input_relation",
            input_relation,
            [str],
        ),
        (
            "n_cluster",
            n_cluster,
            [list],
        ),
        (
            "init",
            init,
            ["kmeanspp", "random"],
        ),
        (
            "max_iter",
            max_iter,
            [int, float],
        ),
        (
            "tol",
            tol,
            [int, float],
        ),
        (
            "elbow_score_stop",
            elbow_score_stop,
            [int, float],
        ),
    ])

    from verticapy.learn.cluster import KMeans

    if not (cursor):
        conn = read_auto_connect()
        cursor = conn.cursor()
    else:
        conn = False
        check_cursor(cursor)
    if not (isinstance(n_cluster, Iterable)):
        L = range(n_cluster[0], n_cluster[1])
    else:
        L = n_cluster
        L.sort()
    schema, relation = schema_relation(input_relation)
    schema = str_column(schema)
    relation_alpha = "".join(ch for ch in relation if ch.isalnum())
    for i in L:
        cursor.execute(
            "DROP MODEL IF EXISTS {}.__vpython_kmeans_tmp_model_{}__".format(
                schema, relation_alpha))
        model = KMeans(
            "{}.__vpython_kmeans_tmp_model_{}__".format(
                schema, relation_alpha),
            cursor,
            i,
            init,
            max_iter,
            tol,
        )
        model.fit(input_relation, X)
        score = model.metrics.values["value"][3]
        if score > elbow_score_stop:
            return i
        score_prev = score
    if conn:
        conn.close()
    print(
        "\u26A0 The K was not found. The last K (= {}) is returned with an elbow score of {}"
        .format(i, score))
    return i
def prc_curve(
    y_true: str,
    y_score: str,
    input_relation: str,
    cursor=None,
    pos_label=1,
    nbins: int = 1000,
    auc_prc: bool = False,
    ax=None,
):
    """
---------------------------------------------------------------------------
Draws the PRC Curve.

Parameters
----------
y_true: str
    Response column.
y_score: str
    Prediction Probability.
input_relation: str
    Relation to use to do the scoring. The relation can be a view or a table
    or even a customized relation. For example, you could write:
    "(SELECT ... FROM ...) x" as long as an alias is given at the end of the
    relation.
cursor: DBcursor, optional
    Vertica DB cursor.
pos_label: int/float/str, optional
    To compute the PRC Curve, one of the response column class has to be the 
    positive one. The parameter 'pos_label' represents this class.
nbins: int, optional
    Curve number of bins.
auc_prc: bool, optional
    If set to True, the function will return the PRC AUC without drawing the 
    curve.
ax: Matplotlib axes object, optional
    The axes to plot on.

Returns
-------
tablesample
    An object containing the result. For more information, see
    utilities.tablesample.
    """
    check_types([
        (
            "y_true",
            y_true,
            [str],
        ),
        (
            "y_score",
            y_score,
            [str],
        ),
        (
            "input_relation",
            input_relation,
            [str],
        ),
        (
            "nbins",
            nbins,
            [int, float],
        ),
        (
            "auc_prc",
            auc_prc,
            [bool],
        ),
    ])
    if not (cursor):
        conn = read_auto_connect()
        cursor = conn.cursor()
    else:
        conn = False
        check_cursor(cursor)
    version(cursor=cursor, condition=[9, 1, 0])
    query = "SELECT PRC(obs, prob USING PARAMETERS num_bins = {}) OVER() FROM (SELECT (CASE WHEN {} = '{}' THEN 1 ELSE 0 END) AS obs, {}::float AS prob FROM {}) AS prediction_output"
    query = query.format(nbins, y_true, pos_label, y_score, input_relation)
    cursor.execute(query)
    query_result = cursor.fetchall()
    if conn:
        conn.close()
    threshold, recall, precision = (
        [0] + [item[0] for item in query_result] + [1],
        [1] + [item[1] for item in query_result] + [0],
        [0] + [item[2] for item in query_result] + [1],
    )
    auc = 0
    for i in range(len(recall) - 1):
        if recall[i + 1] - recall[i] != 0.0:
            a = (precision[i + 1] - precision[i]) / (recall[i + 1] - recall[i])
            b = precision[i + 1] - a * recall[i + 1]
            auc = (
                auc + a *
                (recall[i + 1] * recall[i + 1] - recall[i] * recall[i]) / 2 +
                b * (recall[i + 1] - recall[i]))
    auc = -auc
    if auc_prc:
        return auc
    if not (ax):
        fig, ax = plt.subplots()
        if isnotebook():
            fig.set_size_inches(8, 6)
    ax.set_facecolor("#F5F5F5")
    ax.set_xlabel("Recall")
    ax.set_ylabel("Precision")
    ax.plot(recall, precision, color="#FE5016")
    ax.set_ylim(0, 1)
    ax.set_xlim(0, 1)
    ax.set_title("PRC Curve\nAUC = " + str(auc))
    ax.set_axisbelow(True)
    ax.grid()
    return tablesample(values={
        "threshold": threshold,
        "recall": recall,
        "precision": precision
    }, )
Example #15
0
def confusion_matrix(y_true: str,
                     y_score: str,
                     input_relation: str,
                     cursor=None,
                     pos_label=1):
    """
---------------------------------------------------------------------------
Computes the Confusion Matrix.

Parameters
----------
y_true: str
	Response column.
y_score: str
	Prediction.
input_relation: str
	Relation to use to do the scoring. The relation can be a view or a table
	or even a customized relation. For example, you could write:
	"(SELECT ... FROM ...) x" as long as an alias is given at the end of the
	relation.
cursor: DBcursor, optional
	Vertica DB cursor.
pos_label: int/float/str, optional
	To compute the one dimension Confusion Matrix, one of the response column 
	class has to be the positive one. The parameter 'pos_label' represents 
	this class.

Returns
-------
tablesample
 	An object containing the result. For more information, see
 	utilities.tablesample.
	"""
    check_types([
        (
            "y_true",
            y_true,
            [str],
        ),
        (
            "y_score",
            y_score,
            [str],
        ),
        (
            "input_relation",
            input_relation,
            [str],
        ),
    ])
    if not (cursor):
        conn = read_auto_connect()
        cursor = conn.cursor()
    else:
        conn = False
        check_cursor(cursor)
    version(cursor=cursor, condition=[8, 0, 0])
    query = "SELECT CONFUSION_MATRIX(obs, response USING PARAMETERS num_classes = 2) OVER() FROM (SELECT DECODE({}".format(
        y_true)
    query += ", '{}', 1, NULL, NULL, 0) AS obs, DECODE({}, '{}', 1, NULL, NULL, 0) AS response FROM {}) VERTICAPY_SUBTABLE".format(
        pos_label, y_score, pos_label, input_relation)
    result = to_tablesample(query, cursor)
    if conn:
        conn.close()
    if pos_label in [1, "1"]:
        labels = [0, 1]
    else:
        labels = ["Non-{}".format(pos_label), pos_label]
    del result.values["comment"]
    result = result.transpose()
    result.values["actual_class"] = labels
    result = result.transpose()
    matrix = {"index": labels}
    for elem in result.values:
        if elem != "actual_class":
            matrix[elem] = result.values[elem]
    result.values = matrix
    return result
Example #16
0
def log_loss(y_true: str,
             y_score: str,
             input_relation: str,
             cursor=None,
             pos_label=1):
    """
---------------------------------------------------------------------------
Computes the Log Loss.

Parameters
----------
y_true: str
	Response column.
y_score: str
	Prediction Probability.
input_relation: str
	Relation to use to do the scoring. The relation can be a view or a table
	or even a customized relation. For example, you could write:
	"(SELECT ... FROM ...) x" as long as an alias is given at the end of the
	relation.
cursor: DBcursor, optional
	Vertica DB cursor.
pos_label: int/float/str, optional
	To compute the log loss, one of the response column class has to be the 
	positive one. The parameter 'pos_label' represents this class.

Returns
-------
float
	score
	"""
    check_types([
        (
            "y_true",
            y_true,
            [str],
        ),
        (
            "y_score",
            y_score,
            [str],
        ),
        (
            "input_relation",
            input_relation,
            [str],
        ),
    ])
    if not (cursor):
        conn = read_auto_connect()
        cursor = conn.cursor()
    else:
        conn = False
        check_cursor(cursor)
    query = "SELECT AVG(CASE WHEN {} = '{}' THEN - LOG({}::float + 1e-90) else - LOG(1 - {}::float + 1e-90) END) FROM {};"
    query = query.format(y_true, pos_label, y_score, y_score, input_relation)
    cursor.execute(query)
    result = cursor.fetchone()[0]
    if conn:
        conn.close()
    return result
Example #17
0
def sql(line, cell=""):
    from verticapy.connections.connect import read_auto_connect
    from verticapy.utilities import readSQL
    from verticapy.utilities import vdf_from_relation
    from IPython.core.display import HTML, display
    import time
    import re
    import vertica_python
    from verticapy.errors import QueryError

    version = vertica_python.__version__.split(".")
    version = [int(elem) for elem in version]
    conn = read_auto_connect()
    cursor = conn.cursor()
    queries = line if (not (cell) and (line)) else cell
    options = {
        "limit": 100,
        "columns": 100,
        "percent_bar": False,
        "vdf": False
    }
    queries = queries.replace("\t", " ")
    queries = queries.replace("\n", " ")
    queries = re.sub(" +", " ", queries)
    if (cell) and (line):
        line = re.sub(" +", " ", line)
        all_options_tmp = line.split(" ")
        all_options = []
        for elem in all_options_tmp:
            if elem != "":
                all_options += [elem]
        n, i, all_options_dict = len(all_options), 0, {}
        while i < n:
            all_options_dict[all_options[i]] = all_options[i + 1]
            i += 2
        for option in all_options_dict:
            if option.lower() == "-limit":
                options["limit"] = int(all_options_dict[option])
            elif option.lower() == "-ncols":
                options["columns"] = int(all_options_dict[option])
            elif option.lower() == "-percent":
                options["percent_bar"] = bool(all_options_dict[option])
            elif option.lower() == "-vdf":
                options["vdf"] = bool(all_options_dict[option])
            else:
                print(
                    "\u26A0 Warning : The option '{}' doesn't exist, it was skipped."
                    .format(option))
    n, i, all_split = len(queries), 0, []
    while i < n and queries[n - i - 1] in (";", " ", "\n"):
        i += 1
    queries = queries[:n - i]
    i, n = 0, n - i
    while i < n:
        if queries[i] == '"':
            i += 1
            while i < n and queries[i] != '"':
                i += 1
        elif queries[i] == "'":
            i += 1
            while i < n and queries[i] != "'":
                i += 1
        elif queries[i] == ";":
            all_split += [i]
        i += 1
    all_split = [0] + all_split + [n]
    m = len(all_split)
    start_time = time.time()
    queries = [queries[all_split[i]:all_split[i + 1]] for i in range(m - 1)]
    n = len(queries)
    for i in range(n):
        query = queries[i]
        while len(query) > 0 and (query[-1] in (";", " ")):
            query = query[0:-1]
        while len(query) > 0 and (query[0] in (";", " ")):
            query = query[1:]
        queries[i] = query
    queries_tmp, i = [], 0
    while i < n:
        query = queries[i]
        if (i < n - 1) and (queries[i + 1].lower() == "end"):
            query += "; {}".format(queries[i + 1])
            i += 1
        queries_tmp += [query]
        i += 1
    queries, n = queries_tmp, len(queries_tmp)
    result = None
    for i in range(n):
        query = queries[i]
        query_type = (query.split(" ")[0].upper() if
                      (query.split(" ")[0]) else query.split(" ")[1].upper())
        if ((query_type == "COPY") and ("from local" in query.lower())
                and (version[0] == 0) and (version[1] < 11)):
            query = re.split("from local", query, flags=re.IGNORECASE)
            file_name = (query[1].split(" ")[0] if
                         (query[1].split(" ")[0]) else query[1].split(" ")[1])
            query = ("".join(query[0]) + "FROM" +
                     "".join(query[1]).replace(file_name, "STDIN"))
            if (file_name[0] == file_name[-1]) and (file_name[0]
                                                    in ('"', "'")):
                file_name = file_name[1:-1]
            with open(file_name, "r") as fs:
                cursor.copy(query, fs)
        elif (i < n - 1) or ((i == n - 1) and
                             (query_type.lower() != "select")):
            cursor.execute(query)
            print(query_type)
        else:
            error = ""
            try:
                if options["vdf"]:
                    result = vdf_from_relation("({}) x".format(query),
                                               cursor=cursor)
                    result.set_display_parameters(
                        rows=options["limit"],
                        columns=options["columns"],
                        percent_bar=options["percent_bar"],
                    )
                else:
                    result = readSQL(
                        query,
                        cursor=cursor,
                        limit=options["limit"],
                        display_ncols=options["columns"],
                        percent_bar=options["percent_bar"],
                    )
            except:
                try:
                    cursor.execute(query)
                    final_result = cursor.fetchone()
                    if final_result:
                        print(final_result[0])
                    else:
                        print(query_type)
                except Exception as e:
                    error = e
            if error:
                raise QueryError(error)
    if not (options["vdf"]):
        conn.close()
    elapsed_time = time.time() - start_time
    display(
        HTML("<div><b>Execution: </b> {}s</div>".format(round(elapsed_time,
                                                              3))))
    return result
def elbow(
    X: list,
    input_relation: str,
    cursor=None,
    n_cluster=(1, 15),
    init="kmeanspp",
    max_iter: int = 50,
    tol: float = 1e-4,
    ax=None,
):
    """
---------------------------------------------------------------------------
Draws an Elbow Curve.

Parameters
----------
X: list
    List of the predictor columns.
input_relation: str
    Relation to use to train the model.
cursor: DBcursor, optional
    Vertica DB cursor.
n_cluster: int, optional
    Tuple representing the number of cluster to start with and to end with.
    It can also be customized list with the different K to test.
init: str/list, optional
    The method to use to find the initial cluster centers.
        kmeanspp : Use the KMeans++ method to initialize the centers.
        random   : The initial centers
    It can be also a list with the initial cluster centers to use.
max_iter: int, optional
    The maximum number of iterations the algorithm performs.
tol: float, optional
    Determines whether the algorithm has converged. The algorithm is considered 
    converged after no center has moved more than a distance of 'tol' from the 
    previous iteration.
ax: Matplotlib axes object, optional
    The axes to plot on.

Returns
-------
tablesample
    An object containing the result. For more information, see
    utilities.tablesample.
    """
    check_types([
        (
            "X",
            X,
            [list],
        ),
        (
            "input_relation",
            input_relation,
            [str],
        ),
        (
            "n_cluster",
            n_cluster,
            [list],
        ),
        (
            "init",
            init,
            ["kmeanspp", "random"],
        ),
        (
            "max_iter",
            max_iter,
            [int, float],
        ),
        (
            "tol",
            tol,
            [int, float],
        ),
    ])
    if not (cursor):
        conn = read_auto_connect()
        cursor = conn.cursor()
    else:
        conn = False
        check_cursor(cursor)
    version(cursor=cursor, condition=[8, 0, 0])
    schema, relation = schema_relation(input_relation)
    schema = str_column(schema)
    relation_alpha = "".join(ch for ch in relation if ch.isalnum())
    all_within_cluster_SS = []
    if isinstance(n_cluster, tuple):
        L = [i for i in range(n_cluster[0], n_cluster[1])]
    else:
        L = n_cluster
        L.sort()
    for i in L:
        cursor.execute(
            "DROP MODEL IF EXISTS {}.VERTICAPY_KMEANS_TMP_{}".format(
                schema, relation_alpha))
        from verticapy.learn.cluster import KMeans

        model = KMeans(
            "{}.VERTICAPY_KMEANS_TMP_{}".format(schema, relation_alpha),
            cursor,
            i,
            init,
            max_iter,
            tol,
        )
        model.fit(input_relation, X)
        all_within_cluster_SS += [float(model.metrics_.values["value"][3])]
        model.drop()
    if conn:
        conn.close()
    if not (ax):
        fig, ax = plt.subplots()
        if isnotebook():
            fig.set_size_inches(8, 6)
    ax.set_facecolor("#F5F5F5")
    ax.grid()
    ax.plot(L, all_within_cluster_SS, marker="s", color="#FE5016")
    ax.set_title("Elbow Curve")
    ax.set_xlabel("Number of Clusters")
    ax.set_ylabel("Between-Cluster SS / Total SS")
    values = {"index": L, "Within-Cluster SS": all_within_cluster_SS}
    return tablesample(values=values)