Beispiel #1
0
def durbin_watson(
    vdf: vDataFrame, eps: str, ts: str, by: list = [],
):
    """
---------------------------------------------------------------------------
Durbin Watson test (residuals autocorrelation).

Parameters
----------
vdf: vDataFrame
    Input vDataFrame.
eps: str
    Input residual vcolumn.
ts: str
    vcolumn used as timeline. It will be to use to order the data. It can be
    a numerical or type date like (date, datetime, timestamp...) vcolumn.
by: list, optional
    vcolumns used in the partition.

Returns
-------
float
    Durbin Watson statistic
    """
    check_types(
        [
            ("ts", ts, [str],),
            ("eps", eps, [str],),
            ("by", by, [list],),
            ("vdf", vdf, [vDataFrame, str,],),
        ],
    )
    columns_check([eps] + [ts] + by, vdf)
    eps = vdf_columns_names([eps], vdf)[0]
    ts = vdf_columns_names([ts], vdf)[0]
    by = vdf_columns_names(by, vdf)
    query = "(SELECT et, LAG(et) OVER({}ORDER BY {}) AS lag_et FROM (SELECT {} AS et, {}{} FROM {}) VERTICAPY_SUBTABLE) VERTICAPY_SUBTABLE".format(
        "PARTITION BY {} ".format(", ".join(by)) if (by) else "",
        ts,
        eps,
        ts,
        (", " + ", ".join(by)) if (by) else "",
        vdf.__genSQL__(),
    )
    vdf.__executeSQL__(
        "SELECT SUM(POWER(et - lag_et, 2)) / SUM(POWER(et, 2)) FROM {}".format(query),
        title="Computes the Durbin Watson d.",
    )
    d = vdf._VERTICAPY_VARIABLES_["cursor"].fetchone()[0]
    return d
Beispiel #2
0
def mkt(vdf: vDataFrame, column: str, ts: str, alpha: float = 0.05):
    """
---------------------------------------------------------------------------
Mann Kendall test (Time Series trend).

\u26A0 Warning : This Test is computationally expensive. It is using a CROSS 
                 JOIN during the computation. The complexity is O(n * k), n 
                 being the total count of the vDataFrame and k the number
                 of rows to use to do the test.

Parameters
----------
vdf: vDataFrame
    Input vDataFrame.
column: str
    Input vcolumn to test.
ts: str
    vcolumn used as timeline. It will be to use to order the data. It can be
    a numerical or type date like (date, datetime, timestamp...) vcolumn.
alpha: float, optional
    Significance Level. Probability to accept H0.

Returns
-------
tablesample
    An object containing the result. For more information, see
    utilities.tablesample.
    """
    check_types(
        [
            ("ts", ts, [str],),
            ("column", column, [str],),
            ("alpha", alpha, [int, float],),
            ("vdf", vdf, [vDataFrame,],),
        ],
    )
    columns_check([column, ts], vdf)
    column = vdf_columns_names([column], vdf)[0]
    ts = vdf_columns_names([ts], vdf)[0]
    table = "(SELECT {}, {} FROM {})".format(column, ts, vdf.__genSQL__())
    query = "SELECT SUM(SIGN(y.{} - x.{})) FROM {} x CROSS JOIN {} y WHERE y.{} > x.{}".format(
        column, column, table, table, ts, ts
    )
    vdf.__executeSQL__(query, title="Computes the Mann Kendall S.")
    S = vdf._VERTICAPY_VARIABLES_["cursor"].fetchone()[0]
    try:
        S = float(S)
    except:
        S = None
    n = vdf[column].count()
    query = "SELECT SQRT(({} * ({} - 1) * (2 * {} + 5) - SUM(row * (row - 1) * (2 * row + 5))) / 18) FROM (SELECT row FROM (SELECT ROW_NUMBER() OVER (PARTITION BY {}) AS row FROM {}) VERTICAPY_SUBTABLE GROUP BY row) VERTICAPY_SUBTABLE".format(
        n, n, n, column, vdf.__genSQL__()
    )
    vdf.__executeSQL__(query, title="Computes the Mann Kendall S standard deviation.")
    STDS = vdf._VERTICAPY_VARIABLES_["cursor"].fetchone()[0]
    try:
        STDS = float(STDS)
    except:
        STDS = None
    if STDS in (None, 0) or S == None:
        return None
    if S > 0:
        ZMK = (S - 1) / STDS
        trend = "increasing"
    elif S < 0:
        ZMK = (S + 1) / STDS
        trend = "decreasing"
    else:
        ZMK = 0
        trend = "no trend"
    pvalue = 2 * norm.sf(abs(ZMK))
    result = (
        True
        if (ZMK <= 0 and pvalue < alpha) or (ZMK >= 0 and pvalue < alpha)
        else False
    )
    if not (result):
        trend = "no trend"
    result = tablesample(
        {
            "index": [
                "Mann Kendall Test Statistic",
                "S",
                "STDS",
                "p_value",
                "Monotonic Trend",
                "Trend",
            ],
            "value": [ZMK, S, STDS, pvalue, result, trend],
        }
    )
    return result