Beispiel #1
0
def het_white(
    vdf: vDataFrame, eps: str, X: list,
):
    """
---------------------------------------------------------------------------
White’s Lagrange Multiplier Test for heteroscedasticity.

Parameters
----------
vdf: vDataFrame
    Input vDataFrame.
eps: str
    Input residual vcolumn.
X: str
    Exogenous Variables to test the heteroscedasticity on.

Returns
-------
tablesample
    An object containing the result. For more information, see
    utilities.tablesample.
    """
    check_types(
        [("eps", eps, [str],), ("X", X, [list],), ("vdf", vdf, [vDataFrame, str,],),],
    )
    columns_check([eps] + X, vdf)
    eps = vdf_columns_names([eps], vdf)[0]
    X = vdf_columns_names(X, vdf)
    X_0 = ["1"] + X
    variables = []
    variables_names = []
    for i in range(len(X_0)):
        for j in range(i, len(X_0)):
            if i != 0 or j != 0:
                variables += ["{} * {} AS var_{}_{}".format(X_0[i], X_0[j], i, j)]
                variables_names += ["var_{}_{}".format(i, j)]
    query = "(SELECT {}, POWER({}, 2) AS VERTICAPY_TEMP_eps2 FROM {}) VERTICAPY_SUBTABLE".format(
        ", ".join(variables), eps, vdf.__genSQL__()
    )
    vdf_white = vdf_from_relation(query, cursor=vdf._VERTICAPY_VARIABLES_["cursor"])

    from verticapy.learn.linear_model import LinearRegression

    schema_writing = vdf._VERTICAPY_VARIABLES_["schema_writing"]
    if not (schema_writing):
        schema_writing = "public"
    name = schema_writing + ".VERTICAPY_TEMP_MODEL_LINEAR_REGRESSION_{}".format(
        get_session(vdf._VERTICAPY_VARIABLES_["cursor"])
    )
    model = LinearRegression(name, cursor=vdf._VERTICAPY_VARIABLES_["cursor"])
    try:
        model.fit(vdf_white, variables_names, "VERTICAPY_TEMP_eps2")
        R2 = model.score("r2")
        model.drop()
    except:
        try:
            model.set_params({"solver": "bfgs"})
            model.fit(vdf_white, variables_names, "VERTICAPY_TEMP_eps2")
            R2 = model.score("r2")
            model.drop()
        except:
            model.drop()
            raise
    n = vdf.shape()[0]
    if len(X) > 1:
        k = 2 * len(X) + math.factorial(len(X)) / 2 / (math.factorial(len(X) - 2))
    else:
        k = 1
    LM = n * R2
    lm_pvalue = chi2.sf(LM, k)
    F = (n - k - 1) * R2 / (1 - R2) / k
    f_pvalue = f.sf(F, k, n - k - 1)
    result = tablesample(
        {
            "index": [
                "Lagrange Multiplier Statistic",
                "lm_p_value",
                "F Value",
                "f_p_value",
            ],
            "value": [LM, lm_pvalue, F, f_pvalue],
        }
    )
    return result
Beispiel #2
0
def het_breuschpagan(
    vdf: vDataFrame, eps: str, X: list,
):
    """
---------------------------------------------------------------------------
Breusch-Pagan test for heteroscedasticity.

Parameters
----------
vdf: vDataFrame
    Input vDataFrame.
eps: str
    Input residual vcolumn.
X: list
    Exogenous Variables to test the heteroscedasticity on.

Returns
-------
tablesample
    An object containing the result. For more information, see
    utilities.tablesample.
    """
    check_types(
        [("eps", eps, [str],), ("X", X, [list],), ("vdf", vdf, [vDataFrame, str,],),],
    )
    columns_check([eps] + X, vdf)
    eps = vdf_columns_names([eps], vdf)[0]
    X = vdf_columns_names(X, vdf)

    from verticapy.learn.linear_model import LinearRegression

    schema_writing = vdf._VERTICAPY_VARIABLES_["schema_writing"]
    if not (schema_writing):
        schema_writing = "public"
    name = schema_writing + ".VERTICAPY_TEMP_MODEL_LINEAR_REGRESSION_{}".format(
        get_session(vdf._VERTICAPY_VARIABLES_["cursor"])
    )
    model = LinearRegression(name, cursor=vdf._VERTICAPY_VARIABLES_["cursor"])
    vdf_copy = vdf.copy()
    vdf_copy["VERTICAPY_TEMP_eps2"] = vdf_copy[eps] ** 2
    try:
        model.fit(vdf_copy, X, "VERTICAPY_TEMP_eps2")
        R2 = model.score("r2")
        model.drop()
    except:
        try:
            model.set_params({"solver": "bfgs"})
            model.fit(vdf_copy, X, "VERTICAPY_TEMP_eps2")
            R2 = model.score("r2")
            model.drop()
        except:
            model.drop()
            raise
    n = vdf.shape()[0]
    k = len(X)
    LM = n * R2
    lm_pvalue = chi2.sf(LM, k)
    F = (n - k - 1) * R2 / (1 - R2) / k
    f_pvalue = f.sf(F, k, n - k - 1)
    result = tablesample(
        {
            "index": [
                "Lagrange Multiplier Statistic",
                "lm_p_value",
                "F Value",
                "f_p_value",
            ],
            "value": [LM, lm_pvalue, F, f_pvalue],
        }
    )
    return result
Beispiel #3
0
def adfuller(
    vdf: vDataFrame,
    column: str,
    ts: str,
    by: list = [],
    p: int = 1,
    with_trend: bool = False,
    regresults: bool = False,
):
    """
---------------------------------------------------------------------------
Augmented Dickey Fuller test (Time Series stationarity).

Parameters
----------
vdf: vDataFrame
    Input vDataFrame.
column: str
    Input vcolumn to test.
ts: str
    vcolumn used as timeline. It will be to use to order the data. It can be
    a numerical or type date like (date, datetime, timestamp...) vcolumn.
by: list, optional
    vcolumns used in the partition.
p: int, optional
    Number of lags to consider in the test.
with_trend: bool, optional
    Adds a trend in the Regression.
regresults: bool, optional
    If True, the full regression results are returned.

Returns
-------
tablesample
    An object containing the result. For more information, see
    utilities.tablesample.
    """

    def critical_value(alpha, N, with_trend):
        if not (with_trend):
            if N <= 25:
                if alpha == 0.01:
                    return -3.75
                elif alpha == 0.10:
                    return -2.62
                elif alpha == 0.025:
                    return -3.33
                else:
                    return -3.00
            elif N <= 50:
                if alpha == 0.01:
                    return -3.58
                elif alpha == 0.10:
                    return -2.60
                elif alpha == 0.025:
                    return -3.22
                else:
                    return -2.93
            elif N <= 100:
                if alpha == 0.01:
                    return -3.51
                elif alpha == 0.10:
                    return -2.58
                elif alpha == 0.025:
                    return -3.17
                else:
                    return -2.89
            elif N <= 250:
                if alpha == 0.01:
                    return -3.46
                elif alpha == 0.10:
                    return -2.57
                elif alpha == 0.025:
                    return -3.14
                else:
                    return -2.88
            elif N <= 500:
                if alpha == 0.01:
                    return -3.44
                elif alpha == 0.10:
                    return -2.57
                elif alpha == 0.025:
                    return -3.13
                else:
                    return -2.87
            else:
                if alpha == 0.01:
                    return -3.43
                elif alpha == 0.10:
                    return -2.57
                elif alpha == 0.025:
                    return -3.12
                else:
                    return -2.86
        else:
            if N <= 25:
                if alpha == 0.01:
                    return -4.38
                elif alpha == 0.10:
                    return -3.24
                elif alpha == 0.025:
                    return -3.95
                else:
                    return -3.60
            elif N <= 50:
                if alpha == 0.01:
                    return -4.15
                elif alpha == 0.10:
                    return -3.18
                elif alpha == 0.025:
                    return -3.80
                else:
                    return -3.50
            elif N <= 100:
                if alpha == 0.01:
                    return -4.04
                elif alpha == 0.10:
                    return -3.15
                elif alpha == 0.025:
                    return -3.73
                else:
                    return -5.45
            elif N <= 250:
                if alpha == 0.01:
                    return -3.99
                elif alpha == 0.10:
                    return -3.13
                elif alpha == 0.025:
                    return -3.69
                else:
                    return -3.43
            elif N <= 500:
                if alpha == 0.01:
                    return 3.98
                elif alpha == 0.10:
                    return -3.13
                elif alpha == 0.025:
                    return -3.68
                else:
                    return -3.42
            else:
                if alpha == 0.01:
                    return -3.96
                elif alpha == 0.10:
                    return -3.12
                elif alpha == 0.025:
                    return -3.66
                else:
                    return -3.41

    check_types(
        [
            ("ts", ts, [str],),
            ("column", column, [str],),
            ("p", p, [int, float],),
            ("by", by, [list],),
            ("with_trend", with_trend, [bool],),
            ("regresults", regresults, [bool],),
            ("vdf", vdf, [vDataFrame,],),
        ],
    )
    columns_check([ts, column] + by, vdf)
    ts = vdf_columns_names([ts], vdf)[0]
    column = vdf_columns_names([column], vdf)[0]
    by = vdf_columns_names(by, vdf)
    schema = vdf._VERTICAPY_VARIABLES_["schema_writing"]
    if not (schema):
        schema = "public"
    name = "{}.VERTICAPY_TEMP_MODEL_LINEAR_REGRESSION_{}".format(
        schema, gen_name([column]).upper()
    )
    relation_name = "{}.VERTICAPY_TEMP_MODEL_LINEAR_REGRESSION_VIEW_{}".format(
        schema, gen_name([column]).upper()
    )
    try:
        vdf._VERTICAPY_VARIABLES_["cursor"].execute(
            "DROP MODEL IF EXISTS {}".format(name)
        )
        vdf._VERTICAPY_VARIABLES_["cursor"].execute(
            "DROP VIEW IF EXISTS {}".format(relation_name)
        )
    except:
        pass
    lag = [
        "LAG({}, 1) OVER ({}ORDER BY {}) AS lag1".format(
            column, "PARTITION BY {}".format(", ".join(by)) if (by) else "", ts
        )
    ]
    lag += [
        "LAG({}, {}) OVER ({}ORDER BY {}) - LAG({}, {}) OVER ({}ORDER BY {}) AS delta{}".format(
            column,
            i,
            "PARTITION BY {}".format(", ".join(by)) if (by) else "",
            ts,
            column,
            i + 1,
            "PARTITION BY {}".format(", ".join(by)) if (by) else "",
            ts,
            i,
        )
        for i in range(1, p + 1)
    ]
    lag += [
        "{} - LAG({}, 1) OVER ({}ORDER BY {}) AS delta".format(
            column, column, "PARTITION BY {}".format(", ".join(by)) if (by) else "", ts
        )
    ]
    query = "CREATE VIEW {} AS SELECT {}, {} AS ts FROM {}".format(
        relation_name,
        ", ".join(lag),
        "TIMESTAMPDIFF(SECOND, {}, MIN({}) OVER ())".format(ts, ts)
        if vdf[ts].isdate()
        else ts,
        vdf.__genSQL__(),
    )
    vdf._VERTICAPY_VARIABLES_["cursor"].execute(query)
    model = LinearRegression(
        name, vdf._VERTICAPY_VARIABLES_["cursor"], solver="Newton", max_iter=1000
    )
    predictors = ["lag1"] + ["delta{}".format(i) for i in range(1, p + 1)]
    if with_trend:
        predictors += ["ts"]
    model.fit(
        relation_name, predictors, "delta",
    )
    coef = model.coef_
    vdf._VERTICAPY_VARIABLES_["cursor"].execute("DROP MODEL IF EXISTS {}".format(name))
    vdf._VERTICAPY_VARIABLES_["cursor"].execute(
        "DROP VIEW IF EXISTS {}".format(relation_name)
    )
    if regresults:
        return coef
    coef = coef.transpose()
    DF = coef.values["lag1"][0] / (max(coef.values["lag1"][1], 1e-99))
    p_value = coef.values["lag1"][3]
    count = vdf.shape()[0]
    result = tablesample(
        {
            "index": [
                "ADF Test Statistic",
                "p_value",
                "# Lags used",
                "# Observations Used",
                "Critical Value (1%)",
                "Critical Value (2.5%)",
                "Critical Value (5%)",
                "Critical Value (10%)",
                "Stationarity (alpha = 1%)",
            ],
            "value": [
                DF,
                p_value,
                p,
                count,
                critical_value(0.01, count, with_trend),
                critical_value(0.025, count, with_trend),
                critical_value(0.05, count, with_trend),
                critical_value(0.10, count, with_trend),
                DF < critical_value(0.01, count, with_trend) and p_value < 0.01,
            ],
        }
    )
    return result
Beispiel #4
0
def het_arch(
    vdf: vDataFrame, eps: str, ts: str, by: list = [], p: int = 1,
):
    """
---------------------------------------------------------------------------
Engle’s Test for Autoregressive Conditional Heteroscedasticity (ARCH).

Parameters
----------
vdf: vDataFrame
    Input vDataFrame.
eps: str
    Input residual vcolumn.
ts: str
    vcolumn used as timeline. It will be to use to order the data. It can be
    a numerical or type date like (date, datetime, timestamp...) vcolumn.
by: list, optional
    vcolumns used in the partition.
p: int, optional
    Number of lags to consider in the test.

Returns
-------
tablesample
    An object containing the result. For more information, see
    utilities.tablesample.
    """
    check_types(
        [
            ("eps", eps, [str],),
            ("ts", ts, [str],),
            ("p", p, [int, float],),
            ("vdf", vdf, [vDataFrame, str,],),
        ],
    )
    columns_check([eps, ts] + by, vdf)
    eps = vdf_columns_names([eps], vdf)[0]
    ts = vdf_columns_names([ts], vdf)[0]
    by = vdf_columns_names(by, vdf)
    X = []
    X_names = []
    for i in range(0, p + 1):
        X += [
            "LAG(POWER({}, 2), {}) OVER({}ORDER BY {}) AS lag_{}".format(
                eps, i, ("PARTITION BY " + ", ".join(by)) if (by) else "", ts, i
            )
        ]
        X_names += ["lag_{}".format(i)]
    query = "(SELECT {} FROM {}) VERTICAPY_SUBTABLE".format(
        ", ".join(X), vdf.__genSQL__()
    )
    vdf_lags = vdf_from_relation(query, cursor=vdf._VERTICAPY_VARIABLES_["cursor"])
    from verticapy.learn.linear_model import LinearRegression

    schema_writing = vdf._VERTICAPY_VARIABLES_["schema_writing"]
    if not (schema_writing):
        schema_writing = "public"
    name = schema_writing + ".VERTICAPY_TEMP_MODEL_LINEAR_REGRESSION_{}".format(
        get_session(vdf._VERTICAPY_VARIABLES_["cursor"])
    )
    model = LinearRegression(name, cursor=vdf._VERTICAPY_VARIABLES_["cursor"])
    try:
        model.fit(vdf_lags, X_names[1:], X_names[0])
        R2 = model.score("r2")
        model.drop()
    except:
        try:
            model.set_params({"solver": "bfgs"})
            model.fit(vdf_lags, X_names[1:], X_names[0])
            R2 = model.score("r2")
            model.drop()
        except:
            model.drop()
            raise
    n = vdf.shape()[0]
    k = len(X)
    LM = (n - p) * R2
    lm_pvalue = chi2.sf(LM, p)
    F = (n - 2 * p - 1) * R2 / (1 - R2) / p
    f_pvalue = f.sf(F, p, n - 2 * p - 1)
    result = tablesample(
        {
            "index": [
                "Lagrange Multiplier Statistic",
                "lm_p_value",
                "F Value",
                "f_p_value",
            ],
            "value": [LM, lm_pvalue, F, f_pvalue],
        }
    )
    return result