def coordinate_converter( vdf: vDataFrame, x: str, y: str, x0: float = 0.0, earth_radius: float = 6371, reverse: bool = False, ): """ --------------------------------------------------------------------------- Converts between geographic coordinates (latitude and longitude) and Euclidean coordinates (x,y). Parameters ---------- vdf: vDataFrame input vDataFrame. x: str vColumn used as the abscissa (longitude). y: str vColumn used as the ordinate (latitude). x0: float, optional The initial abscissa. earth_radius: float, optional Earth radius in km. reverse: bool, optional If set to True, the Euclidean coordinates are converted to latitude and longitude. Returns ------- vDataFrame result of the transformation. """ check_types([ ("vdf", vdf, [vDataFrame]), ("x", x, [str]), ("y", y, [str]), ("x0", x0, [int, float]), ("earth_radius", earth_radius, [int, float]), ("reverse", reverse, [bool]), ]) vdf.are_namecols_in([x, y]) result = vdf.copy() if reverse: result[x] = result[x] / earth_radius * 180 / st.pi + x0 result[y] = ((st.atan(st.exp(result[y] / earth_radius)) - st.pi / 4) / st.pi * 360) else: result[x] = earth_radius * ((result[x] - x0) * st.pi / 180) result[y] = earth_radius * st.ln( st.tan(result[y] * st.pi / 360 + st.pi / 4)) return result
def durbin_watson( vdf: vDataFrame, eps: str, ts: str, by: list = [], ): """ --------------------------------------------------------------------------- Durbin Watson test (residuals autocorrelation). Parameters ---------- vdf: vDataFrame Input vDataFrame. eps: str Input residual vcolumn. ts: str vcolumn used as timeline. It will be to use to order the data. It can be a numerical or type date like (date, datetime, timestamp...) vcolumn. by: list, optional vcolumns used in the partition. Returns ------- float Durbin Watson statistic """ check_types( [ ("ts", ts, [str],), ("eps", eps, [str],), ("by", by, [list],), ("vdf", vdf, [vDataFrame, str,],), ], ) columns_check([eps] + [ts] + by, vdf) eps = vdf_columns_names([eps], vdf)[0] ts = vdf_columns_names([ts], vdf)[0] by = vdf_columns_names(by, vdf) query = "(SELECT et, LAG(et) OVER({}ORDER BY {}) AS lag_et FROM (SELECT {} AS et, {}{} FROM {}) VERTICAPY_SUBTABLE) VERTICAPY_SUBTABLE".format( "PARTITION BY {} ".format(", ".join(by)) if (by) else "", ts, eps, ts, (", " + ", ".join(by)) if (by) else "", vdf.__genSQL__(), ) vdf.__executeSQL__( "SELECT SUM(POWER(et - lag_et, 2)) / SUM(POWER(et, 2)) FROM {}".format(query), title="Computes the Durbin Watson d.", ) d = vdf._VERTICAPY_VARIABLES_["cursor"].fetchone()[0] return d
def intersect( vdf: vDataFrame, index: str, gid: str, g: str = "", x: str = "", y: str = "", ): """ --------------------------------------------------------------------------- Spatially intersects a point or points with a set of polygons. Parameters ---------- vdf: vDataFrame vDataFrame to use to compute the spatial join. index: str Name of the index. gid: str An integer column or integer that uniquely identifies the spatial object(s) of g or x and y. g: str, optional A geometry or geography (WGS84) column that contains points. The g column can contain only point geometries or geographies. x: str, optional x-coordinate or longitude. y: str, optional y-coordinate or latitude. Returns ------- vDataFrame object containing the result of the intersection. """ check_types( [ ("vdf", vdf, [vDataFrame],), ("gid", gid, [str],), ("g", g, [str],), ("x", x, [str],), ("y", y, [str],), ("index", index, [str],), ] ) table = vdf.__genSQL__() columns_check([gid], vdf) if g: columns_check([g], vdf) g = vdf_columns_names([g], vdf)[0] query = f"(SELECT STV_Intersect({gid}, {g} USING PARAMETERS index='{index}') OVER (PARTITION BEST) AS (point_id, polygon_gid) FROM {table}) x" elif x and y: columns_check([x, y], vdf) x, y = vdf_columns_names([x, y], vdf) query = f"(SELECT STV_Intersect({gid}, {x}, {y} USING PARAMETERS index='{index}') OVER (PARTITION BEST) AS (point_id, polygon_gid) FROM {table}) x" else: raise ParameterError("Either 'x' and 'y' or 'g' must not be empty.") return vdf_from_relation(query, cursor=vdf._VERTICAPY_VARIABLES_["cursor"])
def create_index( vdf: vDataFrame, gid: str, g: str, index: str, overwrite: bool = False, max_mem_mb: int = 256, skip_nonindexable_polygons: bool = False, ): """ --------------------------------------------------------------------------- Creates a spatial index on a set of polygons to speed up spatial intersection with a set of points. Parameters ---------- vdf: vDataFrame vDataFrame to use to compute the spatial join. gid: str Name of an integer column that uniquely identifies the polygon. The gid cannot be NULL. g: str Name of a geometry or geography (WGS84) column or expression that contains polygons and multipolygons. Only polygon and multipolygon can be indexed. Other shape types are excluded from the index. index: str Name of the index. overwrite: bool, optional BOOLEAN value that specifies whether to overwrite the index, if an index exists. max_mem_mb: int, optional A positive integer that assigns a limit to the amount of memory in megabytes that create_index can allocate during index construction. skip_nonindexable_polygons: bool, optional In rare cases, intricate polygons (for instance, with too high resolution or anomalous spikes) cannot be indexed. These polygons are considered non-indexable. When set to False, non-indexable polygons cause the index creation to fail. When set to True, index creation can succeed by excluding non-indexable polygons from the index. Returns ------- vDataFrame object result of the join. """ check_types([ ( "vdf", vdf, [vDataFrame], ), ( "gid", gid, [str], ), ( "index", index, [str], ), ( "g", g, [str], ), ( "overwrite", overwrite, [bool], ), ( "max_mem_mb", max_mem_mb, [int], ), ( "skip_nonindexable_polygons", skip_nonindexable_polygons, [bool], ), ]) columns_check([gid, g], vdf) gid, g = vdf_columns_names([gid, g], vdf) query = "SELECT STV_Create_Index({}, {} USING PARAMETERS index='{}', overwrite={} , max_mem_mb={}, skip_nonindexable_polygons={}) OVER() FROM {}" query = query.format(gid, g, index, overwrite, max_mem_mb, skip_nonindexable_polygons, vdf.__genSQL__()) return to_tablesample(query, vdf._VERTICAPY_VARIABLES_["cursor"])
def ljungbox( vdf: vDataFrame, column: str, ts: str, by: list = [], p: int = 1, alpha: float = 0.05, box_pierce: bool = False, ): """ --------------------------------------------------------------------------- Ljung–Box test (whether any of a group of autocorrelations of a time series are different from zero). Parameters ---------- vdf: vDataFrame Input vDataFrame. column: str Input vcolumn to test. ts: str vcolumn used as timeline. It will be to use to order the data. It can be a numerical or type date like (date, datetime, timestamp...) vcolumn. by: list, optional vcolumns used in the partition. p: int, optional Number of lags to consider in the test. alpha: float, optional Significance Level. Probability to accept H0. box_pierce: bool If set to True, the Box-Pierce statistic will be used. Returns ------- tablesample An object containing the result. For more information, see utilities.tablesample. """ check_types( [ ("ts", ts, [str],), ("column", column, [str],), ("by", by, [list],), ("p", p, [int, float],), ("alpha", alpha, [int, float],), ("box_pierce", box_pierce, [bool],), ("vdf", vdf, [vDataFrame,],), ], ) columns_check([column] + [ts] + by, vdf) column = vdf_columns_names([column], vdf)[0] ts = vdf_columns_names([ts], vdf)[0] by = vdf_columns_names(by, vdf) acf = vdf.acf(column=column, ts=ts, by=by, p=p, show=False) if p >= 2: acf = acf.values["value"] else: acf = [acf] n = vdf[column].count() name = ( "Ljung–Box Test Statistic" if not (box_pierce) else "Box-Pierce Test Statistic" ) result = tablesample( {"index": [], name: [], "p_value": [], "Serial Correlation": []} ) Q = 0 for k in range(p): div = n - k - 1 if not (box_pierce) else 1 mult = n * (n + 2) if not (box_pierce) else n Q += mult * acf[k] ** 2 / div pvalue = chi2.sf(Q, k + 1) result.values["index"] += [k + 1] result.values[name] += [Q] result.values["p_value"] += [pvalue] result.values["Serial Correlation"] += [True if pvalue < alpha else False] return result
def het_white( vdf: vDataFrame, eps: str, X: list, ): """ --------------------------------------------------------------------------- White’s Lagrange Multiplier Test for heteroscedasticity. Parameters ---------- vdf: vDataFrame Input vDataFrame. eps: str Input residual vcolumn. X: str Exogenous Variables to test the heteroscedasticity on. Returns ------- tablesample An object containing the result. For more information, see utilities.tablesample. """ check_types( [("eps", eps, [str],), ("X", X, [list],), ("vdf", vdf, [vDataFrame, str,],),], ) columns_check([eps] + X, vdf) eps = vdf_columns_names([eps], vdf)[0] X = vdf_columns_names(X, vdf) X_0 = ["1"] + X variables = [] variables_names = [] for i in range(len(X_0)): for j in range(i, len(X_0)): if i != 0 or j != 0: variables += ["{} * {} AS var_{}_{}".format(X_0[i], X_0[j], i, j)] variables_names += ["var_{}_{}".format(i, j)] query = "(SELECT {}, POWER({}, 2) AS VERTICAPY_TEMP_eps2 FROM {}) VERTICAPY_SUBTABLE".format( ", ".join(variables), eps, vdf.__genSQL__() ) vdf_white = vdf_from_relation(query, cursor=vdf._VERTICAPY_VARIABLES_["cursor"]) from verticapy.learn.linear_model import LinearRegression schema_writing = vdf._VERTICAPY_VARIABLES_["schema_writing"] if not (schema_writing): schema_writing = "public" name = schema_writing + ".VERTICAPY_TEMP_MODEL_LINEAR_REGRESSION_{}".format( get_session(vdf._VERTICAPY_VARIABLES_["cursor"]) ) model = LinearRegression(name, cursor=vdf._VERTICAPY_VARIABLES_["cursor"]) try: model.fit(vdf_white, variables_names, "VERTICAPY_TEMP_eps2") R2 = model.score("r2") model.drop() except: try: model.set_params({"solver": "bfgs"}) model.fit(vdf_white, variables_names, "VERTICAPY_TEMP_eps2") R2 = model.score("r2") model.drop() except: model.drop() raise n = vdf.shape()[0] if len(X) > 1: k = 2 * len(X) + math.factorial(len(X)) / 2 / (math.factorial(len(X) - 2)) else: k = 1 LM = n * R2 lm_pvalue = chi2.sf(LM, k) F = (n - k - 1) * R2 / (1 - R2) / k f_pvalue = f.sf(F, k, n - k - 1) result = tablesample( { "index": [ "Lagrange Multiplier Statistic", "lm_p_value", "F Value", "f_p_value", ], "value": [LM, lm_pvalue, F, f_pvalue], } ) return result
def het_goldfeldquandt( vdf: vDataFrame, y: str, X: list, idx: int = 0, split: float = 0.5 ): """ --------------------------------------------------------------------------- Goldfeld-Quandt homoscedasticity test. Parameters ---------- vdf: vDataFrame Input vDataFrame. y: str Response Column. X: list Exogenous Variables. idx: int, optional Column index of variable according to which observations are sorted for the split. split: float, optional Float to indicate where to split (Example: 0.5 to split on the median). Returns ------- tablesample An object containing the result. For more information, see utilities.tablesample. """ def model_fit(input_relation, X, y, model): var = [] for vdf_tmp in input_relation: model.drop() model.fit(vdf_tmp, X, y) model.predict(vdf_tmp, name="verticapy_prediction") vdf_tmp["residual_0"] = vdf_tmp[y] - vdf_tmp["verticapy_prediction"] var += [vdf_tmp["residual_0"].var()] model.drop() return var check_types( [ ("y", y, [str],), ("X", X, [list],), ("idx", idx, [int, float],), ("split", split, [int, float],), ("vdf", vdf, [vDataFrame, str,],), ], ) columns_check([y] + X, vdf) y = vdf_columns_names([y], vdf)[0] X = vdf_columns_names(X, vdf) split_value = vdf[X[idx]].quantile(split) vdf_0_half = vdf.search(vdf[X[idx]] < split_value) vdf_1_half = vdf.search(vdf[X[idx]] > split_value) from verticapy.learn.linear_model import LinearRegression schema_writing = vdf._VERTICAPY_VARIABLES_["schema_writing"] if not (schema_writing): schema_writing = "public" name = schema_writing + ".VERTICAPY_TEMP_MODEL_LINEAR_REGRESSION_{}".format( get_session(vdf._VERTICAPY_VARIABLES_["cursor"]) ) model = LinearRegression(name, cursor=vdf._VERTICAPY_VARIABLES_["cursor"]) try: var0, var1 = model_fit([vdf_0_half, vdf_1_half], X, y, model) except: try: model.set_params({"solver": "bfgs"}) var0, var1 = model_fit([vdf_0_half, vdf_1_half], X, y, model) except: model.drop() raise n, m = vdf_0_half.shape()[0], vdf_1_half.shape()[0] F = var0 / var1 f_pvalue = f.sf(F, n, m) result = tablesample({"index": ["F Value", "f_p_value",], "value": [F, f_pvalue],}) return result
def adfuller( vdf: vDataFrame, column: str, ts: str, by: list = [], p: int = 1, with_trend: bool = False, regresults: bool = False, ): """ --------------------------------------------------------------------------- Augmented Dickey Fuller test (Time Series stationarity). Parameters ---------- vdf: vDataFrame Input vDataFrame. column: str Input vcolumn to test. ts: str vcolumn used as timeline. It will be to use to order the data. It can be a numerical or type date like (date, datetime, timestamp...) vcolumn. by: list, optional vcolumns used in the partition. p: int, optional Number of lags to consider in the test. with_trend: bool, optional Adds a trend in the Regression. regresults: bool, optional If True, the full regression results are returned. Returns ------- tablesample An object containing the result. For more information, see utilities.tablesample. """ def critical_value(alpha, N, with_trend): if not (with_trend): if N <= 25: if alpha == 0.01: return -3.75 elif alpha == 0.10: return -2.62 elif alpha == 0.025: return -3.33 else: return -3.00 elif N <= 50: if alpha == 0.01: return -3.58 elif alpha == 0.10: return -2.60 elif alpha == 0.025: return -3.22 else: return -2.93 elif N <= 100: if alpha == 0.01: return -3.51 elif alpha == 0.10: return -2.58 elif alpha == 0.025: return -3.17 else: return -2.89 elif N <= 250: if alpha == 0.01: return -3.46 elif alpha == 0.10: return -2.57 elif alpha == 0.025: return -3.14 else: return -2.88 elif N <= 500: if alpha == 0.01: return -3.44 elif alpha == 0.10: return -2.57 elif alpha == 0.025: return -3.13 else: return -2.87 else: if alpha == 0.01: return -3.43 elif alpha == 0.10: return -2.57 elif alpha == 0.025: return -3.12 else: return -2.86 else: if N <= 25: if alpha == 0.01: return -4.38 elif alpha == 0.10: return -3.24 elif alpha == 0.025: return -3.95 else: return -3.60 elif N <= 50: if alpha == 0.01: return -4.15 elif alpha == 0.10: return -3.18 elif alpha == 0.025: return -3.80 else: return -3.50 elif N <= 100: if alpha == 0.01: return -4.04 elif alpha == 0.10: return -3.15 elif alpha == 0.025: return -3.73 else: return -5.45 elif N <= 250: if alpha == 0.01: return -3.99 elif alpha == 0.10: return -3.13 elif alpha == 0.025: return -3.69 else: return -3.43 elif N <= 500: if alpha == 0.01: return 3.98 elif alpha == 0.10: return -3.13 elif alpha == 0.025: return -3.68 else: return -3.42 else: if alpha == 0.01: return -3.96 elif alpha == 0.10: return -3.12 elif alpha == 0.025: return -3.66 else: return -3.41 check_types( [ ("ts", ts, [str],), ("column", column, [str],), ("p", p, [int, float],), ("by", by, [list],), ("with_trend", with_trend, [bool],), ("regresults", regresults, [bool],), ("vdf", vdf, [vDataFrame,],), ], ) columns_check([ts, column] + by, vdf) ts = vdf_columns_names([ts], vdf)[0] column = vdf_columns_names([column], vdf)[0] by = vdf_columns_names(by, vdf) schema = vdf._VERTICAPY_VARIABLES_["schema_writing"] if not (schema): schema = "public" name = "{}.VERTICAPY_TEMP_MODEL_LINEAR_REGRESSION_{}".format( schema, gen_name([column]).upper() ) relation_name = "{}.VERTICAPY_TEMP_MODEL_LINEAR_REGRESSION_VIEW_{}".format( schema, gen_name([column]).upper() ) try: vdf._VERTICAPY_VARIABLES_["cursor"].execute( "DROP MODEL IF EXISTS {}".format(name) ) vdf._VERTICAPY_VARIABLES_["cursor"].execute( "DROP VIEW IF EXISTS {}".format(relation_name) ) except: pass lag = [ "LAG({}, 1) OVER ({}ORDER BY {}) AS lag1".format( column, "PARTITION BY {}".format(", ".join(by)) if (by) else "", ts ) ] lag += [ "LAG({}, {}) OVER ({}ORDER BY {}) - LAG({}, {}) OVER ({}ORDER BY {}) AS delta{}".format( column, i, "PARTITION BY {}".format(", ".join(by)) if (by) else "", ts, column, i + 1, "PARTITION BY {}".format(", ".join(by)) if (by) else "", ts, i, ) for i in range(1, p + 1) ] lag += [ "{} - LAG({}, 1) OVER ({}ORDER BY {}) AS delta".format( column, column, "PARTITION BY {}".format(", ".join(by)) if (by) else "", ts ) ] query = "CREATE VIEW {} AS SELECT {}, {} AS ts FROM {}".format( relation_name, ", ".join(lag), "TIMESTAMPDIFF(SECOND, {}, MIN({}) OVER ())".format(ts, ts) if vdf[ts].isdate() else ts, vdf.__genSQL__(), ) vdf._VERTICAPY_VARIABLES_["cursor"].execute(query) model = LinearRegression( name, vdf._VERTICAPY_VARIABLES_["cursor"], solver="Newton", max_iter=1000 ) predictors = ["lag1"] + ["delta{}".format(i) for i in range(1, p + 1)] if with_trend: predictors += ["ts"] model.fit( relation_name, predictors, "delta", ) coef = model.coef_ vdf._VERTICAPY_VARIABLES_["cursor"].execute("DROP MODEL IF EXISTS {}".format(name)) vdf._VERTICAPY_VARIABLES_["cursor"].execute( "DROP VIEW IF EXISTS {}".format(relation_name) ) if regresults: return coef coef = coef.transpose() DF = coef.values["lag1"][0] / (max(coef.values["lag1"][1], 1e-99)) p_value = coef.values["lag1"][3] count = vdf.shape()[0] result = tablesample( { "index": [ "ADF Test Statistic", "p_value", "# Lags used", "# Observations Used", "Critical Value (1%)", "Critical Value (2.5%)", "Critical Value (5%)", "Critical Value (10%)", "Stationarity (alpha = 1%)", ], "value": [ DF, p_value, p, count, critical_value(0.01, count, with_trend), critical_value(0.025, count, with_trend), critical_value(0.05, count, with_trend), critical_value(0.10, count, with_trend), DF < critical_value(0.01, count, with_trend) and p_value < 0.01, ], } ) return result
def het_breuschpagan( vdf: vDataFrame, eps: str, X: list, ): """ --------------------------------------------------------------------------- Breusch-Pagan test for heteroscedasticity. Parameters ---------- vdf: vDataFrame Input vDataFrame. eps: str Input residual vcolumn. X: list Exogenous Variables to test the heteroscedasticity on. Returns ------- tablesample An object containing the result. For more information, see utilities.tablesample. """ check_types( [("eps", eps, [str],), ("X", X, [list],), ("vdf", vdf, [vDataFrame, str,],),], ) columns_check([eps] + X, vdf) eps = vdf_columns_names([eps], vdf)[0] X = vdf_columns_names(X, vdf) from verticapy.learn.linear_model import LinearRegression schema_writing = vdf._VERTICAPY_VARIABLES_["schema_writing"] if not (schema_writing): schema_writing = "public" name = schema_writing + ".VERTICAPY_TEMP_MODEL_LINEAR_REGRESSION_{}".format( get_session(vdf._VERTICAPY_VARIABLES_["cursor"]) ) model = LinearRegression(name, cursor=vdf._VERTICAPY_VARIABLES_["cursor"]) vdf_copy = vdf.copy() vdf_copy["VERTICAPY_TEMP_eps2"] = vdf_copy[eps] ** 2 try: model.fit(vdf_copy, X, "VERTICAPY_TEMP_eps2") R2 = model.score("r2") model.drop() except: try: model.set_params({"solver": "bfgs"}) model.fit(vdf_copy, X, "VERTICAPY_TEMP_eps2") R2 = model.score("r2") model.drop() except: model.drop() raise n = vdf.shape()[0] k = len(X) LM = n * R2 lm_pvalue = chi2.sf(LM, k) F = (n - k - 1) * R2 / (1 - R2) / k f_pvalue = f.sf(F, k, n - k - 1) result = tablesample( { "index": [ "Lagrange Multiplier Statistic", "lm_p_value", "F Value", "f_p_value", ], "value": [LM, lm_pvalue, F, f_pvalue], } ) return result
def het_arch( vdf: vDataFrame, eps: str, ts: str, by: list = [], p: int = 1, ): """ --------------------------------------------------------------------------- Engle’s Test for Autoregressive Conditional Heteroscedasticity (ARCH). Parameters ---------- vdf: vDataFrame Input vDataFrame. eps: str Input residual vcolumn. ts: str vcolumn used as timeline. It will be to use to order the data. It can be a numerical or type date like (date, datetime, timestamp...) vcolumn. by: list, optional vcolumns used in the partition. p: int, optional Number of lags to consider in the test. Returns ------- tablesample An object containing the result. For more information, see utilities.tablesample. """ check_types( [ ("eps", eps, [str],), ("ts", ts, [str],), ("p", p, [int, float],), ("vdf", vdf, [vDataFrame, str,],), ], ) columns_check([eps, ts] + by, vdf) eps = vdf_columns_names([eps], vdf)[0] ts = vdf_columns_names([ts], vdf)[0] by = vdf_columns_names(by, vdf) X = [] X_names = [] for i in range(0, p + 1): X += [ "LAG(POWER({}, 2), {}) OVER({}ORDER BY {}) AS lag_{}".format( eps, i, ("PARTITION BY " + ", ".join(by)) if (by) else "", ts, i ) ] X_names += ["lag_{}".format(i)] query = "(SELECT {} FROM {}) VERTICAPY_SUBTABLE".format( ", ".join(X), vdf.__genSQL__() ) vdf_lags = vdf_from_relation(query, cursor=vdf._VERTICAPY_VARIABLES_["cursor"]) from verticapy.learn.linear_model import LinearRegression schema_writing = vdf._VERTICAPY_VARIABLES_["schema_writing"] if not (schema_writing): schema_writing = "public" name = schema_writing + ".VERTICAPY_TEMP_MODEL_LINEAR_REGRESSION_{}".format( get_session(vdf._VERTICAPY_VARIABLES_["cursor"]) ) model = LinearRegression(name, cursor=vdf._VERTICAPY_VARIABLES_["cursor"]) try: model.fit(vdf_lags, X_names[1:], X_names[0]) R2 = model.score("r2") model.drop() except: try: model.set_params({"solver": "bfgs"}) model.fit(vdf_lags, X_names[1:], X_names[0]) R2 = model.score("r2") model.drop() except: model.drop() raise n = vdf.shape()[0] k = len(X) LM = (n - p) * R2 lm_pvalue = chi2.sf(LM, p) F = (n - 2 * p - 1) * R2 / (1 - R2) / p f_pvalue = f.sf(F, p, n - 2 * p - 1) result = tablesample( { "index": [ "Lagrange Multiplier Statistic", "lm_p_value", "F Value", "f_p_value", ], "value": [LM, lm_pvalue, F, f_pvalue], } ) return result
def seasonal_decompose( vdf: vDataFrame, column: str, ts: str, by: list = [], period: int = -1, polynomial_order: int = 1, estimate_seasonality: bool = True, rule: Union[str, datetime.timedelta] = None, mult: bool = False, two_sided: bool = False, ): """ --------------------------------------------------------------------------- Performs a seasonal time series decomposition. Parameters ---------- vdf: vDataFrame Input vDataFrame. column: str Input vcolumn to decompose. ts: str TS (Time Series) vcolumn to use to order the data. It can be of type date or a numerical vcolumn. by: list, optional vcolumns used in the partition. period: int, optional Time Series period. It is used to retrieve the seasonality component. if period <= 0, the seasonal component will be estimated using ACF. In this case, polynomial_order must be greater than 0. polynomial_order: int, optional If greater than 0, the trend will be estimated using a polynomial of degree 'polynomial_order'. The parameter 'two_sided' will be ignored. If equal to 0, the trend will be estimated using Moving Averages. estimate_seasonality: bool, optional If set to True, the seasonality will be estimated using cosine and sine functions. rule: str / time, optional Interval to use to slice the time. For example, '5 minutes' will create records separated by '5 minutes' time interval. mult: bool, optional If set to True, the decomposition type will be 'multiplicative'. Otherwise, it is 'additive'. two_sided: bool, optional If set to True, a centered moving average is used for the trend isolation. Otherwise only past values are used. Returns ------- vDataFrame object containing (ts, column, TS seasonal part, TS trend, TS noise). """ if isinstance(by, str): by = [by] check_types( [ ("ts", ts, [str],), ("column", column, [str],), ("by", by, [list],), ("rule", rule, [str, datetime.timedelta,],), ("vdf", vdf, [vDataFrame,],), ("period", period, [int,],), ("mult", mult, [bool,],), ("two_sided", two_sided, [bool,],), ("polynomial_order", polynomial_order, [int,],), ("estimate_seasonality", estimate_seasonality, [bool,],), ], ) assert period > 0 or polynomial_order > 0, ParameterError("Parameters 'polynomial_order' and 'period' can not be both null.") columns_check([column, ts] + by, vdf) ts, column, by = ( vdf_columns_names([ts], vdf)[0], vdf_columns_names([column], vdf)[0], vdf_columns_names(by, vdf), ) if rule: vdf_tmp = vdf.asfreq(ts=ts, rule=period, method={column: "linear"}, by=by) else: vdf_tmp = vdf[[ts, column]] trend_name, seasonal_name, epsilon_name = ( "{}_trend".format(column[1:-1]), "{}_seasonal".format(column[1:-1]), "{}_epsilon".format(column[1:-1]), ) by, by_tmp = "" if not (by) else "PARTITION BY " + ", ".join(vdf_columns_names(by, self)) + " ", by if polynomial_order <= 0: if two_sided: if period == 1: window = (-1, 1) else: if period % 2 == 0: window = (-period / 2 + 1, period / 2) else: window = (int(-period / 2), int(period / 2)) else: if period == 1: window = (-2, 0) else: window = (-period + 1, 0) vdf_tmp.rolling("avg", window, column, by_tmp, ts, trend_name) else: vdf_poly = vdf_tmp.copy() X = [] for i in range(1, polynomial_order + 1): vdf_poly[f"t_{i}"] = f"POWER(ROW_NUMBER() OVER ({by}ORDER BY {ts}), {i})" X += [f"t_{i}"] schema = vdf_poly._VERTICAPY_VARIABLES_["schema_writing"] if not (schema): schema = vdf_poly._VERTICAPY_VARIABLES_["schema"] if not (schema): schema = "public" from verticapy.learn.linear_model import LinearRegression model = LinearRegression(name="{}.VERTICAPY_TEMP_MODEL_LINEAR_REGRESSION_{}".format(schema, get_session(vdf_poly._VERTICAPY_VARIABLES_["cursor"])), cursor=vdf_poly._VERTICAPY_VARIABLES_["cursor"], solver="bfgs", max_iter=100, tol=1e-6,) model.drop() model.fit(vdf_poly, X, column) coefficients = model.coef_["coefficient"] coefficients = [str(coefficients[0])] + [f"{coefficients[i]} * POWER(ROW_NUMBER() OVER({by}ORDER BY {ts}), {i})" if i != 1 else f"{coefficients[1]} * ROW_NUMBER() OVER({by}ORDER BY {ts})" for i in range(1, polynomial_order + 1)] vdf_tmp[trend_name] = " + ".join(coefficients) model.drop() if mult: vdf_tmp[seasonal_name] = f'{column} / NULLIFZERO("{trend_name}")' else: vdf_tmp[seasonal_name] = vdf_tmp[column] - vdf_tmp[trend_name] if period <= 0: acf = vdf_tmp.acf(column=seasonal_name, ts=ts, p=23, acf_type="heatmap", show=False) period = int(acf["index"][1].split("_")[1]) if period == 1: period = int(acf["index"][2].split("_")[1]) vdf_tmp["row_number_id"] = f"MOD(ROW_NUMBER() OVER ({by} ORDER BY {ts}), {period})" if mult: vdf_tmp[ seasonal_name ] = f"AVG({seasonal_name}) OVER (PARTITION BY row_number_id) / NULLIFZERO(AVG({seasonal_name}) OVER ())" else: vdf_tmp[ seasonal_name ] = f"AVG({seasonal_name}) OVER (PARTITION BY row_number_id) - AVG({seasonal_name}) OVER ()" if estimate_seasonality: vdf_seasonality = vdf_tmp.copy() vdf_seasonality["t_cos"] = f"COS(2 * PI() * ROW_NUMBER() OVER ({by}ORDER BY {ts}) / {period})" vdf_seasonality["t_sin"] = f"SIN(2 * PI() * ROW_NUMBER() OVER ({by}ORDER BY {ts}) / {period})" X = ["t_cos", "t_sin",] schema = vdf_seasonality._VERTICAPY_VARIABLES_["schema_writing"] if not (schema): schema = vdf_seasonality._VERTICAPY_VARIABLES_["schema"] if not (schema): schema = "public" from verticapy.learn.linear_model import LinearRegression model = LinearRegression(name="{}.VERTICAPY_TEMP_MODEL_LINEAR_REGRESSION_{}".format(schema, get_session(vdf_seasonality._VERTICAPY_VARIABLES_["cursor"])), cursor=vdf_seasonality._VERTICAPY_VARIABLES_["cursor"], solver="bfgs", max_iter=100, tol=1e-6,) model.drop() model.fit(vdf_seasonality, X, seasonal_name) coefficients = model.coef_["coefficient"] vdf_tmp[seasonal_name] = f"{coefficients[0]} + {coefficients[1]} * COS(2 * PI() * ROW_NUMBER() OVER ({by}ORDER BY {ts}) / {period}) + {coefficients[2]} * SIN(2 * PI() * ROW_NUMBER() OVER ({by}ORDER BY {ts}) / {period})" model.drop() if mult: vdf_tmp[ epsilon_name ] = f'{column} / NULLIFZERO("{trend_name}") / NULLIFZERO("{seasonal_name}")' else: vdf_tmp[epsilon_name] = ( vdf_tmp[column] - vdf_tmp[trend_name] - vdf_tmp[seasonal_name] ) vdf_tmp["row_number_id"].drop() return vdf_tmp
def mkt(vdf: vDataFrame, column: str, ts: str, alpha: float = 0.05): """ --------------------------------------------------------------------------- Mann Kendall test (Time Series trend). \u26A0 Warning : This Test is computationally expensive. It is using a CROSS JOIN during the computation. The complexity is O(n * k), n being the total count of the vDataFrame and k the number of rows to use to do the test. Parameters ---------- vdf: vDataFrame Input vDataFrame. column: str Input vcolumn to test. ts: str vcolumn used as timeline. It will be to use to order the data. It can be a numerical or type date like (date, datetime, timestamp...) vcolumn. alpha: float, optional Significance Level. Probability to accept H0. Returns ------- tablesample An object containing the result. For more information, see utilities.tablesample. """ check_types( [ ("ts", ts, [str],), ("column", column, [str],), ("alpha", alpha, [int, float],), ("vdf", vdf, [vDataFrame,],), ], ) columns_check([column, ts], vdf) column = vdf_columns_names([column], vdf)[0] ts = vdf_columns_names([ts], vdf)[0] table = "(SELECT {}, {} FROM {})".format(column, ts, vdf.__genSQL__()) query = "SELECT SUM(SIGN(y.{} - x.{})) FROM {} x CROSS JOIN {} y WHERE y.{} > x.{}".format( column, column, table, table, ts, ts ) vdf.__executeSQL__(query, title="Computes the Mann Kendall S.") S = vdf._VERTICAPY_VARIABLES_["cursor"].fetchone()[0] try: S = float(S) except: S = None n = vdf[column].count() query = "SELECT SQRT(({} * ({} - 1) * (2 * {} + 5) - SUM(row * (row - 1) * (2 * row + 5))) / 18) FROM (SELECT row FROM (SELECT ROW_NUMBER() OVER (PARTITION BY {}) AS row FROM {}) VERTICAPY_SUBTABLE GROUP BY row) VERTICAPY_SUBTABLE".format( n, n, n, column, vdf.__genSQL__() ) vdf.__executeSQL__(query, title="Computes the Mann Kendall S standard deviation.") STDS = vdf._VERTICAPY_VARIABLES_["cursor"].fetchone()[0] try: STDS = float(STDS) except: STDS = None if STDS in (None, 0) or S == None: return None if S > 0: ZMK = (S - 1) / STDS trend = "increasing" elif S < 0: ZMK = (S + 1) / STDS trend = "decreasing" else: ZMK = 0 trend = "no trend" pvalue = 2 * norm.sf(abs(ZMK)) result = ( True if (ZMK <= 0 and pvalue < alpha) or (ZMK >= 0 and pvalue < alpha) else False ) if not (result): trend = "no trend" result = tablesample( { "index": [ "Mann Kendall Test Statistic", "S", "STDS", "p_value", "Monotonic Trend", "Trend", ], "value": [ZMK, S, STDS, pvalue, result, trend], } ) return result
def plot_acf_pacf( vdf: vDataFrame, column: str, ts: str, by: list = [], p: (int, list) = 15, **style_kwds, ): """ --------------------------------------------------------------------------- Draws the ACF and PACF Charts. Parameters ---------- vdf: vDataFrame Input vDataFrame. column: str Response column. ts: str vcolumn used as timeline. It will be to use to order the data. It can be a numerical or type date like (date, datetime, timestamp...) vcolumn. by: list, optional vcolumns used in the partition. p: int/list, optional Int equals to the maximum number of lag to consider during the computation or List of the different lags to include during the computation. p must be positive or a list of positive integers. **style_kwds Any optional parameter to pass to the Matplotlib functions. Returns ------- tablesample An object containing the result. For more information, see utilities.tablesample. """ check_types([ ( "column", column, [str], ), ( "ts", ts, [str], ), ( "by", by, [list], ), ( "p", p, [int, float], ), ( "vdf", vdf, [ vDataFrame, ], ), ]) tmp_style = {} for elem in style_kwds: if elem not in ("color", "colors"): tmp_style[elem] = style_kwds[elem] if "color" in style_kwds: color = style_kwds["color"] else: color = gen_colors()[0] columns_check([column, ts] + by, vdf) by = vdf_columns_names(by, vdf) column, ts = vdf_columns_names([column, ts], vdf) acf = vdf.acf(ts=ts, column=column, by=by, p=p, show=False) pacf = vdf.pacf(ts=ts, column=column, by=by, p=p, show=False) result = tablesample( { "index": [i for i in range(0, len(acf.values["value"]))], "acf": acf.values["value"], "pacf": pacf.values["value"], "confidence": pacf.values["confidence"], }, ) fig = plt.figure(figsize=(10, 6)) if isnotebook() else plt.figure(figsize=(10, 6)) plt.rcParams["axes.facecolor"] = "#FCFCFC" ax1 = fig.add_subplot(211) x, y, confidence = ( result.values["index"], result.values["acf"], result.values["confidence"], ) plt.xlim(-1, x[-1] + 1) ax1.bar( x, y, width=0.007 * len(x), color="#444444", zorder=1, linewidth=0, ) param = { "s": 90, "marker": "o", "facecolors": color, "edgecolors": "black", "zorder": 2, } ax1.scatter( x, y, **updated_dict( param, tmp_style, ), ) ax1.plot( [-1] + x + [x[-1] + 1], [0 for elem in range(len(x) + 2)], color=color, zorder=0, ) ax1.fill_between(x, confidence, color="#FE5016", alpha=0.1) ax1.fill_between(x, [-elem for elem in confidence], color="#FE5016", alpha=0.1) ax1.set_title("Autocorrelation") y = result.values["pacf"] ax2 = fig.add_subplot(212) ax2.bar(x, y, width=0.007 * len(x), color="#444444", zorder=1, linewidth=0) ax2.scatter( x, y, **updated_dict( param, tmp_style, ), ) ax2.plot( [-1] + x + [x[-1] + 1], [0 for elem in range(len(x) + 2)], color=color, zorder=0, ) ax2.fill_between(x, confidence, color="#FE5016", alpha=0.1) ax2.fill_between(x, [-elem for elem in confidence], color="#FE5016", alpha=0.1) ax2.set_title("Partial Autocorrelation") plt.show() return result