def test_set_cursor(self, base): model_test = LinearRegression("linear_reg_cursor_test", cursor=base.cursor) # TODO: creat a new cursor model_test.set_cursor(base.cursor) model_test.drop() model_test.fit("public.winequality", ["alcohol"], "quality") base.cursor.execute( "SELECT model_name FROM models WHERE model_name = 'linear_reg_cursor_test'" ) assert base.cursor.fetchone()[0] == "linear_reg_cursor_test" model_test.drop()
def model(winequality_vd): model_class = LinearRegression("linreg_model_test", ) model_class.drop() model_class.fit("public.winequality", ["citric_acid", "residual_sugar", "alcohol"], "quality") yield model_class model_class.drop()
def test_contour(self, winequality_vd): model_test = LinearRegression("model_contour", ) model_test.drop() model_test.fit( winequality_vd, ["citric_acid", "residual_sugar"], "quality", ) result = model_test.contour() assert len(result.get_default_bbox_extra_artists()) == 32 model_test.drop()
def model(winequality_vd): model_class = Pipeline([ ("NormalizerWine", StandardScaler("std_model_test", )), ("LinearRegressionWine", LinearRegression("linreg_model_test", )), ]) model_class.drop() model_class.fit("public.winequality", ["citric_acid", "residual_sugar", "alcohol"], "quality") yield model_class model_class.drop()
def model(base, winequality_vd): base.cursor.execute("DROP MODEL IF EXISTS linreg_model_test") model_class = LinearRegression("linreg_model_test", cursor=base.cursor) model_class.fit("public.winequality", ["citric_acid", "residual_sugar", "alcohol"], "quality") yield model_class model_class.drop()
def test_model_from_vDF(self, base, winequality_vd): base.cursor.execute("DROP MODEL IF EXISTS linreg_from_vDF") model_test = LinearRegression("linreg_from_vDF", cursor=base.cursor) model_test.fit(winequality_vd, ["alcohol"], "quality") base.cursor.execute( "SELECT model_name FROM models WHERE model_name = 'linreg_from_vDF'" ) assert base.cursor.fetchone()[0] == "linreg_from_vDF" model_test.drop()
def test_model_from_vDF(self, base, winequality_vd): model_test = Pipeline([ ( "NormalizerWine", StandardScaler("std_model_test_vdf", cursor=base.cursor), ), ( "LinearRegressionWine", LinearRegression("linreg_model_test_vdf", cursor=base.cursor), ), ]) model_test.drop() model_test.fit(winequality_vd, ["citric_acid", "residual_sugar", "alcohol"], "quality") model_test.cursor.execute( "SELECT model_name FROM models WHERE model_name IN ('std_model_test_vdf', 'linreg_model_test_vdf')" ) assert len(base.cursor.fetchall()) == 2 model_test.drop()
def test_set_cursor(self, base): model_test = Pipeline([ ( "NormalizerWine", StandardScaler("std_model_test_vdf", cursor=base.cursor), ), ( "LinearRegressionWine", LinearRegression("linreg_model_test_vdf", cursor=base.cursor), ), ]) model_test.drop() model_test.set_cursor(base.cursor) model_test.fit("public.winequality", ["alcohol"], "quality") model_test.cursor.execute( "SELECT model_name FROM models WHERE model_name IN ('std_model_test_vdf', 'linreg_model_test_vdf')" ) assert len(base.cursor.fetchall()) == 2 model_test.drop()
def test_drop(self): current_cursor().execute("DROP MODEL IF EXISTS linreg_model_test_drop") model_test = LinearRegression("linreg_model_test_drop", ) model_test.fit("public.winequality", ["alcohol"], "quality") current_cursor().execute( "SELECT model_name FROM models WHERE model_name = 'linreg_model_test_drop'" ) assert current_cursor().fetchone()[0] == "linreg_model_test_drop" model_test.drop() current_cursor().execute( "SELECT model_name FROM models WHERE model_name = 'linreg_model_test_drop'" ) assert current_cursor().fetchone() is None
def test_drop(self, base): base.cursor.execute("DROP MODEL IF EXISTS linreg_model_test_drop") model_test = LinearRegression("linreg_model_test_drop", cursor=base.cursor) model_test.fit("public.winequality", ["alcohol"], "quality") base.cursor.execute( "SELECT model_name FROM models WHERE model_name = 'linreg_model_test_drop'" ) assert base.cursor.fetchone()[0] == "linreg_model_test_drop" model_test.drop() base.cursor.execute( "SELECT model_name FROM models WHERE model_name = 'linreg_model_test_drop'" ) assert base.cursor.fetchone() is None
def test_drop(self, base, winequality_vd): model_class = Pipeline([ ( "NormalizerWine", StandardScaler("std_model_test_drop", cursor=base.cursor), ), ( "LinearRegressionWine", LinearRegression("linreg_model_test_drop", cursor=base.cursor), ), ]) model_class.drop() model_class.fit(winequality_vd, ["alcohol"], "quality") model_class.cursor.execute( "SELECT model_name FROM models WHERE model_name IN ('linreg_model_test_drop', 'std_model_test_drop')" ) assert len(model_class.cursor.fetchall()) == 2 model_class.drop() model_class.cursor.execute( "SELECT model_name FROM models WHERE model_name IN ('linreg_model_test_drop', 'std_model_test_drop')" ) assert model_class.cursor.fetchone() is None
def test_cochrane_orcutt(self, airline_vd): airline_copy = airline_vd.copy() airline_copy["passengers_bias"] = (airline_copy["passengers"]**2 - 50 * st.random()) drop("lin_cochrane_orcutt_model_test", method="model") model = LinearRegression("lin_cochrane_orcutt_model_test") model.fit(airline_copy, ["passengers_bias"], "passengers") result = st.cochrane_orcutt( model, airline_copy, ts="date", prais_winsten=True, ) assert result.coef_["coefficient"][0] == pytest.approx( 25.8582027191416, 1e-2) assert result.coef_["coefficient"][1] == pytest.approx( 0.00123563974547625, 1e-2) model.drop()
def test_cochrane_orcutt(self, airline_vd, base): airline_copy = airline_vd.copy() airline_copy["passengers_bias"] = airline_copy[ "passengers"]**2 - 50 * st.random() from verticapy.learn.linear_model import LinearRegression base.cursor.execute( "DROP MODEL IF EXISTS lin_cochrane_orcutt_model_test") model = LinearRegression("lin_cochrane_orcutt_model_test", cursor=base.cursor) model.fit(airline_copy, ["passengers_bias"], "passengers") result = st.cochrane_orcutt( model, airline_copy, ts="date", prais_winsten=True, ) assert result.coef_["coefficient"][0] == pytest.approx( 25.8582027191416, 1e-2) assert result.coef_["coefficient"][1] == pytest.approx( 0.00123563974547625, 1e-2) model.drop()
def het_white( vdf: vDataFrame, eps: str, X: list, ): """ --------------------------------------------------------------------------- White’s Lagrange Multiplier Test for heteroscedasticity. Parameters ---------- vdf: vDataFrame Input vDataFrame. eps: str Input residual vcolumn. X: str Exogenous Variables to test the heteroscedasticity on. Returns ------- tablesample An object containing the result. For more information, see utilities.tablesample. """ check_types( [("eps", eps, [str],), ("X", X, [list],), ("vdf", vdf, [vDataFrame, str,],),], ) columns_check([eps] + X, vdf) eps = vdf_columns_names([eps], vdf)[0] X = vdf_columns_names(X, vdf) X_0 = ["1"] + X variables = [] variables_names = [] for i in range(len(X_0)): for j in range(i, len(X_0)): if i != 0 or j != 0: variables += ["{} * {} AS var_{}_{}".format(X_0[i], X_0[j], i, j)] variables_names += ["var_{}_{}".format(i, j)] query = "(SELECT {}, POWER({}, 2) AS VERTICAPY_TEMP_eps2 FROM {}) VERTICAPY_SUBTABLE".format( ", ".join(variables), eps, vdf.__genSQL__() ) vdf_white = vdf_from_relation(query, cursor=vdf._VERTICAPY_VARIABLES_["cursor"]) from verticapy.learn.linear_model import LinearRegression schema_writing = vdf._VERTICAPY_VARIABLES_["schema_writing"] if not (schema_writing): schema_writing = "public" name = schema_writing + ".VERTICAPY_TEMP_MODEL_LINEAR_REGRESSION_{}".format( get_session(vdf._VERTICAPY_VARIABLES_["cursor"]) ) model = LinearRegression(name, cursor=vdf._VERTICAPY_VARIABLES_["cursor"]) try: model.fit(vdf_white, variables_names, "VERTICAPY_TEMP_eps2") R2 = model.score("r2") model.drop() except: try: model.set_params({"solver": "bfgs"}) model.fit(vdf_white, variables_names, "VERTICAPY_TEMP_eps2") R2 = model.score("r2") model.drop() except: model.drop() raise n = vdf.shape()[0] if len(X) > 1: k = 2 * len(X) + math.factorial(len(X)) / 2 / (math.factorial(len(X) - 2)) else: k = 1 LM = n * R2 lm_pvalue = chi2.sf(LM, k) F = (n - k - 1) * R2 / (1 - R2) / k f_pvalue = f.sf(F, k, n - k - 1) result = tablesample( { "index": [ "Lagrange Multiplier Statistic", "lm_p_value", "F Value", "f_p_value", ], "value": [LM, lm_pvalue, F, f_pvalue], } ) return result
def het_goldfeldquandt( vdf: vDataFrame, y: str, X: list, idx: int = 0, split: float = 0.5 ): """ --------------------------------------------------------------------------- Goldfeld-Quandt homoscedasticity test. Parameters ---------- vdf: vDataFrame Input vDataFrame. y: str Response Column. X: list Exogenous Variables. idx: int, optional Column index of variable according to which observations are sorted for the split. split: float, optional Float to indicate where to split (Example: 0.5 to split on the median). Returns ------- tablesample An object containing the result. For more information, see utilities.tablesample. """ def model_fit(input_relation, X, y, model): var = [] for vdf_tmp in input_relation: model.drop() model.fit(vdf_tmp, X, y) model.predict(vdf_tmp, name="verticapy_prediction") vdf_tmp["residual_0"] = vdf_tmp[y] - vdf_tmp["verticapy_prediction"] var += [vdf_tmp["residual_0"].var()] model.drop() return var check_types( [ ("y", y, [str],), ("X", X, [list],), ("idx", idx, [int, float],), ("split", split, [int, float],), ("vdf", vdf, [vDataFrame, str,],), ], ) columns_check([y] + X, vdf) y = vdf_columns_names([y], vdf)[0] X = vdf_columns_names(X, vdf) split_value = vdf[X[idx]].quantile(split) vdf_0_half = vdf.search(vdf[X[idx]] < split_value) vdf_1_half = vdf.search(vdf[X[idx]] > split_value) from verticapy.learn.linear_model import LinearRegression schema_writing = vdf._VERTICAPY_VARIABLES_["schema_writing"] if not (schema_writing): schema_writing = "public" name = schema_writing + ".VERTICAPY_TEMP_MODEL_LINEAR_REGRESSION_{}".format( get_session(vdf._VERTICAPY_VARIABLES_["cursor"]) ) model = LinearRegression(name, cursor=vdf._VERTICAPY_VARIABLES_["cursor"]) try: var0, var1 = model_fit([vdf_0_half, vdf_1_half], X, y, model) except: try: model.set_params({"solver": "bfgs"}) var0, var1 = model_fit([vdf_0_half, vdf_1_half], X, y, model) except: model.drop() raise n, m = vdf_0_half.shape()[0], vdf_1_half.shape()[0] F = var0 / var1 f_pvalue = f.sf(F, n, m) result = tablesample({"index": ["F Value", "f_p_value",], "value": [F, f_pvalue],}) return result
def adfuller( vdf: vDataFrame, column: str, ts: str, by: list = [], p: int = 1, with_trend: bool = False, regresults: bool = False, ): """ --------------------------------------------------------------------------- Augmented Dickey Fuller test (Time Series stationarity). Parameters ---------- vdf: vDataFrame Input vDataFrame. column: str Input vcolumn to test. ts: str vcolumn used as timeline. It will be to use to order the data. It can be a numerical or type date like (date, datetime, timestamp...) vcolumn. by: list, optional vcolumns used in the partition. p: int, optional Number of lags to consider in the test. with_trend: bool, optional Adds a trend in the Regression. regresults: bool, optional If True, the full regression results are returned. Returns ------- tablesample An object containing the result. For more information, see utilities.tablesample. """ def critical_value(alpha, N, with_trend): if not (with_trend): if N <= 25: if alpha == 0.01: return -3.75 elif alpha == 0.10: return -2.62 elif alpha == 0.025: return -3.33 else: return -3.00 elif N <= 50: if alpha == 0.01: return -3.58 elif alpha == 0.10: return -2.60 elif alpha == 0.025: return -3.22 else: return -2.93 elif N <= 100: if alpha == 0.01: return -3.51 elif alpha == 0.10: return -2.58 elif alpha == 0.025: return -3.17 else: return -2.89 elif N <= 250: if alpha == 0.01: return -3.46 elif alpha == 0.10: return -2.57 elif alpha == 0.025: return -3.14 else: return -2.88 elif N <= 500: if alpha == 0.01: return -3.44 elif alpha == 0.10: return -2.57 elif alpha == 0.025: return -3.13 else: return -2.87 else: if alpha == 0.01: return -3.43 elif alpha == 0.10: return -2.57 elif alpha == 0.025: return -3.12 else: return -2.86 else: if N <= 25: if alpha == 0.01: return -4.38 elif alpha == 0.10: return -3.24 elif alpha == 0.025: return -3.95 else: return -3.60 elif N <= 50: if alpha == 0.01: return -4.15 elif alpha == 0.10: return -3.18 elif alpha == 0.025: return -3.80 else: return -3.50 elif N <= 100: if alpha == 0.01: return -4.04 elif alpha == 0.10: return -3.15 elif alpha == 0.025: return -3.73 else: return -5.45 elif N <= 250: if alpha == 0.01: return -3.99 elif alpha == 0.10: return -3.13 elif alpha == 0.025: return -3.69 else: return -3.43 elif N <= 500: if alpha == 0.01: return 3.98 elif alpha == 0.10: return -3.13 elif alpha == 0.025: return -3.68 else: return -3.42 else: if alpha == 0.01: return -3.96 elif alpha == 0.10: return -3.12 elif alpha == 0.025: return -3.66 else: return -3.41 check_types( [ ("ts", ts, [str],), ("column", column, [str],), ("p", p, [int, float],), ("by", by, [list],), ("with_trend", with_trend, [bool],), ("regresults", regresults, [bool],), ("vdf", vdf, [vDataFrame,],), ], ) columns_check([ts, column] + by, vdf) ts = vdf_columns_names([ts], vdf)[0] column = vdf_columns_names([column], vdf)[0] by = vdf_columns_names(by, vdf) schema = vdf._VERTICAPY_VARIABLES_["schema_writing"] if not (schema): schema = "public" name = "{}.VERTICAPY_TEMP_MODEL_LINEAR_REGRESSION_{}".format( schema, gen_name([column]).upper() ) relation_name = "{}.VERTICAPY_TEMP_MODEL_LINEAR_REGRESSION_VIEW_{}".format( schema, gen_name([column]).upper() ) try: vdf._VERTICAPY_VARIABLES_["cursor"].execute( "DROP MODEL IF EXISTS {}".format(name) ) vdf._VERTICAPY_VARIABLES_["cursor"].execute( "DROP VIEW IF EXISTS {}".format(relation_name) ) except: pass lag = [ "LAG({}, 1) OVER ({}ORDER BY {}) AS lag1".format( column, "PARTITION BY {}".format(", ".join(by)) if (by) else "", ts ) ] lag += [ "LAG({}, {}) OVER ({}ORDER BY {}) - LAG({}, {}) OVER ({}ORDER BY {}) AS delta{}".format( column, i, "PARTITION BY {}".format(", ".join(by)) if (by) else "", ts, column, i + 1, "PARTITION BY {}".format(", ".join(by)) if (by) else "", ts, i, ) for i in range(1, p + 1) ] lag += [ "{} - LAG({}, 1) OVER ({}ORDER BY {}) AS delta".format( column, column, "PARTITION BY {}".format(", ".join(by)) if (by) else "", ts ) ] query = "CREATE VIEW {} AS SELECT {}, {} AS ts FROM {}".format( relation_name, ", ".join(lag), "TIMESTAMPDIFF(SECOND, {}, MIN({}) OVER ())".format(ts, ts) if vdf[ts].isdate() else ts, vdf.__genSQL__(), ) vdf._VERTICAPY_VARIABLES_["cursor"].execute(query) model = LinearRegression( name, vdf._VERTICAPY_VARIABLES_["cursor"], solver="Newton", max_iter=1000 ) predictors = ["lag1"] + ["delta{}".format(i) for i in range(1, p + 1)] if with_trend: predictors += ["ts"] model.fit( relation_name, predictors, "delta", ) coef = model.coef_ vdf._VERTICAPY_VARIABLES_["cursor"].execute("DROP MODEL IF EXISTS {}".format(name)) vdf._VERTICAPY_VARIABLES_["cursor"].execute( "DROP VIEW IF EXISTS {}".format(relation_name) ) if regresults: return coef coef = coef.transpose() DF = coef.values["lag1"][0] / (max(coef.values["lag1"][1], 1e-99)) p_value = coef.values["lag1"][3] count = vdf.shape()[0] result = tablesample( { "index": [ "ADF Test Statistic", "p_value", "# Lags used", "# Observations Used", "Critical Value (1%)", "Critical Value (2.5%)", "Critical Value (5%)", "Critical Value (10%)", "Stationarity (alpha = 1%)", ], "value": [ DF, p_value, p, count, critical_value(0.01, count, with_trend), critical_value(0.025, count, with_trend), critical_value(0.05, count, with_trend), critical_value(0.10, count, with_trend), DF < critical_value(0.01, count, with_trend) and p_value < 0.01, ], } ) return result
def het_breuschpagan( vdf: vDataFrame, eps: str, X: list, ): """ --------------------------------------------------------------------------- Breusch-Pagan test for heteroscedasticity. Parameters ---------- vdf: vDataFrame Input vDataFrame. eps: str Input residual vcolumn. X: list Exogenous Variables to test the heteroscedasticity on. Returns ------- tablesample An object containing the result. For more information, see utilities.tablesample. """ check_types( [("eps", eps, [str],), ("X", X, [list],), ("vdf", vdf, [vDataFrame, str,],),], ) columns_check([eps] + X, vdf) eps = vdf_columns_names([eps], vdf)[0] X = vdf_columns_names(X, vdf) from verticapy.learn.linear_model import LinearRegression schema_writing = vdf._VERTICAPY_VARIABLES_["schema_writing"] if not (schema_writing): schema_writing = "public" name = schema_writing + ".VERTICAPY_TEMP_MODEL_LINEAR_REGRESSION_{}".format( get_session(vdf._VERTICAPY_VARIABLES_["cursor"]) ) model = LinearRegression(name, cursor=vdf._VERTICAPY_VARIABLES_["cursor"]) vdf_copy = vdf.copy() vdf_copy["VERTICAPY_TEMP_eps2"] = vdf_copy[eps] ** 2 try: model.fit(vdf_copy, X, "VERTICAPY_TEMP_eps2") R2 = model.score("r2") model.drop() except: try: model.set_params({"solver": "bfgs"}) model.fit(vdf_copy, X, "VERTICAPY_TEMP_eps2") R2 = model.score("r2") model.drop() except: model.drop() raise n = vdf.shape()[0] k = len(X) LM = n * R2 lm_pvalue = chi2.sf(LM, k) F = (n - k - 1) * R2 / (1 - R2) / k f_pvalue = f.sf(F, k, n - k - 1) result = tablesample( { "index": [ "Lagrange Multiplier Statistic", "lm_p_value", "F Value", "f_p_value", ], "value": [LM, lm_pvalue, F, f_pvalue], } ) return result
def het_arch( vdf: vDataFrame, eps: str, ts: str, by: list = [], p: int = 1, ): """ --------------------------------------------------------------------------- Engle’s Test for Autoregressive Conditional Heteroscedasticity (ARCH). Parameters ---------- vdf: vDataFrame Input vDataFrame. eps: str Input residual vcolumn. ts: str vcolumn used as timeline. It will be to use to order the data. It can be a numerical or type date like (date, datetime, timestamp...) vcolumn. by: list, optional vcolumns used in the partition. p: int, optional Number of lags to consider in the test. Returns ------- tablesample An object containing the result. For more information, see utilities.tablesample. """ check_types( [ ("eps", eps, [str],), ("ts", ts, [str],), ("p", p, [int, float],), ("vdf", vdf, [vDataFrame, str,],), ], ) columns_check([eps, ts] + by, vdf) eps = vdf_columns_names([eps], vdf)[0] ts = vdf_columns_names([ts], vdf)[0] by = vdf_columns_names(by, vdf) X = [] X_names = [] for i in range(0, p + 1): X += [ "LAG(POWER({}, 2), {}) OVER({}ORDER BY {}) AS lag_{}".format( eps, i, ("PARTITION BY " + ", ".join(by)) if (by) else "", ts, i ) ] X_names += ["lag_{}".format(i)] query = "(SELECT {} FROM {}) VERTICAPY_SUBTABLE".format( ", ".join(X), vdf.__genSQL__() ) vdf_lags = vdf_from_relation(query, cursor=vdf._VERTICAPY_VARIABLES_["cursor"]) from verticapy.learn.linear_model import LinearRegression schema_writing = vdf._VERTICAPY_VARIABLES_["schema_writing"] if not (schema_writing): schema_writing = "public" name = schema_writing + ".VERTICAPY_TEMP_MODEL_LINEAR_REGRESSION_{}".format( get_session(vdf._VERTICAPY_VARIABLES_["cursor"]) ) model = LinearRegression(name, cursor=vdf._VERTICAPY_VARIABLES_["cursor"]) try: model.fit(vdf_lags, X_names[1:], X_names[0]) R2 = model.score("r2") model.drop() except: try: model.set_params({"solver": "bfgs"}) model.fit(vdf_lags, X_names[1:], X_names[0]) R2 = model.score("r2") model.drop() except: model.drop() raise n = vdf.shape()[0] k = len(X) LM = (n - p) * R2 lm_pvalue = chi2.sf(LM, p) F = (n - 2 * p - 1) * R2 / (1 - R2) / p f_pvalue = f.sf(F, p, n - 2 * p - 1) result = tablesample( { "index": [ "Lagrange Multiplier Statistic", "lm_p_value", "F Value", "f_p_value", ], "value": [LM, lm_pvalue, F, f_pvalue], } ) return result
def variance_inflation_factor( vdf: vDataFrame, X: list, X_idx: int = None, ): """ --------------------------------------------------------------------------- Computes the variance inflation factor (VIF). It can be used to detect multicollinearity in an OLS Regression Analysis. Parameters ---------- vdf: vDataFrame Input vDataFrame. X: list Input Variables. X_idx: int Index of the exogenous variable in X. If left to None, a tablesample will be returned with all the variables VIF. Returns ------- float VIF. """ check_types( [ ("X_idx", X_idx, [int],), ("X", X, [list],), ("vdf", vdf, [vDataFrame, str,],), ], ) columns_check(X, vdf) X = vdf_columns_names(X, vdf) if isinstance(X_idx, str): columns_check([X_idx], vdf) for i in range(len(X)): if str_column(X[i]) == str_column(X_idx): X_idx = i break if isinstance(X_idx, (int, float)): X_r = [] for i in range(len(X)): if i != X_idx: X_r += [X[i]] y_r = X[X_idx] from verticapy.learn.linear_model import LinearRegression schema_writing = vdf._VERTICAPY_VARIABLES_["schema_writing"] if not (schema_writing): schema_writing = "public" name = schema_writing + ".VERTICAPY_TEMP_MODEL_LINEAR_REGRESSION_{}".format( get_session(vdf._VERTICAPY_VARIABLES_["cursor"]) ) model = LinearRegression(name, cursor=vdf._VERTICAPY_VARIABLES_["cursor"]) try: model.fit(vdf, X_r, y_r) R2 = model.score("r2") model.drop() except: try: model.set_params({"solver": "bfgs"}) model.fit(vdf, X_r, y_r) R2 = model.score("r2") model.drop() except: model.drop() raise if 1 - R2 != 0: return 1 / (1 - R2) else: return np.inf elif X_idx == None: VIF = [] for i in range(len(X)): VIF += [variance_inflation_factor(vdf, X, i)] return tablesample({"X_idx": X, "VIF": VIF}) else: raise ParameterError( f"Wrong type for Parameter X_idx.\nExpected integer, found {type(X_idx)}." )
def seasonal_decompose( vdf: vDataFrame, column: str, ts: str, by: list = [], period: int = -1, polynomial_order: int = 1, estimate_seasonality: bool = True, rule: Union[str, datetime.timedelta] = None, mult: bool = False, two_sided: bool = False, ): """ --------------------------------------------------------------------------- Performs a seasonal time series decomposition. Parameters ---------- vdf: vDataFrame Input vDataFrame. column: str Input vcolumn to decompose. ts: str TS (Time Series) vcolumn to use to order the data. It can be of type date or a numerical vcolumn. by: list, optional vcolumns used in the partition. period: int, optional Time Series period. It is used to retrieve the seasonality component. if period <= 0, the seasonal component will be estimated using ACF. In this case, polynomial_order must be greater than 0. polynomial_order: int, optional If greater than 0, the trend will be estimated using a polynomial of degree 'polynomial_order'. The parameter 'two_sided' will be ignored. If equal to 0, the trend will be estimated using Moving Averages. estimate_seasonality: bool, optional If set to True, the seasonality will be estimated using cosine and sine functions. rule: str / time, optional Interval to use to slice the time. For example, '5 minutes' will create records separated by '5 minutes' time interval. mult: bool, optional If set to True, the decomposition type will be 'multiplicative'. Otherwise, it is 'additive'. two_sided: bool, optional If set to True, a centered moving average is used for the trend isolation. Otherwise only past values are used. Returns ------- vDataFrame object containing (ts, column, TS seasonal part, TS trend, TS noise). """ if isinstance(by, str): by = [by] check_types( [ ("ts", ts, [str],), ("column", column, [str],), ("by", by, [list],), ("rule", rule, [str, datetime.timedelta,],), ("vdf", vdf, [vDataFrame,],), ("period", period, [int,],), ("mult", mult, [bool,],), ("two_sided", two_sided, [bool,],), ("polynomial_order", polynomial_order, [int,],), ("estimate_seasonality", estimate_seasonality, [bool,],), ], ) assert period > 0 or polynomial_order > 0, ParameterError("Parameters 'polynomial_order' and 'period' can not be both null.") columns_check([column, ts] + by, vdf) ts, column, by = ( vdf_columns_names([ts], vdf)[0], vdf_columns_names([column], vdf)[0], vdf_columns_names(by, vdf), ) if rule: vdf_tmp = vdf.asfreq(ts=ts, rule=period, method={column: "linear"}, by=by) else: vdf_tmp = vdf[[ts, column]] trend_name, seasonal_name, epsilon_name = ( "{}_trend".format(column[1:-1]), "{}_seasonal".format(column[1:-1]), "{}_epsilon".format(column[1:-1]), ) by, by_tmp = "" if not (by) else "PARTITION BY " + ", ".join(vdf_columns_names(by, self)) + " ", by if polynomial_order <= 0: if two_sided: if period == 1: window = (-1, 1) else: if period % 2 == 0: window = (-period / 2 + 1, period / 2) else: window = (int(-period / 2), int(period / 2)) else: if period == 1: window = (-2, 0) else: window = (-period + 1, 0) vdf_tmp.rolling("avg", window, column, by_tmp, ts, trend_name) else: vdf_poly = vdf_tmp.copy() X = [] for i in range(1, polynomial_order + 1): vdf_poly[f"t_{i}"] = f"POWER(ROW_NUMBER() OVER ({by}ORDER BY {ts}), {i})" X += [f"t_{i}"] schema = vdf_poly._VERTICAPY_VARIABLES_["schema_writing"] if not (schema): schema = vdf_poly._VERTICAPY_VARIABLES_["schema"] if not (schema): schema = "public" from verticapy.learn.linear_model import LinearRegression model = LinearRegression(name="{}.VERTICAPY_TEMP_MODEL_LINEAR_REGRESSION_{}".format(schema, get_session(vdf_poly._VERTICAPY_VARIABLES_["cursor"])), cursor=vdf_poly._VERTICAPY_VARIABLES_["cursor"], solver="bfgs", max_iter=100, tol=1e-6,) model.drop() model.fit(vdf_poly, X, column) coefficients = model.coef_["coefficient"] coefficients = [str(coefficients[0])] + [f"{coefficients[i]} * POWER(ROW_NUMBER() OVER({by}ORDER BY {ts}), {i})" if i != 1 else f"{coefficients[1]} * ROW_NUMBER() OVER({by}ORDER BY {ts})" for i in range(1, polynomial_order + 1)] vdf_tmp[trend_name] = " + ".join(coefficients) model.drop() if mult: vdf_tmp[seasonal_name] = f'{column} / NULLIFZERO("{trend_name}")' else: vdf_tmp[seasonal_name] = vdf_tmp[column] - vdf_tmp[trend_name] if period <= 0: acf = vdf_tmp.acf(column=seasonal_name, ts=ts, p=23, acf_type="heatmap", show=False) period = int(acf["index"][1].split("_")[1]) if period == 1: period = int(acf["index"][2].split("_")[1]) vdf_tmp["row_number_id"] = f"MOD(ROW_NUMBER() OVER ({by} ORDER BY {ts}), {period})" if mult: vdf_tmp[ seasonal_name ] = f"AVG({seasonal_name}) OVER (PARTITION BY row_number_id) / NULLIFZERO(AVG({seasonal_name}) OVER ())" else: vdf_tmp[ seasonal_name ] = f"AVG({seasonal_name}) OVER (PARTITION BY row_number_id) - AVG({seasonal_name}) OVER ()" if estimate_seasonality: vdf_seasonality = vdf_tmp.copy() vdf_seasonality["t_cos"] = f"COS(2 * PI() * ROW_NUMBER() OVER ({by}ORDER BY {ts}) / {period})" vdf_seasonality["t_sin"] = f"SIN(2 * PI() * ROW_NUMBER() OVER ({by}ORDER BY {ts}) / {period})" X = ["t_cos", "t_sin",] schema = vdf_seasonality._VERTICAPY_VARIABLES_["schema_writing"] if not (schema): schema = vdf_seasonality._VERTICAPY_VARIABLES_["schema"] if not (schema): schema = "public" from verticapy.learn.linear_model import LinearRegression model = LinearRegression(name="{}.VERTICAPY_TEMP_MODEL_LINEAR_REGRESSION_{}".format(schema, get_session(vdf_seasonality._VERTICAPY_VARIABLES_["cursor"])), cursor=vdf_seasonality._VERTICAPY_VARIABLES_["cursor"], solver="bfgs", max_iter=100, tol=1e-6,) model.drop() model.fit(vdf_seasonality, X, seasonal_name) coefficients = model.coef_["coefficient"] vdf_tmp[seasonal_name] = f"{coefficients[0]} + {coefficients[1]} * COS(2 * PI() * ROW_NUMBER() OVER ({by}ORDER BY {ts}) / {period}) + {coefficients[2]} * SIN(2 * PI() * ROW_NUMBER() OVER ({by}ORDER BY {ts}) / {period})" model.drop() if mult: vdf_tmp[ epsilon_name ] = f'{column} / NULLIFZERO("{trend_name}") / NULLIFZERO("{seasonal_name}")' else: vdf_tmp[epsilon_name] = ( vdf_tmp[column] - vdf_tmp[trend_name] - vdf_tmp[seasonal_name] ) vdf_tmp["row_number_id"].drop() return vdf_tmp
def durbin_watson( vdf, column: str, ts: str, X: list, by: list = [], ): """ --------------------------------------------------------------------------- Durbin Watson test (residuals autocorrelation). Parameters ---------- vdf: vDataFrame input vDataFrame. column: str Input vcolumn used as response. ts: str vcolumn used as timeline. It will be to use to order the data. It can be a numerical or type date like (date, datetime, timestamp...) vcolumn. X: list Input vcolumns used as predictors. by: list, optional vcolumns used in the partition. Returns ------- tablesample An object containing the result. For more information, see utilities.tablesample. """ check_types( [ ( "ts", ts, [str], ), ( "column", column, [str], ), ( "X", X, [list], ), ( "by", by, [list], ), ], vdf=["vdf", vdf], ) columns_check(X + [column] + [ts] + by, vdf) column = vdf_columns_names([column], vdf)[0] ts = vdf_columns_names([ts], vdf)[0] X = vdf_columns_names(X, vdf) by = vdf_columns_names(by, vdf) schema = vdf._VERTICAPY_VARIABLES_["schema_writing"] if not (schema): schema = "public" name = "{}.VERTICAPY_TEMP_MODEL_LINEAR_REGRESSION_{}".format( schema, gen_name([column]).upper()) relation_name = "{}.VERTICAPY_TEMP_MODEL_LINEAR_REGRESSION_VIEW_{}".format( schema, gen_name([column]).upper()) try: vdf._VERTICAPY_VARIABLES_["cursor"].execute( "DROP MODEL IF EXISTS {}".format(name)) vdf._VERTICAPY_VARIABLES_["cursor"].execute( "DROP VIEW IF EXISTS {}".format(relation_name)) except: pass query = "CREATE VIEW {} AS SELECT {}, {}, {}{} FROM {}".format( relation_name, ", ".join(X), column, ts, ", {}".format(", ".join(by)) if by else "", vdf.__genSQL__(), ) vdf._VERTICAPY_VARIABLES_["cursor"].execute(query) model = LinearRegression(name, vdf._VERTICAPY_VARIABLES_["cursor"], solver="Newton", max_iter=1000) model.fit(relation_name, X, column) query = "(SELECT et, LAG(et) OVER({}ORDER BY {}) AS lag_et FROM (SELECT {}{}, {} - PREDICT_LINEAR_REG({} USING PARAMETERS model_name = '{}') AS et FROM {}) VERTICAPY_SUBTABLE) VERTICAPY_SUBTABLE".format( "PARTITION BY {} ".format(", ".join(by)) if (by) else "", ts, "{}, ".format(", ".join(by)) if by else "", ts, column, ", ".join(X), name, relation_name, ) vdf.__executeSQL__( "SELECT SUM(POWER(et - lag_et, 2)) / SUM(POWER(et, 2)) FROM {}".format( query), title="Computes the Durbin Watson d.", ) d = vdf._VERTICAPY_VARIABLES_["cursor"].fetchone()[0] vdf._VERTICAPY_VARIABLES_["cursor"].execute( "DROP MODEL IF EXISTS {}".format(name)) vdf._VERTICAPY_VARIABLES_["cursor"].execute( "DROP VIEW IF EXISTS {}".format(relation_name)) if d > 2.5 or d < 1.5: result = False else: result = True result = tablesample({ "index": ["Durbin Watson Index", "Residuals Stationarity"], "value": [d, result], }) return result
def test_get_plot(self, winequality_vd): current_cursor().execute("DROP MODEL IF EXISTS model_test_plot") model_test = LinearRegression("model_test_plot", ) model_test.fit(winequality_vd, ["alcohol"], "quality") result = model_test.plot(color="r") assert len(result.get_default_bbox_extra_artists()) == 9 plt.close("all") model_test.drop() model_test.fit(winequality_vd, ["alcohol", "residual_sugar"], "quality") result = model_test.plot(color="r") assert len(result.get_default_bbox_extra_artists()) == 3 plt.close("all") model_test.drop()
def test_repr(self, model): assert "|coefficient|std_err |t_value |p_value" in model.__repr__() model_repr = LinearRegression("lin_repr") model_repr.drop() assert model_repr.__repr__() == "<LinearRegression>"
def load_model(name: str, cursor=None, input_relation: str = "", test_relation: str = ""): """ --------------------------------------------------------------------------- Loads a Vertica model and returns the associated object. Parameters ---------- name: str Model Name. cursor: DBcursor, optional Vertica database cursor. input_relation: str, optional Some automated functions may depend on the input relation. If the load_model function cannot find the input relation from the call string, you should fill it manually. test_relation: str, optional Relation to use to do the testing. All the methods will use this relation for the scoring. If empty, the training relation will be used as testing. Returns ------- model The model. """ check_types([("name", name, [str],), ("test_relation", test_relation, [str],), ("input_relation", input_relation, [str],),]) cursor = check_cursor(cursor)[0] does_exist = does_model_exist(name=name, cursor=cursor, raise_error=False) schema, name = schema_relation(name) schema, name = schema[1:-1], name[1:-1] assert does_exist, NameError("The model '{}' doesn't exist.".format(name)) if does_exist == 2: cursor.execute( "SELECT attr_name, value FROM verticapy.attr WHERE LOWER(model_name) = LOWER('{}')".format( str_column(name.lower()) ) ) result = cursor.fetchall() model_save = {} for elem in result: ldic = {} try: exec("result_tmp = {}".format(elem[1]), globals(), ldic) except: exec( "result_tmp = '{}'".format(elem[1].replace("'", "''")), globals(), ldic, ) result_tmp = ldic["result_tmp"] try: result_tmp = float(result_tmp) except: pass if result_tmp == None: result_tmp = "None" model_save[elem[0]] = result_tmp if model_save["type"] == "NearestCentroid": from verticapy.learn.neighbors import NearestCentroid model = NearestCentroid(name, cursor, model_save["p"]) model.centroids_ = tablesample(model_save["centroids"]) model.classes_ = model_save["classes"] elif model_save["type"] == "KNeighborsClassifier": from verticapy.learn.neighbors import KNeighborsClassifier model = KNeighborsClassifier( name, cursor, model_save["n_neighbors"], model_save["p"] ) model.classes_ = model_save["classes"] elif model_save["type"] == "KNeighborsRegressor": from verticapy.learn.neighbors import KNeighborsRegressor model = KNeighborsRegressor( name, cursor, model_save["n_neighbors"], model_save["p"] ) elif model_save["type"] == "KernelDensity": from verticapy.learn.neighbors import KernelDensity model = KernelDensity( name, cursor, model_save["bandwidth"], model_save["kernel"], model_save["p"], model_save["max_leaf_nodes"], model_save["max_depth"], model_save["min_samples_leaf"], model_save["nbins"], model_save["xlim"], ) model.y = "KDE" model.map = model_save["map"] model.tree_name = model_save["tree_name"] elif model_save["type"] == "LocalOutlierFactor": from verticapy.learn.neighbors import LocalOutlierFactor model = LocalOutlierFactor( name, cursor, model_save["n_neighbors"], model_save["p"] ) model.n_errors_ = model_save["n_errors"] elif model_save["type"] == "DBSCAN": from verticapy.learn.cluster import DBSCAN model = DBSCAN( name, cursor, model_save["eps"], model_save["min_samples"], model_save["p"], ) model.n_cluster_ = model_save["n_cluster"] model.n_noise_ = model_save["n_noise"] elif model_save["type"] == "CountVectorizer": from verticapy.learn.preprocessing import CountVectorizer model = CountVectorizer( name, cursor, model_save["lowercase"], model_save["max_df"], model_save["min_df"], model_save["max_features"], model_save["ignore_special"], model_save["max_text_size"], ) model.vocabulary_ = model_save["vocabulary"] model.stop_words_ = model_save["stop_words"] elif model_save["type"] == "SARIMAX": from verticapy.learn.tsa import SARIMAX model = SARIMAX( name, cursor, model_save["p"], model_save["d"], model_save["q"], model_save["P"], model_save["D"], model_save["Q"], model_save["s"], model_save["tol"], model_save["max_iter"], model_save["solver"], model_save["max_pik"], model_save["papprox_ma"], ) model.transform_relation = model_save["transform_relation"] model.coef_ = tablesample(model_save["coef"]) model.ma_avg_ = model_save["ma_avg"] if isinstance(model_save["ma_piq"], dict): model.ma_piq_ = tablesample(model_save["ma_piq"]) else: model.ma_piq_ = None model.ts = model_save["ts"] model.exogenous = model_save["exogenous"] model.deploy_predict_ = model_save["deploy_predict"] elif model_save["type"] == "VAR": from verticapy.learn.tsa import VAR model = VAR( name, cursor, model_save["p"], model_save["tol"], model_save["max_iter"], model_save["solver"], ) model.transform_relation = model_save["transform_relation"] model.coef_ = [] for i in range(len(model_save["X"])): model.coef_ += [tablesample(model_save["coef_{}".format(i)])] model.ts = model_save["ts"] model.deploy_predict_ = model_save["deploy_predict"] model.X = model_save["X"] if not(input_relation): model.input_relation = model_save["input_relation"] else: model.input_relation = input_relation model.X = model_save["X"] if model_save["type"] in ( "KNeighborsRegressor", "KNeighborsClassifier", "NearestCentroid", "SARIMAX", ): model.y = model_save["y"] model.test_relation = model_save["test_relation"] elif model_save["type"] not in ("CountVectorizer", "VAR"): model.key_columns = model_save["key_columns"] else: model_type = does_model_exist(name="{}.{}".format(schema, name), cursor=cursor, raise_error=False, return_model_type=True,) if model_type.lower() == "kmeans": cursor.execute( "SELECT GET_MODEL_SUMMARY (USING PARAMETERS model_name = '" + name + "')" ) info = cursor.fetchone()[0].replace("\n", " ") info = "kmeans(" + info.split("kmeans(")[1] elif model_type.lower() == "normalize_fit": from verticapy.learn.preprocessing import Normalizer model = Normalizer(name, cursor) model.param_ = model.get_attr("details") model.X = [ '"' + item + '"' for item in model.param_.values["column_name"] ] if "avg" in model.param_.values: model.parameters["method"] = "zscore" elif "max" in model.param_.values: model.parameters["method"] = "minmax" else: model.parameters["method"] = "robust_zscore" return model else: cursor.execute( "SELECT GET_MODEL_ATTRIBUTE (USING PARAMETERS model_name = '" + name + "', attr_name = 'call_string')" ) info = cursor.fetchone()[0].replace("\n", " ") if "SELECT " in info: info = info.split("SELECT ")[1].split("(") else: info = info.split("(") model_type = info[0].lower() info = info[1].split(")")[0].replace(" ", "").split("USINGPARAMETERS") if model_type == "svm_classifier" and "class_weights='none'" not in " ".join(info).lower(): parameters = "".join(info[1].split("class_weights=")[1].split("'")) parameters = parameters[3 : len(parameters)].split(",") del parameters[0] parameters += [ "class_weights=" + info[1].split("class_weights=")[1].split("'")[1] ] elif model_type != "svd": parameters = info[1].split(",") else: parameters = [] parameters = [item.split("=") for item in parameters] parameters_dict = {} for item in parameters: if len(item) > 1: parameters_dict[item[0]] = item[1] info = info[0] for elem in parameters_dict: if isinstance(parameters_dict[elem], str): parameters_dict[elem] = parameters_dict[elem].replace("'", "") if model_type == "rf_regressor": from verticapy.learn.ensemble import RandomForestRegressor model = RandomForestRegressor( name, cursor, int(parameters_dict["ntree"]), int(parameters_dict["mtry"]), int(parameters_dict["max_breadth"]), float(parameters_dict["sampling_size"]), int(parameters_dict["max_depth"]), int(parameters_dict["min_leaf_size"]), float(parameters_dict["min_info_gain"]), int(parameters_dict["nbins"]), ) elif model_type == "rf_classifier": from verticapy.learn.ensemble import RandomForestClassifier model = RandomForestClassifier( name, cursor, int(parameters_dict["ntree"]), int(parameters_dict["mtry"]), int(parameters_dict["max_breadth"]), float(parameters_dict["sampling_size"]), int(parameters_dict["max_depth"]), int(parameters_dict["min_leaf_size"]), float(parameters_dict["min_info_gain"]), int(parameters_dict["nbins"]), ) elif model_type == "xgb_classifier": from verticapy.learn.ensemble import XGBoostClassifier model = XGBoostClassifier( name, cursor, int(parameters_dict["max_ntree"]), int(parameters_dict["max_depth"]), int(parameters_dict["nbins"]), parameters_dict["objective"], parameters_dict["split_proposal_method"], float(parameters_dict["epsilon"]), float(parameters_dict["learning_rate"]), float(parameters_dict["min_split_loss"]), float(parameters_dict["weight_reg"]), float(parameters_dict["sampling_size"]), ) elif model_type == "xgb_regressor": from verticapy.learn.ensemble import XGBoostRegressor model = XGBoostRegressor( name, cursor, int(parameters_dict["max_ntree"]), int(parameters_dict["max_depth"]), int(parameters_dict["nbins"]), parameters_dict["objective"], parameters_dict["split_proposal_method"], float(parameters_dict["epsilon"]), float(parameters_dict["learning_rate"]), float(parameters_dict["min_split_loss"]), float(parameters_dict["weight_reg"]), float(parameters_dict["sampling_size"]), ) elif model_type == "logistic_reg": from verticapy.learn.linear_model import LogisticRegression model = LogisticRegression( name, cursor, parameters_dict["regularization"], float(parameters_dict["epsilon"]), float(parameters_dict["lambda"]), int(parameters_dict["max_iterations"]), parameters_dict["optimizer"], float(parameters_dict["alpha"]), ) elif model_type == "linear_reg": from verticapy.learn.linear_model import ( LinearRegression, Lasso, Ridge, ElasticNet, ) if parameters_dict["regularization"] == "none": model = LinearRegression( name, cursor, float(parameters_dict["epsilon"]), int(parameters_dict["max_iterations"]), parameters_dict["optimizer"], ) elif parameters_dict["regularization"] == "l1": model = Lasso( name, cursor, float(parameters_dict["epsilon"]), float(parameters_dict["lambda"]), int(parameters_dict["max_iterations"]), parameters_dict["optimizer"], ) elif parameters_dict["regularization"] == "l2": model = Ridge( name, cursor, float(parameters_dict["epsilon"]), float(parameters_dict["lambda"]), int(parameters_dict["max_iterations"]), parameters_dict["optimizer"], ) else: model = ElasticNet( name, cursor, float(parameters_dict["epsilon"]), float(parameters_dict["lambda"]), int(parameters_dict["max_iterations"]), parameters_dict["optimizer"], float(parameters_dict["alpha"]), ) elif model_type == "naive_bayes": from verticapy.learn.naive_bayes import NaiveBayes model = NaiveBayes(name, cursor, float(parameters_dict["alpha"])) elif model_type == "svm_regressor": from verticapy.learn.svm import LinearSVR model = LinearSVR( name, cursor, float(parameters_dict["epsilon"]), float(parameters_dict["C"]), True, float(parameters_dict["intercept_scaling"]), parameters_dict["intercept_mode"], float(parameters_dict["error_tolerance"]), int(parameters_dict["max_iterations"]), ) elif model_type == "svm_classifier": from verticapy.learn.svm import LinearSVC class_weights = parameters_dict["class_weights"].split(",") for idx, elem in enumerate(class_weights): try: class_weights[idx] = float(class_weights[idx]) except: class_weights[idx] = None model = LinearSVC( name, cursor, float(parameters_dict["epsilon"]), float(parameters_dict["C"]), True, float(parameters_dict["intercept_scaling"]), parameters_dict["intercept_mode"], class_weights, int(parameters_dict["max_iterations"]), ) elif model_type == "kmeans": from verticapy.learn.cluster import KMeans model = KMeans( name, cursor, int(info.split(",")[-1]), parameters_dict["init_method"], int(parameters_dict["max_iterations"]), float(parameters_dict["epsilon"]), ) model.cluster_centers_ = model.get_attr("centers") result = model.get_attr("metrics").values["metrics"][0] values = { "index": [ "Between-Cluster Sum of Squares", "Total Sum of Squares", "Total Within-Cluster Sum of Squares", "Between-Cluster SS / Total SS", "converged", ] } values["value"] = [ float(result.split("Between-Cluster Sum of Squares: ")[1].split("\n")[0]), float(result.split("Total Sum of Squares: ")[1].split("\n")[0]), float( result.split("Total Within-Cluster Sum of Squares: ")[1].split("\n")[0] ), float(result.split("Between-Cluster Sum of Squares: ")[1].split("\n")[0]) / float(result.split("Total Sum of Squares: ")[1].split("\n")[0]), result.split("Converged: ")[1].split("\n")[0] == "True", ] model.metrics_ = tablesample(values) elif model_type == "bisecting_kmeans": from verticapy.learn.cluster import BisectingKMeans model = BisectingKMeans( name, cursor, int(info.split(",")[-1]), int(parameters_dict["bisection_iterations"]), parameters_dict["split_method"], int(parameters_dict["min_divisible_cluster_size"]), parameters_dict["distance_method"], parameters_dict["kmeans_center_init_method"], int(parameters_dict["kmeans_max_iterations"]), float(parameters_dict["kmeans_epsilon"]), ) model.metrics_ = model.get_attr("Metrics") model.cluster_centers_ = model.get_attr("BKTree") elif model_type == "pca": from verticapy.learn.decomposition import PCA model = PCA(name, cursor, 0, bool(parameters_dict["scale"])) model.components_ = model.get_attr("principal_components") model.explained_variance_ = model.get_attr("singular_values") model.mean_ = model.get_attr("columns") elif model_type == "svd": from verticapy.learn.decomposition import SVD model = SVD(name, cursor) model.singular_values_ = model.get_attr("right_singular_vectors") model.explained_variance_ = model.get_attr("singular_values") elif model_type == "one_hot_encoder_fit": from verticapy.learn.preprocessing import OneHotEncoder model = OneHotEncoder(name, cursor) try: model.param_ = to_tablesample( query="SELECT category_name, category_level::varchar, category_level_index FROM (SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'integer_categories')) VERTICAPY_SUBTABLE UNION ALL SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'varchar_categories')".format( model.name, model.name ), cursor=model.cursor, ) except: try: model.param_ = model.get_attr("integer_categories") except: model.param_ = model.get_attr("varchar_categories") if not(input_relation): model.input_relation = info.split(",")[1].replace("'", "").replace("\\", "") else: model.input_relation = input_relation model.test_relation = test_relation if (test_relation) else model.input_relation if model_type not in ("kmeans", "pca", "svd", "one_hot_encoder_fit"): model.X = info.split(",")[3 : len(info.split(","))] model.X = [item.replace("'", "").replace("\\", "") for item in model.X] model.y = info.split(",")[2].replace("'", "").replace("\\", "") elif model_type in ( "svd", "pca", "one_hot_encoder_fit", "normalizer", "kmeans", "bisectingkmeans", ): model.X = info.split(",")[2 : len(info.split(","))] model.X = [item.replace("'", "").replace("\\", "") for item in model.X] if model_type in ("naive_bayes", "rf_classifier", "xgb_classifier"): try: cursor.execute( "SELECT DISTINCT {} FROM {} WHERE {} IS NOT NULL ORDER BY 1".format( model.y, model.input_relation, model.y ) ) classes = cursor.fetchall() model.classes_ = [item[0] for item in classes] except: model.classes_ = [0, 1] elif model_type in ("svm_classifier", "logistic_reg"): model.classes_ = [0, 1] if model_type in ("svm_classifier", "svm_regressor", "logistic_reg", "linear_reg",): model.coef_ = model.get_attr("details") return model