def __init__( self, name: str, max_ntree: int = 10, max_depth: int = 5, nbins: int = 32, split_proposal_method: str = "global", tol: float = 0.001, learning_rate: float = 0.1, min_split_loss: float = 0.0, weight_reg: float = 0.0, sample: float = 1.0, col_sample_by_tree: float = 1.0, col_sample_by_node: float = 1.0, ): version(condition=[10, 1, 0]) check_types([("name", name, [str], False)]) self.type, self.name = "XGBoostRegressor", name params = { "max_ntree": max_ntree, "max_depth": max_depth, "nbins": nbins, "split_proposal_method": split_proposal_method, "tol": tol, "learning_rate": learning_rate, "min_split_loss": min_split_loss, "weight_reg": weight_reg, "sample": sample, } v = version() v = v[0] > 11 or (v[0] == 11 and (v[1] >= 1 or v[2] >= 1)) if v: params["col_sample_by_tree"] = col_sample_by_tree params["col_sample_by_node"] = col_sample_by_node self.set_params(params)
def __init__( self, name: str, n_estimators: int = 10, max_features: Union[int, str] = "auto", max_leaf_nodes: int = 1e9, sample: float = 0.632, max_depth: int = 5, min_samples_leaf: int = 1, min_info_gain: float = 0.0, nbins: int = 32, ): version(condition=[9, 0, 1]) check_types([("name", name, [str], False)]) self.type, self.name = "RandomForestRegressor", name self.set_params( { "n_estimators": n_estimators, "max_features": max_features, "max_leaf_nodes": max_leaf_nodes, "sample": sample, "max_depth": max_depth, "min_samples_leaf": min_samples_leaf, "min_info_gain": min_info_gain, "nbins": nbins, } )
def xgboost_tree_dict_list(model): n = model.get_attr("tree_count")["tree_count"][0] if model.type == "XGBoostClassifier" and ( len(model.classes_) > 2 or model.classes_[1] != 1 or model.classes_[0] != 0 ): trees = [] for i in range(n): for c in model.classes_: trees += [xgboost_tree_dict(model, i, str(c))] v = version() v = v[0] > 11 or (v[0] == 11 and (v[1] >= 1 or v[2] >= 1)) if not (v): for i in range(len(model.classes_)): trees += [xgboost_dummy_tree_dict(model, i)] tree_info = [i for i in range(len(model.classes_))] * ( n + int(not (v)) ) for idx, tree in enumerate(trees): tree["id"] = idx else: trees = [xgboost_tree_dict(model, i) for i in range(n)] tree_info = [0 for i in range(n)] return { "model": { "trees": trees, "tree_info": tree_info, "gbtree_model_param": { "num_trees": str(len(trees)), "size_leaf_vector": "0", }, }, "name": "gbtree", }
def __init__( self, name: str, n_estimators: int = 100, max_depth: int = 10, nbins: int = 32, sample: float = 0.632, col_sample_by_tree: float = 1.0, ): version(condition=[12, 0, 0]) check_types([("name", name, [str], False)]) self.type, self.name = "IsolationForest", name params = { "n_estimators": n_estimators, "max_depth": max_depth, "nbins": nbins, "sample": sample, "col_sample_by_tree": col_sample_by_tree, } self.set_params(params)
def get_prior(self): """ --------------------------------------------------------------------------- Returns the XGB Priors. Returns ------- list XGB Priors. """ from verticapy.utilities import version condition = ["{} IS NOT NULL".format(elem) for elem in self.X] + [ "{} IS NOT NULL".format(self.y) ] v = version() v = v[0] > 11 or (v[0] == 11 and (v[1] >= 1 or v[2] >= 1)) if self.type == "XGBoostRegressor" or ( len(self.classes_) == 2 and self.classes_[1] == 1 and self.classes_[0] == 0 ): prior_ = executeSQL( "SELECT AVG({}) FROM {} WHERE {}".format( self.y, self.input_relation, " AND ".join(condition) ), method="fetchfirstelem", print_time_sql=False, ) elif not (v): prior_ = [] for elem in self.classes_: avg = executeSQL( "SELECT COUNT(*) FROM {} WHERE {} AND {} = '{}'".format( self.input_relation, " AND ".join(condition), self.y, elem ), method="fetchfirstelem", print_time_sql=False, ) avg /= executeSQL( "SELECT COUNT(*) FROM {} WHERE {}".format( self.input_relation, " AND ".join(condition) ), method="fetchfirstelem", print_time_sql=False, ) logodds = np.log(avg / (1 - avg)) prior_ += [logodds] else: prior_ = [0.0 for elem in self.classes_] return prior_
def xgboost_learner(model): v = version() v = v[0] > 11 or (v[0] == 11 and (v[1] >= 1 or v[2] >= 1)) if v: col_sample_by_tree = model.parameters["col_sample_by_tree"] col_sample_by_node = model.parameters["col_sample_by_node"] else: col_sample_by_tree = "null" col_sample_by_node = "null" condition = ["{} IS NOT NULL".format(elem) for elem in model.X] + [ "{} IS NOT NULL".format(model.y) ] n = model.get_attr("tree_count")["tree_count"][0] if model.type == "XGBoostRegressor" or ( len(model.classes_) == 2 and model.classes_[1] == 1 and model.classes_[0] == 0 ): bs, num_class, param, param_val = ( model.prior_, "0", "reg_loss_param", {"scale_pos_weight": "1"}, ) if model.type == "XGBoostRegressor": objective = "reg:squarederror" attributes_dict = { "scikit_learn": '{"n_estimators": ' + str(n) + ', "objective": "reg:squarederror", "max_depth": ' + str(model.parameters["max_depth"]) + ', "learning_rate": ' + str(model.parameters["learning_rate"]) + ', "verbosity": null, "booster": null, "tree_method": null,' + ' "gamma": null, "min_child_weight": null, "max_delta_step":' + ' null, "subsample": null, "colsample_bytree": ' + str(col_sample_by_tree) + ', "colsample_bylevel": null, "colsample_bynode": ' + str(col_sample_by_node) + ', "reg_alpha": null, "reg_lambda": null, "scale_pos_weight":' + ' null, "base_score": null, "missing": NaN, "num_parallel_tree"' + ': null, "kwargs": {}, "random_state": null, "n_jobs": null, ' + '"monotone_constraints": null, "interaction_constraints": null,' + ' "importance_type": "gain", "gpu_id": null, "validate_parameters"' + ': null, "_estimator_type": "regressor"}' } else: objective = "binary:logistic" attributes_dict = { "scikit_learn": '{"use_label_encoder": true, "n_estimators": ' + str(n) + ', "objective": "binary:logistic", "max_depth": ' + str(model.parameters["max_depth"]) + ', "learning_rate": ' + str(model.parameters["learning_rate"]) + ', "verbosity": null, "booster": null, "tree_method": null,' + ' "gamma": null, "min_child_weight": null, "max_delta_step":' + ' null, "subsample": null, "colsample_bytree": ' + str(col_sample_by_tree) + ', "colsample_bylevel": null, "colsample_bynode": ' + str(col_sample_by_node) + ', "reg_alpha": null, "reg_lambda": null, "scale_pos_weight":' + ' null, "base_score": null, "missing": NaN, "num_parallel_tree"' + ': null, "kwargs": {}, "random_state": null, "n_jobs": null,' + ' "monotone_constraints": null, "interaction_constraints": null,' + ' "importance_type": "gain", "gpu_id": null, "validate_parameters"' + ': null, "classes_": [0, 1], "n_classes_": 2, "_le": {"classes_": ' + '[0, 1]}, "_estimator_type": "classifier"}' } else: objective, bs, num_class, param, param_val = ( "multi:softprob", 0.5, str(len(model.classes_)), "softmax_multiclass_param", {"num_class": str(len(model.classes_))}, ) attributes_dict = { "scikit_learn": '{"use_label_encoder": true, "n_estimators": ' + str(n) + ', "objective": "multi:softprob", "max_depth": ' + str(model.parameters["max_depth"]) + ', "learning_rate": ' + str(model.parameters["learning_rate"]) + ', "verbosity": null, "booster": null, "tree_method": null, ' + '"gamma": null, "min_child_weight": null, "max_delta_step": ' + 'null, "subsample": null, "colsample_bytree": ' + str(col_sample_by_tree) + ', "colsample_bylevel": null, "colsample_bynode": ' + str(col_sample_by_node) + ', "reg_alpha": null, "reg_lambda": null, "scale_pos_weight":' + ' null, "base_score": null, "missing": NaN, "num_parallel_tree":' + ' null, "kwargs": {}, "random_state": null, "n_jobs": null, ' + '"monotone_constraints": null, "interaction_constraints": null, ' + '"importance_type": "gain", "gpu_id": null, "validate_parameters":' + ' null, "classes_": ' + str(model.classes_) + ', "n_classes_": ' + str(len(model.classes_)) + ', "_le": {"classes_": ' + str(model.classes_) + '}, "_estimator_type": "classifier"}' } attributes_dict["scikit_learn"] = attributes_dict[ "scikit_learn" ].replace('"', "++++") gradient_booster = xgboost_tree_dict_list(model) return { "attributes": attributes_dict, "feature_names": [], "feature_types": [], "gradient_booster": gradient_booster, "learner_model_param": { "base_score": np.format_float_scientific( bs, precision=7 ).upper(), "num_class": num_class, "num_feature": str(len(model.X)), }, "objective": {"name": objective, param: param_val}, }