Ejemplo n.º 1
0
 def __init__(
     self,
     name: str,
     max_ntree: int = 10,
     max_depth: int = 5,
     nbins: int = 32,
     split_proposal_method: str = "global",
     tol: float = 0.001,
     learning_rate: float = 0.1,
     min_split_loss: float = 0.0,
     weight_reg: float = 0.0,
     sample: float = 1.0,
     col_sample_by_tree: float = 1.0,
     col_sample_by_node: float = 1.0,
 ):
     version(condition=[10, 1, 0])
     check_types([("name", name, [str], False)])
     self.type, self.name = "XGBoostRegressor", name
     params = {
         "max_ntree": max_ntree,
         "max_depth": max_depth,
         "nbins": nbins,
         "split_proposal_method": split_proposal_method,
         "tol": tol,
         "learning_rate": learning_rate,
         "min_split_loss": min_split_loss,
         "weight_reg": weight_reg,
         "sample": sample,
     }
     v = version()
     v = v[0] > 11 or (v[0] == 11 and (v[1] >= 1 or v[2] >= 1))
     if v:
         params["col_sample_by_tree"] = col_sample_by_tree
         params["col_sample_by_node"] = col_sample_by_node
     self.set_params(params)
Ejemplo n.º 2
0
 def __init__(
     self,
     name: str,
     n_estimators: int = 10,
     max_features: Union[int, str] = "auto",
     max_leaf_nodes: int = 1e9,
     sample: float = 0.632,
     max_depth: int = 5,
     min_samples_leaf: int = 1,
     min_info_gain: float = 0.0,
     nbins: int = 32,
 ):
     version(condition=[9, 0, 1])
     check_types([("name", name, [str], False)])
     self.type, self.name = "RandomForestRegressor", name
     self.set_params(
         {
             "n_estimators": n_estimators,
             "max_features": max_features,
             "max_leaf_nodes": max_leaf_nodes,
             "sample": sample,
             "max_depth": max_depth,
             "min_samples_leaf": min_samples_leaf,
             "min_info_gain": min_info_gain,
             "nbins": nbins,
         }
     )
Ejemplo n.º 3
0
 def xgboost_tree_dict_list(model):
     n = model.get_attr("tree_count")["tree_count"][0]
     if model.type == "XGBoostClassifier" and (
         len(model.classes_) > 2
         or model.classes_[1] != 1
         or model.classes_[0] != 0
     ):
         trees = []
         for i in range(n):
             for c in model.classes_:
                 trees += [xgboost_tree_dict(model, i, str(c))]
         v = version()
         v = v[0] > 11 or (v[0] == 11 and (v[1] >= 1 or v[2] >= 1))
         if not (v):
             for i in range(len(model.classes_)):
                 trees += [xgboost_dummy_tree_dict(model, i)]
         tree_info = [i for i in range(len(model.classes_))] * (
             n + int(not (v))
         )
         for idx, tree in enumerate(trees):
             tree["id"] = idx
     else:
         trees = [xgboost_tree_dict(model, i) for i in range(n)]
         tree_info = [0 for i in range(n)]
     return {
         "model": {
             "trees": trees,
             "tree_info": tree_info,
             "gbtree_model_param": {
                 "num_trees": str(len(trees)),
                 "size_leaf_vector": "0",
             },
         },
         "name": "gbtree",
     }
Ejemplo n.º 4
0
 def __init__(
     self,
     name: str,
     n_estimators: int = 100,
     max_depth: int = 10,
     nbins: int = 32,
     sample: float = 0.632,
     col_sample_by_tree: float = 1.0,
 ):
     version(condition=[12, 0, 0])
     check_types([("name", name, [str], False)])
     self.type, self.name = "IsolationForest", name
     params = {
         "n_estimators": n_estimators,
         "max_depth": max_depth,
         "nbins": nbins,
         "sample": sample,
         "col_sample_by_tree": col_sample_by_tree,
     }
     self.set_params(params)
Ejemplo n.º 5
0
    def get_prior(self):
        """
        ---------------------------------------------------------------------------
        Returns the XGB Priors.
            
        Returns
        -------
        list
            XGB Priors.
        """
        from verticapy.utilities import version

        condition = ["{} IS NOT NULL".format(elem) for elem in self.X] + [
            "{} IS NOT NULL".format(self.y)
        ]
        v = version()
        v = v[0] > 11 or (v[0] == 11 and (v[1] >= 1 or v[2] >= 1))
        if self.type == "XGBoostRegressor" or (
            len(self.classes_) == 2 and self.classes_[1] == 1 and self.classes_[0] == 0
        ):
            prior_ = executeSQL(
                "SELECT AVG({}) FROM {} WHERE {}".format(
                    self.y, self.input_relation, " AND ".join(condition)
                ),
                method="fetchfirstelem",
                print_time_sql=False,
            )
        elif not (v):
            prior_ = []
            for elem in self.classes_:
                avg = executeSQL(
                    "SELECT COUNT(*) FROM {} WHERE {} AND {} = '{}'".format(
                        self.input_relation, " AND ".join(condition), self.y, elem
                    ),
                    method="fetchfirstelem",
                    print_time_sql=False,
                )
                avg /= executeSQL(
                    "SELECT COUNT(*) FROM {} WHERE {}".format(
                        self.input_relation, " AND ".join(condition)
                    ),
                    method="fetchfirstelem",
                    print_time_sql=False,
                )
                logodds = np.log(avg / (1 - avg))
                prior_ += [logodds]
        else:
            prior_ = [0.0 for elem in self.classes_]
        return prior_
Ejemplo n.º 6
0
 def xgboost_learner(model):
     v = version()
     v = v[0] > 11 or (v[0] == 11 and (v[1] >= 1 or v[2] >= 1))
     if v:
         col_sample_by_tree = model.parameters["col_sample_by_tree"]
         col_sample_by_node = model.parameters["col_sample_by_node"]
     else:
         col_sample_by_tree = "null"
         col_sample_by_node = "null"
     condition = ["{} IS NOT NULL".format(elem) for elem in model.X] + [
         "{} IS NOT NULL".format(model.y)
     ]
     n = model.get_attr("tree_count")["tree_count"][0]
     if model.type == "XGBoostRegressor" or (
         len(model.classes_) == 2
         and model.classes_[1] == 1
         and model.classes_[0] == 0
     ):
         bs, num_class, param, param_val = (
             model.prior_,
             "0",
             "reg_loss_param",
             {"scale_pos_weight": "1"},
         )
         if model.type == "XGBoostRegressor":
             objective = "reg:squarederror"
             attributes_dict = {
                 "scikit_learn": '{"n_estimators": '
                 + str(n)
                 + ', "objective": "reg:squarederror", "max_depth": '
                 + str(model.parameters["max_depth"])
                 + ', "learning_rate": '
                 + str(model.parameters["learning_rate"])
                 + ', "verbosity": null, "booster": null, "tree_method": null,'
                 + ' "gamma": null, "min_child_weight": null, "max_delta_step":'
                 + ' null, "subsample": null, "colsample_bytree": '
                 + str(col_sample_by_tree)
                 + ', "colsample_bylevel": null, "colsample_bynode": '
                 + str(col_sample_by_node)
                 + ', "reg_alpha": null, "reg_lambda": null, "scale_pos_weight":'
                 + ' null, "base_score": null, "missing": NaN, "num_parallel_tree"'
                 + ': null, "kwargs": {}, "random_state": null, "n_jobs": null, '
                 + '"monotone_constraints": null, "interaction_constraints": null,'
                 + ' "importance_type": "gain", "gpu_id": null, "validate_parameters"'
                 + ': null, "_estimator_type": "regressor"}'
             }
         else:
             objective = "binary:logistic"
             attributes_dict = {
                 "scikit_learn": '{"use_label_encoder": true, "n_estimators": '
                 + str(n)
                 + ', "objective": "binary:logistic", "max_depth": '
                 + str(model.parameters["max_depth"])
                 + ', "learning_rate": '
                 + str(model.parameters["learning_rate"])
                 + ', "verbosity": null, "booster": null, "tree_method": null,'
                 + ' "gamma": null, "min_child_weight": null, "max_delta_step":'
                 + ' null, "subsample": null, "colsample_bytree": '
                 + str(col_sample_by_tree)
                 + ', "colsample_bylevel": null, "colsample_bynode": '
                 + str(col_sample_by_node)
                 + ', "reg_alpha": null, "reg_lambda": null, "scale_pos_weight":'
                 + ' null, "base_score": null, "missing": NaN, "num_parallel_tree"'
                 + ': null, "kwargs": {}, "random_state": null, "n_jobs": null,'
                 + ' "monotone_constraints": null, "interaction_constraints": null,'
                 + ' "importance_type": "gain", "gpu_id": null, "validate_parameters"'
                 + ': null, "classes_": [0, 1], "n_classes_": 2, "_le": {"classes_": '
                 + '[0, 1]}, "_estimator_type": "classifier"}'
             }
     else:
         objective, bs, num_class, param, param_val = (
             "multi:softprob",
             0.5,
             str(len(model.classes_)),
             "softmax_multiclass_param",
             {"num_class": str(len(model.classes_))},
         )
         attributes_dict = {
             "scikit_learn": '{"use_label_encoder": true, "n_estimators": '
             + str(n)
             + ', "objective": "multi:softprob", "max_depth": '
             + str(model.parameters["max_depth"])
             + ', "learning_rate": '
             + str(model.parameters["learning_rate"])
             + ', "verbosity": null, "booster": null, "tree_method": null, '
             + '"gamma": null, "min_child_weight": null, "max_delta_step": '
             + 'null, "subsample": null, "colsample_bytree": '
             + str(col_sample_by_tree)
             + ', "colsample_bylevel": null, "colsample_bynode": '
             + str(col_sample_by_node)
             + ', "reg_alpha": null, "reg_lambda": null, "scale_pos_weight":'
             + ' null, "base_score": null, "missing": NaN, "num_parallel_tree":'
             + ' null, "kwargs": {}, "random_state": null, "n_jobs": null, '
             + '"monotone_constraints": null, "interaction_constraints": null, '
             + '"importance_type": "gain", "gpu_id": null, "validate_parameters":'
             + ' null, "classes_": '
             + str(model.classes_)
             + ', "n_classes_": '
             + str(len(model.classes_))
             + ', "_le": {"classes_": '
             + str(model.classes_)
             + '}, "_estimator_type": "classifier"}'
         }
     attributes_dict["scikit_learn"] = attributes_dict[
         "scikit_learn"
     ].replace('"', "++++")
     gradient_booster = xgboost_tree_dict_list(model)
     return {
         "attributes": attributes_dict,
         "feature_names": [],
         "feature_types": [],
         "gradient_booster": gradient_booster,
         "learner_model_param": {
             "base_score": np.format_float_scientific(
                 bs, precision=7
             ).upper(),
             "num_class": num_class,
             "num_feature": str(len(model.X)),
         },
         "objective": {"name": objective, param: param_val},
     }