Esempio n. 1
0
def _update_max_compilations_limit(es, limit="10000/1m"):
    print("Updating script.max_compilations_rate to ", limit)
    if es_version(es) < (7, 8):
        body = {"transient": {"script.max_compilations_rate": limit}}
    else:
        body = {"transient": {"script.context.field.max_compilations_rate": limit}}
    es.cluster.put_settings(body=body)
Esempio n. 2
0
    def _trained_model_config(self) -> Dict[str, Any]:
        """Lazily loads an ML models 'trained_model_config' information"""
        if self._trained_model_config_cache is None:

            # In Elasticsearch 7.7 and earlier you can't get
            # target type without pulling the model definition
            # so we check the version first.
            if es_version(self._client) < (7, 8):
                resp = self._client.ml.get_trained_models(
                    model_id=self._model_id, include_model_definition=True
                )
            else:
                resp = self._client.ml.get_trained_models(model_id=self._model_id)

            if resp["count"] > 1:
                raise ValueError(f"Model ID {self._model_id!r} wasn't unambiguous")
            elif resp["count"] == 0:
                raise ValueError(f"Model with Model ID {self._model_id!r} wasn't found")
            self._trained_model_config_cache = resp["trained_model_configs"][0]
        return self._trained_model_config_cache
Esempio n. 3
0
    def predict(
        self, X: Union[np.ndarray, List[float], List[List[float]]]
    ) -> np.ndarray:
        """
        Make a prediction using a trained model stored in Elasticsearch.

        Parameters for this method are not yet fully compatible with standard sklearn.predict.

        Parameters
        ----------
        X: Input feature vector.
           Must be either a numpy ndarray or a list or list of lists
           of type float. TODO: support DataFrame and other formats

        Returns
        -------
        y: np.ndarray of dtype float for regressors or int for classifiers

        Examples
        --------
        >>> from sklearn import datasets
        >>> from xgboost import XGBRegressor
        >>> from eland.ml import MLModel

        >>> # Train model
        >>> training_data = datasets.make_classification(n_features=6, random_state=0)
        >>> test_data = [[-1, -2, -3, -4, -5, -6], [10, 20, 30, 40, 50, 60]]
        >>> regressor = XGBRegressor(objective='reg:squarederror')
        >>> regressor = regressor.fit(training_data[0], training_data[1])

        >>> # Get some test results
        >>> regressor.predict(np.array(test_data))  # doctest: +SKIP
        array([0.06062475, 0.9990102 ], dtype=float32)

        >>> # Serialise the model to Elasticsearch
        >>> feature_names = ["f0", "f1", "f2", "f3", "f4", "f5"]
        >>> model_id = "test_xgb_regressor"
        >>> es_model = MLModel.import_model('localhost', model_id, regressor, feature_names, es_if_exists='replace')

        >>> # Get some test results from Elasticsearch model
        >>> es_model.predict(test_data)  # doctest: +SKIP
        array([0.0606248 , 0.99901026], dtype=float32)

        >>> # Delete model from Elasticsearch
        >>> es_model.delete_model()
        """
        docs = []
        if isinstance(X, np.ndarray):

            def to_list_or_float(x: Any) -> Union[List[Any], float]:
                if isinstance(x, np.ndarray):
                    return [to_list_or_float(i) for i in x.tolist()]
                elif isinstance(x, list):
                    return [to_list_or_float(i) for i in x]
                return float(x)

            X = to_list_or_float(X)

        # Is it a list of floats?
        if isinstance(X, list) and all(isinstance(i, (float, int)) for i in X):
            features = cast(List[List[float]], [X])
        # If not a list of lists of floats then we error out.
        elif isinstance(X, list) and all(
            [
                isinstance(i, list) and all([isinstance(ix, (float, int)) for ix in i])
                for i in X
            ]
        ):
            features = cast(List[List[float]], X)
        else:
            raise NotImplementedError(
                f"Prediction for type {type(X)}, not supported: {X!r}"
            )

        for i in features:
            doc = {"_source": dict(zip(self.feature_names, i))}
            docs.append(doc)

        # field_mappings -> field_map in ES 7.7
        field_map_name = (
            "field_map" if es_version(self._client) >= (7, 7) else "field_mappings"
        )

        results = self._client.ingest.simulate(
            body={
                "pipeline": {
                    "processors": [
                        {
                            "inference": {
                                "model_id": self._model_id,
                                "inference_config": {self.model_type: {}},
                                field_map_name: {},
                            }
                        }
                    ]
                },
                "docs": docs,
            }
        )

        # Unpack results into an array. Errors can be present
        # within the response without a non-2XX HTTP status code.
        y = []
        for res in results["docs"]:
            if "error" in res:
                raise RuntimeError(
                    f"Failed to run prediction for model ID {self._model_id!r}",
                    res["error"],
                )

            y.append(res["doc"]["_source"]["ml"]["inference"][self.results_field])

        # Return results as np.ndarray of float32 or int (consistent with sklearn/xgboost)
        if self.model_type == TYPE_CLASSIFICATION:
            dt = np.int
        else:
            dt = np.float32
        return np.asarray(y, dtype=dt)
Esempio n. 4
0
    def import_model(
        cls,
        es_client: Union[str, List[str], Tuple[str, ...], "Elasticsearch"],
        model_id: str,
        model: Union[
            "DecisionTreeClassifier",
            "DecisionTreeRegressor",
            "RandomForestRegressor",
            "RandomForestClassifier",
            "XGBClassifier",
            "XGBRegressor",
            "LGBMRegressor",
            "LGBMClassifier",
        ],
        feature_names: List[str],
        classification_labels: Optional[List[str]] = None,
        classification_weights: Optional[List[float]] = None,
        es_if_exists: Optional[str] = None,
        es_compress_model_definition: bool = True,
    ) -> "MLModel":
        """
        Transform and serialize a trained 3rd party model into Elasticsearch.
        This model can then be used for inference in the Elastic Stack.

        Parameters
        ----------
        es_client: Elasticsearch client argument(s)
            - elasticsearch-py parameters or
            - elasticsearch-py instance

        model_id: str
            The unique identifier of the trained inference model in Elasticsearch.

        model: An instance of a supported python model. We support the following model types:
            - sklearn.tree.DecisionTreeClassifier
            - sklearn.tree.DecisionTreeRegressor
            - sklearn.ensemble.RandomForestRegressor
            - sklearn.ensemble.RandomForestClassifier
            - lightgbm.LGBMRegressor
                - Categorical fields are expected to already be processed
                - Only the following objectives are supported
                    - "regression"
                    - "regression_l1"
                    - "huber"
                    - "fair"
                    - "quantile"
                    - "mape"
            - lightgbm.LGBMClassifier
                - Categorical fields are expected to already be processed
                - Only the following objectives are supported
                    - "binary"
                    - "multiclass"
                    - "multiclassova"
            - xgboost.XGBClassifier
                - only the following objectives are supported:
                    - "binary:logistic"
                    - "multi:softmax"
                    - "multi:softprob"
            - xgboost.XGBRegressor
                - only the following objectives are supported:
                    - "reg:squarederror"
                    - "reg:linear"
                    - "reg:squaredlogerror"
                    - "reg:logistic"
                    - "reg:pseudohubererror"

        feature_names: List[str]
            Names of the features (required)

        classification_labels: List[str]
            Labels of the classification targets

        classification_weights: List[str]
            Weights of the classification targets

        es_if_exists: {'fail', 'replace'} default 'fail'
            How to behave if model already exists

            - fail: Raise a Value Error
            - replace: Overwrite existing model

        es_compress_model_definition: bool
            If True will use 'compressed_definition' which uses gzipped
            JSON instead of raw JSON to reduce the amount of data sent
            over the wire in HTTP requests. Defaults to 'True'.

        Examples
        --------
        >>> from sklearn import datasets
        >>> from sklearn.tree import DecisionTreeClassifier
        >>> from eland.ml import MLModel

        >>> # Train model
        >>> training_data = datasets.make_classification(n_features=5, random_state=0)
        >>> test_data = [[-50.1, 0.2, 0.3, -0.5, 1.0], [1.6, 2.1, -10, 50, -1.0]]
        >>> classifier = DecisionTreeClassifier()
        >>> classifier = classifier.fit(training_data[0], training_data[1])

        >>> # Get some test results
        >>> classifier.predict(test_data)
        array([0, 1])

        >>> # Serialise the model to Elasticsearch
        >>> feature_names = ["f0", "f1", "f2", "f3", "f4"]
        >>> model_id = "test_decision_tree_classifier"
        >>> es_model = MLModel.import_model(
        ...   'localhost',
        ...   model_id=model_id,
        ...   model=classifier,
        ...   feature_names=feature_names,
        ...   es_if_exists='replace'
        ... )

        >>> # Get some test results from Elasticsearch model
        >>> es_model.predict(test_data)
        array([0, 1])

        >>> # Delete model from Elasticsearch
        >>> es_model.delete_model()
        """
        es_client = ensure_es_client(es_client)
        transformer = get_model_transformer(
            model,
            feature_names=feature_names,
            classification_labels=classification_labels,
            classification_weights=classification_weights,
        )
        serializer = transformer.transform()
        model_type = transformer.model_type

        if es_if_exists is None:
            es_if_exists = "fail"

        ml_model = MLModel(
            es_client=es_client,
            model_id=model_id,
        )
        if es_if_exists not in ("fail", "replace"):
            raise ValueError("'es_if_exists' must be either 'fail' or 'replace'")
        elif es_if_exists == "fail":
            if ml_model.exists_model():
                raise ValueError(
                    f"Trained machine learning model {model_id} already exists"
                )
        elif es_if_exists == "replace":
            ml_model.delete_model()

        body: Dict[str, Any] = {
            "input": {"field_names": feature_names},
        }
        # 'inference_config' is required in 7.8+ but isn't available in <=7.7
        if es_version(es_client) >= (7, 8):
            body["inference_config"] = {model_type: {}}

        if es_compress_model_definition:
            body["compressed_definition"] = serializer.serialize_and_compress_model()
        else:
            body["definition"] = serializer.serialize_model()

        ml_model._client.ml.put_trained_model(
            model_id=model_id,
            body=body,
        )
        return ml_model
Esempio n. 5
0
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))

# Define test files and indices
ELASTICSEARCH_HOST = os.environ.get("ELASTICSEARCH_HOST") or "localhost"

# Define client to use in tests
TEST_SUITE = os.environ.get("TEST_SUITE", "xpack")
if TEST_SUITE == "xpack":
    ES_TEST_CLIENT = Elasticsearch(
        ELASTICSEARCH_HOST,
        http_auth=("elastic", "changeme"),
    )
else:
    ES_TEST_CLIENT = Elasticsearch(ELASTICSEARCH_HOST)

ES_VERSION = es_version(ES_TEST_CLIENT)

FLIGHTS_INDEX_NAME = "flights"
FLIGHTS_MAPPING = {
    "mappings": {
        "properties": {
            "AvgTicketPrice": {
                "type": "float"
            },
            "Cancelled": {
                "type": "boolean"
            },
            "Carrier": {
                "type": "keyword"
            },
            "Dest": {
Esempio n. 6
0
    def __init__(
        self,
        es_client,
        model_id: str,
        model: Union[DecisionTreeClassifier, DecisionTreeRegressor,
                     RandomForestRegressor, RandomForestClassifier,
                     XGBClassifier, XGBRegressor, ],
        feature_names: List[str],
        classification_labels: List[str] = None,
        classification_weights: List[float] = None,
        overwrite=False,
    ):
        super().__init__(es_client, model_id)

        self._feature_names = feature_names
        self._model_type = None

        # Transform model
        if isinstance(model, DecisionTreeRegressor):
            serializer = SKLearnDecisionTreeTransformer(
                model, feature_names).transform()
            self._model_type = MLModel.TYPE_REGRESSION
        elif isinstance(model, DecisionTreeClassifier):
            serializer = SKLearnDecisionTreeTransformer(
                model, feature_names, classification_labels).transform()
            self._model_type = MLModel.TYPE_CLASSIFICATION
        elif isinstance(model, RandomForestRegressor):
            serializer = SKLearnForestRegressorTransformer(
                model, feature_names).transform()
            self._model_type = MLModel.TYPE_REGRESSION
        elif isinstance(model, RandomForestClassifier):
            serializer = SKLearnForestClassifierTransformer(
                model, feature_names, classification_labels).transform()
            self._model_type = MLModel.TYPE_CLASSIFICATION
        elif isinstance(model, XGBRegressor):
            serializer = XGBoostRegressorTransformer(
                model, feature_names).transform()
            self._model_type = MLModel.TYPE_REGRESSION
        elif isinstance(model, XGBClassifier):
            serializer = XGBoostClassifierTransformer(
                model, feature_names, classification_labels).transform()
            self._model_type = MLModel.TYPE_CLASSIFICATION
        else:
            raise NotImplementedError(
                f"ML model of type {type(model)}, not currently implemented")

        if overwrite:
            self.delete_model()

        serialized_model = serializer.serialize_and_compress_model()
        body = {
            "compressed_definition": serialized_model,
            "input": {
                "field_names": feature_names
            },
        }
        # 'inference_config' is required in 7.8+ but isn't available in <=7.7
        if es_version(self._client) >= (7, 8):
            body["inference_config"] = {self._model_type: {}}

        self._client.ml.put_trained_model(
            model_id=self._model_id,
            body=body,
        )